diff options
Diffstat (limited to 'net/ipv6')
87 files changed, 7488 insertions, 5062 deletions
diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig index 658bfed1df8b..b8f9a8c0302e 100644 --- a/net/ipv6/Kconfig +++ b/net/ipv6/Kconfig @@ -152,7 +152,7 @@ config INET6_TUNNEL default n config IPV6_VTI -tristate "Virtual (secure) IPv6: tunneling" + tristate "Virtual (secure) IPv6: tunneling" select IPV6_TUNNEL select NET_IP_TUNNEL select XFRM @@ -304,10 +304,9 @@ config IPV6_SEG6_LWTUNNEL config IPV6_SEG6_HMAC bool "IPv6: Segment Routing HMAC support" depends on IPV6 - select CRYPTO - select CRYPTO_HMAC - select CRYPTO_SHA1 - select CRYPTO_SHA256 + select CRYPTO_LIB_SHA1 + select CRYPTO_LIB_SHA256 + select CRYPTO_LIB_UTILS help Support for HMAC signature generation and verification of SR-enabled packets. @@ -323,6 +322,7 @@ config IPV6_RPL_LWTUNNEL bool "IPv6: RPL Source Routing Header support" depends on IPV6 select LWTUNNEL + select DST_CACHE help Support for RFC6554 RPL Source Routing Header using the lightweight tunnels mechanism. diff --git a/net/ipv6/Makefile b/net/ipv6/Makefile index 3036a45e8a1e..d283c59df4c1 100644 --- a/net/ipv6/Makefile +++ b/net/ipv6/Makefile @@ -52,4 +52,5 @@ obj-$(subst m,y,$(CONFIG_IPV6)) += inet6_hashtables.o ifneq ($(CONFIG_IPV6),) obj-$(CONFIG_NET_UDP_TUNNEL) += ip6_udp_tunnel.o obj-y += mcast_snoop.o +obj-$(CONFIG_TCP_AO) += tcp_ao.o endif diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index f7a84a4acffc..b66217d1b2f8 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -63,6 +63,7 @@ #include <linux/string.h> #include <linux/hash.h> +#include <net/ip_tunnels.h> #include <net/net_namespace.h> #include <net/sock.h> #include <net/snmp.h> @@ -79,20 +80,19 @@ #include <net/netlink.h> #include <net/pkt_sched.h> #include <net/l3mdev.h> +#include <net/netdev_lock.h> #include <linux/if_tunnel.h> #include <linux/rtnetlink.h> #include <linux/netconf.h> #include <linux/random.h> #include <linux/uaccess.h> -#include <asm/unaligned.h> +#include <linux/unaligned.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/export.h> #include <linux/ioam6.h> -#define INFINITY_LIFE_TIME 0xFFFFFFFF - #define IPV6_MAX_STRLEN \ sizeof("ffff:ffff:ffff:ffff:ffff:ffff:255.255.255.255") @@ -195,6 +195,7 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = { .use_tempaddr = 0, .temp_valid_lft = TEMP_VALID_LIFETIME, .temp_prefered_lft = TEMP_PREFERRED_LIFETIME, + .regen_min_advance = REGEN_MIN_ADVANCE, .regen_max_retry = REGEN_MAX_RETRY, .max_desync_factor = MAX_DESYNC_FACTOR, .max_addresses = IPV6_MAX_ADDRESSES, @@ -202,6 +203,7 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = { .ra_defrtr_metric = IP6_RT_PRIO_USER, .accept_ra_from_local = 0, .accept_ra_min_hop_limit= 1, + .accept_ra_min_lft = 0, .accept_ra_pinfo = 1, #ifdef CONFIG_IPV6_ROUTER_PREF .accept_ra_rtr_pref = 1, @@ -235,6 +237,9 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = { .ioam6_id = IOAM6_DEFAULT_IF_ID, .ioam6_id_wide = IOAM6_DEFAULT_IF_ID_WIDE, .ndisc_evict_nocarrier = 1, + .ra_honor_pio_life = 0, + .ra_honor_pio_pflag = 0, + .force_forwarding = 0, }; static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = { @@ -255,6 +260,7 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = { .use_tempaddr = 0, .temp_valid_lft = TEMP_VALID_LIFETIME, .temp_prefered_lft = TEMP_PREFERRED_LIFETIME, + .regen_min_advance = REGEN_MIN_ADVANCE, .regen_max_retry = REGEN_MAX_RETRY, .max_desync_factor = MAX_DESYNC_FACTOR, .max_addresses = IPV6_MAX_ADDRESSES, @@ -262,6 +268,7 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = { .ra_defrtr_metric = IP6_RT_PRIO_USER, .accept_ra_from_local = 0, .accept_ra_min_hop_limit= 1, + .accept_ra_min_lft = 0, .accept_ra_pinfo = 1, #ifdef CONFIG_IPV6_ROUTER_PREF .accept_ra_rtr_pref = 1, @@ -295,6 +302,9 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = { .ioam6_id = IOAM6_DEFAULT_IF_ID, .ioam6_id_wide = IOAM6_DEFAULT_IF_ID_WIDE, .ndisc_evict_nocarrier = 1, + .ra_honor_pio_life = 0, + .ra_honor_pio_pflag = 0, + .force_forwarding = 0, }; /* Check if link is ready: is it up and is a valid qdisc available */ @@ -305,7 +315,7 @@ static inline bool addrconf_link_ready(const struct net_device *dev) static void addrconf_del_rs_timer(struct inet6_dev *idev) { - if (del_timer(&idev->rs_timer)) + if (timer_delete(&idev->rs_timer)) __in6_dev_put(idev); } @@ -318,9 +328,8 @@ static void addrconf_del_dad_work(struct inet6_ifaddr *ifp) static void addrconf_mod_rs_timer(struct inet6_dev *idev, unsigned long when) { - if (!timer_pending(&idev->rs_timer)) + if (!mod_timer(&idev->rs_timer, jiffies + when)) in6_dev_hold(idev); - mod_timer(&idev->rs_timer, jiffies + when); } static void addrconf_mod_dad_work(struct inet6_ifaddr *ifp, @@ -371,6 +380,7 @@ static struct inet6_dev *ipv6_add_dev(struct net_device *dev) int err = -ENOMEM; ASSERT_RTNL(); + netdev_ops_assert_locked(dev); if (dev->mtu < IPV6_MIN_MTU && dev != blackhole_netdev) return ERR_PTR(-EINVAL); @@ -396,7 +406,7 @@ static struct inet6_dev *ipv6_add_dev(struct net_device *dev) return ERR_PTR(err); } if (ndev->cnf.forwarding) - dev_disable_lro(dev); + netif_disable_lro(dev); /* We refer to the device */ netdev_hold(dev, &ndev->dev_tracker, GFP_KERNEL); @@ -546,7 +556,8 @@ static int inet6_netconf_fill_devconf(struct sk_buff *skb, int ifindex, goto out; if ((all || type == NETCONFA_FORWARDING) && - nla_put_s32(skb, NETCONFA_FORWARDING, devconf->forwarding) < 0) + nla_put_s32(skb, NETCONFA_FORWARDING, + READ_ONCE(devconf->forwarding)) < 0) goto nla_put_failure; #ifdef CONFIG_IPV6_MROUTE if ((all || type == NETCONFA_MC_FORWARDING) && @@ -555,12 +566,13 @@ static int inet6_netconf_fill_devconf(struct sk_buff *skb, int ifindex, goto nla_put_failure; #endif if ((all || type == NETCONFA_PROXY_NEIGH) && - nla_put_s32(skb, NETCONFA_PROXY_NEIGH, devconf->proxy_ndp) < 0) + nla_put_s32(skb, NETCONFA_PROXY_NEIGH, + READ_ONCE(devconf->proxy_ndp)) < 0) goto nla_put_failure; if ((all || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN) && nla_put_s32(skb, NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN, - devconf->ignore_routes_with_linkdown) < 0) + READ_ONCE(devconf->ignore_routes_with_linkdown)) < 0) goto nla_put_failure; out: @@ -705,16 +717,33 @@ errout: return err; } +/* Combine dev_addr_genid and dev_base_seq to detect changes. + */ +static u32 inet6_base_seq(const struct net *net) +{ + u32 res = atomic_read(&net->ipv6.dev_addr_genid) + + READ_ONCE(net->dev_base_seq); + + /* Must not return 0 (see nl_dump_check_consistent()). + * Chose a value far away from 0. + */ + if (!res) + res = 0x80000000; + return res; +} + static int inet6_netconf_dump_devconf(struct sk_buff *skb, struct netlink_callback *cb) { const struct nlmsghdr *nlh = cb->nlh; struct net *net = sock_net(skb->sk); - int h, s_h; - int idx, s_idx; + struct { + unsigned long ifindex; + unsigned int all_default; + } *ctx = (void *)cb->ctx; struct net_device *dev; struct inet6_dev *idev; - struct hlist_head *head; + int err = 0; if (cb->strict_check) { struct netlink_ext_ack *extack = cb->extack; @@ -731,65 +760,46 @@ static int inet6_netconf_dump_devconf(struct sk_buff *skb, } } - s_h = cb->args[0]; - s_idx = idx = cb->args[1]; - - for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { - idx = 0; - head = &net->dev_index_head[h]; - rcu_read_lock(); - cb->seq = atomic_read(&net->ipv6.dev_addr_genid) ^ - net->dev_base_seq; - hlist_for_each_entry_rcu(dev, head, index_hlist) { - if (idx < s_idx) - goto cont; - idev = __in6_dev_get(dev); - if (!idev) - goto cont; - - if (inet6_netconf_fill_devconf(skb, dev->ifindex, - &idev->cnf, - NETLINK_CB(cb->skb).portid, - nlh->nlmsg_seq, - RTM_NEWNETCONF, - NLM_F_MULTI, - NETCONFA_ALL) < 0) { - rcu_read_unlock(); - goto done; - } - nl_dump_check_consistent(cb, nlmsg_hdr(skb)); -cont: - idx++; - } - rcu_read_unlock(); + rcu_read_lock(); + for_each_netdev_dump(net, dev, ctx->ifindex) { + idev = __in6_dev_get(dev); + if (!idev) + continue; + err = inet6_netconf_fill_devconf(skb, dev->ifindex, + &idev->cnf, + NETLINK_CB(cb->skb).portid, + nlh->nlmsg_seq, + RTM_NEWNETCONF, + NLM_F_MULTI, + NETCONFA_ALL); + if (err < 0) + goto done; } - if (h == NETDEV_HASHENTRIES) { - if (inet6_netconf_fill_devconf(skb, NETCONFA_IFINDEX_ALL, - net->ipv6.devconf_all, - NETLINK_CB(cb->skb).portid, - nlh->nlmsg_seq, - RTM_NEWNETCONF, NLM_F_MULTI, - NETCONFA_ALL) < 0) + if (ctx->all_default == 0) { + err = inet6_netconf_fill_devconf(skb, NETCONFA_IFINDEX_ALL, + net->ipv6.devconf_all, + NETLINK_CB(cb->skb).portid, + nlh->nlmsg_seq, + RTM_NEWNETCONF, NLM_F_MULTI, + NETCONFA_ALL); + if (err < 0) goto done; - else - h++; - } - if (h == NETDEV_HASHENTRIES + 1) { - if (inet6_netconf_fill_devconf(skb, NETCONFA_IFINDEX_DEFAULT, - net->ipv6.devconf_dflt, - NETLINK_CB(cb->skb).portid, - nlh->nlmsg_seq, - RTM_NEWNETCONF, NLM_F_MULTI, - NETCONFA_ALL) < 0) + ctx->all_default++; + } + if (ctx->all_default == 1) { + err = inet6_netconf_fill_devconf(skb, NETCONFA_IFINDEX_DEFAULT, + net->ipv6.devconf_dflt, + NETLINK_CB(cb->skb).portid, + nlh->nlmsg_seq, + RTM_NEWNETCONF, NLM_F_MULTI, + NETCONFA_ALL); + if (err < 0) goto done; - else - h++; + ctx->all_default++; } done: - cb->args[0] = h; - cb->args[1] = idx; - - return skb->len; + rcu_read_unlock(); + return err; } #ifdef CONFIG_SYSCTL @@ -846,27 +856,30 @@ static void addrconf_forward_change(struct net *net, __s32 newf) struct inet6_dev *idev; for_each_netdev(net, dev) { - idev = __in6_dev_get(dev); + idev = __in6_dev_get_rtnl_net(dev); if (idev) { int changed = (!idev->cnf.forwarding) ^ (!newf); - idev->cnf.forwarding = newf; + /* Disabling all.forwarding sets 0 to force_forwarding for all interfaces */ + if (newf == 0) + WRITE_ONCE(idev->cnf.force_forwarding, 0); + + WRITE_ONCE(idev->cnf.forwarding, newf); if (changed) dev_forward_change(idev); } } } -static int addrconf_fixup_forwarding(struct ctl_table *table, int *p, int newf) +static int addrconf_fixup_forwarding(const struct ctl_table *table, int *p, int newf) { - struct net *net; + struct net *net = (struct net *)table->extra2; int old; - if (!rtnl_trylock()) + if (!rtnl_net_trylock(net)) return restart_syscall(); - net = (struct net *)table->extra2; old = *p; - *p = newf; + WRITE_ONCE(*p, newf); if (p == &net->ipv6.devconf_dflt->forwarding) { if ((!newf) ^ (!old)) @@ -874,14 +887,14 @@ static int addrconf_fixup_forwarding(struct ctl_table *table, int *p, int newf) NETCONFA_FORWARDING, NETCONFA_IFINDEX_DEFAULT, net->ipv6.devconf_dflt); - rtnl_unlock(); + rtnl_net_unlock(net); return 0; } if (p == &net->ipv6.devconf_all->forwarding) { int old_dflt = net->ipv6.devconf_dflt->forwarding; - net->ipv6.devconf_dflt->forwarding = newf; + WRITE_ONCE(net->ipv6.devconf_dflt->forwarding, newf); if ((!newf) ^ (!old_dflt)) inet6_netconf_notify_devconf(net, RTM_NEWNETCONF, NETCONFA_FORWARDING, @@ -896,7 +909,7 @@ static int addrconf_fixup_forwarding(struct ctl_table *table, int *p, int newf) net->ipv6.devconf_all); } else if ((!newf) ^ (!old)) dev_forward_change((struct inet6_dev *)table->extra1); - rtnl_unlock(); + rtnl_net_unlock(net); if (newf) rt6_purge_dflt_routers(net); @@ -909,11 +922,11 @@ static void addrconf_linkdown_change(struct net *net, __s32 newf) struct inet6_dev *idev; for_each_netdev(net, dev) { - idev = __in6_dev_get(dev); + idev = __in6_dev_get_rtnl_net(dev); if (idev) { int changed = (!idev->cnf.ignore_routes_with_linkdown) ^ (!newf); - idev->cnf.ignore_routes_with_linkdown = newf; + WRITE_ONCE(idev->cnf.ignore_routes_with_linkdown, newf); if (changed) inet6_netconf_notify_devconf(dev_net(dev), RTM_NEWNETCONF, @@ -924,17 +937,16 @@ static void addrconf_linkdown_change(struct net *net, __s32 newf) } } -static int addrconf_fixup_linkdown(struct ctl_table *table, int *p, int newf) +static int addrconf_fixup_linkdown(const struct ctl_table *table, int *p, int newf) { - struct net *net; + struct net *net = (struct net *)table->extra2; int old; - if (!rtnl_trylock()) + if (!rtnl_net_trylock(net)) return restart_syscall(); - net = (struct net *)table->extra2; old = *p; - *p = newf; + WRITE_ONCE(*p, newf); if (p == &net->ipv6.devconf_dflt->ignore_routes_with_linkdown) { if ((!newf) ^ (!old)) @@ -943,12 +955,12 @@ static int addrconf_fixup_linkdown(struct ctl_table *table, int *p, int newf) NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN, NETCONFA_IFINDEX_DEFAULT, net->ipv6.devconf_dflt); - rtnl_unlock(); + rtnl_net_unlock(net); return 0; } if (p == &net->ipv6.devconf_all->ignore_routes_with_linkdown) { - net->ipv6.devconf_dflt->ignore_routes_with_linkdown = newf; + WRITE_ONCE(net->ipv6.devconf_dflt->ignore_routes_with_linkdown, newf); addrconf_linkdown_change(net, newf); if ((!newf) ^ (!old)) inet6_netconf_notify_devconf(net, @@ -957,7 +969,8 @@ static int addrconf_fixup_linkdown(struct ctl_table *table, int *p, int newf) NETCONFA_IFINDEX_ALL, net->ipv6.devconf_all); } - rtnl_unlock(); + + rtnl_net_unlock(net); return 1; } @@ -1009,7 +1022,7 @@ ipv6_link_dev_addr(struct inet6_dev *idev, struct inet6_ifaddr *ifp) static u32 inet6_addr_hash(const struct net *net, const struct in6_addr *addr) { - u32 val = ipv6_addr_hash(addr) ^ net_hash_mix(net); + u32 val = __ipv6_addr_jhash(addr, net_hash_mix(net)); return hash_32(val, IN6_ADDR_HSIZE_SHIFT); } @@ -1034,7 +1047,7 @@ static int ipv6_add_addr_hash(struct net_device *dev, struct inet6_ifaddr *ifa) unsigned int hash = inet6_addr_hash(net, &ifa->addr); int err = 0; - spin_lock(&net->ipv6.addrconf_hash_lock); + spin_lock_bh(&net->ipv6.addrconf_hash_lock); /* Ignore adding duplicate addresses on an interface */ if (ipv6_chk_same_addr(net, &ifa->addr, dev, hash)) { @@ -1044,7 +1057,7 @@ static int ipv6_add_addr_hash(struct net_device *dev, struct inet6_ifaddr *ifa) hlist_add_head_rcu(&ifa->addr_lst, &net->ipv6.inet6_addr_lst[hash]); } - spin_unlock(&net->ipv6.addrconf_hash_lock); + spin_unlock_bh(&net->ipv6.addrconf_hash_lock); return err; } @@ -1062,20 +1075,28 @@ ipv6_add_addr(struct inet6_dev *idev, struct ifa6_config *cfg, struct fib6_info *f6i = NULL; int err = 0; - if (addr_type == IPV6_ADDR_ANY || - (addr_type & IPV6_ADDR_MULTICAST && - !(cfg->ifa_flags & IFA_F_MCAUTOJOIN)) || - (!(idev->dev->flags & IFF_LOOPBACK) && - !netif_is_l3_master(idev->dev) && - addr_type & IPV6_ADDR_LOOPBACK)) + if (addr_type == IPV6_ADDR_ANY) { + NL_SET_ERR_MSG_MOD(extack, "Invalid address"); return ERR_PTR(-EADDRNOTAVAIL); + } else if (addr_type & IPV6_ADDR_MULTICAST && + !(cfg->ifa_flags & IFA_F_MCAUTOJOIN)) { + NL_SET_ERR_MSG_MOD(extack, "Cannot assign multicast address without \"IFA_F_MCAUTOJOIN\" flag"); + return ERR_PTR(-EADDRNOTAVAIL); + } else if (!(idev->dev->flags & IFF_LOOPBACK) && + !netif_is_l3_master(idev->dev) && + addr_type & IPV6_ADDR_LOOPBACK) { + NL_SET_ERR_MSG_MOD(extack, "Cannot assign loopback address on this device"); + return ERR_PTR(-EADDRNOTAVAIL); + } if (idev->dead) { - err = -ENODEV; /*XXX*/ + NL_SET_ERR_MSG_MOD(extack, "device is going away"); + err = -ENODEV; goto out; } if (idev->cnf.disable_ipv6) { + NL_SET_ERR_MSG_MOD(extack, "IPv6 is disabled on this device"); err = -EACCES; goto out; } @@ -1102,7 +1123,7 @@ ipv6_add_addr(struct inet6_dev *idev, struct ifa6_config *cfg, goto out; } - f6i = addrconf_f6i_alloc(net, idev, cfg->pfx, false, gfp_flags); + f6i = addrconf_f6i_alloc(net, idev, cfg->pfx, false, gfp_flags, extack); if (IS_ERR(f6i)) { err = PTR_ERR(f6i); f6i = NULL; @@ -1139,15 +1160,15 @@ ipv6_add_addr(struct inet6_dev *idev, struct ifa6_config *cfg, /* For caller */ refcount_set(&ifa->refcnt, 1); - rcu_read_lock_bh(); + rcu_read_lock(); err = ipv6_add_addr_hash(idev->dev, ifa); if (err < 0) { - rcu_read_unlock_bh(); + rcu_read_unlock(); goto out; } - write_lock(&idev->lock); + write_lock_bh(&idev->lock); /* Add to inet6_dev unicast addr list. */ ipv6_link_dev_addr(idev, ifa); @@ -1158,9 +1179,9 @@ ipv6_add_addr(struct inet6_dev *idev, struct ifa6_config *cfg, } in6_ifa_hold(ifa); - write_unlock(&idev->lock); + write_unlock_bh(&idev->lock); - rcu_read_unlock_bh(); + rcu_read_unlock(); inet6addr_notifier_call_chain(NETDEV_UP, ifa); out: @@ -1244,6 +1265,7 @@ static void cleanup_prefix_route(struct inet6_ifaddr *ifp, unsigned long expires, bool del_rt, bool del_peer) { + struct fib6_table *table; struct fib6_info *f6i; f6i = addrconf_get_prefix_route(del_peer ? &ifp->peer_addr : &ifp->addr, @@ -1253,8 +1275,15 @@ cleanup_prefix_route(struct inet6_ifaddr *ifp, unsigned long expires, if (del_rt) ip6_del_rt(dev_net(ifp->idev->dev), f6i, false); else { - if (!(f6i->fib6_flags & RTF_EXPIRES)) + if (!(f6i->fib6_flags & RTF_EXPIRES)) { + table = f6i->fib6_table; + spin_lock_bh(&table->tb6_lock); + fib6_set_expires(f6i, expires); + fib6_add_gc_list(f6i); + + spin_unlock_bh(&table->tb6_lock); + } fib6_info_release(f6i); } } @@ -1295,7 +1324,7 @@ static void ipv6_del_addr(struct inet6_ifaddr *ifp) __in6_ifa_put(ifp); } - if (ifp->flags & IFA_F_PERMANENT && !(ifp->flags & IFA_F_NOPREFIXROUTE)) + if (!(ifp->flags & IFA_F_NOPREFIXROUTE)) action = check_cleanup_prefix_route(ifp, &expires); list_del_rcu(&ifp->if_list); @@ -1320,12 +1349,21 @@ out: in6_ifa_put(ifp); } +static unsigned long ipv6_get_regen_advance(const struct inet6_dev *idev) +{ + return READ_ONCE(idev->cnf.regen_min_advance) + + READ_ONCE(idev->cnf.regen_max_retry) * + READ_ONCE(idev->cnf.dad_transmits) * + max(NEIGH_VAR(idev->nd_parms, RETRANS_TIME), HZ/100) / HZ; +} + static int ipv6_create_tempaddr(struct inet6_ifaddr *ifp, bool block) { struct inet6_dev *idev = ifp->idev; unsigned long tmp_tstamp, age; unsigned long regen_advance; unsigned long now = jiffies; + u32 if_public_preferred_lft; s32 cnf_temp_preferred_lft; struct inet6_ifaddr *ift; struct ifa6_config cfg; @@ -1337,7 +1375,7 @@ static int ipv6_create_tempaddr(struct inet6_ifaddr *ifp, bool block) retry: in6_dev_hold(idev); - if (idev->cnf.use_tempaddr <= 0) { + if (READ_ONCE(idev->cnf.use_tempaddr) <= 0) { write_unlock_bh(&idev->lock); pr_info("%s: use_tempaddr is disabled\n", __func__); in6_dev_put(idev); @@ -1345,8 +1383,8 @@ retry: goto out; } spin_lock_bh(&ifp->lock); - if (ifp->regen_count++ >= idev->cnf.regen_max_retry) { - idev->cnf.use_tempaddr = -1; /*XXX*/ + if (ifp->regen_count++ >= READ_ONCE(idev->cnf.regen_max_retry)) { + WRITE_ONCE(idev->cnf.use_tempaddr, -1); /*XXX*/ spin_unlock_bh(&ifp->lock); write_unlock_bh(&idev->lock); pr_warn("%s: regeneration time exceeded - disabled temporary address support\n", @@ -1361,16 +1399,14 @@ retry: age = (now - ifp->tstamp) / HZ; - regen_advance = idev->cnf.regen_max_retry * - idev->cnf.dad_transmits * - max(NEIGH_VAR(idev->nd_parms, RETRANS_TIME), HZ/100) / HZ; + regen_advance = ipv6_get_regen_advance(idev); /* recalculate max_desync_factor each time and update * idev->desync_factor if it's larger */ cnf_temp_preferred_lft = READ_ONCE(idev->cnf.temp_prefered_lft); - max_desync_factor = min_t(__u32, - idev->cnf.max_desync_factor, + max_desync_factor = min_t(long, + READ_ONCE(idev->cnf.max_desync_factor), cnf_temp_preferred_lft - regen_advance); if (unlikely(idev->desync_factor > max_desync_factor)) { @@ -1383,11 +1419,14 @@ retry: } } + if_public_preferred_lft = ifp->prefered_lft; + memset(&cfg, 0, sizeof(cfg)); cfg.valid_lft = min_t(__u32, ifp->valid_lft, - idev->cnf.temp_valid_lft + age); + READ_ONCE(idev->cnf.temp_valid_lft) + age); cfg.preferred_lft = cnf_temp_preferred_lft + age - idev->desync_factor; - cfg.preferred_lft = min_t(__u32, ifp->prefered_lft, cfg.preferred_lft); + cfg.preferred_lft = min_t(__u32, if_public_preferred_lft, cfg.preferred_lft); + cfg.preferred_lft = min_t(__u32, cfg.valid_lft, cfg.preferred_lft); cfg.plen = ifp->prefix_len; tmp_tstamp = ifp->tstamp; @@ -1395,19 +1434,41 @@ retry: write_unlock_bh(&idev->lock); - /* A temporary address is created only if this calculated Preferred - * Lifetime is greater than REGEN_ADVANCE time units. In particular, - * an implementation must not create a temporary address with a zero - * Preferred Lifetime. + /* From RFC 4941: + * + * A temporary address is created only if this calculated Preferred + * Lifetime is greater than REGEN_ADVANCE time units. In + * particular, an implementation must not create a temporary address + * with a zero Preferred Lifetime. + * + * ... + * + * When creating a temporary address, the lifetime values MUST be + * derived from the corresponding prefix as follows: + * + * ... + * + * * Its Preferred Lifetime is the lower of the Preferred Lifetime + * of the public address or TEMP_PREFERRED_LIFETIME - + * DESYNC_FACTOR. + * + * To comply with the RFC's requirements, clamp the preferred lifetime + * to a minimum of regen_advance, unless that would exceed valid_lft or + * ifp->prefered_lft. + * * Use age calculation as in addrconf_verify to avoid unnecessary * temporary addresses being generated. */ age = (now - tmp_tstamp + ADDRCONF_TIMER_FUZZ_MINUS) / HZ; if (cfg.preferred_lft <= regen_advance + age) { - in6_ifa_put(ifp); - in6_dev_put(idev); - ret = -1; - goto out; + cfg.preferred_lft = regen_advance + age + 1; + if (cfg.preferred_lft > cfg.valid_lft || + cfg.preferred_lft > if_public_preferred_lft) { + in6_ifa_put(ifp); + in6_dev_put(idev); + ret = -1; + goto out; + } } cfg.ifa_flags = IFA_F_TEMPORARY; @@ -1486,15 +1547,17 @@ static inline int ipv6_saddr_preferred(int type) return 0; } -static bool ipv6_use_optimistic_addr(struct net *net, - struct inet6_dev *idev) +static bool ipv6_use_optimistic_addr(const struct net *net, + const struct inet6_dev *idev) { #ifdef CONFIG_IPV6_OPTIMISTIC_DAD if (!idev) return false; - if (!net->ipv6.devconf_all->optimistic_dad && !idev->cnf.optimistic_dad) + if (!READ_ONCE(net->ipv6.devconf_all->optimistic_dad) && + !READ_ONCE(idev->cnf.optimistic_dad)) return false; - if (!net->ipv6.devconf_all->use_optimistic && !idev->cnf.use_optimistic) + if (!READ_ONCE(net->ipv6.devconf_all->use_optimistic) && + !READ_ONCE(idev->cnf.use_optimistic)) return false; return true; @@ -1503,13 +1566,14 @@ static bool ipv6_use_optimistic_addr(struct net *net, #endif } -static bool ipv6_allow_optimistic_dad(struct net *net, - struct inet6_dev *idev) +static bool ipv6_allow_optimistic_dad(const struct net *net, + const struct inet6_dev *idev) { #ifdef CONFIG_IPV6_OPTIMISTIC_DAD if (!idev) return false; - if (!net->ipv6.devconf_all->optimistic_dad && !idev->cnf.optimistic_dad) + if (!READ_ONCE(net->ipv6.devconf_all->optimistic_dad) && + !READ_ONCE(idev->cnf.optimistic_dad)) return false; return true; @@ -1615,7 +1679,7 @@ static int ipv6_get_saddr_eval(struct net *net, */ int preftmp = dst->prefs & (IPV6_PREFER_SRC_PUBLIC|IPV6_PREFER_SRC_TMP) ? !!(dst->prefs & IPV6_PREFER_SRC_TMP) : - score->ifa->idev->cnf.use_tempaddr >= 2; + READ_ONCE(score->ifa->idev->cnf.use_tempaddr) >= 2; ret = (!(score->ifa->flags & IFA_F_TEMPORARY)) ^ preftmp; break; } @@ -1791,7 +1855,7 @@ int ipv6_dev_get_saddr(struct net *net, const struct net_device *dst_dev, idev = __in6_dev_get(dst_dev); if ((dst_type & IPV6_ADDR_MULTICAST) || dst.scope <= IPV6_ADDR_SCOPE_LINKLOCAL || - (idev && idev->cnf.use_oif_addrs_only)) { + (idev && READ_ONCE(idev->cnf.use_oif_addrs_only))) { use_oif_addr = true; } } @@ -1815,7 +1879,8 @@ int ipv6_dev_get_saddr(struct net *net, const struct net_device *dst_dev, master, &dst, scores, hiscore_idx); - if (scores[hiscore_idx].ifa) + if (scores[hiscore_idx].ifa && + scores[hiscore_idx].scopedist >= 0) goto out; } @@ -2034,9 +2099,10 @@ struct inet6_ifaddr *ipv6_get_ifaddr(struct net *net, const struct in6_addr *add if (ipv6_addr_equal(&ifp->addr, addr)) { if (!dev || ifp->idev->dev == dev || !(ifp->scope&(IFA_LINK|IFA_HOST) || strict)) { - result = ifp; - in6_ifa_hold(ifp); - break; + if (in6_ifa_hold_safe(ifp)) { + result = ifp; + break; + } } } } @@ -2098,6 +2164,7 @@ void addrconf_dad_failure(struct sk_buff *skb, struct inet6_ifaddr *ifp) { struct inet6_dev *idev = ifp->idev; struct net *net = dev_net(idev->dev); + int max_addresses; if (addrconf_dad_end(ifp)) { in6_ifa_put(ifp); @@ -2135,9 +2202,9 @@ void addrconf_dad_failure(struct sk_buff *skb, struct inet6_ifaddr *ifp) spin_unlock_bh(&ifp->lock); - if (idev->cnf.max_addresses && - ipv6_count_addresses(idev) >= - idev->cnf.max_addresses) + max_addresses = READ_ONCE(idev->cnf.max_addresses); + if (max_addresses && + ipv6_count_addresses(idev) >= max_addresses) goto lock_errdad; net_info_ratelimited("%s: generating new stable privacy address because of DAD conflict\n", @@ -2167,32 +2234,29 @@ errdad: in6_ifa_put(ifp); } -/* Join to solicited addr multicast group. - * caller must hold RTNL */ +/* Join to solicited addr multicast group. */ void addrconf_join_solict(struct net_device *dev, const struct in6_addr *addr) { struct in6_addr maddr; - if (dev->flags&(IFF_LOOPBACK|IFF_NOARP)) + if (READ_ONCE(dev->flags) & (IFF_LOOPBACK | IFF_NOARP)) return; addrconf_addr_solict_mult(addr, &maddr); ipv6_dev_mc_inc(dev, &maddr); } -/* caller must hold RTNL */ void addrconf_leave_solict(struct inet6_dev *idev, const struct in6_addr *addr) { struct in6_addr maddr; - if (idev->dev->flags&(IFF_LOOPBACK|IFF_NOARP)) + if (READ_ONCE(idev->dev->flags) & (IFF_LOOPBACK | IFF_NOARP)) return; addrconf_addr_solict_mult(addr, &maddr); __ipv6_dev_mc_dec(idev, &maddr); } -/* caller must hold RTNL */ static void addrconf_join_anycast(struct inet6_ifaddr *ifp) { struct in6_addr addr; @@ -2205,7 +2269,6 @@ static void addrconf_join_anycast(struct inet6_ifaddr *ifp) __ipv6_dev_ac_inc(ifp->idev, &addr); } -/* caller must hold RTNL */ static void addrconf_leave_anycast(struct inet6_ifaddr *ifp) { struct in6_addr addr; @@ -2509,6 +2572,24 @@ static struct inet6_dev *addrconf_add_dev(struct net_device *dev) return idev; } +static void delete_tempaddrs(struct inet6_dev *idev, + struct inet6_ifaddr *ifp) +{ + struct inet6_ifaddr *ift, *tmp; + + write_lock_bh(&idev->lock); + list_for_each_entry_safe(ift, tmp, &idev->tempaddr_list, tmp_list) { + if (ift->ifpub != ifp) + continue; + + in6_ifa_hold(ift); + write_unlock_bh(&idev->lock); + ipv6_del_addr(ift); + write_lock_bh(&idev->lock); + } + write_unlock_bh(&idev->lock); +} + static void manage_tempaddrs(struct inet6_dev *idev, struct inet6_ifaddr *ifp, __u32 valid_lft, __u32 prefered_lft, @@ -2534,11 +2615,11 @@ static void manage_tempaddrs(struct inet6_dev *idev, * (TEMP_PREFERRED_LIFETIME - DESYNC_FACTOR), respectively. */ age = (now - ift->cstamp) / HZ; - max_valid = idev->cnf.temp_valid_lft - age; + max_valid = READ_ONCE(idev->cnf.temp_valid_lft) - age; if (max_valid < 0) max_valid = 0; - max_prefered = idev->cnf.temp_prefered_lft - + max_prefered = READ_ONCE(idev->cnf.temp_prefered_lft) - idev->desync_factor - age; if (max_prefered < 0) max_prefered = 0; @@ -2562,12 +2643,18 @@ static void manage_tempaddrs(struct inet6_dev *idev, ipv6_ifa_notify(0, ift); } - if ((create || list_empty(&idev->tempaddr_list)) && - idev->cnf.use_tempaddr > 0) { + /* Also create a temporary address if it's enabled but no temporary + * address currently exists. + * However, we get called with valid_lft == 0, prefered_lft == 0, create == false + * as part of cleanup (ie. deleting the mngtmpaddr). + * We don't want that to result in creating a new temporary ip address. + */ + if (list_empty(&idev->tempaddr_list) && (valid_lft || prefered_lft)) + create = true; + + if (create && READ_ONCE(idev->cnf.use_tempaddr) > 0) { /* When a new public address is created as described * in [ADDRCONF], also create a new temporary address. - * Also create a temporary address if it's enabled but - * no temporary address currently exists. */ read_unlock_bh(&idev->lock); ipv6_create_tempaddr(ifp, false); @@ -2593,7 +2680,7 @@ int addrconf_prefix_rcv_add_addr(struct net *net, struct net_device *dev, int create = 0, update_lft = 0; if (!ifp && valid_lft) { - int max_addresses = in6_dev->cnf.max_addresses; + int max_addresses = READ_ONCE(in6_dev->cnf.max_addresses); struct ifa6_config cfg = { .pfx = addr, .plen = pinfo->prefix_len, @@ -2605,8 +2692,8 @@ int addrconf_prefix_rcv_add_addr(struct net *net, struct net_device *dev, }; #ifdef CONFIG_IPV6_OPTIMISTIC_DAD - if ((net->ipv6.devconf_all->optimistic_dad || - in6_dev->cnf.optimistic_dad) && + if ((READ_ONCE(net->ipv6.devconf_all->optimistic_dad) || + READ_ONCE(in6_dev->cnf.optimistic_dad)) && !net->ipv6.devconf_all->forwarding && sllao) cfg.ifa_flags |= IFA_F_OPTIMISTIC; #endif @@ -2642,28 +2729,29 @@ int addrconf_prefix_rcv_add_addr(struct net *net, struct net_device *dev, stored_lft = ifp->valid_lft - (now - ifp->tstamp) / HZ; else stored_lft = 0; - if (!create && stored_lft) { + + /* RFC4862 Section 5.5.3e: + * "Note that the preferred lifetime of the + * corresponding address is always reset to + * the Preferred Lifetime in the received + * Prefix Information option, regardless of + * whether the valid lifetime is also reset or + * ignored." + * + * So we should always update prefered_lft here. + */ + update_lft = !create && stored_lft; + + if (update_lft && !READ_ONCE(in6_dev->cnf.ra_honor_pio_life)) { const u32 minimum_lft = min_t(u32, stored_lft, MIN_VALID_LIFETIME); valid_lft = max(valid_lft, minimum_lft); - - /* RFC4862 Section 5.5.3e: - * "Note that the preferred lifetime of the - * corresponding address is always reset to - * the Preferred Lifetime in the received - * Prefix Information option, regardless of - * whether the valid lifetime is also reset or - * ignored." - * - * So we should always update prefered_lft here. - */ - update_lft = 1; } if (update_lft) { ifp->valid_lft = valid_lft; ifp->prefered_lft = prefered_lft; - ifp->tstamp = now; + WRITE_ONCE(ifp->tstamp, now); flags = ifp->flags; ifp->flags &= ~IFA_F_DEPRECATED; spin_unlock_bh(&ifp->lock); @@ -2687,12 +2775,14 @@ EXPORT_SYMBOL_GPL(addrconf_prefix_rcv_add_addr); void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len, bool sllao) { struct prefix_info *pinfo; + struct fib6_table *table; __u32 valid_lft; __u32 prefered_lft; int addr_type, err; u32 addr_flags = 0; struct inet6_dev *in6_dev; struct net *net = dev_net(dev); + bool ignore_autoconf = false; pinfo = (struct prefix_info *) opt; @@ -2726,6 +2816,9 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len, bool sllao) return; } + if (valid_lft != 0 && valid_lft < in6_dev->cnf.accept_ra_min_lft) + goto put; + /* * Two things going on here: * 1) Add routes for on-link prefixes @@ -2760,11 +2853,20 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len, bool sllao) if (valid_lft == 0) { ip6_del_rt(net, rt, false); rt = NULL; - } else if (addrconf_finite_timeout(rt_expires)) { - /* not infinity */ - fib6_set_expires(rt, jiffies + rt_expires); } else { - fib6_clean_expires(rt); + table = rt->fib6_table; + spin_lock_bh(&table->tb6_lock); + + if (addrconf_finite_timeout(rt_expires)) { + /* not infinity */ + fib6_set_expires(rt, jiffies + rt_expires); + fib6_add_gc_list(rt); + } else { + fib6_clean_expires(rt); + fib6_remove_gc_list(rt); + } + + spin_unlock_bh(&table->tb6_lock); } } else if (valid_lft) { clock_t expires = 0; @@ -2783,7 +2885,8 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len, bool sllao) /* Try to figure out our local address for this prefix */ - if (pinfo->autoconf && in6_dev->cnf.autoconf) { + ignore_autoconf = READ_ONCE(in6_dev->cnf.ra_honor_pio_pflag) && pinfo->preferpd; + if (pinfo->autoconf && in6_dev->cnf.autoconf && !ignore_autoconf) { struct in6_addr addr; bool tokenized = false, dev_addr_generated = false; @@ -2839,7 +2942,7 @@ put: static int addrconf_set_sit_dstaddr(struct net *net, struct net_device *dev, struct in6_ifreq *ireq) { - struct ip_tunnel_parm p = { }; + struct ip_tunnel_parm_kern p = { }; int err; if (!(ipv6_addr_type(&ireq->ifr6_addr) & IPV6_ADDR_COMPATv4)) @@ -2879,11 +2982,11 @@ int addrconf_set_dstaddr(struct net *net, void __user *arg) if (copy_from_user(&ireq, arg, sizeof(struct in6_ifreq))) return -EFAULT; - rtnl_lock(); + rtnl_net_lock(net); dev = __dev_get_by_index(net, ireq.ifr6_ifindex); if (dev && dev->type == ARPHRD_SIT) err = addrconf_set_sit_dstaddr(net, dev, &ireq); - rtnl_unlock(); + rtnl_net_unlock(net); return err; } @@ -2907,65 +3010,43 @@ static int ipv6_mc_config(struct sock *sk, bool join, /* * Manual configuration of address on an interface */ -static int inet6_addr_add(struct net *net, int ifindex, - struct ifa6_config *cfg, +static int inet6_addr_add(struct net *net, struct net_device *dev, + struct ifa6_config *cfg, clock_t expires, u32 flags, struct netlink_ext_ack *extack) { struct inet6_ifaddr *ifp; struct inet6_dev *idev; - struct net_device *dev; - unsigned long timeout; - clock_t expires; - u32 flags; - - ASSERT_RTNL(); - if (cfg->plen > 128) - return -EINVAL; + ASSERT_RTNL_NET(net); - /* check the lifetime */ - if (!cfg->valid_lft || cfg->preferred_lft > cfg->valid_lft) + if (cfg->plen > 128) { + NL_SET_ERR_MSG_MOD(extack, "Invalid prefix length"); return -EINVAL; + } - if (cfg->ifa_flags & IFA_F_MANAGETEMPADDR && cfg->plen != 64) + if (cfg->ifa_flags & IFA_F_MANAGETEMPADDR && cfg->plen != 64) { + NL_SET_ERR_MSG_MOD(extack, "address with \"mngtmpaddr\" flag must have a prefix length of 64"); return -EINVAL; - - dev = __dev_get_by_index(net, ifindex); - if (!dev) - return -ENODEV; + } idev = addrconf_add_dev(dev); - if (IS_ERR(idev)) + if (IS_ERR(idev)) { + NL_SET_ERR_MSG_MOD(extack, "IPv6 is disabled on this device"); return PTR_ERR(idev); + } if (cfg->ifa_flags & IFA_F_MCAUTOJOIN) { int ret = ipv6_mc_config(net->ipv6.mc_autojoin_sk, - true, cfg->pfx, ifindex); + true, cfg->pfx, dev->ifindex); - if (ret < 0) + if (ret < 0) { + NL_SET_ERR_MSG_MOD(extack, "Multicast auto join failed"); return ret; + } } cfg->scope = ipv6_addr_scope(cfg->pfx); - timeout = addrconf_timeout_fixup(cfg->valid_lft, HZ); - if (addrconf_finite_timeout(timeout)) { - expires = jiffies_to_clock_t(timeout * HZ); - cfg->valid_lft = timeout; - flags = RTF_EXPIRES; - } else { - expires = 0; - flags = 0; - cfg->ifa_flags |= IFA_F_PERMANENT; - } - - timeout = addrconf_timeout_fixup(cfg->preferred_lft, HZ); - if (addrconf_finite_timeout(timeout)) { - if (timeout == 0) - cfg->ifa_flags |= IFA_F_DEPRECATED; - cfg->preferred_lft = timeout; - } - ifp = ipv6_add_addr(idev, cfg, true, extack); if (!IS_ERR(ifp)) { if (!(cfg->ifa_flags & IFA_F_NOPREFIXROUTE)) { @@ -2993,29 +3074,36 @@ static int inet6_addr_add(struct net *net, int ifindex, return 0; } else if (cfg->ifa_flags & IFA_F_MCAUTOJOIN) { ipv6_mc_config(net->ipv6.mc_autojoin_sk, false, - cfg->pfx, ifindex); + cfg->pfx, dev->ifindex); } return PTR_ERR(ifp); } static int inet6_addr_del(struct net *net, int ifindex, u32 ifa_flags, - const struct in6_addr *pfx, unsigned int plen) + const struct in6_addr *pfx, unsigned int plen, + struct netlink_ext_ack *extack) { struct inet6_ifaddr *ifp; struct inet6_dev *idev; struct net_device *dev; - if (plen > 128) + if (plen > 128) { + NL_SET_ERR_MSG_MOD(extack, "Invalid prefix length"); return -EINVAL; + } dev = __dev_get_by_index(net, ifindex); - if (!dev) + if (!dev) { + NL_SET_ERR_MSG_MOD(extack, "Unable to find the interface"); return -ENODEV; + } - idev = __in6_dev_get(dev); - if (!idev) + idev = __in6_dev_get_rtnl_net(dev); + if (!idev) { + NL_SET_ERR_MSG_MOD(extack, "IPv6 is disabled on this device"); return -ENXIO; + } read_lock_bh(&idev->lock); list_for_each_entry(ifp, &idev->addr_list, if_list) { @@ -3024,11 +3112,12 @@ static int inet6_addr_del(struct net *net, int ifindex, u32 ifa_flags, in6_ifa_hold(ifp); read_unlock_bh(&idev->lock); - if (!(ifp->flags & IFA_F_TEMPORARY) && - (ifa_flags & IFA_F_MANAGETEMPADDR)) - manage_tempaddrs(idev, ifp, 0, 0, false, - jiffies); ipv6_del_addr(ifp); + + if (!(ifp->flags & IFA_F_TEMPORARY) && + (ifp->flags & IFA_F_MANAGETEMPADDR)) + delete_tempaddrs(idev, ifp); + addrconf_verify_rtnl(net); if (ipv6_addr_is_multicast(pfx)) { ipv6_mc_config(net->ipv6.mc_autojoin_sk, @@ -3038,6 +3127,8 @@ static int inet6_addr_del(struct net *net, int ifindex, u32 ifa_flags, } } read_unlock_bh(&idev->lock); + + NL_SET_ERR_MSG_MOD(extack, "address not found"); return -EADDRNOTAVAIL; } @@ -3049,6 +3140,7 @@ int addrconf_add_ifaddr(struct net *net, void __user *arg) .preferred_lft = INFINITY_LIFE_TIME, .valid_lft = INFINITY_LIFE_TIME, }; + struct net_device *dev; struct in6_ifreq ireq; int err; @@ -3061,9 +3153,16 @@ int addrconf_add_ifaddr(struct net *net, void __user *arg) cfg.pfx = &ireq.ifr6_addr; cfg.plen = ireq.ifr6_prefixlen; - rtnl_lock(); - err = inet6_addr_add(net, ireq.ifr6_ifindex, &cfg, NULL); - rtnl_unlock(); + rtnl_net_lock(net); + dev = __dev_get_by_index(net, ireq.ifr6_ifindex); + if (dev) { + netdev_lock_ops(dev); + err = inet6_addr_add(net, dev, &cfg, 0, 0, NULL); + netdev_unlock_ops(dev); + } else { + err = -ENODEV; + } + rtnl_net_unlock(net); return err; } @@ -3078,10 +3177,10 @@ int addrconf_del_ifaddr(struct net *net, void __user *arg) if (copy_from_user(&ireq, arg, sizeof(struct in6_ifreq))) return -EFAULT; - rtnl_lock(); + rtnl_net_lock(net); err = inet6_addr_del(net, ireq.ifr6_ifindex, 0, &ireq.ifr6_addr, - ireq.ifr6_prefixlen); - rtnl_unlock(); + ireq.ifr6_prefixlen, NULL); + rtnl_net_unlock(net); return err; } @@ -3110,34 +3209,31 @@ static void add_addr(struct inet6_dev *idev, const struct in6_addr *addr, } } -#if IS_ENABLED(CONFIG_IPV6_SIT) || IS_ENABLED(CONFIG_NET_IPGRE) || IS_ENABLED(CONFIG_IPV6_GRE) +#if IS_ENABLED(CONFIG_IPV6_SIT) || IS_ENABLED(CONFIG_NET_IPGRE) static void add_v4_addrs(struct inet6_dev *idev) { struct in6_addr addr; struct net_device *dev; struct net *net = dev_net(idev->dev); - int scope, plen, offset = 0; + int scope, plen; u32 pflags = 0; ASSERT_RTNL(); memset(&addr, 0, sizeof(struct in6_addr)); - /* in case of IP6GRE the dev_addr is an IPv6 and therefore we use only the last 4 bytes */ - if (idev->dev->addr_len == sizeof(struct in6_addr)) - offset = sizeof(struct in6_addr) - 4; - memcpy(&addr.s6_addr32[3], idev->dev->dev_addr + offset, 4); + memcpy(&addr.s6_addr32[3], idev->dev->dev_addr, 4); - if (idev->dev->flags&IFF_POINTOPOINT) { + if (!(idev->dev->flags & IFF_POINTOPOINT) && idev->dev->type == ARPHRD_SIT) { + scope = IPV6_ADDR_COMPATv4; + plen = 96; + pflags |= RTF_NONEXTHOP; + } else { if (idev->cnf.addr_gen_mode == IN6_ADDR_GEN_MODE_NONE) return; addr.s6_addr32[0] = htonl(0xfe800000); scope = IFA_LINK; plen = 64; - } else { - scope = IPV6_ADDR_COMPATv4; - plen = 96; - pflags |= RTF_NONEXTHOP; } if (addr.s6_addr32[3]) { @@ -3206,8 +3302,8 @@ void addrconf_add_linklocal(struct inet6_dev *idev, struct inet6_ifaddr *ifp; #ifdef CONFIG_IPV6_OPTIMISTIC_DAD - if ((dev_net(idev->dev)->ipv6.devconf_all->optimistic_dad || - idev->cnf.optimistic_dad) && + if ((READ_ONCE(dev_net(idev->dev)->ipv6.devconf_all->optimistic_dad) || + READ_ONCE(idev->cnf.optimistic_dad)) && !dev_net(idev->dev)->ipv6.devconf_all->forwarding) cfg.ifa_flags |= IFA_F_OPTIMISTIC; #endif @@ -3272,7 +3368,7 @@ static int ipv6_generate_stable_address(struct in6_addr *address, retry: spin_lock_bh(&lock); - sha1_init(digest); + sha1_init_raw(digest); memset(&data, 0, sizeof(data)); memset(workspace, 0, sizeof(workspace)); memcpy(data.hwaddr, idev->dev->perm_addr, idev->dev->addr_len); @@ -3368,6 +3464,7 @@ static void addrconf_dev_config(struct net_device *dev) (dev->type != ARPHRD_IEEE1394) && (dev->type != ARPHRD_TUNNEL6) && (dev->type != ARPHRD_6LOWPAN) && + (dev->type != ARPHRD_IP6GRE) && (dev->type != ARPHRD_TUNNEL) && (dev->type != ARPHRD_NONE) && (dev->type != ARPHRD_RAWIP)) { @@ -3386,7 +3483,8 @@ static void addrconf_dev_config(struct net_device *dev) /* this device type has no EUI support */ if (dev->type == ARPHRD_NONE && idev->cnf.addr_gen_mode == IN6_ADDR_GEN_MODE_EUI64) - idev->cnf.addr_gen_mode = IN6_ADDR_GEN_MODE_RANDOM; + WRITE_ONCE(idev->cnf.addr_gen_mode, + IN6_ADDR_GEN_MODE_RANDOM); addrconf_addr_gen(idev, false); } @@ -3422,31 +3520,55 @@ static void addrconf_sit_config(struct net_device *dev) } #endif -#if IS_ENABLED(CONFIG_NET_IPGRE) || IS_ENABLED(CONFIG_IPV6_GRE) +#if IS_ENABLED(CONFIG_NET_IPGRE) static void addrconf_gre_config(struct net_device *dev) { struct inet6_dev *idev; ASSERT_RTNL(); - idev = ipv6_find_idev(dev); - if (IS_ERR(idev)) { - pr_debug("%s: add_dev failed\n", __func__); + idev = addrconf_add_dev(dev); + if (IS_ERR(idev)) return; - } - if (dev->type == ARPHRD_ETHER) { + /* Generate the IPv6 link-local address using addrconf_addr_gen(), + * unless we have an IPv4 GRE device not bound to an IP address and + * which is in EUI64 mode (as __ipv6_isatap_ifid() would fail in this + * case). Such devices fall back to add_v4_addrs() instead. + */ + if (!(*(__be32 *)dev->dev_addr == 0 && + idev->cnf.addr_gen_mode == IN6_ADDR_GEN_MODE_EUI64)) { addrconf_addr_gen(idev, true); return; } add_v4_addrs(idev); - - if (dev->flags & IFF_POINTOPOINT) - addrconf_add_mroute(dev); } #endif +static void addrconf_init_auto_addrs(struct net_device *dev) +{ + switch (dev->type) { +#if IS_ENABLED(CONFIG_IPV6_SIT) + case ARPHRD_SIT: + addrconf_sit_config(dev); + break; +#endif +#if IS_ENABLED(CONFIG_NET_IPGRE) + case ARPHRD_IPGRE: + addrconf_gre_config(dev); + break; +#endif + case ARPHRD_LOOPBACK: + init_loopback(dev); + break; + + default: + addrconf_dev_config(dev); + break; + } +} + static int fixup_permanent_addr(struct net *net, struct inet6_dev *idev, struct inet6_ifaddr *ifp) @@ -3459,7 +3581,7 @@ static int fixup_permanent_addr(struct net *net, struct fib6_info *f6i, *prev; f6i = addrconf_f6i_alloc(net, idev, &ifp->addr, false, - GFP_ATOMIC); + GFP_ATOMIC, NULL); if (IS_ERR(f6i)) return PTR_ERR(f6i); @@ -3540,7 +3662,7 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event, if (idev) { rt6_mtu_change(dev, dev->mtu); - idev->cnf.mtu6 = dev->mtu; + WRITE_ONCE(idev->cnf.mtu6, dev->mtu); break; } @@ -3609,32 +3731,13 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event, idev->if_flags |= IF_READY; } - pr_info("ADDRCONF(NETDEV_CHANGE): %s: link becomes ready\n", - dev->name); + pr_debug("ADDRCONF(NETDEV_CHANGE): %s: link becomes ready\n", + dev->name); run_pending = 1; } - switch (dev->type) { -#if IS_ENABLED(CONFIG_IPV6_SIT) - case ARPHRD_SIT: - addrconf_sit_config(dev); - break; -#endif -#if IS_ENABLED(CONFIG_NET_IPGRE) || IS_ENABLED(CONFIG_IPV6_GRE) - case ARPHRD_IP6GRE: - case ARPHRD_IPGRE: - addrconf_gre_config(dev); - break; -#endif - case ARPHRD_LOOPBACK: - init_loopback(dev); - break; - - default: - addrconf_dev_config(dev); - break; - } + addrconf_init_auto_addrs(dev); if (!IS_ERR_OR_NULL(idev)) { if (run_pending) @@ -3651,9 +3754,9 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event, if (idev->cnf.mtu6 != dev->mtu && dev->mtu >= IPV6_MIN_MTU) { rt6_mtu_change(dev, dev->mtu); - idev->cnf.mtu6 = dev->mtu; + WRITE_ONCE(idev->cnf.mtu6, dev->mtu); } - idev->tstamp = jiffies; + WRITE_ONCE(idev->tstamp, jiffies); inet6_ifinfo_notify(RTM_NEWLINK, idev); /* @@ -3758,7 +3861,7 @@ static int addrconf_ifdown(struct net_device *dev, bool unregister) * Do not dev_put! */ if (unregister) { - idev->dead = 1; + WRITE_ONCE(idev->dead, 1); /* protected by rtnl_lock */ RCU_INIT_POINTER(dev->ip6_ptr, NULL); @@ -3773,10 +3876,10 @@ static int addrconf_ifdown(struct net_device *dev, bool unregister) */ if (!unregister && !idev->cnf.disable_ipv6) { /* aggregate the system setting and interface setting */ - int _keep_addr = net->ipv6.devconf_all->keep_addr_on_down; + int _keep_addr = READ_ONCE(net->ipv6.devconf_all->keep_addr_on_down); if (!_keep_addr) - _keep_addr = idev->cnf.keep_addr_on_down; + _keep_addr = READ_ONCE(idev->cnf.keep_addr_on_down); keep_addr = (_keep_addr > 0); } @@ -3895,7 +3998,7 @@ restart: ipv6_mc_down(idev); } - idev->tstamp = jiffies; + WRITE_ONCE(idev->tstamp, jiffies); idev->ra_mtu = 0; /* Last: Shot the device (if unregistered) */ @@ -3910,9 +4013,10 @@ restart: static void addrconf_rs_timer(struct timer_list *t) { - struct inet6_dev *idev = from_timer(idev, t, rs_timer); + struct inet6_dev *idev = timer_container_of(idev, t, rs_timer); struct net_device *dev = idev->dev; struct in6_addr lladdr; + int rtr_solicits; write_lock(&idev->lock); if (idev->dead || !(idev->if_flags & IF_READY)) @@ -3925,7 +4029,9 @@ static void addrconf_rs_timer(struct timer_list *t) if (idev->if_flags & IF_RA_RCVD) goto out; - if (idev->rs_probes++ < idev->cnf.rtr_solicits || idev->cnf.rtr_solicits < 0) { + rtr_solicits = READ_ONCE(idev->cnf.rtr_solicits); + + if (idev->rs_probes++ < rtr_solicits || rtr_solicits < 0) { write_unlock(&idev->lock); if (!ipv6_get_lladdr(dev, &lladdr, IFA_F_TENTATIVE)) ndisc_send_rs(dev, &lladdr, @@ -3935,11 +4041,12 @@ static void addrconf_rs_timer(struct timer_list *t) write_lock(&idev->lock); idev->rs_interval = rfc3315_s14_backoff_update( - idev->rs_interval, idev->cnf.rtr_solicit_max_interval); + idev->rs_interval, + READ_ONCE(idev->cnf.rtr_solicit_max_interval)); /* The wait after the last probe can be shorter */ addrconf_mod_rs_timer(idev, (idev->rs_probes == - idev->cnf.rtr_solicits) ? - idev->cnf.rtr_solicit_delay : + READ_ONCE(idev->cnf.rtr_solicits)) ? + READ_ONCE(idev->cnf.rtr_solicit_delay) : idev->rs_interval); } else { /* @@ -3960,24 +4067,25 @@ put: */ static void addrconf_dad_kick(struct inet6_ifaddr *ifp) { - unsigned long rand_num; struct inet6_dev *idev = ifp->idev; + unsigned long rand_num; u64 nonce; if (ifp->flags & IFA_F_OPTIMISTIC) rand_num = 0; else - rand_num = get_random_u32_below(idev->cnf.rtr_solicit_delay ? : 1); + rand_num = get_random_u32_below( + READ_ONCE(idev->cnf.rtr_solicit_delay) ? : 1); nonce = 0; - if (idev->cnf.enhanced_dad || - dev_net(idev->dev)->ipv6.devconf_all->enhanced_dad) { + if (READ_ONCE(idev->cnf.enhanced_dad) || + READ_ONCE(dev_net(idev->dev)->ipv6.devconf_all->enhanced_dad)) { do get_random_bytes(&nonce, 6); while (nonce == 0); } ifp->dad_nonce = nonce; - ifp->dad_probes = idev->cnf.dad_transmits; + ifp->dad_probes = READ_ONCE(idev->cnf.dad_transmits); addrconf_mod_dad_work(ifp, rand_num); } @@ -3997,8 +4105,8 @@ static void addrconf_dad_begin(struct inet6_ifaddr *ifp) net = dev_net(dev); if (dev->flags&(IFF_NOARP|IFF_LOOPBACK) || - (net->ipv6.devconf_all->accept_dad < 1 && - idev->cnf.accept_dad < 1) || + (READ_ONCE(net->ipv6.devconf_all->accept_dad) < 1 && + READ_ONCE(idev->cnf.accept_dad) < 1) || !(ifp->flags&IFA_F_TENTATIVE) || ifp->flags & IFA_F_NODAD) { bool send_na = false; @@ -4073,6 +4181,7 @@ static void addrconf_dad_work(struct work_struct *w) struct inet6_dev *idev = ifp->idev; bool bump_id, disable_ipv6 = false; struct in6_addr mcaddr; + struct net *net; enum { DAD_PROCESS, @@ -4080,7 +4189,9 @@ static void addrconf_dad_work(struct work_struct *w) DAD_ABORT, } action = DAD_PROCESS; - rtnl_lock(); + net = dev_net(idev->dev); + + rtnl_net_lock(net); spin_lock_bh(&ifp->lock); if (ifp->state == INET6_IFADDR_STATE_PREDAD) { @@ -4090,8 +4201,8 @@ static void addrconf_dad_work(struct work_struct *w) action = DAD_ABORT; ifp->state = INET6_IFADDR_STATE_POSTDAD; - if ((dev_net(idev->dev)->ipv6.devconf_all->accept_dad > 1 || - idev->cnf.accept_dad > 1) && + if ((READ_ONCE(net->ipv6.devconf_all->accept_dad) > 1 || + READ_ONCE(idev->cnf.accept_dad) > 1) && !idev->cnf.disable_ipv6 && !(ifp->flags & IFA_F_STABLE_PRIVACY)) { struct in6_addr addr; @@ -4102,7 +4213,7 @@ static void addrconf_dad_work(struct work_struct *w) if (!ipv6_generate_eui64(addr.s6_addr + 8, idev->dev) && ipv6_addr_equal(&ifp->addr, &addr)) { /* DAD failed for link-local based on MAC */ - idev->cnf.disable_ipv6 = 1; + WRITE_ONCE(idev->cnf.disable_ipv6, 1); pr_info("%s: IPv6 being disabled!\n", ifp->idev->dev->name); @@ -4172,7 +4283,7 @@ static void addrconf_dad_work(struct work_struct *w) ifp->dad_nonce); out: in6_ifa_put(ifp); - rtnl_unlock(); + rtnl_net_unlock(net); } /* ifp->idev must be at least read locked */ @@ -4216,9 +4327,10 @@ static void addrconf_dad_completed(struct inet6_ifaddr *ifp, bool bump_id, send_mld = ifp->scope == IFA_LINK && ipv6_lonely_lladdr(ifp); send_rs = send_mld && ipv6_accept_ra(ifp->idev) && - ifp->idev->cnf.rtr_solicits != 0 && + READ_ONCE(ifp->idev->cnf.rtr_solicits) != 0 && (dev->flags & IFF_LOOPBACK) == 0 && - (dev->type != ARPHRD_TUNNEL); + (dev->type != ARPHRD_TUNNEL) && + !netif_is_team_port(dev); read_unlock_bh(&ifp->idev->lock); /* While dad is in progress mld report's source address is in6_addrany. @@ -4229,8 +4341,8 @@ static void addrconf_dad_completed(struct inet6_ifaddr *ifp, bool bump_id, /* send unsolicited NA if enabled */ if (send_na && - (ifp->idev->cnf.ndisc_notify || - dev_net(dev)->ipv6.devconf_all->ndisc_notify)) { + (READ_ONCE(ifp->idev->cnf.ndisc_notify) || + READ_ONCE(dev_net(dev)->ipv6.devconf_all->ndisc_notify))) { ndisc_send_na(dev, &in6addr_linklocal_allnodes, &ifp->addr, /*router=*/ !!ifp->idev->cnf.forwarding, /*solicited=*/ false, /*override=*/ true, @@ -4250,7 +4362,7 @@ static void addrconf_dad_completed(struct inet6_ifaddr *ifp, bool bump_id, write_lock_bh(&ifp->idev->lock); spin_lock(&ifp->lock); ifp->idev->rs_interval = rfc3315_s14_backoff_init( - ifp->idev->cnf.rtr_solicit_interval); + READ_ONCE(ifp->idev->cnf.rtr_solicit_interval)); ifp->idev->rs_probes = 1; ifp->idev->if_flags |= IF_RS_SENT; addrconf_mod_rs_timer(ifp->idev, ifp->idev->rs_interval); @@ -4530,9 +4642,7 @@ restart: !ifp->regen_count && ifp->ifpub) { /* This is a non-regenerated temporary addr. */ - unsigned long regen_advance = ifp->idev->cnf.regen_max_retry * - ifp->idev->cnf.dad_transmits * - max(NEIGH_VAR(ifp->idev->nd_parms, RETRANS_TIME), HZ/100) / HZ; + unsigned long regen_advance = ipv6_get_regen_advance(ifp->idev); if (age + regen_advance >= ifp->prefered_lft) { struct inet6_ifaddr *ifpub = ifp->ifpub; @@ -4621,9 +4731,9 @@ static void addrconf_verify_work(struct work_struct *w) struct net *net = container_of(to_delayed_work(w), struct net, ipv6.addr_chk_work); - rtnl_lock(); + rtnl_net_lock(net); addrconf_verify_rtnl(net); - rtnl_unlock(); + rtnl_net_unlock(net); } static void addrconf_verify(struct net *net) @@ -4681,19 +4791,24 @@ inet6_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, if (!pfx) return -EINVAL; - ifa_flags = tb[IFA_FLAGS] ? nla_get_u32(tb[IFA_FLAGS]) : ifm->ifa_flags; + ifa_flags = nla_get_u32_default(tb[IFA_FLAGS], ifm->ifa_flags); /* We ignore other flags so far. */ ifa_flags &= IFA_F_MANAGETEMPADDR; - return inet6_addr_del(net, ifm->ifa_index, ifa_flags, pfx, - ifm->ifa_prefixlen); + rtnl_net_lock(net); + err = inet6_addr_del(net, ifm->ifa_index, ifa_flags, pfx, + ifm->ifa_prefixlen, extack); + rtnl_net_unlock(net); + + return err; } -static int modify_prefix_route(struct inet6_ifaddr *ifp, +static int modify_prefix_route(struct net *net, struct inet6_ifaddr *ifp, unsigned long expires, u32 flags, bool modify_peer) { + struct fib6_table *table; struct fib6_info *f6i; u32 prio; @@ -4713,32 +4828,36 @@ static int modify_prefix_route(struct inet6_ifaddr *ifp, ifp->prefix_len, ifp->rt_priority, ifp->idev->dev, expires, flags, GFP_KERNEL); - } else { - if (!expires) + return 0; + } + if (f6i != net->ipv6.fib6_null_entry) { + table = f6i->fib6_table; + spin_lock_bh(&table->tb6_lock); + + if (!(flags & RTF_EXPIRES)) { fib6_clean_expires(f6i); - else + fib6_remove_gc_list(f6i); + } else { fib6_set_expires(f6i, expires); + fib6_add_gc_list(f6i); + } - fib6_info_release(f6i); + spin_unlock_bh(&table->tb6_lock); } + fib6_info_release(f6i); return 0; } static int inet6_addr_modify(struct net *net, struct inet6_ifaddr *ifp, - struct ifa6_config *cfg) + struct ifa6_config *cfg, clock_t expires, + u32 flags) { - u32 flags; - clock_t expires; - unsigned long timeout; bool was_managetempaddr; - bool had_prefixroute; bool new_peer = false; + bool had_prefixroute; - ASSERT_RTNL(); - - if (!cfg->valid_lft || cfg->preferred_lft > cfg->valid_lft) - return -EINVAL; + ASSERT_RTNL_NET(net); if (cfg->ifa_flags & IFA_F_MANAGETEMPADDR && (ifp->flags & IFA_F_TEMPORARY || ifp->prefix_len != 64)) @@ -4747,24 +4866,6 @@ static int inet6_addr_modify(struct net *net, struct inet6_ifaddr *ifp, if (!(ifp->flags & IFA_F_TENTATIVE) || ifp->flags & IFA_F_DADFAILED) cfg->ifa_flags &= ~IFA_F_OPTIMISTIC; - timeout = addrconf_timeout_fixup(cfg->valid_lft, HZ); - if (addrconf_finite_timeout(timeout)) { - expires = jiffies_to_clock_t(timeout * HZ); - cfg->valid_lft = timeout; - flags = RTF_EXPIRES; - } else { - expires = 0; - flags = 0; - cfg->ifa_flags |= IFA_F_PERMANENT; - } - - timeout = addrconf_timeout_fixup(cfg->preferred_lft, HZ); - if (addrconf_finite_timeout(timeout)) { - if (timeout == 0) - cfg->ifa_flags |= IFA_F_DEPRECATED; - cfg->preferred_lft = timeout; - } - if (cfg->peer_pfx && memcmp(&ifp->peer_addr, cfg->peer_pfx, sizeof(struct in6_addr))) { if (!ipv6_addr_any(&ifp->peer_addr)) @@ -4780,13 +4881,13 @@ static int inet6_addr_modify(struct net *net, struct inet6_ifaddr *ifp, IFA_F_HOMEADDRESS | IFA_F_MANAGETEMPADDR | IFA_F_NOPREFIXROUTE); ifp->flags |= cfg->ifa_flags; - ifp->tstamp = jiffies; - ifp->valid_lft = cfg->valid_lft; - ifp->prefered_lft = cfg->preferred_lft; - ifp->ifa_proto = cfg->ifa_proto; + WRITE_ONCE(ifp->tstamp, jiffies); + WRITE_ONCE(ifp->valid_lft, cfg->valid_lft); + WRITE_ONCE(ifp->prefered_lft, cfg->preferred_lft); + WRITE_ONCE(ifp->ifa_proto, cfg->ifa_proto); if (cfg->rt_priority && cfg->rt_priority != ifp->rt_priority) - ifp->rt_priority = cfg->rt_priority; + WRITE_ONCE(ifp->rt_priority, cfg->rt_priority); if (new_peer) ifp->peer_addr = *cfg->peer_pfx; @@ -4799,7 +4900,7 @@ static int inet6_addr_modify(struct net *net, struct inet6_ifaddr *ifp, int rc = -ENOENT; if (had_prefixroute) - rc = modify_prefix_route(ifp, expires, flags, false); + rc = modify_prefix_route(net, ifp, expires, flags, false); /* prefix route could have been deleted; if so restore it */ if (rc == -ENOENT) { @@ -4809,7 +4910,7 @@ static int inet6_addr_modify(struct net *net, struct inet6_ifaddr *ifp, } if (had_prefixroute && !ipv6_addr_any(&ifp->peer_addr)) - rc = modify_prefix_route(ifp, expires, flags, true); + rc = modify_prefix_route(net, ifp, expires, flags, true); if (rc == -ENOENT && !ipv6_addr_any(&ifp->peer_addr)) { addrconf_prefix_route(&ifp->peer_addr, ifp->prefix_len, @@ -4831,14 +4932,12 @@ static int inet6_addr_modify(struct net *net, struct inet6_ifaddr *ifp, } if (was_managetempaddr || ifp->flags & IFA_F_MANAGETEMPADDR) { - if (was_managetempaddr && - !(ifp->flags & IFA_F_MANAGETEMPADDR)) { - cfg->valid_lft = 0; - cfg->preferred_lft = 0; - } - manage_tempaddrs(ifp->idev, ifp, cfg->valid_lft, - cfg->preferred_lft, !was_managetempaddr, - jiffies); + if (was_managetempaddr && !(ifp->flags & IFA_F_MANAGETEMPADDR)) + delete_tempaddrs(ifp->idev, ifp); + else + manage_tempaddrs(ifp->idev, ifp, cfg->valid_lft, + cfg->preferred_lft, !was_managetempaddr, + jiffies); } addrconf_verify_rtnl(net); @@ -4851,13 +4950,16 @@ inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); - struct ifaddrmsg *ifm; struct nlattr *tb[IFA_MAX+1]; struct in6_addr *peer_pfx; struct inet6_ifaddr *ifa; struct net_device *dev; struct inet6_dev *idev; struct ifa6_config cfg; + struct ifaddrmsg *ifm; + unsigned long timeout; + clock_t expires; + u32 flags; int err; err = nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFA_MAX, @@ -4880,8 +4982,18 @@ inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, if (tb[IFA_PROTO]) cfg.ifa_proto = nla_get_u8(tb[IFA_PROTO]); + cfg.ifa_flags = nla_get_u32_default(tb[IFA_FLAGS], ifm->ifa_flags); + + /* We ignore other flags so far. */ + cfg.ifa_flags &= IFA_F_NODAD | IFA_F_HOMEADDRESS | + IFA_F_MANAGETEMPADDR | IFA_F_NOPREFIXROUTE | + IFA_F_MCAUTOJOIN | IFA_F_OPTIMISTIC; + + cfg.ifa_flags |= IFA_F_PERMANENT; cfg.valid_lft = INFINITY_LIFE_TIME; cfg.preferred_lft = INFINITY_LIFE_TIME; + expires = 0; + flags = 0; if (tb[IFA_CACHEINFO]) { struct ifa_cacheinfo *ci; @@ -4889,25 +5001,44 @@ inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, ci = nla_data(tb[IFA_CACHEINFO]); cfg.valid_lft = ci->ifa_valid; cfg.preferred_lft = ci->ifa_prefered; - } - dev = __dev_get_by_index(net, ifm->ifa_index); - if (!dev) - return -ENODEV; + if (!cfg.valid_lft || cfg.preferred_lft > cfg.valid_lft) { + NL_SET_ERR_MSG_MOD(extack, "address lifetime invalid"); + return -EINVAL; + } - if (tb[IFA_FLAGS]) - cfg.ifa_flags = nla_get_u32(tb[IFA_FLAGS]); - else - cfg.ifa_flags = ifm->ifa_flags; + timeout = addrconf_timeout_fixup(cfg.valid_lft, HZ); + if (addrconf_finite_timeout(timeout)) { + cfg.ifa_flags &= ~IFA_F_PERMANENT; + cfg.valid_lft = timeout; + expires = jiffies_to_clock_t(timeout * HZ); + flags = RTF_EXPIRES; + } - /* We ignore other flags so far. */ - cfg.ifa_flags &= IFA_F_NODAD | IFA_F_HOMEADDRESS | - IFA_F_MANAGETEMPADDR | IFA_F_NOPREFIXROUTE | - IFA_F_MCAUTOJOIN | IFA_F_OPTIMISTIC; + timeout = addrconf_timeout_fixup(cfg.preferred_lft, HZ); + if (addrconf_finite_timeout(timeout)) { + if (timeout == 0) + cfg.ifa_flags |= IFA_F_DEPRECATED; + + cfg.preferred_lft = timeout; + } + } + + rtnl_net_lock(net); + dev = __dev_get_by_index(net, ifm->ifa_index); + if (!dev) { + NL_SET_ERR_MSG_MOD(extack, "Unable to find the interface"); + err = -ENODEV; + goto unlock_rtnl; + } + + netdev_lock_ops(dev); idev = ipv6_find_idev(dev); - if (IS_ERR(idev)) - return PTR_ERR(idev); + if (IS_ERR(idev)) { + err = PTR_ERR(idev); + goto unlock; + } if (!ipv6_allow_optimistic_dad(net, idev)) cfg.ifa_flags &= ~IFA_F_OPTIMISTIC; @@ -4915,7 +5046,8 @@ inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, if (cfg.ifa_flags & IFA_F_NODAD && cfg.ifa_flags & IFA_F_OPTIMISTIC) { NL_SET_ERR_MSG(extack, "IFA_F_NODAD and IFA_F_OPTIMISTIC are mutually exclusive"); - return -EINVAL; + err = -EINVAL; + goto unlock; } ifa = ipv6_get_ifaddr(net, cfg.pfx, dev, 1); @@ -4924,16 +5056,23 @@ inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, * It would be best to check for !NLM_F_CREATE here but * userspace already relies on not having to provide this. */ - return inet6_addr_add(net, ifm->ifa_index, &cfg, extack); + err = inet6_addr_add(net, dev, &cfg, expires, flags, extack); + goto unlock; } if (nlh->nlmsg_flags & NLM_F_EXCL || - !(nlh->nlmsg_flags & NLM_F_REPLACE)) + !(nlh->nlmsg_flags & NLM_F_REPLACE)) { + NL_SET_ERR_MSG_MOD(extack, "address already assigned"); err = -EEXIST; - else - err = inet6_addr_modify(net, ifa, &cfg); + } else { + err = inet6_addr_modify(net, ifa, &cfg, expires, flags); + } in6_ifa_put(ifa); +unlock: + netdev_unlock_ops(dev); +unlock_rtnl: + rtnl_net_unlock(net); return err; } @@ -4987,33 +5126,21 @@ static inline int inet6_ifaddr_msgsize(void) + nla_total_size(4) /* IFA_RT_PRIORITY */; } -enum addr_type_t { - UNICAST_ADDR, - MULTICAST_ADDR, - ANYCAST_ADDR, -}; - -struct inet6_fill_args { - u32 portid; - u32 seq; - int event; - unsigned int flags; - int netnsid; - int ifindex; - enum addr_type_t type; -}; - -static int inet6_fill_ifaddr(struct sk_buff *skb, struct inet6_ifaddr *ifa, +static int inet6_fill_ifaddr(struct sk_buff *skb, + const struct inet6_ifaddr *ifa, struct inet6_fill_args *args) { - struct nlmsghdr *nlh; + struct nlmsghdr *nlh; u32 preferred, valid; + u32 flags, priority; + u8 proto; nlh = nlmsg_put(skb, args->portid, args->seq, args->event, sizeof(struct ifaddrmsg), args->flags); if (!nlh) return -EMSGSIZE; + flags = READ_ONCE(ifa->flags); put_ifaddrmsg(nlh, ifa->prefix_len, ifa->flags, rt_scope(ifa->scope), ifa->idev->dev->ifindex); @@ -5021,13 +5148,14 @@ static int inet6_fill_ifaddr(struct sk_buff *skb, struct inet6_ifaddr *ifa, nla_put_s32(skb, IFA_TARGET_NETNSID, args->netnsid)) goto error; - spin_lock_bh(&ifa->lock); - if (!((ifa->flags&IFA_F_PERMANENT) && - (ifa->prefered_lft == INFINITY_LIFE_TIME))) { - preferred = ifa->prefered_lft; - valid = ifa->valid_lft; + preferred = READ_ONCE(ifa->prefered_lft); + valid = READ_ONCE(ifa->valid_lft); + + if (!((flags & IFA_F_PERMANENT) && + (preferred == INFINITY_LIFE_TIME))) { if (preferred != INFINITY_LIFE_TIME) { - long tval = (jiffies - ifa->tstamp)/HZ; + long tval = (jiffies - READ_ONCE(ifa->tstamp)) / HZ; + if (preferred > tval) preferred -= tval; else @@ -5043,28 +5171,29 @@ static int inet6_fill_ifaddr(struct sk_buff *skb, struct inet6_ifaddr *ifa, preferred = INFINITY_LIFE_TIME; valid = INFINITY_LIFE_TIME; } - spin_unlock_bh(&ifa->lock); if (!ipv6_addr_any(&ifa->peer_addr)) { if (nla_put_in6_addr(skb, IFA_LOCAL, &ifa->addr) < 0 || nla_put_in6_addr(skb, IFA_ADDRESS, &ifa->peer_addr) < 0) goto error; - } else + } else { if (nla_put_in6_addr(skb, IFA_ADDRESS, &ifa->addr) < 0) goto error; + } - if (ifa->rt_priority && - nla_put_u32(skb, IFA_RT_PRIORITY, ifa->rt_priority)) + priority = READ_ONCE(ifa->rt_priority); + if (priority && nla_put_u32(skb, IFA_RT_PRIORITY, priority)) goto error; - if (put_cacheinfo(skb, ifa->cstamp, ifa->tstamp, preferred, valid) < 0) + if (put_cacheinfo(skb, ifa->cstamp, READ_ONCE(ifa->tstamp), + preferred, valid) < 0) goto error; - if (nla_put_u32(skb, IFA_FLAGS, ifa->flags) < 0) + if (nla_put_u32(skb, IFA_FLAGS, flags) < 0) goto error; - if (ifa->ifa_proto && - nla_put_u8(skb, IFA_PROTO, ifa->ifa_proto)) + proto = READ_ONCE(ifa->ifa_proto); + if (proto && nla_put_u8(skb, IFA_PROTO, proto)) goto error; nlmsg_end(skb, nlh); @@ -5075,14 +5204,16 @@ error: return -EMSGSIZE; } -static int inet6_fill_ifmcaddr(struct sk_buff *skb, struct ifmcaddr6 *ifmca, - struct inet6_fill_args *args) +int inet6_fill_ifmcaddr(struct sk_buff *skb, + const struct ifmcaddr6 *ifmca, + struct inet6_fill_args *args) { - struct nlmsghdr *nlh; - u8 scope = RT_SCOPE_UNIVERSE; int ifindex = ifmca->idev->dev->ifindex; + u8 scope = RT_SCOPE_UNIVERSE; + struct nlmsghdr *nlh; - if (ipv6_addr_scope(&ifmca->mca_addr) & IFA_SITE) + if (!args->force_rt_scope_universe && + ipv6_addr_scope(&ifmca->mca_addr) & IFA_SITE) scope = RT_SCOPE_SITE; nlh = nlmsg_put(skb, args->portid, args->seq, args->event, @@ -5098,7 +5229,7 @@ static int inet6_fill_ifmcaddr(struct sk_buff *skb, struct ifmcaddr6 *ifmca, put_ifaddrmsg(nlh, 128, IFA_F_PERMANENT, scope, ifindex); if (nla_put_in6_addr(skb, IFA_MULTICAST, &ifmca->mca_addr) < 0 || - put_cacheinfo(skb, ifmca->mca_cstamp, ifmca->mca_tstamp, + put_cacheinfo(skb, ifmca->mca_cstamp, READ_ONCE(ifmca->mca_tstamp), INFINITY_LIFE_TIME, INFINITY_LIFE_TIME) < 0) { nlmsg_cancel(skb, nlh); return -EMSGSIZE; @@ -5108,13 +5239,14 @@ static int inet6_fill_ifmcaddr(struct sk_buff *skb, struct ifmcaddr6 *ifmca, return 0; } -static int inet6_fill_ifacaddr(struct sk_buff *skb, struct ifacaddr6 *ifaca, - struct inet6_fill_args *args) +int inet6_fill_ifacaddr(struct sk_buff *skb, + const struct ifacaddr6 *ifaca, + struct inet6_fill_args *args) { struct net_device *dev = fib6_info_nh_dev(ifaca->aca_rt); int ifindex = dev ? dev->ifindex : 1; - struct nlmsghdr *nlh; u8 scope = RT_SCOPE_UNIVERSE; + struct nlmsghdr *nlh; if (ipv6_addr_scope(&ifaca->aca_addr) & IFA_SITE) scope = RT_SCOPE_SITE; @@ -5132,7 +5264,7 @@ static int inet6_fill_ifacaddr(struct sk_buff *skb, struct ifacaddr6 *ifaca, put_ifaddrmsg(nlh, 128, IFA_F_PERMANENT, scope, ifindex); if (nla_put_in6_addr(skb, IFA_ANYCAST, &ifaca->aca_addr) < 0 || - put_cacheinfo(skb, ifaca->aca_cstamp, ifaca->aca_tstamp, + put_cacheinfo(skb, ifaca->aca_cstamp, READ_ONCE(ifaca->aca_tstamp), INFINITY_LIFE_TIME, INFINITY_LIFE_TIME) < 0) { nlmsg_cancel(skb, nlh); return -EMSGSIZE; @@ -5143,24 +5275,23 @@ static int inet6_fill_ifacaddr(struct sk_buff *skb, struct ifacaddr6 *ifaca, } /* called with rcu_read_lock() */ -static int in6_dump_addrs(struct inet6_dev *idev, struct sk_buff *skb, - struct netlink_callback *cb, int s_ip_idx, +static int in6_dump_addrs(const struct inet6_dev *idev, struct sk_buff *skb, + struct netlink_callback *cb, int *s_ip_idx, struct inet6_fill_args *fillargs) { - struct ifmcaddr6 *ifmca; - struct ifacaddr6 *ifaca; + const struct ifmcaddr6 *ifmca; + const struct ifacaddr6 *ifaca; int ip_idx = 0; - int err = 1; + int err = 0; - read_lock_bh(&idev->lock); switch (fillargs->type) { case UNICAST_ADDR: { - struct inet6_ifaddr *ifa; + const struct inet6_ifaddr *ifa; fillargs->event = RTM_NEWADDR; /* unicast address incl. temp addr */ - list_for_each_entry(ifa, &idev->addr_list, if_list) { - if (ip_idx < s_ip_idx) + list_for_each_entry_rcu(ifa, &idev->addr_list, if_list) { + if (ip_idx < *s_ip_idx) goto next; err = inet6_fill_ifaddr(skb, ifa, fillargs); if (err < 0) @@ -5172,27 +5303,25 @@ next: break; } case MULTICAST_ADDR: - read_unlock_bh(&idev->lock); fillargs->event = RTM_GETMULTICAST; /* multicast address */ - for (ifmca = rtnl_dereference(idev->mc_list); + for (ifmca = rcu_dereference(idev->mc_list); ifmca; - ifmca = rtnl_dereference(ifmca->next), ip_idx++) { - if (ip_idx < s_ip_idx) + ifmca = rcu_dereference(ifmca->next), ip_idx++) { + if (ip_idx < *s_ip_idx) continue; err = inet6_fill_ifmcaddr(skb, ifmca, fillargs); if (err < 0) break; } - read_lock_bh(&idev->lock); break; case ANYCAST_ADDR: fillargs->event = RTM_GETANYCAST; /* anycast address */ - for (ifaca = idev->ac_list; ifaca; - ifaca = ifaca->aca_next, ip_idx++) { - if (ip_idx < s_ip_idx) + for (ifaca = rcu_dereference(idev->ac_list); ifaca; + ifaca = rcu_dereference(ifaca->aca_next), ip_idx++) { + if (ip_idx < *s_ip_idx) continue; err = inet6_fill_ifacaddr(skb, ifaca, fillargs); if (err < 0) @@ -5202,8 +5331,7 @@ next: default: break; } - read_unlock_bh(&idev->lock); - cb->args[2] = ip_idx; + *s_ip_idx = err ? ip_idx : 0; return err; } @@ -5217,12 +5345,12 @@ static int inet6_valid_dump_ifaddr_req(const struct nlmsghdr *nlh, struct ifaddrmsg *ifm; int err, i; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifm))) { + ifm = nlmsg_payload(nlh, sizeof(*ifm)); + if (!ifm) { NL_SET_ERR_MSG_MOD(extack, "Invalid header for address dump request"); return -EINVAL; } - ifm = nlmsg_data(nlh); if (ifm->ifa_prefixlen || ifm->ifa_flags || ifm->ifa_scope) { NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for address dump request"); return -EINVAL; @@ -5266,6 +5394,7 @@ static int inet6_valid_dump_ifaddr_req(const struct nlmsghdr *nlh, static int inet6_dump_addr(struct sk_buff *skb, struct netlink_callback *cb, enum addr_type_t type) { + struct net *tgt_net = sock_net(skb->sk); const struct nlmsghdr *nlh = cb->nlh; struct inet6_fill_args fillargs = { .portid = NETLINK_CB(cb->skb).portid, @@ -5273,73 +5402,55 @@ static int inet6_dump_addr(struct sk_buff *skb, struct netlink_callback *cb, .flags = NLM_F_MULTI, .netnsid = -1, .type = type, + .force_rt_scope_universe = false, }; - struct net *tgt_net = sock_net(skb->sk); - int idx, s_idx, s_ip_idx; - int h, s_h; + struct { + unsigned long ifindex; + int ip_idx; + } *ctx = (void *)cb->ctx; struct net_device *dev; struct inet6_dev *idev; - struct hlist_head *head; int err = 0; - s_h = cb->args[0]; - s_idx = idx = cb->args[1]; - s_ip_idx = cb->args[2]; - + rcu_read_lock(); if (cb->strict_check) { err = inet6_valid_dump_ifaddr_req(nlh, &fillargs, &tgt_net, skb->sk, cb); if (err < 0) - goto put_tgt_net; + goto done; err = 0; if (fillargs.ifindex) { - dev = __dev_get_by_index(tgt_net, fillargs.ifindex); + dev = dev_get_by_index_rcu(tgt_net, fillargs.ifindex); if (!dev) { err = -ENODEV; - goto put_tgt_net; + goto done; } idev = __in6_dev_get(dev); - if (idev) { - err = in6_dump_addrs(idev, skb, cb, s_ip_idx, + if (idev) + err = in6_dump_addrs(idev, skb, cb, + &ctx->ip_idx, &fillargs); - if (err > 0) - err = 0; - } - goto put_tgt_net; + goto done; } } - rcu_read_lock(); - cb->seq = atomic_read(&tgt_net->ipv6.dev_addr_genid) ^ tgt_net->dev_base_seq; - for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { - idx = 0; - head = &tgt_net->dev_index_head[h]; - hlist_for_each_entry_rcu(dev, head, index_hlist) { - if (idx < s_idx) - goto cont; - if (h > s_h || idx > s_idx) - s_ip_idx = 0; - idev = __in6_dev_get(dev); - if (!idev) - goto cont; - - if (in6_dump_addrs(idev, skb, cb, s_ip_idx, - &fillargs) < 0) - goto done; -cont: - idx++; - } + cb->seq = inet6_base_seq(tgt_net); + for_each_netdev_dump(tgt_net, dev, ctx->ifindex) { + idev = __in6_dev_get(dev); + if (!idev) + continue; + err = in6_dump_addrs(idev, skb, cb, &ctx->ip_idx, + &fillargs); + if (err < 0) + goto done; } done: rcu_read_unlock(); - cb->args[0] = h; - cb->args[1] = idx; -put_tgt_net: if (fillargs.netnsid >= 0) put_net(tgt_net); - return skb->len ? : err; + return err; } static int inet6_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb) @@ -5372,7 +5483,8 @@ static int inet6_rtm_valid_getaddr_req(struct sk_buff *skb, struct ifaddrmsg *ifm; int i, err; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifm))) { + ifm = nlmsg_payload(nlh, sizeof(*ifm)); + if (!ifm) { NL_SET_ERR_MSG_MOD(extack, "Invalid header for get address request"); return -EINVAL; } @@ -5381,7 +5493,6 @@ static int inet6_rtm_valid_getaddr_req(struct sk_buff *skb, return nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_ipv6_policy, extack); - ifm = nlmsg_data(nlh); if (ifm->ifa_prefixlen || ifm->ifa_flags || ifm->ifa_scope) { NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for get address request"); return -EINVAL; @@ -5420,6 +5531,7 @@ static int inet6_rtm_getaddr(struct sk_buff *in_skb, struct nlmsghdr *nlh, .event = RTM_NEWADDR, .flags = 0, .netnsid = -1, + .force_rt_scope_universe = false, }; struct ifaddrmsg *ifm; struct nlattr *tb[IFA_MAX+1]; @@ -5443,9 +5555,10 @@ static int inet6_rtm_getaddr(struct sk_buff *in_skb, struct nlmsghdr *nlh, } addr = extract_addr(tb[IFA_ADDRESS], tb[IFA_LOCAL], &peer); - if (!addr) - return -EINVAL; - + if (!addr) { + err = -EINVAL; + goto errout; + } ifm = nlmsg_data(nlh); if (ifm->ifa_index) dev = dev_get_by_index(tgt_net, ifm->ifa_index); @@ -5490,6 +5603,7 @@ static void inet6_ifa_notify(int event, struct inet6_ifaddr *ifa) .event = event, .flags = 0, .netnsid = -1, + .force_rt_scope_universe = false, }; int err = -ENOBUFS; @@ -5507,90 +5621,101 @@ static void inet6_ifa_notify(int event, struct inet6_ifaddr *ifa) rtnl_notify(skb, net, 0, RTNLGRP_IPV6_IFADDR, NULL, GFP_ATOMIC); return; errout: - if (err < 0) - rtnl_set_sk_err(net, RTNLGRP_IPV6_IFADDR, err); + rtnl_set_sk_err(net, RTNLGRP_IPV6_IFADDR, err); } -static inline void ipv6_store_devconf(struct ipv6_devconf *cnf, - __s32 *array, int bytes) +static void ipv6_store_devconf(const struct ipv6_devconf *cnf, + __s32 *array, int bytes) { BUG_ON(bytes < (DEVCONF_MAX * 4)); memset(array, 0, bytes); - array[DEVCONF_FORWARDING] = cnf->forwarding; - array[DEVCONF_HOPLIMIT] = cnf->hop_limit; - array[DEVCONF_MTU6] = cnf->mtu6; - array[DEVCONF_ACCEPT_RA] = cnf->accept_ra; - array[DEVCONF_ACCEPT_REDIRECTS] = cnf->accept_redirects; - array[DEVCONF_AUTOCONF] = cnf->autoconf; - array[DEVCONF_DAD_TRANSMITS] = cnf->dad_transmits; - array[DEVCONF_RTR_SOLICITS] = cnf->rtr_solicits; + array[DEVCONF_FORWARDING] = READ_ONCE(cnf->forwarding); + array[DEVCONF_HOPLIMIT] = READ_ONCE(cnf->hop_limit); + array[DEVCONF_MTU6] = READ_ONCE(cnf->mtu6); + array[DEVCONF_ACCEPT_RA] = READ_ONCE(cnf->accept_ra); + array[DEVCONF_ACCEPT_REDIRECTS] = READ_ONCE(cnf->accept_redirects); + array[DEVCONF_AUTOCONF] = READ_ONCE(cnf->autoconf); + array[DEVCONF_DAD_TRANSMITS] = READ_ONCE(cnf->dad_transmits); + array[DEVCONF_RTR_SOLICITS] = READ_ONCE(cnf->rtr_solicits); array[DEVCONF_RTR_SOLICIT_INTERVAL] = - jiffies_to_msecs(cnf->rtr_solicit_interval); + jiffies_to_msecs(READ_ONCE(cnf->rtr_solicit_interval)); array[DEVCONF_RTR_SOLICIT_MAX_INTERVAL] = - jiffies_to_msecs(cnf->rtr_solicit_max_interval); + jiffies_to_msecs(READ_ONCE(cnf->rtr_solicit_max_interval)); array[DEVCONF_RTR_SOLICIT_DELAY] = - jiffies_to_msecs(cnf->rtr_solicit_delay); - array[DEVCONF_FORCE_MLD_VERSION] = cnf->force_mld_version; + jiffies_to_msecs(READ_ONCE(cnf->rtr_solicit_delay)); + array[DEVCONF_FORCE_MLD_VERSION] = READ_ONCE(cnf->force_mld_version); array[DEVCONF_MLDV1_UNSOLICITED_REPORT_INTERVAL] = - jiffies_to_msecs(cnf->mldv1_unsolicited_report_interval); + jiffies_to_msecs(READ_ONCE(cnf->mldv1_unsolicited_report_interval)); array[DEVCONF_MLDV2_UNSOLICITED_REPORT_INTERVAL] = - jiffies_to_msecs(cnf->mldv2_unsolicited_report_interval); - array[DEVCONF_USE_TEMPADDR] = cnf->use_tempaddr; - array[DEVCONF_TEMP_VALID_LFT] = cnf->temp_valid_lft; - array[DEVCONF_TEMP_PREFERED_LFT] = cnf->temp_prefered_lft; - array[DEVCONF_REGEN_MAX_RETRY] = cnf->regen_max_retry; - array[DEVCONF_MAX_DESYNC_FACTOR] = cnf->max_desync_factor; - array[DEVCONF_MAX_ADDRESSES] = cnf->max_addresses; - array[DEVCONF_ACCEPT_RA_DEFRTR] = cnf->accept_ra_defrtr; - array[DEVCONF_RA_DEFRTR_METRIC] = cnf->ra_defrtr_metric; - array[DEVCONF_ACCEPT_RA_MIN_HOP_LIMIT] = cnf->accept_ra_min_hop_limit; - array[DEVCONF_ACCEPT_RA_PINFO] = cnf->accept_ra_pinfo; + jiffies_to_msecs(READ_ONCE(cnf->mldv2_unsolicited_report_interval)); + array[DEVCONF_USE_TEMPADDR] = READ_ONCE(cnf->use_tempaddr); + array[DEVCONF_TEMP_VALID_LFT] = READ_ONCE(cnf->temp_valid_lft); + array[DEVCONF_TEMP_PREFERED_LFT] = READ_ONCE(cnf->temp_prefered_lft); + array[DEVCONF_REGEN_MAX_RETRY] = READ_ONCE(cnf->regen_max_retry); + array[DEVCONF_MAX_DESYNC_FACTOR] = READ_ONCE(cnf->max_desync_factor); + array[DEVCONF_MAX_ADDRESSES] = READ_ONCE(cnf->max_addresses); + array[DEVCONF_ACCEPT_RA_DEFRTR] = READ_ONCE(cnf->accept_ra_defrtr); + array[DEVCONF_RA_DEFRTR_METRIC] = READ_ONCE(cnf->ra_defrtr_metric); + array[DEVCONF_ACCEPT_RA_MIN_HOP_LIMIT] = + READ_ONCE(cnf->accept_ra_min_hop_limit); + array[DEVCONF_ACCEPT_RA_PINFO] = READ_ONCE(cnf->accept_ra_pinfo); #ifdef CONFIG_IPV6_ROUTER_PREF - array[DEVCONF_ACCEPT_RA_RTR_PREF] = cnf->accept_ra_rtr_pref; + array[DEVCONF_ACCEPT_RA_RTR_PREF] = READ_ONCE(cnf->accept_ra_rtr_pref); array[DEVCONF_RTR_PROBE_INTERVAL] = - jiffies_to_msecs(cnf->rtr_probe_interval); + jiffies_to_msecs(READ_ONCE(cnf->rtr_probe_interval)); #ifdef CONFIG_IPV6_ROUTE_INFO - array[DEVCONF_ACCEPT_RA_RT_INFO_MIN_PLEN] = cnf->accept_ra_rt_info_min_plen; - array[DEVCONF_ACCEPT_RA_RT_INFO_MAX_PLEN] = cnf->accept_ra_rt_info_max_plen; + array[DEVCONF_ACCEPT_RA_RT_INFO_MIN_PLEN] = + READ_ONCE(cnf->accept_ra_rt_info_min_plen); + array[DEVCONF_ACCEPT_RA_RT_INFO_MAX_PLEN] = + READ_ONCE(cnf->accept_ra_rt_info_max_plen); #endif #endif - array[DEVCONF_PROXY_NDP] = cnf->proxy_ndp; - array[DEVCONF_ACCEPT_SOURCE_ROUTE] = cnf->accept_source_route; + array[DEVCONF_PROXY_NDP] = READ_ONCE(cnf->proxy_ndp); + array[DEVCONF_ACCEPT_SOURCE_ROUTE] = + READ_ONCE(cnf->accept_source_route); #ifdef CONFIG_IPV6_OPTIMISTIC_DAD - array[DEVCONF_OPTIMISTIC_DAD] = cnf->optimistic_dad; - array[DEVCONF_USE_OPTIMISTIC] = cnf->use_optimistic; + array[DEVCONF_OPTIMISTIC_DAD] = READ_ONCE(cnf->optimistic_dad); + array[DEVCONF_USE_OPTIMISTIC] = READ_ONCE(cnf->use_optimistic); #endif #ifdef CONFIG_IPV6_MROUTE array[DEVCONF_MC_FORWARDING] = atomic_read(&cnf->mc_forwarding); #endif - array[DEVCONF_DISABLE_IPV6] = cnf->disable_ipv6; - array[DEVCONF_ACCEPT_DAD] = cnf->accept_dad; - array[DEVCONF_FORCE_TLLAO] = cnf->force_tllao; - array[DEVCONF_NDISC_NOTIFY] = cnf->ndisc_notify; - array[DEVCONF_SUPPRESS_FRAG_NDISC] = cnf->suppress_frag_ndisc; - array[DEVCONF_ACCEPT_RA_FROM_LOCAL] = cnf->accept_ra_from_local; - array[DEVCONF_ACCEPT_RA_MTU] = cnf->accept_ra_mtu; - array[DEVCONF_IGNORE_ROUTES_WITH_LINKDOWN] = cnf->ignore_routes_with_linkdown; + array[DEVCONF_DISABLE_IPV6] = READ_ONCE(cnf->disable_ipv6); + array[DEVCONF_ACCEPT_DAD] = READ_ONCE(cnf->accept_dad); + array[DEVCONF_FORCE_TLLAO] = READ_ONCE(cnf->force_tllao); + array[DEVCONF_NDISC_NOTIFY] = READ_ONCE(cnf->ndisc_notify); + array[DEVCONF_SUPPRESS_FRAG_NDISC] = + READ_ONCE(cnf->suppress_frag_ndisc); + array[DEVCONF_ACCEPT_RA_FROM_LOCAL] = + READ_ONCE(cnf->accept_ra_from_local); + array[DEVCONF_ACCEPT_RA_MTU] = READ_ONCE(cnf->accept_ra_mtu); + array[DEVCONF_IGNORE_ROUTES_WITH_LINKDOWN] = + READ_ONCE(cnf->ignore_routes_with_linkdown); /* we omit DEVCONF_STABLE_SECRET for now */ - array[DEVCONF_USE_OIF_ADDRS_ONLY] = cnf->use_oif_addrs_only; - array[DEVCONF_DROP_UNICAST_IN_L2_MULTICAST] = cnf->drop_unicast_in_l2_multicast; - array[DEVCONF_DROP_UNSOLICITED_NA] = cnf->drop_unsolicited_na; - array[DEVCONF_KEEP_ADDR_ON_DOWN] = cnf->keep_addr_on_down; - array[DEVCONF_SEG6_ENABLED] = cnf->seg6_enabled; + array[DEVCONF_USE_OIF_ADDRS_ONLY] = READ_ONCE(cnf->use_oif_addrs_only); + array[DEVCONF_DROP_UNICAST_IN_L2_MULTICAST] = + READ_ONCE(cnf->drop_unicast_in_l2_multicast); + array[DEVCONF_DROP_UNSOLICITED_NA] = READ_ONCE(cnf->drop_unsolicited_na); + array[DEVCONF_KEEP_ADDR_ON_DOWN] = READ_ONCE(cnf->keep_addr_on_down); + array[DEVCONF_SEG6_ENABLED] = READ_ONCE(cnf->seg6_enabled); #ifdef CONFIG_IPV6_SEG6_HMAC - array[DEVCONF_SEG6_REQUIRE_HMAC] = cnf->seg6_require_hmac; + array[DEVCONF_SEG6_REQUIRE_HMAC] = READ_ONCE(cnf->seg6_require_hmac); #endif - array[DEVCONF_ENHANCED_DAD] = cnf->enhanced_dad; - array[DEVCONF_ADDR_GEN_MODE] = cnf->addr_gen_mode; - array[DEVCONF_DISABLE_POLICY] = cnf->disable_policy; - array[DEVCONF_NDISC_TCLASS] = cnf->ndisc_tclass; - array[DEVCONF_RPL_SEG_ENABLED] = cnf->rpl_seg_enabled; - array[DEVCONF_IOAM6_ENABLED] = cnf->ioam6_enabled; - array[DEVCONF_IOAM6_ID] = cnf->ioam6_id; - array[DEVCONF_IOAM6_ID_WIDE] = cnf->ioam6_id_wide; - array[DEVCONF_NDISC_EVICT_NOCARRIER] = cnf->ndisc_evict_nocarrier; - array[DEVCONF_ACCEPT_UNTRACKED_NA] = cnf->accept_untracked_na; + array[DEVCONF_ENHANCED_DAD] = READ_ONCE(cnf->enhanced_dad); + array[DEVCONF_ADDR_GEN_MODE] = READ_ONCE(cnf->addr_gen_mode); + array[DEVCONF_DISABLE_POLICY] = READ_ONCE(cnf->disable_policy); + array[DEVCONF_NDISC_TCLASS] = READ_ONCE(cnf->ndisc_tclass); + array[DEVCONF_RPL_SEG_ENABLED] = READ_ONCE(cnf->rpl_seg_enabled); + array[DEVCONF_IOAM6_ENABLED] = READ_ONCE(cnf->ioam6_enabled); + array[DEVCONF_IOAM6_ID] = READ_ONCE(cnf->ioam6_id); + array[DEVCONF_IOAM6_ID_WIDE] = READ_ONCE(cnf->ioam6_id_wide); + array[DEVCONF_NDISC_EVICT_NOCARRIER] = + READ_ONCE(cnf->ndisc_evict_nocarrier); + array[DEVCONF_ACCEPT_UNTRACKED_NA] = + READ_ONCE(cnf->accept_untracked_na); + array[DEVCONF_ACCEPT_RA_MIN_LFT] = READ_ONCE(cnf->accept_ra_min_lft); + array[DEVCONF_FORCE_FORWARDING] = READ_ONCE(cnf->force_forwarding); } static inline size_t inet6_ifla6_size(void) @@ -5667,16 +5792,38 @@ static void snmp6_fill_stats(u64 *stats, struct inet6_dev *idev, int attrtype, } } +static int inet6_fill_ifla6_stats_attrs(struct sk_buff *skb, + struct inet6_dev *idev) +{ + struct nlattr *nla; + + nla = nla_reserve(skb, IFLA_INET6_STATS, IPSTATS_MIB_MAX * sizeof(u64)); + if (!nla) + goto nla_put_failure; + snmp6_fill_stats(nla_data(nla), idev, IFLA_INET6_STATS, nla_len(nla)); + + nla = nla_reserve(skb, IFLA_INET6_ICMP6STATS, ICMP6_MIB_MAX * sizeof(u64)); + if (!nla) + goto nla_put_failure; + snmp6_fill_stats(nla_data(nla), idev, IFLA_INET6_ICMP6STATS, nla_len(nla)); + + return 0; + +nla_put_failure: + return -EMSGSIZE; +} + static int inet6_fill_ifla6_attrs(struct sk_buff *skb, struct inet6_dev *idev, u32 ext_filter_mask) { - struct nlattr *nla; struct ifla_cacheinfo ci; + struct nlattr *nla; + u32 ra_mtu; - if (nla_put_u32(skb, IFLA_INET6_FLAGS, idev->if_flags)) + if (nla_put_u32(skb, IFLA_INET6_FLAGS, READ_ONCE(idev->if_flags))) goto nla_put_failure; ci.max_reasm_len = IPV6_MAXPLEN; - ci.tstamp = cstamp_delta(idev->tstamp); + ci.tstamp = cstamp_delta(READ_ONCE(idev->tstamp)); ci.reachable_time = jiffies_to_msecs(idev->nd_parms->reachable_time); ci.retrans_time = jiffies_to_msecs(NEIGH_VAR(idev->nd_parms, RETRANS_TIME)); if (nla_put(skb, IFLA_INET6_CACHEINFO, sizeof(ci), &ci)) @@ -5688,18 +5835,10 @@ static int inet6_fill_ifla6_attrs(struct sk_buff *skb, struct inet6_dev *idev, /* XXX - MC not implemented */ - if (ext_filter_mask & RTEXT_FILTER_SKIP_STATS) - return 0; - - nla = nla_reserve(skb, IFLA_INET6_STATS, IPSTATS_MIB_MAX * sizeof(u64)); - if (!nla) - goto nla_put_failure; - snmp6_fill_stats(nla_data(nla), idev, IFLA_INET6_STATS, nla_len(nla)); - - nla = nla_reserve(skb, IFLA_INET6_ICMP6STATS, ICMP6_MIB_MAX * sizeof(u64)); - if (!nla) - goto nla_put_failure; - snmp6_fill_stats(nla_data(nla), idev, IFLA_INET6_ICMP6STATS, nla_len(nla)); + if (!(ext_filter_mask & RTEXT_FILTER_SKIP_STATS)) { + if (inet6_fill_ifla6_stats_attrs(skb, idev) < 0) + goto nla_put_failure; + } nla = nla_reserve(skb, IFLA_INET6_TOKEN, sizeof(struct in6_addr)); if (!nla) @@ -5708,11 +5847,12 @@ static int inet6_fill_ifla6_attrs(struct sk_buff *skb, struct inet6_dev *idev, memcpy(nla_data(nla), idev->token.s6_addr, nla_len(nla)); read_unlock_bh(&idev->lock); - if (nla_put_u8(skb, IFLA_INET6_ADDR_GEN_MODE, idev->cnf.addr_gen_mode)) + if (nla_put_u8(skb, IFLA_INET6_ADDR_GEN_MODE, + READ_ONCE(idev->cnf.addr_gen_mode))) goto nla_put_failure; - if (idev->ra_mtu && - nla_put_u32(skb, IFLA_INET6_RA_MTU, idev->ra_mtu)) + ra_mtu = READ_ONCE(idev->ra_mtu); + if (ra_mtu && nla_put_u32(skb, IFLA_INET6_RA_MTU, ra_mtu)) goto nla_put_failure; return 0; @@ -5774,7 +5914,7 @@ static int inet6_set_iftoken(struct inet6_dev *idev, struct in6_addr *token, return -EINVAL; } - if (idev->cnf.rtr_solicits == 0) { + if (READ_ONCE(idev->cnf.rtr_solicits) == 0) { NL_SET_ERR_MSG(extack, "Router solicitation is disabled on device"); return -EINVAL; @@ -5807,7 +5947,7 @@ update_lft: if (update_rs) { idev->if_flags |= IF_RS_SENT; idev->rs_interval = rfc3315_s14_backoff_init( - idev->cnf.rtr_solicit_interval); + READ_ONCE(idev->cnf.rtr_solicit_interval)); idev->rs_probes = 1; addrconf_mod_rs_timer(idev, idev->rs_interval); } @@ -5913,7 +6053,7 @@ static int inet6_set_link_af(struct net_device *dev, const struct nlattr *nla, if (tb[IFLA_INET6_ADDR_GEN_MODE]) { u8 mode = nla_get_u8(tb[IFLA_INET6_ADDR_GEN_MODE]); - idev->cnf.addr_gen_mode = mode; + WRITE_ONCE(idev->cnf.addr_gen_mode, mode); } return 0; @@ -5925,6 +6065,7 @@ static int inet6_fill_ifinfo(struct sk_buff *skb, struct inet6_dev *idev, struct net_device *dev = idev->dev; struct ifinfomsg *hdr; struct nlmsghdr *nlh; + int ifindex, iflink; void *protoinfo; nlh = nlmsg_put(skb, portid, seq, event, sizeof(*hdr), flags); @@ -5935,18 +6076,20 @@ static int inet6_fill_ifinfo(struct sk_buff *skb, struct inet6_dev *idev, hdr->ifi_family = AF_INET6; hdr->__ifi_pad = 0; hdr->ifi_type = dev->type; - hdr->ifi_index = dev->ifindex; - hdr->ifi_flags = dev_get_flags(dev); + ifindex = READ_ONCE(dev->ifindex); + hdr->ifi_index = ifindex; + hdr->ifi_flags = netif_get_flags(dev); hdr->ifi_change = 0; + iflink = dev_get_iflink(dev); if (nla_put_string(skb, IFLA_IFNAME, dev->name) || (dev->addr_len && nla_put(skb, IFLA_ADDRESS, dev->addr_len, dev->dev_addr)) || - nla_put_u32(skb, IFLA_MTU, dev->mtu) || - (dev->ifindex != dev_get_iflink(dev) && - nla_put_u32(skb, IFLA_LINK, dev_get_iflink(dev))) || + nla_put_u32(skb, IFLA_MTU, READ_ONCE(dev->mtu)) || + (ifindex != iflink && + nla_put_u32(skb, IFLA_LINK, iflink)) || nla_put_u8(skb, IFLA_OPERSTATE, - netif_running(dev) ? dev->operstate : IF_OPER_DOWN)) + netif_running(dev) ? READ_ONCE(dev->operstate) : IF_OPER_DOWN)) goto nla_put_failure; protoinfo = nla_nest_start_noflag(skb, IFLA_PROTINFO); if (!protoinfo) @@ -5969,7 +6112,8 @@ static int inet6_valid_dump_ifinfo(const struct nlmsghdr *nlh, { struct ifinfomsg *ifm; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifm))) { + ifm = nlmsg_payload(nlh, sizeof(*ifm)); + if (!ifm) { NL_SET_ERR_MSG_MOD(extack, "Invalid header for link dump request"); return -EINVAL; } @@ -5979,7 +6123,6 @@ static int inet6_valid_dump_ifinfo(const struct nlmsghdr *nlh, return -EINVAL; } - ifm = nlmsg_data(nlh); if (ifm->__ifi_pad || ifm->ifi_type || ifm->ifi_flags || ifm->ifi_change || ifm->ifi_index) { NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for dump request"); @@ -5992,50 +6135,39 @@ static int inet6_valid_dump_ifinfo(const struct nlmsghdr *nlh, static int inet6_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); - int h, s_h; - int idx = 0, s_idx; + struct { + unsigned long ifindex; + } *ctx = (void *)cb->ctx; struct net_device *dev; struct inet6_dev *idev; - struct hlist_head *head; + int err; /* only requests using strict checking can pass data to * influence the dump */ if (cb->strict_check) { - int err = inet6_valid_dump_ifinfo(cb->nlh, cb->extack); + err = inet6_valid_dump_ifinfo(cb->nlh, cb->extack); if (err < 0) return err; } - s_h = cb->args[0]; - s_idx = cb->args[1]; - + err = 0; rcu_read_lock(); - for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { - idx = 0; - head = &net->dev_index_head[h]; - hlist_for_each_entry_rcu(dev, head, index_hlist) { - if (idx < s_idx) - goto cont; - idev = __in6_dev_get(dev); - if (!idev) - goto cont; - if (inet6_fill_ifinfo(skb, idev, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, - RTM_NEWLINK, NLM_F_MULTI) < 0) - goto out; -cont: - idx++; - } + for_each_netdev_dump(net, dev, ctx->ifindex) { + idev = __in6_dev_get(dev); + if (!idev) + continue; + err = inet6_fill_ifinfo(skb, idev, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + RTM_NEWLINK, NLM_F_MULTI); + if (err < 0) + break; } -out: rcu_read_unlock(); - cb->args[1] = idx; - cb->args[0] = h; - return skb->len; + return err; } void inet6_ifinfo_notify(int event, struct inet6_dev *idev) @@ -6058,8 +6190,7 @@ void inet6_ifinfo_notify(int event, struct inet6_dev *idev) rtnl_notify(skb, net, 0, RTNLGRP_IPV6_IFINFO, NULL, GFP_ATOMIC); return; errout: - if (err < 0) - rtnl_set_sk_err(net, RTNLGRP_IPV6_IFINFO, err); + rtnl_set_sk_err(net, RTNLGRP_IPV6_IFINFO, err); } static inline size_t inet6_prefix_nlmsg_size(void) @@ -6089,11 +6220,7 @@ static int inet6_fill_prefix(struct sk_buff *skb, struct inet6_dev *idev, pmsg->prefix_len = pinfo->prefix_len; pmsg->prefix_type = pinfo->type; pmsg->prefix_pad3 = 0; - pmsg->prefix_flags = 0; - if (pinfo->onlink) - pmsg->prefix_flags |= IF_PREFIX_ONLINK; - if (pinfo->autoconf) - pmsg->prefix_flags |= IF_PREFIX_AUTOCONF; + pmsg->prefix_flags = pinfo->flags; if (nla_put(skb, PREFIX_ADDRESS, sizeof(pinfo->prefix), &pinfo->prefix)) goto nla_put_failure; @@ -6130,8 +6257,7 @@ static void inet6_prefix_notify(int event, struct inet6_dev *idev, rtnl_notify(skb, net, 0, RTNLGRP_IPV6_PREFIX, NULL, GFP_ATOMIC); return; errout: - if (err < 0) - rtnl_set_sk_err(net, RTNLGRP_IPV6_PREFIX, err); + rtnl_set_sk_err(net, RTNLGRP_IPV6_PREFIX, err); } static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp) @@ -6198,7 +6324,7 @@ static void ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp) #ifdef CONFIG_SYSCTL -static int addrconf_sysctl_forward(struct ctl_table *ctl, int write, +static int addrconf_sysctl_forward(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { int *valp = ctl->data; @@ -6223,7 +6349,7 @@ static int addrconf_sysctl_forward(struct ctl_table *ctl, int write, return ret; } -static int addrconf_sysctl_mtu(struct ctl_table *ctl, int write, +static int addrconf_sysctl_mtu(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct inet6_dev *idev = ctl->extra1; @@ -6257,44 +6383,45 @@ static void addrconf_disable_change(struct net *net, __s32 newf) struct inet6_dev *idev; for_each_netdev(net, dev) { - idev = __in6_dev_get(dev); + idev = __in6_dev_get_rtnl_net(dev); if (idev) { int changed = (!idev->cnf.disable_ipv6) ^ (!newf); - idev->cnf.disable_ipv6 = newf; + + WRITE_ONCE(idev->cnf.disable_ipv6, newf); if (changed) dev_disable_change(idev); } } } -static int addrconf_disable_ipv6(struct ctl_table *table, int *p, int newf) +static int addrconf_disable_ipv6(const struct ctl_table *table, int *p, int newf) { - struct net *net; + struct net *net = (struct net *)table->extra2; int old; - if (!rtnl_trylock()) - return restart_syscall(); - - net = (struct net *)table->extra2; - old = *p; - *p = newf; - if (p == &net->ipv6.devconf_dflt->disable_ipv6) { - rtnl_unlock(); + WRITE_ONCE(*p, newf); return 0; } + if (!rtnl_net_trylock(net)) + return restart_syscall(); + + old = *p; + WRITE_ONCE(*p, newf); + if (p == &net->ipv6.devconf_all->disable_ipv6) { - net->ipv6.devconf_dflt->disable_ipv6 = newf; + WRITE_ONCE(net->ipv6.devconf_dflt->disable_ipv6, newf); addrconf_disable_change(net, newf); - } else if ((!newf) ^ (!old)) + } else if ((!newf) ^ (!old)) { dev_disable_change((struct inet6_dev *)table->extra1); + } - rtnl_unlock(); + rtnl_net_unlock(net); return 0; } -static int addrconf_sysctl_disable(struct ctl_table *ctl, int write, +static int addrconf_sysctl_disable(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { int *valp = ctl->data; @@ -6319,7 +6446,7 @@ static int addrconf_sysctl_disable(struct ctl_table *ctl, int write, return ret; } -static int addrconf_sysctl_proxy_ndp(struct ctl_table *ctl, int write, +static int addrconf_sysctl_proxy_ndp(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { int *valp = ctl->data; @@ -6333,20 +6460,20 @@ static int addrconf_sysctl_proxy_ndp(struct ctl_table *ctl, int write, if (write && old != new) { struct net *net = ctl->extra2; - if (!rtnl_trylock()) + if (!rtnl_net_trylock(net)) return restart_syscall(); - if (valp == &net->ipv6.devconf_dflt->proxy_ndp) + if (valp == &net->ipv6.devconf_dflt->proxy_ndp) { inet6_netconf_notify_devconf(net, RTM_NEWNETCONF, NETCONFA_PROXY_NEIGH, NETCONFA_IFINDEX_DEFAULT, net->ipv6.devconf_dflt); - else if (valp == &net->ipv6.devconf_all->proxy_ndp) + } else if (valp == &net->ipv6.devconf_all->proxy_ndp) { inet6_netconf_notify_devconf(net, RTM_NEWNETCONF, NETCONFA_PROXY_NEIGH, NETCONFA_IFINDEX_ALL, net->ipv6.devconf_all); - else { + } else { struct inet6_dev *idev = ctl->extra1; inet6_netconf_notify_devconf(net, RTM_NEWNETCONF, @@ -6354,13 +6481,13 @@ static int addrconf_sysctl_proxy_ndp(struct ctl_table *ctl, int write, idev->dev->ifindex, &idev->cnf); } - rtnl_unlock(); + rtnl_net_unlock(net); } return ret; } -static int addrconf_sysctl_addr_gen_mode(struct ctl_table *ctl, int write, +static int addrconf_sysctl_addr_gen_mode(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { @@ -6374,7 +6501,7 @@ static int addrconf_sysctl_addr_gen_mode(struct ctl_table *ctl, int write, .mode = ctl->mode, }; - if (!rtnl_trylock()) + if (!rtnl_net_trylock(net)) return restart_syscall(); new_val = *((u32 *)ctl->data); @@ -6396,33 +6523,38 @@ static int addrconf_sysctl_addr_gen_mode(struct ctl_table *ctl, int write, } if (idev->cnf.addr_gen_mode != new_val) { - idev->cnf.addr_gen_mode = new_val; - addrconf_dev_config(idev->dev); + WRITE_ONCE(idev->cnf.addr_gen_mode, new_val); + netdev_lock_ops(idev->dev); + addrconf_init_auto_addrs(idev->dev); + netdev_unlock_ops(idev->dev); } } else if (&net->ipv6.devconf_all->addr_gen_mode == ctl->data) { struct net_device *dev; - net->ipv6.devconf_dflt->addr_gen_mode = new_val; + WRITE_ONCE(net->ipv6.devconf_dflt->addr_gen_mode, new_val); for_each_netdev(net, dev) { - idev = __in6_dev_get(dev); + idev = __in6_dev_get_rtnl_net(dev); if (idev && idev->cnf.addr_gen_mode != new_val) { - idev->cnf.addr_gen_mode = new_val; - addrconf_dev_config(idev->dev); + WRITE_ONCE(idev->cnf.addr_gen_mode, + new_val); + netdev_lock_ops(idev->dev); + addrconf_init_auto_addrs(idev->dev); + netdev_unlock_ops(idev->dev); } } } - *((u32 *)ctl->data) = new_val; + WRITE_ONCE(*((u32 *)ctl->data), new_val); } out: - rtnl_unlock(); + rtnl_net_unlock(net); return ret; } -static int addrconf_sysctl_stable_secret(struct ctl_table *ctl, int write, +static int addrconf_sysctl_stable_secret(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { @@ -6439,7 +6571,7 @@ static int addrconf_sysctl_stable_secret(struct ctl_table *ctl, int write, lctl.maxlen = IPV6_MAX_STRLEN; lctl.data = str; - if (!rtnl_trylock()) + if (!rtnl_net_trylock(net)) return restart_syscall(); if (!write && !secret->initialized) { @@ -6469,27 +6601,28 @@ static int addrconf_sysctl_stable_secret(struct ctl_table *ctl, int write, struct net_device *dev; for_each_netdev(net, dev) { - struct inet6_dev *idev = __in6_dev_get(dev); + struct inet6_dev *idev = __in6_dev_get_rtnl_net(dev); if (idev) { - idev->cnf.addr_gen_mode = - IN6_ADDR_GEN_MODE_STABLE_PRIVACY; + WRITE_ONCE(idev->cnf.addr_gen_mode, + IN6_ADDR_GEN_MODE_STABLE_PRIVACY); } } } else { struct inet6_dev *idev = ctl->extra1; - idev->cnf.addr_gen_mode = IN6_ADDR_GEN_MODE_STABLE_PRIVACY; + WRITE_ONCE(idev->cnf.addr_gen_mode, + IN6_ADDR_GEN_MODE_STABLE_PRIVACY); } out: - rtnl_unlock(); + rtnl_net_unlock(net); return err; } static -int addrconf_sysctl_ignore_routes_with_linkdown(struct ctl_table *ctl, +int addrconf_sysctl_ignore_routes_with_linkdown(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) @@ -6557,27 +6690,26 @@ void addrconf_disable_policy_idev(struct inet6_dev *idev, int val) } static -int addrconf_disable_policy(struct ctl_table *ctl, int *valp, int val) +int addrconf_disable_policy(const struct ctl_table *ctl, int *valp, int val) { + struct net *net = (struct net *)ctl->extra2; struct inet6_dev *idev; - struct net *net; - - if (!rtnl_trylock()) - return restart_syscall(); - - *valp = val; - net = (struct net *)ctl->extra2; if (valp == &net->ipv6.devconf_dflt->disable_policy) { - rtnl_unlock(); + WRITE_ONCE(*valp, val); return 0; } + if (!rtnl_net_trylock(net)) + return restart_syscall(); + + WRITE_ONCE(*valp, val); + if (valp == &net->ipv6.devconf_all->disable_policy) { struct net_device *dev; for_each_netdev(net, dev) { - idev = __in6_dev_get(dev); + idev = __in6_dev_get_rtnl_net(dev); if (idev) addrconf_disable_policy_idev(idev, val); } @@ -6586,11 +6718,11 @@ int addrconf_disable_policy(struct ctl_table *ctl, int *valp, int val) addrconf_disable_policy_idev(idev, val); } - rtnl_unlock(); + rtnl_net_unlock(net); return 0; } -static int addrconf_sysctl_disable_policy(struct ctl_table *ctl, int write, +static int addrconf_sysctl_disable_policy(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { int *valp = ctl->data; @@ -6612,6 +6744,75 @@ static int addrconf_sysctl_disable_policy(struct ctl_table *ctl, int write, return ret; } +static void addrconf_force_forward_change(struct net *net, __s32 newf) +{ + struct net_device *dev; + struct inet6_dev *idev; + + for_each_netdev(net, dev) { + idev = __in6_dev_get_rtnl_net(dev); + if (idev) { + int changed = (!idev->cnf.force_forwarding) ^ (!newf); + + WRITE_ONCE(idev->cnf.force_forwarding, newf); + if (changed) + inet6_netconf_notify_devconf(dev_net(dev), RTM_NEWNETCONF, + NETCONFA_FORCE_FORWARDING, + dev->ifindex, &idev->cnf); + } + } +} + +static int addrconf_sysctl_force_forwarding(const struct ctl_table *ctl, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + struct inet6_dev *idev = ctl->extra1; + struct ctl_table tmp_ctl = *ctl; + struct net *net = ctl->extra2; + int *valp = ctl->data; + int new_val = *valp; + int old_val = *valp; + loff_t pos = *ppos; + int ret; + + tmp_ctl.extra1 = SYSCTL_ZERO; + tmp_ctl.extra2 = SYSCTL_ONE; + tmp_ctl.data = &new_val; + + ret = proc_douintvec_minmax(&tmp_ctl, write, buffer, lenp, ppos); + + if (write && old_val != new_val) { + if (!rtnl_net_trylock(net)) + return restart_syscall(); + + WRITE_ONCE(*valp, new_val); + + if (valp == &net->ipv6.devconf_dflt->force_forwarding) { + inet6_netconf_notify_devconf(net, RTM_NEWNETCONF, + NETCONFA_FORCE_FORWARDING, + NETCONFA_IFINDEX_DEFAULT, + net->ipv6.devconf_dflt); + } else if (valp == &net->ipv6.devconf_all->force_forwarding) { + inet6_netconf_notify_devconf(net, RTM_NEWNETCONF, + NETCONFA_FORCE_FORWARDING, + NETCONFA_IFINDEX_ALL, + net->ipv6.devconf_all); + + addrconf_force_forward_change(net, new_val); + } else { + inet6_netconf_notify_devconf(net, RTM_NEWNETCONF, + NETCONFA_FORCE_FORWARDING, + idev->dev->ifindex, + &idev->cnf); + } + rtnl_net_unlock(net); + } + + if (ret) + *ppos = pos; + return ret; +} + static int minus_one = -1; static const int two_five_five = 255; static u32 ioam6_if_id_max = U16_MAX; @@ -6742,6 +6943,13 @@ static const struct ctl_table addrconf_sysctl[] = { .proc_handler = proc_dointvec, }, { + .procname = "regen_min_advance", + .data = &ipv6_devconf.regen_min_advance, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { .procname = "regen_max_retry", .data = &ipv6_devconf.regen_max_retry, .maxlen = sizeof(int), @@ -6785,12 +6993,37 @@ static const struct ctl_table addrconf_sysctl[] = { .proc_handler = proc_dointvec, }, { + .procname = "accept_ra_min_lft", + .data = &ipv6_devconf.accept_ra_min_lft, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { .procname = "accept_ra_pinfo", .data = &ipv6_devconf.accept_ra_pinfo, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, + { + .procname = "ra_honor_pio_life", + .data = &ipv6_devconf.ra_honor_pio_life, + .maxlen = sizeof(u8), + .mode = 0644, + .proc_handler = proc_dou8vec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { + .procname = "ra_honor_pio_pflag", + .data = &ipv6_devconf.ra_honor_pio_pflag, + .maxlen = sizeof(u8), + .mode = 0644, + .proc_handler = proc_dou8vec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, #ifdef CONFIG_IPV6_ROUTER_PREF { .procname = "accept_ra_rtr_pref", @@ -7005,7 +7238,9 @@ static const struct ctl_table addrconf_sysctl[] = { .data = &ipv6_devconf.rpl_seg_enabled, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, { .procname = "ioam6_enabled", @@ -7051,13 +7286,18 @@ static const struct ctl_table addrconf_sysctl[] = { .extra2 = SYSCTL_TWO, }, { - /* sentinel */ - } + .procname = "force_forwarding", + .data = &ipv6_devconf.force_forwarding, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = addrconf_sysctl_force_forwarding, + }, }; static int __addrconf_sysctl_register(struct net *net, char *dev_name, struct inet6_dev *idev, struct ipv6_devconf *p) { + size_t table_size = ARRAY_SIZE(addrconf_sysctl); int i, ifindex; struct ctl_table *table; char path[sizeof("net/ipv6/conf/") + IFNAMSIZ]; @@ -7066,7 +7306,7 @@ static int __addrconf_sysctl_register(struct net *net, char *dev_name, if (!table) goto out; - for (i = 0; table[i].data; i++) { + for (i = 0; i < table_size; i++) { table[i].data += (char *)p - (char *)&ipv6_devconf; /* If one of these is already set, then it is not safe to * overwrite either of them: this makes proc_dointvec_minmax @@ -7080,7 +7320,8 @@ static int __addrconf_sysctl_register(struct net *net, char *dev_name, snprintf(path, sizeof(path), "net/ipv6/conf/%s", dev_name); - p->sysctl_header = register_net_sysctl(net, path, table); + p->sysctl_header = register_net_sysctl_sz(net, path, table, + table_size); if (!p->sysctl_header) goto free; @@ -7103,7 +7344,7 @@ out: static void __addrconf_sysctl_unregister(struct net *net, struct ipv6_devconf *p, int ifindex) { - struct ctl_table *table; + const struct ctl_table *table; if (!p->sysctl_header) return; @@ -7264,6 +7505,27 @@ static struct rtnl_af_ops inet6_ops __read_mostly = { .set_link_af = inet6_set_link_af, }; +static const struct rtnl_msg_handler addrconf_rtnl_msg_handlers[] __initconst_or_module = { + {.owner = THIS_MODULE, .protocol = PF_INET6, .msgtype = RTM_GETLINK, + .dumpit = inet6_dump_ifinfo, .flags = RTNL_FLAG_DUMP_UNLOCKED}, + {.owner = THIS_MODULE, .protocol = PF_INET6, .msgtype = RTM_NEWADDR, + .doit = inet6_rtm_newaddr, .flags = RTNL_FLAG_DOIT_PERNET}, + {.owner = THIS_MODULE, .protocol = PF_INET6, .msgtype = RTM_DELADDR, + .doit = inet6_rtm_deladdr, .flags = RTNL_FLAG_DOIT_PERNET}, + {.owner = THIS_MODULE, .protocol = PF_INET6, .msgtype = RTM_GETADDR, + .doit = inet6_rtm_getaddr, .dumpit = inet6_dump_ifaddr, + .flags = RTNL_FLAG_DOIT_UNLOCKED | RTNL_FLAG_DUMP_UNLOCKED}, + {.owner = THIS_MODULE, .protocol = PF_INET6, .msgtype = RTM_GETMULTICAST, + .dumpit = inet6_dump_ifmcaddr, + .flags = RTNL_FLAG_DUMP_UNLOCKED}, + {.owner = THIS_MODULE, .protocol = PF_INET6, .msgtype = RTM_GETANYCAST, + .dumpit = inet6_dump_ifacaddr, + .flags = RTNL_FLAG_DUMP_UNLOCKED}, + {.owner = THIS_MODULE, .protocol = PF_INET6, .msgtype = RTM_GETNETCONF, + .doit = inet6_netconf_get_devconf, .dumpit = inet6_netconf_dump_devconf, + .flags = RTNL_FLAG_DOIT_UNLOCKED | RTNL_FLAG_DUMP_UNLOCKED}, +}; + /* * Init / cleanup code */ @@ -7284,15 +7546,16 @@ int __init addrconf_init(void) if (err < 0) goto out_addrlabel; - addrconf_wq = create_workqueue("ipv6_addrconf"); + /* All works using addrconf_wq need to lock rtnl. */ + addrconf_wq = create_singlethread_workqueue("ipv6_addrconf"); if (!addrconf_wq) { err = -ENOMEM; goto out_nowq; } - rtnl_lock(); + rtnl_net_lock(&init_net); idev = ipv6_add_dev(blackhole_netdev); - rtnl_unlock(); + rtnl_net_unlock(&init_net); if (IS_ERR(idev)) { err = PTR_ERR(idev); goto errlo; @@ -7304,40 +7567,14 @@ int __init addrconf_init(void) addrconf_verify(&init_net); - rtnl_af_register(&inet6_ops); + err = rtnl_af_register(&inet6_ops); + if (err) + goto erraf; - err = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETLINK, - NULL, inet6_dump_ifinfo, 0); - if (err < 0) + err = rtnl_register_many(addrconf_rtnl_msg_handlers); + if (err) goto errout; - err = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWADDR, - inet6_rtm_newaddr, NULL, 0); - if (err < 0) - goto errout; - err = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELADDR, - inet6_rtm_deladdr, NULL, 0); - if (err < 0) - goto errout; - err = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETADDR, - inet6_rtm_getaddr, inet6_dump_ifaddr, - RTNL_FLAG_DOIT_UNLOCKED); - if (err < 0) - goto errout; - err = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETMULTICAST, - NULL, inet6_dump_ifmcaddr, 0); - if (err < 0) - goto errout; - err = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETANYCAST, - NULL, inet6_dump_ifacaddr, 0); - if (err < 0) - goto errout; - err = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETNETCONF, - inet6_netconf_get_devconf, - inet6_netconf_dump_devconf, - RTNL_FLAG_DOIT_UNLOCKED); - if (err < 0) - goto errout; err = ipv6_addr_label_rtnl_register(); if (err < 0) goto errout; @@ -7346,6 +7583,7 @@ int __init addrconf_init(void) errout: rtnl_unregister_all(PF_INET6); rtnl_af_unregister(&inet6_ops); +erraf: unregister_netdevice_notifier(&ipv6_dev_notf); errlo: destroy_workqueue(addrconf_wq); @@ -7367,17 +7605,17 @@ void addrconf_cleanup(void) rtnl_af_unregister(&inet6_ops); - rtnl_lock(); + rtnl_net_lock(&init_net); /* clean dev list */ for_each_netdev(&init_net, dev) { - if (__in6_dev_get(dev) == NULL) + if (!__in6_dev_get_rtnl_net(dev)) continue; addrconf_ifdown(dev, true); } addrconf_ifdown(init_net.loopback_dev, true); - rtnl_unlock(); + rtnl_net_unlock(&init_net); destroy_workqueue(addrconf_wq); } diff --git a/net/ipv6/addrconf_core.c b/net/ipv6/addrconf_core.c index 507a8353a6bd..c008d21925d7 100644 --- a/net/ipv6/addrconf_core.c +++ b/net/ipv6/addrconf_core.c @@ -220,19 +220,26 @@ const struct ipv6_stub *ipv6_stub __read_mostly = &(struct ipv6_stub) { EXPORT_SYMBOL_GPL(ipv6_stub); /* IPv6 Wildcard Address and Loopback Address defined by RFC2553 */ -const struct in6_addr in6addr_loopback = IN6ADDR_LOOPBACK_INIT; +const struct in6_addr in6addr_loopback __aligned(BITS_PER_LONG/8) + = IN6ADDR_LOOPBACK_INIT; EXPORT_SYMBOL(in6addr_loopback); -const struct in6_addr in6addr_any = IN6ADDR_ANY_INIT; +const struct in6_addr in6addr_any __aligned(BITS_PER_LONG/8) + = IN6ADDR_ANY_INIT; EXPORT_SYMBOL(in6addr_any); -const struct in6_addr in6addr_linklocal_allnodes = IN6ADDR_LINKLOCAL_ALLNODES_INIT; +const struct in6_addr in6addr_linklocal_allnodes __aligned(BITS_PER_LONG/8) + = IN6ADDR_LINKLOCAL_ALLNODES_INIT; EXPORT_SYMBOL(in6addr_linklocal_allnodes); -const struct in6_addr in6addr_linklocal_allrouters = IN6ADDR_LINKLOCAL_ALLROUTERS_INIT; +const struct in6_addr in6addr_linklocal_allrouters __aligned(BITS_PER_LONG/8) + = IN6ADDR_LINKLOCAL_ALLROUTERS_INIT; EXPORT_SYMBOL(in6addr_linklocal_allrouters); -const struct in6_addr in6addr_interfacelocal_allnodes = IN6ADDR_INTERFACELOCAL_ALLNODES_INIT; +const struct in6_addr in6addr_interfacelocal_allnodes __aligned(BITS_PER_LONG/8) + = IN6ADDR_INTERFACELOCAL_ALLNODES_INIT; EXPORT_SYMBOL(in6addr_interfacelocal_allnodes); -const struct in6_addr in6addr_interfacelocal_allrouters = IN6ADDR_INTERFACELOCAL_ALLROUTERS_INIT; +const struct in6_addr in6addr_interfacelocal_allrouters __aligned(BITS_PER_LONG/8) + = IN6ADDR_INTERFACELOCAL_ALLROUTERS_INIT; EXPORT_SYMBOL(in6addr_interfacelocal_allrouters); -const struct in6_addr in6addr_sitelocal_allrouters = IN6ADDR_SITELOCAL_ALLROUTERS_INIT; +const struct in6_addr in6addr_sitelocal_allrouters __aligned(BITS_PER_LONG/8) + = IN6ADDR_SITELOCAL_ALLROUTERS_INIT; EXPORT_SYMBOL(in6addr_sitelocal_allrouters); static void snmp6_free_dev(struct inet6_dev *idev) diff --git a/net/ipv6/addrlabel.c b/net/ipv6/addrlabel.c index 17ac45aa7194..567efd626ab4 100644 --- a/net/ipv6/addrlabel.c +++ b/net/ipv6/addrlabel.c @@ -20,12 +20,6 @@ #include <linux/netlink.h> #include <linux/rtnetlink.h> -#if 0 -#define ADDRLABEL(x...) printk(x) -#else -#define ADDRLABEL(x...) do { ; } while (0) -#endif - /* * Policy Table */ @@ -150,8 +144,8 @@ u32 ipv6_addr_label(struct net *net, label = p ? p->label : IPV6_ADDR_LABEL_DEFAULT; rcu_read_unlock(); - ADDRLABEL(KERN_DEBUG "%s(addr=%pI6, type=%d, ifindex=%d) => %08x\n", - __func__, addr, type, ifindex, label); + net_dbg_ratelimited("%s(addr=%pI6, type=%d, ifindex=%d) => %08x\n", __func__, addr, type, + ifindex, label); return label; } @@ -164,8 +158,8 @@ static struct ip6addrlbl_entry *ip6addrlbl_alloc(const struct in6_addr *prefix, struct ip6addrlbl_entry *newp; int addrtype; - ADDRLABEL(KERN_DEBUG "%s(prefix=%pI6, prefixlen=%d, ifindex=%d, label=%u)\n", - __func__, prefix, prefixlen, ifindex, (unsigned int)label); + net_dbg_ratelimited("%s(prefix=%pI6, prefixlen=%d, ifindex=%d, label=%u)\n", __func__, + prefix, prefixlen, ifindex, (unsigned int)label); addrtype = ipv6_addr_type(prefix) & (IPV6_ADDR_MAPPED | IPV6_ADDR_COMPATv4 | IPV6_ADDR_LOOPBACK); @@ -207,8 +201,7 @@ static int __ip6addrlbl_add(struct net *net, struct ip6addrlbl_entry *newp, struct hlist_node *n; int ret = 0; - ADDRLABEL(KERN_DEBUG "%s(newp=%p, replace=%d)\n", __func__, newp, - replace); + net_dbg_ratelimited("%s(newp=%p, replace=%d)\n", __func__, newp, replace); hlist_for_each_entry_safe(p, n, &net->ipv6.ip6addrlbl_table.head, list) { if (p->prefixlen == newp->prefixlen && @@ -234,7 +227,8 @@ static int __ip6addrlbl_add(struct net *net, struct ip6addrlbl_entry *newp, hlist_add_head_rcu(&newp->list, &net->ipv6.ip6addrlbl_table.head); out: if (!ret) - net->ipv6.ip6addrlbl_table.seq++; + WRITE_ONCE(net->ipv6.ip6addrlbl_table.seq, + net->ipv6.ip6addrlbl_table.seq + 1); return ret; } @@ -246,9 +240,8 @@ static int ip6addrlbl_add(struct net *net, struct ip6addrlbl_entry *newp; int ret = 0; - ADDRLABEL(KERN_DEBUG "%s(prefix=%pI6, prefixlen=%d, ifindex=%d, label=%u, replace=%d)\n", - __func__, prefix, prefixlen, ifindex, (unsigned int)label, - replace); + net_dbg_ratelimited("%s(prefix=%pI6, prefixlen=%d, ifindex=%d, label=%u, replace=%d)\n", + __func__, prefix, prefixlen, ifindex, (unsigned int)label, replace); newp = ip6addrlbl_alloc(prefix, prefixlen, ifindex, label); if (IS_ERR(newp)) @@ -270,8 +263,8 @@ static int __ip6addrlbl_del(struct net *net, struct hlist_node *n; int ret = -ESRCH; - ADDRLABEL(KERN_DEBUG "%s(prefix=%pI6, prefixlen=%d, ifindex=%d)\n", - __func__, prefix, prefixlen, ifindex); + net_dbg_ratelimited("%s(prefix=%pI6, prefixlen=%d, ifindex=%d)\n", __func__, prefix, + prefixlen, ifindex); hlist_for_each_entry_safe(p, n, &net->ipv6.ip6addrlbl_table.head, list) { if (p->prefixlen == prefixlen && @@ -293,8 +286,8 @@ static int ip6addrlbl_del(struct net *net, struct in6_addr prefix_buf; int ret; - ADDRLABEL(KERN_DEBUG "%s(prefix=%pI6, prefixlen=%d, ifindex=%d)\n", - __func__, prefix, prefixlen, ifindex); + net_dbg_ratelimited("%s(prefix=%pI6, prefixlen=%d, ifindex=%d)\n", __func__, prefix, + prefixlen, ifindex); ipv6_addr_prefix(&prefix_buf, prefix, prefixlen); spin_lock(&net->ipv6.ip6addrlbl_table.lock); @@ -311,8 +304,6 @@ static int __net_init ip6addrlbl_net_init(struct net *net) int err; int i; - ADDRLABEL(KERN_DEBUG "%s\n", __func__); - spin_lock_init(&net->ipv6.ip6addrlbl_table.lock); INIT_HLIST_HEAD(&net->ipv6.ip6addrlbl_table.head); @@ -445,7 +436,7 @@ static void ip6addrlbl_putmsg(struct nlmsghdr *nlh, }; static int ip6addrlbl_fill(struct sk_buff *skb, - struct ip6addrlbl_entry *p, + const struct ip6addrlbl_entry *p, u32 lseq, u32 portid, u32 seq, int event, unsigned int flags) @@ -472,12 +463,12 @@ static int ip6addrlbl_valid_dump_req(const struct nlmsghdr *nlh, { struct ifaddrlblmsg *ifal; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifal))) { + ifal = nlmsg_payload(nlh, sizeof(*ifal)); + if (!ifal) { NL_SET_ERR_MSG_MOD(extack, "Invalid header for address label dump request"); return -EINVAL; } - ifal = nlmsg_data(nlh); if (ifal->__ifal_reserved || ifal->ifal_prefixlen || ifal->ifal_flags || ifal->ifal_index || ifal->ifal_seq) { NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for address label dump request"); @@ -498,7 +489,8 @@ static int ip6addrlbl_dump(struct sk_buff *skb, struct netlink_callback *cb) struct net *net = sock_net(skb->sk); struct ip6addrlbl_entry *p; int idx = 0, s_idx = cb->args[0]; - int err; + int err = 0; + u32 lseq; if (cb->strict_check) { err = ip6addrlbl_valid_dump_req(nlh, cb->extack); @@ -507,10 +499,11 @@ static int ip6addrlbl_dump(struct sk_buff *skb, struct netlink_callback *cb) } rcu_read_lock(); + lseq = READ_ONCE(net->ipv6.ip6addrlbl_table.seq); hlist_for_each_entry_rcu(p, &net->ipv6.ip6addrlbl_table.head, list) { if (idx >= s_idx) { err = ip6addrlbl_fill(skb, p, - net->ipv6.ip6addrlbl_table.seq, + lseq, NETLINK_CB(cb->skb).portid, nlh->nlmsg_seq, RTM_NEWADDRLABEL, @@ -522,7 +515,7 @@ static int ip6addrlbl_dump(struct sk_buff *skb, struct netlink_callback *cb) } rcu_read_unlock(); cb->args[0] = idx; - return skb->len; + return err; } static inline int ip6addrlbl_msgsize(void) @@ -540,7 +533,8 @@ static int ip6addrlbl_valid_get_req(struct sk_buff *skb, struct ifaddrlblmsg *ifal; int i, err; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifal))) { + ifal = nlmsg_payload(nlh, sizeof(*ifal)); + if (!ifal) { NL_SET_ERR_MSG_MOD(extack, "Invalid header for addrlabel get request"); return -EINVAL; } @@ -549,7 +543,6 @@ static int ip6addrlbl_valid_get_req(struct sk_buff *skb, return nlmsg_parse_deprecated(nlh, sizeof(*ifal), tb, IFAL_MAX, ifal_policy, extack); - ifal = nlmsg_data(nlh); if (ifal->__ifal_reserved || ifal->ifal_flags || ifal->ifal_seq) { NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for addrlabel get request"); return -EINVAL; @@ -614,7 +607,7 @@ static int ip6addrlbl_get(struct sk_buff *in_skb, struct nlmsghdr *nlh, rcu_read_lock(); p = __ipv6_addr_label(net, addr, ipv6_addr_type(addr), ifal->ifal_index); - lseq = net->ipv6.ip6addrlbl_table.seq; + lseq = READ_ONCE(net->ipv6.ip6addrlbl_table.seq); if (p) err = ip6addrlbl_fill(skb, p, lseq, NETLINK_CB(in_skb).portid, @@ -631,22 +624,17 @@ static int ip6addrlbl_get(struct sk_buff *in_skb, struct nlmsghdr *nlh, return err; } +static const struct rtnl_msg_handler ipv6_adddr_label_rtnl_msg_handlers[] __initconst_or_module = { + {.owner = THIS_MODULE, .protocol = PF_INET6, .msgtype = RTM_NEWADDRLABEL, + .doit = ip6addrlbl_newdel, .flags = RTNL_FLAG_DOIT_UNLOCKED}, + {.owner = THIS_MODULE, .protocol = PF_INET6, .msgtype = RTM_DELADDRLABEL, + .doit = ip6addrlbl_newdel, .flags = RTNL_FLAG_DOIT_UNLOCKED}, + {.owner = THIS_MODULE, .protocol = PF_INET6, .msgtype = RTM_GETADDRLABEL, + .doit = ip6addrlbl_get, .dumpit = ip6addrlbl_dump, + .flags = RTNL_FLAG_DOIT_UNLOCKED | RTNL_FLAG_DUMP_UNLOCKED}, +}; + int __init ipv6_addr_label_rtnl_register(void) { - int ret; - - ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWADDRLABEL, - ip6addrlbl_newdel, - NULL, RTNL_FLAG_DOIT_UNLOCKED); - if (ret < 0) - return ret; - ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELADDRLABEL, - ip6addrlbl_newdel, - NULL, RTNL_FLAG_DOIT_UNLOCKED); - if (ret < 0) - return ret; - ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETADDRLABEL, - ip6addrlbl_get, - ip6addrlbl_dump, RTNL_FLAG_DOIT_UNLOCKED); - return ret; + return rtnl_register_many(ipv6_adddr_label_rtnl_msg_handlers); } diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index fee9163382c2..b705751eb73c 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -64,6 +64,7 @@ #include <net/xfrm.h> #include <net/ioam6.h> #include <net/rawv6.h> +#include <net/rps.h> #include <linux/uaccess.h> #include <linux/mroute6.h> @@ -102,9 +103,9 @@ bool ipv6_mod_enabled(void) } EXPORT_SYMBOL_GPL(ipv6_mod_enabled); -static __inline__ struct ipv6_pinfo *inet6_sk_generic(struct sock *sk) +static struct ipv6_pinfo *inet6_sk_generic(struct sock *sk) { - const int offset = sk->sk_prot->obj_size - sizeof(struct ipv6_pinfo); + const int offset = sk->sk_prot->ipv6_pinfo_offset; return (struct ipv6_pinfo *)(((u8 *)sk) + offset); } @@ -199,13 +200,16 @@ lookup_protocol: if (INET_PROTOSW_REUSE & answer_flags) sk->sk_reuse = SK_CAN_REUSE; + if (INET_PROTOSW_ICSK & answer_flags) + inet_init_csk_locks(sk); + inet = inet_sk(sk); - inet->is_icsk = (INET_PROTOSW_ICSK & answer_flags) != 0; + inet_assign_bit(IS_ICSK, sk, INET_PROTOSW_ICSK & answer_flags); if (SOCK_RAW == sock->type) { inet->inet_num = protocol; if (IPPROTO_RAW == protocol) - inet->hdrincl = 1; + inet_set_bit(HDRINCL, sk); } sk->sk_destruct = inet6_sock_destruct; @@ -217,18 +221,20 @@ lookup_protocol: inet_sk(sk)->pinet6 = np = inet6_sk_generic(sk); np->hop_limit = -1; np->mcast_hops = IPV6_DEFAULT_MCASTHOPS; - np->mc_loop = 1; - np->mc_all = 1; + inet6_set_bit(MC6_LOOP, sk); + inet6_set_bit(MC6_ALL, sk); np->pmtudisc = IPV6_PMTUDISC_WANT; - np->repflow = net->ipv6.sysctl.flowlabel_reflect & FLOWLABEL_REFLECT_ESTABLISHED; + inet6_assign_bit(REPFLOW, sk, net->ipv6.sysctl.flowlabel_reflect & + FLOWLABEL_REFLECT_ESTABLISHED); sk->sk_ipv6only = net->ipv6.sysctl.bindv6only; + sk->sk_txrehash = READ_ONCE(net->core.sysctl_txrehash); /* Init the ipv4 part of the socket since we can have sockets * using v6 API for ipv4. */ inet->uc_ttl = -1; - inet->mc_loop = 1; + inet_set_bit(MC_LOOP, sk); inet->mc_ttl = 1; inet->mc_index = 0; RCU_INIT_POINTER(inet->mc_list, NULL); @@ -238,16 +244,6 @@ lookup_protocol: inet->pmtudisc = IP_PMTUDISC_DONT; else inet->pmtudisc = IP_PMTUDISC_WANT; - /* - * Increment only the relevant sk_prot->socks debug field, this changes - * the previous behaviour of incrementing both the equivalent to - * answer->prot->socks (inet6_sock_nr) and inet_sock_nr. - * - * This allows better debug granularity as we'll know exactly how many - * UDPv6, TCPv6, etc socks were allocated, not the sum of all IPv6 - * transport protocol socks. -acme - */ - sk_refcnt_debug_inc(sk); if (inet->inet_num) { /* It assumes that any protocol which allows @@ -256,34 +252,32 @@ lookup_protocol: */ inet->inet_sport = htons(inet->inet_num); err = sk->sk_prot->hash(sk); - if (err) { - sk_common_release(sk); - goto out; - } + if (err) + goto out_sk_release; } if (sk->sk_prot->init) { err = sk->sk_prot->init(sk); - if (err) { - sk_common_release(sk); - goto out; - } + if (err) + goto out_sk_release; } if (!kern) { err = BPF_CGROUP_RUN_PROG_INET_SOCK(sk); - if (err) { - sk_common_release(sk); - goto out; - } + if (err) + goto out_sk_release; } out: return err; out_rcu_unlock: rcu_read_unlock(); goto out; +out_sk_release: + sk_common_release(sk); + sock->sk = NULL; + goto out; } -static int __inet6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, +static int __inet6_bind(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len, u32 flags) { struct sockaddr_in6 *addr = (struct sockaddr_in6 *)uaddr; @@ -408,7 +402,7 @@ static int __inet6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, sk->sk_ipv6only = 1; /* Make sure we are allowed to bind here. */ - if (snum || !(inet->bind_address_no_port || + if (snum || !(inet_test_bit(BIND_ADDRESS_NO_PORT, sk) || (flags & BIND_FORCE_ADDRESS_NO_PORT))) { err = sk->sk_prot->get_port(sk, snum); if (err) { @@ -444,10 +438,8 @@ out_unlock: goto out; } -/* bind for INET6 API */ -int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) +int inet6_bind_sk(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len) { - struct sock *sk = sock->sk; u32 flags = BIND_WITH_LOCK; const struct proto *prot; int err = 0; @@ -464,13 +456,19 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) /* BPF prog is run before any checks are done so that if the prog * changes context in a wrong way it will be caught. */ - err = BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr, + err = BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr, &addr_len, CGROUP_INET6_BIND, &flags); if (err) return err; return __inet6_bind(sk, uaddr, addr_len, flags); } + +/* bind for INET6 API */ +int inet6_bind(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len) +{ + return inet6_bind_sk(sock->sk, uaddr, addr_len); +} EXPORT_SYMBOL(inet6_bind); int inet6_release(struct socket *sock) @@ -509,7 +507,7 @@ void inet6_cleanup_sock(struct sock *sk) /* Free tx options */ - opt = xchg((__force struct ipv6_txoptions **)&np->opt, NULL); + opt = unrcu_pointer(xchg(&np->opt, NULL)); if (opt) { atomic_sub(opt->tot_len, &sk->sk_omem_alloc); txopt_put(opt); @@ -524,6 +522,7 @@ int inet6_getname(struct socket *sock, struct sockaddr *uaddr, int peer) { struct sockaddr_in6 *sin = (struct sockaddr_in6 *)uaddr; + int sin_addr_len = sizeof(*sin); struct sock *sk = sock->sk; struct inet_sock *inet = inet_sk(sk); struct ipv6_pinfo *np = inet6_sk(sk); @@ -541,9 +540,9 @@ int inet6_getname(struct socket *sock, struct sockaddr *uaddr, } sin->sin6_port = inet->inet_dport; sin->sin6_addr = sk->sk_v6_daddr; - if (np->sndflow) + if (inet6_test_bit(SNDFLOW, sk)) sin->sin6_flowinfo = np->flow_label; - BPF_CGROUP_RUN_SA_PROG(sk, (struct sockaddr *)sin, + BPF_CGROUP_RUN_SA_PROG(sk, (struct sockaddr *)sin, &sin_addr_len, CGROUP_INET6_GETPEERNAME); } else { if (ipv6_addr_any(&sk->sk_v6_rcv_saddr)) @@ -551,13 +550,13 @@ int inet6_getname(struct socket *sock, struct sockaddr *uaddr, else sin->sin6_addr = sk->sk_v6_rcv_saddr; sin->sin6_port = inet->inet_sport; - BPF_CGROUP_RUN_SA_PROG(sk, (struct sockaddr *)sin, + BPF_CGROUP_RUN_SA_PROG(sk, (struct sockaddr *)sin, &sin_addr_len, CGROUP_INET6_GETSOCKNAME); } sin->sin6_scope_id = ipv6_iface_scope_id(&sin->sin6_addr, sk->sk_bound_dev_if); release_sock(sk); - return sizeof(*sin); + return sin_addr_len; } EXPORT_SYMBOL(inet6_getname); @@ -588,7 +587,7 @@ int inet6_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) prot = READ_ONCE(sk->sk_prot); if (!prot->ioctl) return -ENOIOCTLCMD; - return prot->ioctl(sk, cmd, arg); + return sk_ioctl(sk, cmd, (void __user *)arg); } /*NOTREACHED*/ return 0; @@ -704,10 +703,10 @@ const struct proto_ops inet6_stream_ops = { #ifdef CONFIG_MMU .mmap = tcp_mmap, #endif - .sendpage = inet_sendpage, + .splice_eof = inet_splice_eof, .sendmsg_locked = tcp_sendmsg_locked, - .sendpage_locked = tcp_sendpage_locked, .splice_read = tcp_splice_read, + .set_peek_off = sk_set_peek_off, .read_sock = tcp_read_sock, .read_skb = tcp_read_skb, .peek_len = tcp_peek_len, @@ -716,6 +715,7 @@ const struct proto_ops inet6_stream_ops = { #endif .set_rcvlowat = tcp_set_rcvlowat, }; +EXPORT_SYMBOL_GPL(inet6_stream_ops); const struct proto_ops inet6_dgram_ops = { .family = PF_INET6, @@ -737,8 +737,7 @@ const struct proto_ops inet6_dgram_ops = { .recvmsg = inet6_recvmsg, /* retpoline's sake */ .read_skb = udp_read_skb, .mmap = sock_no_mmap, - .sendpage = sock_no_sendpage, - .set_peek_off = sk_set_peek_off, + .set_peek_off = udp_set_peek_off, #ifdef CONFIG_COMPAT .compat_ioctl = inet6_compat_ioctl, #endif @@ -843,7 +842,7 @@ int inet6_sk_rebuild_header(struct sock *sk) fl6.flowi6_mark = sk->sk_mark; fl6.fl6_dport = inet->inet_dport; fl6.fl6_sport = inet->inet_sport; - fl6.flowi6_uid = sk->sk_uid; + fl6.flowi6_uid = sk_uid(sk); security_sk_classify_flow(sk, flowi6_to_flowi_common(&fl6)); rcu_read_lock(); @@ -854,11 +853,11 @@ int inet6_sk_rebuild_header(struct sock *sk) dst = ip6_dst_lookup_flow(sock_net(sk), sk, &fl6, final_p); if (IS_ERR(dst)) { sk->sk_route_caps = 0; - sk->sk_err_soft = -PTR_ERR(dst); + WRITE_ONCE(sk->sk_err_soft, -PTR_ERR(dst)); return PTR_ERR(dst); } - ip6_dst_store(sk, dst, NULL, NULL); + ip6_dst_store(sk, dst, false, false); } return 0; @@ -883,7 +882,6 @@ bool ipv6_opt_accepted(const struct sock *sk, const struct sk_buff *skb, } return false; } -EXPORT_SYMBOL_GPL(ipv6_opt_accepted); static struct packet_type ipv6_packet_type __read_mostly = { .type = cpu_to_be16(ETH_P_IPV6), @@ -961,6 +959,8 @@ static int __net_init inet6_net_init(struct net *net) net->ipv6.sysctl.icmpv6_echo_ignore_all = 0; net->ipv6.sysctl.icmpv6_echo_ignore_multicast = 0; net->ipv6.sysctl.icmpv6_echo_ignore_anycast = 0; + net->ipv6.sysctl.icmpv6_error_anycast_as_unicast = 0; + net->ipv6.sysctl.icmpv6_errors_extension_mask = 0; /* By default, rate limit error messages. * Except for pmtu discovery, it would break it. @@ -1054,11 +1054,13 @@ static const struct ipv6_stub ipv6_stub_impl = { #if IS_ENABLED(CONFIG_XFRM) .xfrm6_local_rxpmtu = xfrm6_local_rxpmtu, .xfrm6_udp_encap_rcv = xfrm6_udp_encap_rcv, + .xfrm6_gro_udp_encap_rcv = xfrm6_gro_udp_encap_rcv, .xfrm6_rcv_encap = xfrm6_rcv_encap, #endif .nd_tbl = &nd_tbl, .ipv6_fragment = ip6_fragment, .ipv6_dev_find = ipv6_dev_find, + .ip6_xmit = ip6_xmit, }; static const struct ipv6_bpf_stub ipv6_bpf_stub_impl = { @@ -1066,6 +1068,7 @@ static const struct ipv6_bpf_stub ipv6_bpf_stub_impl = { .udp6_lib_lookup = __udp6_lib_lookup, .ipv6_setsockopt = do_ipv6_setsockopt, .ipv6_getsockopt = do_ipv6_getsockopt, + .ipv6_dev_get_saddr = ipv6_dev_get_saddr, }; static int __init inet6_init(void) diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c index 5228d2716289..95372e0f1d21 100644 --- a/net/ipv6/ah6.c +++ b/net/ipv6/ah6.c @@ -13,8 +13,8 @@ #define pr_fmt(fmt) "IPv6: " fmt -#include <crypto/algapi.h> #include <crypto/hash.h> +#include <crypto/utils.h> #include <linux/module.h> #include <linux/slab.h> #include <net/ip.h> @@ -46,14 +46,40 @@ struct ah_skb_cb { #define AH_SKB_CB(__skb) ((struct ah_skb_cb *)&((__skb)->cb[0])) +/* Helper to save IPv6 addresses and extension headers to temporary storage */ +static inline void ah6_save_hdrs(struct tmp_ext *iph_ext, + struct ipv6hdr *top_iph, int extlen) +{ + if (!extlen) + return; + +#if IS_ENABLED(CONFIG_IPV6_MIP6) + iph_ext->saddr = top_iph->saddr; +#endif + iph_ext->daddr = top_iph->daddr; + memcpy(&iph_ext->hdrs, top_iph + 1, extlen - sizeof(*iph_ext)); +} + +/* Helper to restore IPv6 addresses and extension headers from temporary storage */ +static inline void ah6_restore_hdrs(struct ipv6hdr *top_iph, + struct tmp_ext *iph_ext, int extlen) +{ + if (!extlen) + return; + +#if IS_ENABLED(CONFIG_IPV6_MIP6) + top_iph->saddr = iph_ext->saddr; +#endif + top_iph->daddr = iph_ext->daddr; + memcpy(top_iph + 1, &iph_ext->hdrs, extlen - sizeof(*iph_ext)); +} + static void *ah_alloc_tmp(struct crypto_ahash *ahash, int nfrags, unsigned int size) { unsigned int len; - len = size + crypto_ahash_digestsize(ahash) + - (crypto_ahash_alignmask(ahash) & - ~(crypto_tfm_ctx_alignment() - 1)); + len = size + crypto_ahash_digestsize(ahash); len = ALIGN(len, crypto_tfm_ctx_alignment()); @@ -75,10 +101,9 @@ static inline u8 *ah_tmp_auth(u8 *tmp, unsigned int offset) return tmp + offset; } -static inline u8 *ah_tmp_icv(struct crypto_ahash *ahash, void *tmp, - unsigned int offset) +static inline u8 *ah_tmp_icv(void *tmp, unsigned int offset) { - return PTR_ALIGN((u8 *)tmp + offset, crypto_ahash_alignmask(ahash) + 1); + return tmp + offset; } static inline struct ahash_request *ah_tmp_req(struct crypto_ahash *ahash, @@ -281,12 +306,12 @@ static int ipv6_clear_mutable_options(struct ipv6hdr *iph, int len, int dir) return 0; } -static void ah6_output_done(struct crypto_async_request *base, int err) +static void ah6_output_done(void *data, int err) { int extlen; u8 *iph_base; u8 *icv; - struct sk_buff *skb = base->data; + struct sk_buff *skb = data; struct xfrm_state *x = skb_dst(skb)->xfrm; struct ah_data *ahp = x->data; struct ipv6hdr *top_iph = ipv6_hdr(skb); @@ -299,18 +324,12 @@ static void ah6_output_done(struct crypto_async_request *base, int err) iph_base = AH_SKB_CB(skb)->tmp; iph_ext = ah_tmp_ext(iph_base); - icv = ah_tmp_icv(ahp->ahash, iph_ext, extlen); + icv = ah_tmp_icv(iph_ext, extlen); memcpy(ah->auth_data, icv, ahp->icv_trunc_len); memcpy(top_iph, iph_base, IPV6HDR_BASELEN); - if (extlen) { -#if IS_ENABLED(CONFIG_IPV6_MIP6) - memcpy(&top_iph->saddr, iph_ext, extlen); -#else - memcpy(&top_iph->daddr, iph_ext, extlen); -#endif - } + ah6_restore_hdrs(top_iph, iph_ext, extlen); kfree(AH_SKB_CB(skb)->tmp); xfrm_output_resume(skb->sk, skb, err); @@ -362,7 +381,7 @@ static int ah6_output(struct xfrm_state *x, struct sk_buff *skb) iph_ext = ah_tmp_ext(iph_base); seqhi = (__be32 *)((char *)iph_ext + extlen); - icv = ah_tmp_icv(ahash, seqhi, seqhi_len); + icv = ah_tmp_icv(seqhi, seqhi_len); req = ah_tmp_req(ahash, icv); sg = ah_req_sg(ahash, req); seqhisg = sg + nfrags; @@ -381,12 +400,8 @@ static int ah6_output(struct xfrm_state *x, struct sk_buff *skb) */ memcpy(iph_base, top_iph, IPV6HDR_BASELEN); + ah6_save_hdrs(iph_ext, top_iph, extlen); if (extlen) { -#if IS_ENABLED(CONFIG_IPV6_MIP6) - memcpy(iph_ext, &top_iph->saddr, extlen); -#else - memcpy(iph_ext, &top_iph->daddr, extlen); -#endif err = ipv6_clear_mutable_options(top_iph, extlen - sizeof(*iph_ext) + sizeof(*top_iph), @@ -437,13 +452,7 @@ static int ah6_output(struct xfrm_state *x, struct sk_buff *skb) memcpy(ah->auth_data, icv, ahp->icv_trunc_len); memcpy(top_iph, iph_base, IPV6HDR_BASELEN); - if (extlen) { -#if IS_ENABLED(CONFIG_IPV6_MIP6) - memcpy(&top_iph->saddr, iph_ext, extlen); -#else - memcpy(&top_iph->daddr, iph_ext, extlen); -#endif - } + ah6_restore_hdrs(top_iph, iph_ext, extlen); out_free: kfree(iph_base); @@ -451,12 +460,12 @@ out: return err; } -static void ah6_input_done(struct crypto_async_request *base, int err) +static void ah6_input_done(void *data, int err) { u8 *auth_data; u8 *icv; u8 *work_iph; - struct sk_buff *skb = base->data; + struct sk_buff *skb = data; struct xfrm_state *x = xfrm_input_state(skb); struct ah_data *ahp = x->data; struct ip_auth_hdr *ah = ip_auth_hdr(skb); @@ -468,7 +477,7 @@ static void ah6_input_done(struct crypto_async_request *base, int err) work_iph = AH_SKB_CB(skb)->tmp; auth_data = ah_tmp_auth(work_iph, hdr_len); - icv = ah_tmp_icv(ahp->ahash, auth_data, ahp->icv_trunc_len); + icv = ah_tmp_icv(auth_data, ahp->icv_trunc_len); err = crypto_memneq(icv, auth_data, ahp->icv_trunc_len) ? -EBADMSG : 0; if (err) @@ -576,7 +585,7 @@ static int ah6_input(struct xfrm_state *x, struct sk_buff *skb) auth_data = ah_tmp_auth((u8 *)work_iph, hdr_len); seqhi = (__be32 *)(auth_data + ahp->icv_trunc_len); - icv = ah_tmp_icv(ahash, seqhi, seqhi_len); + icv = ah_tmp_icv(seqhi, seqhi_len); req = ah_tmp_req(ahash, icv); sg = ah_req_sg(ahash, req); seqhisg = sg + nfrags; @@ -803,5 +812,6 @@ static void __exit ah6_fini(void) module_init(ah6_init); module_exit(ah6_fini); +MODULE_DESCRIPTION("IPv6 AH transformation helpers"); MODULE_LICENSE("GPL"); MODULE_ALIAS_XFRM_TYPE(AF_INET6, XFRM_PROTO_AH); diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c index dacdea7fcb62..52599584422b 100644 --- a/net/ipv6/anycast.c +++ b/net/ipv6/anycast.c @@ -47,11 +47,15 @@ static struct hlist_head inet6_acaddr_lst[IN6_ADDR_HSIZE]; static DEFINE_SPINLOCK(acaddr_hash_lock); +#define ac_dereference(a, idev) \ + rcu_dereference_protected(a, lockdep_is_held(&(idev)->lock)) + static int ipv6_dev_ac_dec(struct net_device *dev, const struct in6_addr *addr); -static u32 inet6_acaddr_hash(struct net *net, const struct in6_addr *addr) +static u32 inet6_acaddr_hash(const struct net *net, + const struct in6_addr *addr) { - u32 val = ipv6_addr_hash(addr) ^ net_hash_mix(net); + u32 val = __ipv6_addr_jhash(addr, net_hash_mix(net)); return hash_32(val, IN6_ADDR_HSIZE_SHIFT); } @@ -63,14 +67,12 @@ static u32 inet6_acaddr_hash(struct net *net, const struct in6_addr *addr) int ipv6_sock_ac_join(struct sock *sk, int ifindex, const struct in6_addr *addr) { struct ipv6_pinfo *np = inet6_sk(sk); + struct ipv6_ac_socklist *pac = NULL; + struct net *net = sock_net(sk); + netdevice_tracker dev_tracker; struct net_device *dev = NULL; struct inet6_dev *idev; - struct ipv6_ac_socklist *pac; - struct net *net = sock_net(sk); - int ishost = !net->ipv6.devconf_all->forwarding; - int err = 0; - - ASSERT_RTNL(); + int err = 0, ishost; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; @@ -78,32 +80,43 @@ int ipv6_sock_ac_join(struct sock *sk, int ifindex, const struct in6_addr *addr) return -EINVAL; if (ifindex) - dev = __dev_get_by_index(net, ifindex); + dev = netdev_get_by_index(net, ifindex, &dev_tracker, GFP_KERNEL); - if (ipv6_chk_addr_and_flags(net, addr, dev, true, 0, IFA_F_TENTATIVE)) - return -EINVAL; + if (ipv6_chk_addr_and_flags(net, addr, dev, true, 0, IFA_F_TENTATIVE)) { + err = -EINVAL; + goto error; + } pac = sock_kmalloc(sk, sizeof(struct ipv6_ac_socklist), GFP_KERNEL); - if (!pac) - return -ENOMEM; + if (!pac) { + err = -ENOMEM; + goto error; + } + pac->acl_next = NULL; pac->acl_addr = *addr; + ishost = !READ_ONCE(net->ipv6.devconf_all->forwarding); + if (ifindex == 0) { struct rt6_info *rt; + rcu_read_lock(); rt = rt6_lookup(net, addr, NULL, 0, NULL, 0); if (rt) { - dev = rt->dst.dev; + dev = dst_dev_rcu(&rt->dst); + netdev_hold(dev, &dev_tracker, GFP_ATOMIC); ip6_rt_put(rt); } else if (ishost) { + rcu_read_unlock(); err = -EADDRNOTAVAIL; goto error; } else { /* router, no matching interface: just pick one */ - dev = __dev_get_by_flags(net, IFF_UP, - IFF_UP | IFF_LOOPBACK); + dev = netdev_get_by_flags_rcu(net, &dev_tracker, IFF_UP, + IFF_UP | IFF_LOOPBACK); } + rcu_read_unlock(); } if (!dev) { @@ -111,7 +124,7 @@ int ipv6_sock_ac_join(struct sock *sk, int ifindex, const struct in6_addr *addr) goto error; } - idev = __in6_dev_get(dev); + idev = in6_dev_get(dev); if (!idev) { if (ifindex) err = -ENODEV; @@ -119,8 +132,9 @@ int ipv6_sock_ac_join(struct sock *sk, int ifindex, const struct in6_addr *addr) err = -EADDRNOTAVAIL; goto error; } + /* reset ishost, now that we have a specific device */ - ishost = !idev->cnf.forwarding; + ishost = !READ_ONCE(idev->cnf.forwarding); pac->acl_ifindex = dev->ifindex; @@ -133,7 +147,7 @@ int ipv6_sock_ac_join(struct sock *sk, int ifindex, const struct in6_addr *addr) if (ishost) err = -EADDRNOTAVAIL; if (err) - goto error; + goto error_idev; } err = __ipv6_dev_ac_inc(idev, addr); @@ -143,7 +157,11 @@ int ipv6_sock_ac_join(struct sock *sk, int ifindex, const struct in6_addr *addr) pac = NULL; } +error_idev: + in6_dev_put(idev); error: + netdev_put(dev, &dev_tracker); + if (pac) sock_kfree_s(sk, pac, sizeof(*pac)); return err; @@ -154,12 +172,10 @@ error: */ int ipv6_sock_ac_drop(struct sock *sk, int ifindex, const struct in6_addr *addr) { - struct ipv6_pinfo *np = inet6_sk(sk); - struct net_device *dev; struct ipv6_ac_socklist *pac, *prev_pac; + struct ipv6_pinfo *np = inet6_sk(sk); struct net *net = sock_net(sk); - - ASSERT_RTNL(); + struct net_device *dev; prev_pac = NULL; for (pac = np->ipv6_ac_list; pac; pac = pac->acl_next) { @@ -175,9 +191,11 @@ int ipv6_sock_ac_drop(struct sock *sk, int ifindex, const struct in6_addr *addr) else np->ipv6_ac_list = pac->acl_next; - dev = __dev_get_by_index(net, pac->acl_ifindex); - if (dev) + dev = dev_get_by_index(net, pac->acl_ifindex); + if (dev) { ipv6_dev_ac_dec(dev, &pac->acl_addr); + dev_put(dev); + } sock_kfree_s(sk, pac, sizeof(*pac)); return 0; @@ -186,21 +204,20 @@ int ipv6_sock_ac_drop(struct sock *sk, int ifindex, const struct in6_addr *addr) void __ipv6_sock_ac_close(struct sock *sk) { struct ipv6_pinfo *np = inet6_sk(sk); + struct net *net = sock_net(sk); struct net_device *dev = NULL; struct ipv6_ac_socklist *pac; - struct net *net = sock_net(sk); - int prev_index; + int prev_index = 0; - ASSERT_RTNL(); pac = np->ipv6_ac_list; np->ipv6_ac_list = NULL; - prev_index = 0; while (pac) { struct ipv6_ac_socklist *next = pac->acl_next; if (pac->acl_ifindex != prev_index) { - dev = __dev_get_by_index(net, pac->acl_ifindex); + dev_put(dev); + dev = dev_get_by_index(net, pac->acl_ifindex); prev_index = pac->acl_ifindex; } if (dev) @@ -208,6 +225,8 @@ void __ipv6_sock_ac_close(struct sock *sk) sock_kfree_s(sk, pac, sizeof(*pac)); pac = next; } + + dev_put(dev); } void ipv6_sock_ac_close(struct sock *sk) @@ -216,9 +235,8 @@ void ipv6_sock_ac_close(struct sock *sk) if (!np->ipv6_ac_list) return; - rtnl_lock(); + __ipv6_sock_ac_close(sk); - rtnl_unlock(); } static void ipv6_add_acaddr_hash(struct net *net, struct ifacaddr6 *aca) @@ -252,9 +270,8 @@ static void aca_free_rcu(struct rcu_head *h) static void aca_put(struct ifacaddr6 *ac) { - if (refcount_dec_and_test(&ac->aca_refcnt)) { - call_rcu(&ac->rcu, aca_free_rcu); - } + if (refcount_dec_and_test(&ac->aca_refcnt)) + call_rcu_hurry(&ac->rcu, aca_free_rcu); } static struct ifacaddr6 *aca_alloc(struct fib6_info *f6i, @@ -278,6 +295,37 @@ static struct ifacaddr6 *aca_alloc(struct fib6_info *f6i, return aca; } +static void inet6_ifacaddr_notify(struct net_device *dev, + const struct ifacaddr6 *ifaca, int event) +{ + struct inet6_fill_args fillargs = { + .event = event, + .netnsid = -1, + }; + struct net *net = dev_net(dev); + struct sk_buff *skb; + int err = -ENOMEM; + + skb = nlmsg_new(NLMSG_ALIGN(sizeof(struct ifaddrmsg)) + + nla_total_size(sizeof(struct in6_addr)) + + nla_total_size(sizeof(struct ifa_cacheinfo)), + GFP_KERNEL); + if (!skb) + goto error; + + err = inet6_fill_ifacaddr(skb, ifaca, &fillargs); + if (err < 0) { + pr_err("Failed to fill in anycast addresses (err %d)\n", err); + nlmsg_free(skb); + goto error; + } + + rtnl_notify(skb, net, 0, RTNLGRP_IPV6_ACADDR, NULL, GFP_KERNEL); + return; +error: + rtnl_set_sk_err(net, RTNLGRP_IPV6_ACADDR, err); +} + /* * device anycast group inc (add if not found) */ @@ -288,15 +336,14 @@ int __ipv6_dev_ac_inc(struct inet6_dev *idev, const struct in6_addr *addr) struct net *net; int err; - ASSERT_RTNL(); - write_lock_bh(&idev->lock); if (idev->dead) { err = -ENODEV; goto out; } - for (aca = idev->ac_list; aca; aca = aca->aca_next) { + for (aca = ac_dereference(idev->ac_list, idev); aca; + aca = ac_dereference(aca->aca_next, idev)) { if (ipv6_addr_equal(&aca->aca_addr, addr)) { aca->aca_users++; err = 0; @@ -305,7 +352,7 @@ int __ipv6_dev_ac_inc(struct inet6_dev *idev, const struct in6_addr *addr) } net = dev_net(idev->dev); - f6i = addrconf_f6i_alloc(net, idev, addr, true, GFP_ATOMIC); + f6i = addrconf_f6i_alloc(net, idev, addr, true, GFP_ATOMIC, NULL); if (IS_ERR(f6i)) { err = PTR_ERR(f6i); goto out; @@ -317,13 +364,13 @@ int __ipv6_dev_ac_inc(struct inet6_dev *idev, const struct in6_addr *addr) goto out; } - aca->aca_next = idev->ac_list; - idev->ac_list = aca; - /* Hold this for addrconf_join_solict() below before we unlock, * it is already exposed via idev->ac_list. */ aca_get(aca); + aca->aca_next = idev->ac_list; + rcu_assign_pointer(idev->ac_list, aca); + write_unlock_bh(&idev->lock); ipv6_add_acaddr_hash(net, aca); @@ -332,6 +379,8 @@ int __ipv6_dev_ac_inc(struct inet6_dev *idev, const struct in6_addr *addr) addrconf_join_solict(idev->dev, &aca->aca_addr); + inet6_ifacaddr_notify(idev->dev, aca, RTM_NEWANYCAST); + aca_put(aca); return 0; out: @@ -346,11 +395,10 @@ int __ipv6_dev_ac_dec(struct inet6_dev *idev, const struct in6_addr *addr) { struct ifacaddr6 *aca, *prev_aca; - ASSERT_RTNL(); - write_lock_bh(&idev->lock); prev_aca = NULL; - for (aca = idev->ac_list; aca; aca = aca->aca_next) { + for (aca = ac_dereference(idev->ac_list, idev); aca; + aca = ac_dereference(aca->aca_next, idev)) { if (ipv6_addr_equal(&aca->aca_addr, addr)) break; prev_aca = aca; @@ -364,27 +412,33 @@ int __ipv6_dev_ac_dec(struct inet6_dev *idev, const struct in6_addr *addr) return 0; } if (prev_aca) - prev_aca->aca_next = aca->aca_next; + rcu_assign_pointer(prev_aca->aca_next, aca->aca_next); else - idev->ac_list = aca->aca_next; + rcu_assign_pointer(idev->ac_list, aca->aca_next); write_unlock_bh(&idev->lock); ipv6_del_acaddr_hash(aca); addrconf_leave_solict(idev, &aca->aca_addr); ip6_del_rt(dev_net(idev->dev), aca->aca_rt, false); + inet6_ifacaddr_notify(idev->dev, aca, RTM_DELANYCAST); + aca_put(aca); return 0; } -/* called with rtnl_lock() */ static int ipv6_dev_ac_dec(struct net_device *dev, const struct in6_addr *addr) { - struct inet6_dev *idev = __in6_dev_get(dev); + struct inet6_dev *idev = in6_dev_get(dev); + int err; if (!idev) return -ENODEV; - return __ipv6_dev_ac_dec(idev, addr); + + err = __ipv6_dev_ac_dec(idev, addr); + in6_dev_put(idev); + + return err; } void ipv6_ac_destroy_dev(struct inet6_dev *idev) @@ -392,8 +446,8 @@ void ipv6_ac_destroy_dev(struct inet6_dev *idev) struct ifacaddr6 *aca; write_lock_bh(&idev->lock); - while ((aca = idev->ac_list) != NULL) { - idev->ac_list = aca->aca_next; + while ((aca = ac_dereference(idev->ac_list, idev)) != NULL) { + rcu_assign_pointer(idev->ac_list, aca->aca_next); write_unlock_bh(&idev->lock); ipv6_del_acaddr_hash(aca); @@ -420,11 +474,10 @@ static bool ipv6_chk_acast_dev(struct net_device *dev, const struct in6_addr *ad idev = __in6_dev_get(dev); if (idev) { - read_lock_bh(&idev->lock); - for (aca = idev->ac_list; aca; aca = aca->aca_next) + for (aca = rcu_dereference(idev->ac_list); aca; + aca = rcu_dereference(aca->aca_next)) if (ipv6_addr_equal(&aca->aca_addr, addr)) break; - read_unlock_bh(&idev->lock); return aca != NULL; } return false; @@ -477,30 +530,25 @@ bool ipv6_chk_acast_addr_src(struct net *net, struct net_device *dev, struct ac6_iter_state { struct seq_net_private p; struct net_device *dev; - struct inet6_dev *idev; }; #define ac6_seq_private(seq) ((struct ac6_iter_state *)(seq)->private) static inline struct ifacaddr6 *ac6_get_first(struct seq_file *seq) { - struct ifacaddr6 *im = NULL; struct ac6_iter_state *state = ac6_seq_private(seq); struct net *net = seq_file_net(seq); + struct ifacaddr6 *im = NULL; - state->idev = NULL; for_each_netdev_rcu(net, state->dev) { struct inet6_dev *idev; + idev = __in6_dev_get(state->dev); if (!idev) continue; - read_lock_bh(&idev->lock); - im = idev->ac_list; - if (im) { - state->idev = idev; + im = rcu_dereference(idev->ac_list); + if (im) break; - } - read_unlock_bh(&idev->lock); } return im; } @@ -508,22 +556,17 @@ static inline struct ifacaddr6 *ac6_get_first(struct seq_file *seq) static struct ifacaddr6 *ac6_get_next(struct seq_file *seq, struct ifacaddr6 *im) { struct ac6_iter_state *state = ac6_seq_private(seq); + struct inet6_dev *idev; - im = im->aca_next; + im = rcu_dereference(im->aca_next); while (!im) { - if (likely(state->idev != NULL)) - read_unlock_bh(&state->idev->lock); - state->dev = next_net_device_rcu(state->dev); - if (!state->dev) { - state->idev = NULL; + if (!state->dev) break; - } - state->idev = __in6_dev_get(state->dev); - if (!state->idev) + idev = __in6_dev_get(state->dev); + if (!idev) continue; - read_lock_bh(&state->idev->lock); - im = state->idev->ac_list; + im = rcu_dereference(idev->ac_list); } return im; } @@ -555,12 +598,6 @@ static void *ac6_seq_next(struct seq_file *seq, void *v, loff_t *pos) static void ac6_seq_stop(struct seq_file *seq, void *v) __releases(RCU) { - struct ac6_iter_state *state = ac6_seq_private(seq); - - if (likely(state->idev != NULL)) { - read_unlock_bh(&state->idev->lock); - state->idev = NULL; - } rcu_read_unlock(); } diff --git a/net/ipv6/calipso.c b/net/ipv6/calipso.c index 1578ed9e97d8..df1986973430 100644 --- a/net/ipv6/calipso.c +++ b/net/ipv6/calipso.c @@ -29,10 +29,10 @@ #include <net/calipso.h> #include <linux/atomic.h> #include <linux/bug.h> -#include <asm/unaligned.h> +#include <linux/unaligned.h> #include <linux/crc-ccitt.h> -/* Maximium size of the calipso option including +/* Maximum size of the calipso option including * the two-byte TLV header. */ #define CALIPSO_OPT_LEN_MAX (2 + 252) @@ -42,13 +42,13 @@ */ #define CALIPSO_HDR_LEN (2 + 8) -/* Maximium size of the calipso option including +/* Maximum size of the calipso option including * the two-byte TLV header and upto 3 bytes of * leading pad and 7 bytes of trailing pad. */ #define CALIPSO_OPT_LEN_MAX_WITH_PAD (3 + CALIPSO_OPT_LEN_MAX + 7) - /* Maximium size of u32 aligned buffer required to hold calipso + /* Maximum size of u32 aligned buffer required to hold calipso * option. Max of 3 initial pad bytes starting from buffer + 3. * i.e. the worst case is when the previous tlv finishes on 4n + 3. */ @@ -657,11 +657,8 @@ static int calipso_map_cat_ntoh(const struct calipso_doi *doi_def, net_clen_bits, spot + 1, 1); - if (spot < 0) { - if (spot == -2) - return -EFAULT; + if (spot < 0) return 0; - } ret_val = netlbl_catmap_setbit(&secattr->attr.mls.cat, spot, @@ -1075,8 +1072,13 @@ static int calipso_sock_getattr(struct sock *sk, struct ipv6_opt_hdr *hop; int opt_len, len, ret_val = -ENOMSG, offset; unsigned char *opt; - struct ipv6_txoptions *txopts = txopt_get(inet6_sk(sk)); + struct ipv6_pinfo *pinfo = inet6_sk(sk); + struct ipv6_txoptions *txopts; + + if (!pinfo) + return -EAFNOSUPPORT; + txopts = txopt_get(pinfo); if (!txopts || !txopts->hopopt) goto done; @@ -1128,8 +1130,13 @@ static int calipso_sock_setattr(struct sock *sk, { int ret_val; struct ipv6_opt_hdr *old, *new; - struct ipv6_txoptions *txopts = txopt_get(inet6_sk(sk)); + struct ipv6_pinfo *pinfo = inet6_sk(sk); + struct ipv6_txoptions *txopts; + + if (!pinfo) + return -EAFNOSUPPORT; + txopts = txopt_get(pinfo); old = NULL; if (txopts) old = txopts->hopopt; @@ -1156,8 +1163,13 @@ static int calipso_sock_setattr(struct sock *sk, static void calipso_sock_delattr(struct sock *sk) { struct ipv6_opt_hdr *new_hop; - struct ipv6_txoptions *txopts = txopt_get(inet6_sk(sk)); + struct ipv6_pinfo *pinfo = inet6_sk(sk); + struct ipv6_txoptions *txopts; + if (!pinfo) + return; + + txopts = txopt_get(pinfo); if (!txopts || !txopts->hopopt) goto done; @@ -1195,6 +1207,10 @@ static int calipso_req_setattr(struct request_sock *req, struct ipv6_opt_hdr *old, *new; struct sock *sk = sk_to_full_sk(req_to_sk(req)); + /* sk is NULL for SYN+ACK w/ SYN Cookie */ + if (!sk) + return -ENOMEM; + if (req_inet->ipv6_opt && req_inet->ipv6_opt->hopopt) old = req_inet->ipv6_opt->hopopt; else @@ -1235,6 +1251,10 @@ static void calipso_req_delattr(struct request_sock *req) struct ipv6_txoptions *txopts; struct sock *sk = sk_to_full_sk(req_to_sk(req)); + /* sk is NULL for SYN+ACK w/ SYN Cookie */ + if (!sk) + return; + if (!req_inet->ipv6_opt || !req_inet->ipv6_opt->hopopt) return; diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index e624497fa992..83e03176819c 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -38,10 +38,11 @@ static bool ipv6_mapped_addr_any(const struct in6_addr *a) return ipv6_addr_v4mapped(a) && (a->s6_addr32[3] == 0); } -static void ip6_datagram_flow_key_init(struct flowi6 *fl6, struct sock *sk) +static void ip6_datagram_flow_key_init(struct flowi6 *fl6, + const struct sock *sk) { - struct inet_sock *inet = inet_sk(sk); - struct ipv6_pinfo *np = inet6_sk(sk); + const struct inet_sock *inet = inet_sk(sk); + const struct ipv6_pinfo *np = inet6_sk(sk); int oif = sk->sk_bound_dev_if; memset(fl6, 0, sizeof(*fl6)); @@ -51,17 +52,17 @@ static void ip6_datagram_flow_key_init(struct flowi6 *fl6, struct sock *sk) fl6->flowi6_mark = sk->sk_mark; fl6->fl6_dport = inet->inet_dport; fl6->fl6_sport = inet->inet_sport; - fl6->flowlabel = np->flow_label; - fl6->flowi6_uid = sk->sk_uid; + fl6->flowlabel = ip6_make_flowinfo(np->tclass, np->flow_label); + fl6->flowi6_uid = sk_uid(sk); if (!oif) oif = np->sticky_pktinfo.ipi6_ifindex; if (!oif) { if (ipv6_addr_is_multicast(&fl6->daddr)) - oif = np->mcast_oif; + oif = READ_ONCE(np->mcast_oif); else - oif = np->ucast_oif; + oif = READ_ONCE(np->ucast_oif); } fl6->flowi6_oif = oif; @@ -79,7 +80,8 @@ int ip6_datagram_dst_update(struct sock *sk, bool fix_sk_saddr) struct flowi6 fl6; int err = 0; - if (np->sndflow && (np->flow_label & IPV6_FLOWLABEL_MASK)) { + if (inet6_test_bit(SNDFLOW, sk) && + (np->flow_label & IPV6_FLOWLABEL_MASK)) { flowlabel = fl6_sock_lookup(sk, np->flow_label); if (IS_ERR(flowlabel)) return -EINVAL; @@ -125,7 +127,7 @@ void ip6_datagram_release_cb(struct sock *sk) rcu_read_lock(); dst = __sk_dst_get(sk); - if (!dst || !dst->obsolete || + if (!dst || !READ_ONCE(dst->obsolete) || dst->ops->check(dst, inet6_sk(sk)->dst_cookie)) { rcu_read_unlock(); return; @@ -136,7 +138,7 @@ void ip6_datagram_release_cb(struct sock *sk) } EXPORT_SYMBOL_GPL(ip6_datagram_release_cb); -int __ip6_datagram_connect(struct sock *sk, struct sockaddr *uaddr, +int __ip6_datagram_connect(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len) { struct sockaddr_in6 *usin = (struct sockaddr_in6 *) uaddr; @@ -162,7 +164,7 @@ int __ip6_datagram_connect(struct sock *sk, struct sockaddr *uaddr, if (usin->sin6_family != AF_INET6) return -EAFNOSUPPORT; - if (np->sndflow) + if (inet6_test_bit(SNDFLOW, sk)) fl6_flowlabel = usin->sin6_flowinfo & IPV6_FLOWINFO_MASK; if (ipv6_addr_any(&usin->sin6_addr)) { @@ -192,7 +194,7 @@ int __ip6_datagram_connect(struct sock *sk, struct sockaddr *uaddr, sin.sin_port = usin->sin6_port; err = __ip4_datagram_connect(sk, - (struct sockaddr *) &sin, + (struct sockaddr_unsized *)&sin, sizeof(sin)); ipv4_connected: @@ -227,7 +229,7 @@ ipv4_connected: } if (!sk->sk_bound_dev_if && (addr_type & IPV6_ADDR_MULTICAST)) - WRITE_ONCE(sk->sk_bound_dev_if, np->mcast_oif); + WRITE_ONCE(sk->sk_bound_dev_if, READ_ONCE(np->mcast_oif)); /* Connect to link-local address requires an interface */ if (!sk->sk_bound_dev_if) { @@ -269,7 +271,7 @@ out: } EXPORT_SYMBOL_GPL(__ip6_datagram_connect); -int ip6_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) +int ip6_datagram_connect(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len) { int res; @@ -280,7 +282,7 @@ int ip6_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) } EXPORT_SYMBOL_GPL(ip6_datagram_connect); -int ip6_datagram_connect_v6_only(struct sock *sk, struct sockaddr *uaddr, +int ip6_datagram_connect_v6_only(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len) { DECLARE_SOCKADDR(struct sockaddr_in6 *, sin6, uaddr); @@ -304,11 +306,10 @@ static void ipv6_icmp_error_rfc4884(const struct sk_buff *skb, void ipv6_icmp_error(struct sock *sk, struct sk_buff *skb, int err, __be16 port, u32 info, u8 *payload) { - struct ipv6_pinfo *np = inet6_sk(sk); struct icmp6hdr *icmph = icmp6_hdr(skb); struct sock_exterr_skb *serr; - if (!np->recverr) + if (!inet6_test_bit(RECVERR6, sk)) return; skb = skb_clone(skb, GFP_ATOMIC); @@ -331,7 +332,7 @@ void ipv6_icmp_error(struct sock *sk, struct sk_buff *skb, int err, __skb_pull(skb, payload - skb->data); - if (inet6_sk(sk)->recverr_rfc4884) + if (inet6_test_bit(RECVERR6_RFC4884, sk)) ipv6_icmp_error_rfc4884(skb, &serr->ee.ee_rfc4884); skb_reset_transport_header(skb); @@ -343,12 +344,11 @@ EXPORT_SYMBOL_GPL(ipv6_icmp_error); void ipv6_local_error(struct sock *sk, int err, struct flowi6 *fl6, u32 info) { - const struct ipv6_pinfo *np = inet6_sk(sk); struct sock_exterr_skb *serr; struct ipv6hdr *iph; struct sk_buff *skb; - if (!np->recverr) + if (!inet6_test_bit(RECVERR6, sk)) return; skb = alloc_skb(sizeof(struct ipv6hdr), GFP_ATOMIC); @@ -492,7 +492,7 @@ int ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) const struct ipv6hdr *ip6h = container_of((struct in6_addr *)(nh + serr->addr_offset), struct ipv6hdr, daddr); sin->sin6_addr = ip6h->daddr; - if (np->sndflow) + if (inet6_test_bit(SNDFLOW, sk)) sin->sin6_flowinfo = ip6_flowinfo(ip6h); sin->sin6_scope_id = ipv6_iface_scope_id(&sin->sin6_addr, @@ -523,7 +523,7 @@ int ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) } else { ipv6_addr_set_v4mapped(ip_hdr(skb)->saddr, &sin->sin6_addr); - if (inet_sk(sk)->cmsg_flags) + if (inet_cmsg_flags(inet_sk(sk))) ip_cmsg_recv(msg, skb); } } @@ -1064,9 +1064,9 @@ void __ip6_dgram_sock_seq_show(struct seq_file *seq, struct sock *sp, sk_wmem_alloc_get(sp), rqueue, 0, 0L, 0, - from_kuid_munged(seq_user_ns(seq), sock_i_uid(sp)), + from_kuid_munged(seq_user_ns(seq), sk_uid(sp)), 0, sock_i_ino(sp), refcount_read(&sp->sk_refcnt), sp, - atomic_read(&sp->sk_drops)); + sk_drops_read(sp)); } diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index 14ed868680c6..e75da98f5283 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -36,6 +36,7 @@ #include <net/tcp.h> #include <net/espintcp.h> #include <net/inet6_hashtables.h> +#include <linux/skbuff_ref.h> #include <linux/highmem.h> @@ -112,7 +113,7 @@ static inline struct scatterlist *esp_req_sg(struct crypto_aead *aead, __alignof__(struct scatterlist)); } -static void esp_ssg_unref(struct xfrm_state *x, void *tmp) +static void esp_ssg_unref(struct xfrm_state *x, void *tmp, struct sk_buff *skb) { struct crypto_aead *aead = x->data; int extralen = 0; @@ -131,55 +132,25 @@ static void esp_ssg_unref(struct xfrm_state *x, void *tmp) */ if (req->src != req->dst) for (sg = sg_next(req->src); sg; sg = sg_next(sg)) - put_page(sg_page(sg)); + skb_page_unref(page_to_netmem(sg_page(sg)), + skb->pp_recycle); } #ifdef CONFIG_INET6_ESPINTCP -struct esp_tcp_sk { - struct sock *sk; - struct rcu_head rcu; -}; - -static void esp_free_tcp_sk(struct rcu_head *head) -{ - struct esp_tcp_sk *esk = container_of(head, struct esp_tcp_sk, rcu); - - sock_put(esk->sk); - kfree(esk); -} - static struct sock *esp6_find_tcp_sk(struct xfrm_state *x) { struct xfrm_encap_tmpl *encap = x->encap; struct net *net = xs_net(x); - struct esp_tcp_sk *esk; __be16 sport, dport; - struct sock *nsk; struct sock *sk; - sk = rcu_dereference(x->encap_sk); - if (sk && sk->sk_state == TCP_ESTABLISHED) - return sk; - spin_lock_bh(&x->lock); sport = encap->encap_sport; dport = encap->encap_dport; - nsk = rcu_dereference_protected(x->encap_sk, - lockdep_is_held(&x->lock)); - if (sk && sk == nsk) { - esk = kmalloc(sizeof(*esk), GFP_ATOMIC); - if (!esk) { - spin_unlock_bh(&x->lock); - return ERR_PTR(-ENOMEM); - } - RCU_INIT_POINTER(x->encap_sk, NULL); - esk->sk = sk; - call_rcu(&esk->rcu, esp_free_tcp_sk); - } spin_unlock_bh(&x->lock); - sk = __inet6_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, &x->id.daddr.in6, - dport, &x->props.saddr.in6, ntohs(sport), 0, 0); + sk = __inet6_lookup_established(net, &x->id.daddr.in6, dport, + &x->props.saddr.in6, ntohs(sport), 0, 0); if (!sk) return ERR_PTR(-ENOENT); @@ -188,20 +159,6 @@ static struct sock *esp6_find_tcp_sk(struct xfrm_state *x) return ERR_PTR(-EINVAL); } - spin_lock_bh(&x->lock); - nsk = rcu_dereference_protected(x->encap_sk, - lockdep_is_held(&x->lock)); - if (encap->encap_sport != sport || - encap->encap_dport != dport) { - sock_put(sk); - sk = nsk ?: ERR_PTR(-EREMCHG); - } else if (sk == nsk) { - sock_put(sk); - } else { - rcu_assign_pointer(x->encap_sk, sk); - } - spin_unlock_bh(&x->lock); - return sk; } @@ -214,8 +171,10 @@ static int esp_output_tcp_finish(struct xfrm_state *x, struct sk_buff *skb) sk = esp6_find_tcp_sk(x); err = PTR_ERR_OR_ZERO(sk); - if (err) + if (err) { + kfree_skb(skb); goto out; + } bh_lock_sock(sk); if (sock_owned_by_user(sk)) @@ -224,6 +183,8 @@ static int esp_output_tcp_finish(struct xfrm_state *x, struct sk_buff *skb) err = espintcp_push_skb(sk, skb); bh_unlock_sock(sk); + sock_put(sk); + out: rcu_read_unlock(); return err; @@ -255,8 +216,7 @@ static int esp_output_tail_tcp(struct xfrm_state *x, struct sk_buff *skb) #else static int esp_output_tail_tcp(struct xfrm_state *x, struct sk_buff *skb) { - kfree_skb(skb); - + WARN_ON(1); return -EOPNOTSUPP; } #endif @@ -278,9 +238,9 @@ static void esp_output_encap_csum(struct sk_buff *skb) } } -static void esp_output_done(struct crypto_async_request *base, int err) +static void esp_output_done(void *data, int err) { - struct sk_buff *skb = base->data; + struct sk_buff *skb = data; struct xfrm_offload *xo = xfrm_offload(skb); void *tmp; struct xfrm_state *x; @@ -294,7 +254,7 @@ static void esp_output_done(struct crypto_async_request *base, int err) } tmp = ESP_SKB_CB(skb)->tmp; - esp_ssg_unref(x, tmp); + esp_ssg_unref(x, tmp, skb); kfree(tmp); esp_output_encap_csum(skb); @@ -314,7 +274,7 @@ static void esp_output_done(struct crypto_async_request *base, int err) x->encap && x->encap->encap_type == TCP_ENCAP_ESPINTCP) esp_output_tail_tcp(x, skb); else - xfrm_output_resume(skb->sk, skb, err); + xfrm_output_resume(skb_to_full_sk(skb), skb, err); } } @@ -368,12 +328,12 @@ static struct ip_esp_hdr *esp_output_set_esn(struct sk_buff *skb, return esph; } -static void esp_output_done_esn(struct crypto_async_request *base, int err) +static void esp_output_done_esn(void *data, int err) { - struct sk_buff *skb = base->data; + struct sk_buff *skb = data; esp_output_restore_header(skb); - esp_output_done(base, err); + esp_output_done(data, err); } static struct ip_esp_hdr *esp6_output_udp_encap(struct sk_buff *skb, @@ -383,7 +343,6 @@ static struct ip_esp_hdr *esp6_output_udp_encap(struct sk_buff *skb, __be16 dport) { struct udphdr *uh; - __be32 *udpdata32; unsigned int len; len = skb->len + esp->tailen - skb_transport_offset(skb); @@ -398,12 +357,6 @@ static struct ip_esp_hdr *esp6_output_udp_encap(struct sk_buff *skb, *skb_mac_header(skb) = IPPROTO_UDP; - if (encap_type == UDP_ENCAP_ESPINUDP_NON_IKE) { - udpdata32 = (__be32 *)(uh + 1); - udpdata32[0] = udpdata32[1] = 0; - return (struct ip_esp_hdr *)(udpdata32 + 2); - } - return (struct ip_esp_hdr *)(uh + 1); } @@ -428,6 +381,8 @@ static struct ip_esp_hdr *esp6_output_tcp_encap(struct xfrm_state *x, if (IS_ERR(sk)) return ERR_CAST(sk); + sock_put(sk); + *lenp = htons(len); esph = (struct ip_esp_hdr *)(lenp + 1); @@ -459,7 +414,6 @@ static int esp6_output_encap(struct xfrm_state *x, struct sk_buff *skb, switch (encap_type) { default: case UDP_ENCAP_ESPINUDP: - case UDP_ENCAP_ESPINUDP_NON_IKE: esph = esp6_output_udp_encap(skb, encap_type, esp, sport, dport); break; case TCP_ENCAP_ESPINTCP: @@ -677,7 +631,7 @@ int esp6_output_tail(struct xfrm_state *x, struct sk_buff *skb, struct esp_info } if (sg != dsg) - esp_ssg_unref(x, tmp); + esp_ssg_unref(x, tmp, skb); if (!err && x->encap && x->encap->encap_type == TCP_ENCAP_ESPINTCP) err = esp_output_tail_tcp(x, skb); @@ -770,7 +724,9 @@ static inline int esp_remove_trailer(struct sk_buff *skb) skb->csum = csum_block_sub(skb->csum, csumdiff, skb->len - trimlen); } - pskb_trim(skb, skb->len - trimlen); + ret = pskb_trim(skb, skb->len - trimlen); + if (unlikely(ret)) + return ret; ret = nexthdr[1]; @@ -820,7 +776,6 @@ int esp6_input_done2(struct sk_buff *skb, int err) source = th->source; break; case UDP_ENCAP_ESPINUDP: - case UDP_ENCAP_ESPINUDP_NON_IKE: source = uh->source; break; default: @@ -831,7 +786,7 @@ int esp6_input_done2(struct sk_buff *skb, int err) /* * 1) if the NAT-T peer's IP or port changed then - * advertize the change to the keying daemon. + * advertise the change to the keying daemon. * This is an inbound SA, so just compare * SRC ports. */ @@ -865,7 +820,8 @@ int esp6_input_done2(struct sk_buff *skb, int err) skb_postpull_rcsum(skb, skb_network_header(skb), skb_network_header_len(skb)); skb_pull_rcsum(skb, hlen); - if (x->props.mode == XFRM_MODE_TUNNEL) + if (x->props.mode == XFRM_MODE_TUNNEL || + x->props.mode == XFRM_MODE_IPTFS) skb_reset_transport_header(skb); else skb_set_transport_header(skb, -hdr_len); @@ -879,9 +835,9 @@ out: } EXPORT_SYMBOL_GPL(esp6_input_done2); -static void esp_input_done(struct crypto_async_request *base, int err) +static void esp_input_done(void *data, int err) { - struct sk_buff *skb = base->data; + struct sk_buff *skb = data; xfrm_input_resume(skb, esp6_input_done2(skb, err)); } @@ -909,12 +865,12 @@ static void esp_input_set_header(struct sk_buff *skb, __be32 *seqhi) } } -static void esp_input_done_esn(struct crypto_async_request *base, int err) +static void esp_input_done_esn(void *data, int err) { - struct sk_buff *skb = base->data; + struct sk_buff *skb = data; esp_input_restore_header(skb); - esp_input_done(base, err); + esp_input_done(data, err); } static int esp6_input(struct xfrm_state *x, struct sk_buff *skb) @@ -1230,9 +1186,6 @@ static int esp6_init_state(struct xfrm_state *x, struct netlink_ext_ack *extack) case UDP_ENCAP_ESPINUDP: x->props.header_len += sizeof(struct udphdr); break; - case UDP_ENCAP_ESPINUDP_NON_IKE: - x->props.header_len += sizeof(struct udphdr) + 2 * sizeof(u32); - break; #ifdef CONFIG_INET6_ESPINTCP case TCP_ENCAP_ESPINTCP: /* only the length field, TCP encap is done by @@ -1299,5 +1252,6 @@ static void __exit esp6_fini(void) module_init(esp6_init); module_exit(esp6_fini); +MODULE_DESCRIPTION("IPv6 ESP transformation helpers"); MODULE_LICENSE("GPL"); MODULE_ALIAS_XFRM_TYPE(AF_INET6, XFRM_PROTO_ESP); diff --git a/net/ipv6/esp6_offload.c b/net/ipv6/esp6_offload.c index 75c02992c520..22410243ebe8 100644 --- a/net/ipv6/esp6_offload.c +++ b/net/ipv6/esp6_offload.c @@ -17,6 +17,7 @@ #include <linux/err.h> #include <linux/module.h> #include <net/gro.h> +#include <net/gso.h> #include <net/ip.h> #include <net/xfrm.h> #include <net/esp.h> @@ -33,7 +34,9 @@ static __u16 esp6_nexthdr_esp_offset(struct ipv6hdr *ipv6_hdr, int nhlen) int off = sizeof(struct ipv6hdr); struct ipv6_opt_hdr *exthdr; - if (likely(ipv6_hdr->nexthdr == NEXTHDR_ESP)) + /* ESP or ESPINUDP */ + if (likely(ipv6_hdr->nexthdr == NEXTHDR_ESP || + ipv6_hdr->nexthdr == NEXTHDR_UDP)) return offsetof(struct ipv6hdr, nexthdr); while (off < nhlen) { @@ -53,10 +56,14 @@ static struct sk_buff *esp6_gro_receive(struct list_head *head, int offset = skb_gro_offset(skb); struct xfrm_offload *xo; struct xfrm_state *x; + int encap_type = 0; __be32 seq; __be32 spi; int nhoff; + if (NAPI_GRO_CB(skb)->proto == IPPROTO_UDP) + encap_type = UDP_ENCAP_ESPINUDP; + if (!pskb_pull(skb, offset)) return NULL; @@ -73,9 +80,16 @@ static struct sk_buff *esp6_gro_receive(struct list_head *head, if (sp->len == XFRM_MAX_DEPTH) goto out_reset; - x = xfrm_state_lookup(dev_net(skb->dev), skb->mark, - (xfrm_address_t *)&ipv6_hdr(skb)->daddr, - spi, IPPROTO_ESP, AF_INET6); + x = xfrm_input_state_lookup(dev_net(skb->dev), skb->mark, + (xfrm_address_t *)&ipv6_hdr(skb)->daddr, + spi, IPPROTO_ESP, AF_INET6); + + if (unlikely(x && x->dir && x->dir != XFRM_SA_DIR_IN)) { + /* non-offload path will record the error and audit log */ + xfrm_state_put(x); + x = NULL; + } + if (!x) goto out_reset; @@ -103,7 +117,7 @@ static struct sk_buff *esp6_gro_receive(struct list_head *head, /* We don't need to handle errors from xfrm_input, it does all * the error handling and frees the resources on error. */ - xfrm_input(skb, IPPROTO_ESP, spi, -2); + xfrm_input(skb, IPPROTO_ESP, spi, encap_type); return ERR_PTR(-EINPROGRESS); out_reset: @@ -144,8 +158,10 @@ static struct sk_buff *xfrm6_tunnel_gso_segment(struct xfrm_state *x, struct sk_buff *skb, netdev_features_t features) { - __be16 type = x->inner_mode.family == AF_INET ? htons(ETH_P_IP) - : htons(ETH_P_IPV6); + const struct xfrm_mode *inner_mode = xfrm_ip2inner_mode(x, + XFRM_MODE_SKB_CB(skb)->protocol); + __be16 type = inner_mode->family == AF_INET ? htons(ETH_P_IP) + : htons(ETH_P_IPV6); return skb_eth_gso_segment(skb, features, type); } @@ -374,6 +390,9 @@ static int esp6_xmit(struct xfrm_state *x, struct sk_buff *skb, netdev_features secpath_reset(skb); + if (skb_needs_linearize(skb, skb->dev->features) && + __skb_linearize(skb)) + return -ENOMEM; return 0; } diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c index a8d961d3a477..a23eb8734e15 100644 --- a/net/ipv6/exthdrs.c +++ b/net/ipv6/exthdrs.c @@ -50,6 +50,7 @@ #endif #include <net/rpl.h> #include <linux/ioam6.h> +#include <linux/ioam6_genl.h> #include <net/ioam6.h> #include <net/dst_metadata.h> @@ -126,9 +127,6 @@ static bool ip6_parse_tlv(bool hopbyhop, max_count = -max_count; } - if (skb_transport_offset(skb) + len > skb_headlen(skb)) - goto bad; - off += 2; len -= 2; @@ -180,6 +178,8 @@ static bool ip6_parse_tlv(bool hopbyhop, case IPV6_TLV_IOAM: if (!ipv6_hop_ioam(skb, off)) return false; + + nh = skb_network_header(skb); break; case IPV6_TLV_JUMBO: if (!ipv6_hop_jumbo(skb, off)) @@ -306,7 +306,7 @@ static int ipv6_destopt_rcv(struct sk_buff *skb) if (!pskb_may_pull(skb, skb_transport_offset(skb) + 8) || !pskb_may_pull(skb, (skb_transport_offset(skb) + ((skb_transport_header(skb)[1] + 1) << 3)))) { - __IP6_INC_STATS(dev_net(dst->dev), idev, + __IP6_INC_STATS(dev_net(dst_dev(dst)), idev, IPSTATS_MIB_INHDRERRORS); fail_and_free: kfree_skb(skb); @@ -379,9 +379,8 @@ static int ipv6_srh_rcv(struct sk_buff *skb) idev = __in6_dev_get(skb->dev); - accept_seg6 = net->ipv6.devconf_all->seg6_enabled; - if (accept_seg6 > idev->cnf.seg6_enabled) - accept_seg6 = idev->cnf.seg6_enabled; + accept_seg6 = min(READ_ONCE(net->ipv6.devconf_all->seg6_enabled), + READ_ONCE(idev->cnf.seg6_enabled)); if (!accept_seg6) { kfree_skb(skb); @@ -402,11 +401,7 @@ looped_back: skb_postpull_rcsum(skb, skb_network_header(skb), skb_network_header_len(skb)); - - if (!pskb_pull(skb, offset)) { - kfree_skb(skb); - return -1; - } + skb_pull(skb, offset); skb_postpull_rcsum(skb, skb_transport_header(skb), offset); @@ -444,9 +439,9 @@ looped_back: kfree_skb(skb); return -1; } - } - hdr = (struct ipv6_sr_hdr *)skb_transport_header(skb); + hdr = (struct ipv6_sr_hdr *)skb_transport_header(skb); + } hdr->segments_left--; addr = hdr->segments + hdr->segments_left; @@ -458,8 +453,6 @@ looped_back: ipv6_hdr(skb)->daddr = *addr; - skb_dst_drop(skb); - ip6_route_input(skb); if (skb_dst(skb)->error) { @@ -467,7 +460,7 @@ looped_back: return -1; } - if (skb_dst(skb)->dev->flags & IFF_LOOPBACK) { + if (skb_dst_dev(skb)->flags & IFF_LOOPBACK) { if (ipv6_hdr(skb)->hop_limit <= 1) { __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); icmpv6_send(skb, ICMPV6_TIME_EXCEED, @@ -501,10 +494,8 @@ static int ipv6_rpl_srh_rcv(struct sk_buff *skb) idev = __in6_dev_get(skb->dev); - accept_rpl_seg = net->ipv6.devconf_all->rpl_seg_enabled; - if (accept_rpl_seg > idev->cnf.rpl_seg_enabled) - accept_rpl_seg = idev->cnf.rpl_seg_enabled; - + accept_rpl_seg = min(READ_ONCE(net->ipv6.devconf_all->rpl_seg_enabled), + READ_ONCE(idev->cnf.rpl_seg_enabled)); if (!accept_rpl_seg) { kfree_skb(skb); return -1; @@ -519,11 +510,7 @@ looped_back: skb_postpull_rcsum(skb, skb_network_header(skb), skb_network_header_len(skb)); - - if (!pskb_pull(skb, offset)) { - kfree_skb(skb); - return -1; - } + skb_pull(skb, offset); skb_postpull_rcsum(skb, skb_transport_header(skb), offset); @@ -545,11 +532,6 @@ looped_back: return 1; } - if (!pskb_may_pull(skb, sizeof(*hdr))) { - kfree_skb(skb); - return -1; - } - n = (hdr->hdrlen << 3) - hdr->pad - (16 - hdr->cmpre); r = do_div(n, (16 - hdr->cmpri)); /* checks if calculation was without remainder and n fits into @@ -569,30 +551,6 @@ looped_back: return -1; } - if (skb_cloned(skb)) { - if (pskb_expand_head(skb, IPV6_RPL_SRH_WORST_SWAP_SIZE, 0, - GFP_ATOMIC)) { - __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), - IPSTATS_MIB_OUTDISCARDS); - kfree_skb(skb); - return -1; - } - } else { - err = skb_cow_head(skb, IPV6_RPL_SRH_WORST_SWAP_SIZE); - if (unlikely(err)) { - kfree_skb(skb); - return -1; - } - } - - hdr = (struct ipv6_rpl_sr_hdr *)skb_transport_header(skb); - - if (!pskb_may_pull(skb, ipv6_rpl_srh_size(n, hdr->cmpri, - hdr->cmpre))) { - kfree_skb(skb); - return -1; - } - hdr->segments_left--; i = n - hdr->segments_left; @@ -606,8 +564,7 @@ looped_back: ipv6_rpl_srh_decompress(ohdr, hdr, &ipv6_hdr(skb)->daddr, n); chdr = (struct ipv6_rpl_sr_hdr *)(buf + ((ohdr->hdrlen + 1) << 3)); - if ((ipv6_addr_type(&ipv6_hdr(skb)->daddr) & IPV6_ADDR_MULTICAST) || - (ipv6_addr_type(&ohdr->rpl_segaddr[i]) & IPV6_ADDR_MULTICAST)) { + if (ipv6_addr_is_multicast(&ohdr->rpl_segaddr[i])) { kfree_skb(skb); kfree(buf); return -1; @@ -630,6 +587,17 @@ looped_back: skb_pull(skb, ((hdr->hdrlen + 1) << 3)); skb_postpull_rcsum(skb, oldhdr, sizeof(struct ipv6hdr) + ((hdr->hdrlen + 1) << 3)); + if (unlikely(!hdr->segments_left)) { + if (pskb_expand_head(skb, sizeof(struct ipv6hdr) + ((chdr->hdrlen + 1) << 3), 0, + GFP_ATOMIC)) { + __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_OUTDISCARDS); + kfree_skb(skb); + kfree(buf); + return -1; + } + + oldhdr = ipv6_hdr(skb); + } skb_push(skb, ((chdr->hdrlen + 1) << 3) + sizeof(struct ipv6hdr)); skb_reset_network_header(skb); skb_mac_header_rebuild(skb); @@ -644,8 +612,6 @@ looped_back: kfree(buf); - skb_dst_drop(skb); - ip6_route_input(skb); if (skb_dst(skb)->error) { @@ -653,7 +619,7 @@ looped_back: return -1; } - if (skb_dst(skb)->dev->flags & IFF_LOOPBACK) { + if (skb_dst_dev(skb)->flags & IFF_LOOPBACK) { if (ipv6_hdr(skb)->hop_limit <= 1) { __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); icmpv6_send(skb, ICMPV6_TIME_EXCEED, @@ -682,15 +648,17 @@ static int ipv6_rthdr_rcv(struct sk_buff *skb) struct inet6_dev *idev = __in6_dev_get(skb->dev); struct inet6_skb_parm *opt = IP6CB(skb); struct in6_addr *addr = NULL; - struct in6_addr daddr; int n, i; struct ipv6_rt_hdr *hdr; struct rt0_hdr *rthdr; struct net *net = dev_net(skb->dev); - int accept_source_route = net->ipv6.devconf_all->accept_source_route; + int accept_source_route; + + accept_source_route = READ_ONCE(net->ipv6.devconf_all->accept_source_route); - if (idev && accept_source_route > idev->cnf.accept_source_route) - accept_source_route = idev->cnf.accept_source_route; + if (idev) + accept_source_route = min(accept_source_route, + READ_ONCE(idev->cnf.accept_source_route)); if (!pskb_may_pull(skb, skb_transport_offset(skb) + 8) || !pskb_may_pull(skb, (skb_transport_offset(skb) + @@ -813,7 +781,7 @@ looped_back: kfree_skb(skb); return -1; } - if (!ipv6_chk_home_addr(dev_net(skb_dst(skb)->dev), addr)) { + if (!ipv6_chk_home_addr(skb_dst_dev_net(skb), addr)) { __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS); kfree_skb(skb); return -1; @@ -830,19 +798,16 @@ looped_back: return -1; } - daddr = *addr; - *addr = ipv6_hdr(skb)->daddr; - ipv6_hdr(skb)->daddr = daddr; + swap(*addr, ipv6_hdr(skb)->daddr); - skb_dst_drop(skb); ip6_route_input(skb); if (skb_dst(skb)->error) { - skb_push(skb, skb->data - skb_network_header(skb)); + skb_push(skb, -skb_network_offset(skb)); dst_input(skb); return -1; } - if (skb_dst(skb)->dev->flags&IFF_LOOPBACK) { + if (skb_dst_dev(skb)->flags & IFF_LOOPBACK) { if (ipv6_hdr(skb)->hop_limit <= 1) { __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, @@ -854,7 +819,7 @@ looped_back: goto looped_back; } - skb_push(skb, skb->data - skb_network_header(skb)); + skb_push(skb, -skb_network_offset(skb)); dst_input(skb); return -1; @@ -916,14 +881,6 @@ void ipv6_exthdrs_exit(void) Hop-by-hop options. **********************************/ -/* - * Note: we cannot rely on skb_dst(skb) before we assign it in ip6_route_input(). - */ -static inline struct net *ipv6_skb_net(struct sk_buff *skb) -{ - return skb_dst(skb) ? dev_net(skb_dst(skb)->dev) : dev_net(skb->dev); -} - /* Router Alert as of RFC 2711 */ static bool ipv6_hop_ra(struct sk_buff *skb, int optoff) @@ -954,7 +911,7 @@ static bool ipv6_hop_ioam(struct sk_buff *skb, int optoff) goto drop; /* Ignore if IOAM is not enabled on ingress */ - if (!__in6_dev_get(skb->dev)->cnf.ioam6_enabled) + if (!READ_ONCE(__in6_dev_get(skb->dev)->cnf.ioam6_enabled)) goto ignore; /* Truncated Option header */ @@ -974,14 +931,25 @@ static bool ipv6_hop_ioam(struct sk_buff *skb, int optoff) goto drop; /* Ignore if the IOAM namespace is unknown */ - ns = ioam6_namespace(ipv6_skb_net(skb), trace->namespace_id); + ns = ioam6_namespace(dev_net(skb->dev), trace->namespace_id); if (!ns) goto ignore; if (!skb_valid_dst(skb)) ip6_route_input(skb); + /* About to mangle packet header */ + if (skb_ensure_writable(skb, optoff + 2 + hdr->opt_len)) + goto drop; + + /* Trace pointer may have changed */ + trace = (struct ioam6_trace_hdr *)(skb_network_header(skb) + + optoff + sizeof(*hdr)); + ioam6_fill_trace_data(skb, ns, trace, true); + + ioam6_event(IOAM6_EVENT_TRACE, dev_net(skb->dev), + GFP_ATOMIC, (void *)trace, hdr->opt_len - 2); break; default: break; @@ -1234,10 +1202,9 @@ ipv6_dup_options(struct sock *sk, struct ipv6_txoptions *opt) { struct ipv6_txoptions *opt2; - opt2 = sock_kmalloc(sk, opt->tot_len, GFP_ATOMIC); + opt2 = sock_kmemdup(sk, opt, opt->tot_len, GFP_ATOMIC); if (opt2) { long dif = (char *)opt2 - (char *)opt; - memcpy(opt2, opt, opt->tot_len); if (opt2->hopopt) *((char **)&opt2->hopopt) += dif; if (opt2->dst0opt) diff --git a/net/ipv6/exthdrs_core.c b/net/ipv6/exthdrs_core.c index da46c4284676..49e31e4ae7b7 100644 --- a/net/ipv6/exthdrs_core.c +++ b/net/ipv6/exthdrs_core.c @@ -143,6 +143,8 @@ int ipv6_find_tlv(const struct sk_buff *skb, int offset, int type) optlen = 1; break; default: + if (len < 2) + goto bad; optlen = nh[offset + 1] + 2; if (optlen > len) goto bad; diff --git a/net/ipv6/exthdrs_offload.c b/net/ipv6/exthdrs_offload.c index 06750d65d480..4c00398f4dca 100644 --- a/net/ipv6/exthdrs_offload.c +++ b/net/ipv6/exthdrs_offload.c @@ -16,6 +16,10 @@ static const struct net_offload dstopt_offload = { .flags = INET6_PROTO_GSO_EXTHDR, }; +static const struct net_offload hbh_offload = { + .flags = INET6_PROTO_GSO_EXTHDR, +}; + int __init ipv6_exthdrs_offload_init(void) { int ret; @@ -28,9 +32,16 @@ int __init ipv6_exthdrs_offload_init(void) if (ret) goto out_rt; + ret = inet6_add_offload(&hbh_offload, IPPROTO_HOPOPTS); + if (ret) + goto out_dstopts; + out: return ret; +out_dstopts: + inet6_del_offload(&dstopt_offload, IPPROTO_DSTOPTS); + out_rt: inet6_del_offload(&rthdr_offload, IPPROTO_ROUTING); goto out; diff --git a/net/ipv6/fib6_notifier.c b/net/ipv6/fib6_notifier.c index f87ae33e1d01..949b72610df7 100644 --- a/net/ipv6/fib6_notifier.c +++ b/net/ipv6/fib6_notifier.c @@ -22,7 +22,7 @@ int call_fib6_notifiers(struct net *net, enum fib_event_type event_type, return call_fib_notifiers(net, event_type, info); } -static unsigned int fib6_seq_read(struct net *net) +static unsigned int fib6_seq_read(const struct net *net) { return fib6_tables_seq_read(net) + fib6_rules_seq_read(net); } diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c index 7c2003833010..fd5f7112a51f 100644 --- a/net/ipv6/fib6_rules.c +++ b/net/ipv6/fib6_rules.c @@ -26,14 +26,18 @@ struct fib6_rule { struct fib_rule common; struct rt6key src; struct rt6key dst; + __be32 flowlabel; + __be32 flowlabel_mask; dscp_t dscp; + dscp_t dscp_mask; + u8 dscp_full:1; /* DSCP or TOS selector */ }; static bool fib6_rule_matchall(const struct fib_rule *rule) { struct fib6_rule *r = container_of(rule, struct fib6_rule, common); - if (r->dst.plen || r->src.plen || r->dscp) + if (r->dst.plen || r->src.plen || r->dscp || r->flowlabel_mask) return false; return fib_rule_matchall(rule); } @@ -55,7 +59,7 @@ int fib6_rules_dump(struct net *net, struct notifier_block *nb, return fib_rules_dump(net, nb, AF_INET6, extack); } -unsigned int fib6_rules_seq_read(struct net *net) +unsigned int fib6_rules_seq_read(const struct net *net) { return fib_rules_seq_read(net, AF_INET6); } @@ -233,8 +237,12 @@ static int __fib6_rule_action(struct fib_rule *rule, struct flowi *flp, rt = pol_lookup_func(lookup, net, table, flp6, arg->lookup_data, flags); if (rt != net->ipv6.ip6_null_entry) { + struct inet6_dev *idev = ip6_dst_idev(&rt->dst); + + if (!idev) + goto again; err = fib6_rule_saddr(net, rule, flags, flp6, - ip6_dst_idev(&rt->dst)->dev); + idev->dev); if (err == -EAGAIN) goto again; @@ -324,31 +332,101 @@ INDIRECT_CALLABLE_SCOPE int fib6_rule_match(struct fib_rule *rule, return 0; } - if (r->dscp && r->dscp != ip6_dscp(fl6->flowlabel)) + if ((r->dscp ^ ip6_dscp(fl6->flowlabel)) & r->dscp_mask) + return 0; + + if ((r->flowlabel ^ flowi6_get_flowlabel(fl6)) & r->flowlabel_mask) return 0; if (rule->ip_proto && (rule->ip_proto != fl6->flowi6_proto)) return 0; - if (fib_rule_port_range_set(&rule->sport_range) && - !fib_rule_port_inrange(&rule->sport_range, fl6->fl6_sport)) + if (!fib_rule_port_match(&rule->sport_range, rule->sport_mask, + fl6->fl6_sport)) return 0; - if (fib_rule_port_range_set(&rule->dport_range) && - !fib_rule_port_inrange(&rule->dport_range, fl6->fl6_dport)) + if (!fib_rule_port_match(&rule->dport_range, rule->dport_mask, + fl6->fl6_dport)) return 0; return 1; } +static int fib6_nl2rule_dscp(const struct nlattr *nla, struct fib6_rule *rule6, + struct netlink_ext_ack *extack) +{ + if (rule6->dscp) { + NL_SET_ERR_MSG(extack, "Cannot specify both TOS and DSCP"); + return -EINVAL; + } + + rule6->dscp = inet_dsfield_to_dscp(nla_get_u8(nla) << 2); + rule6->dscp_mask = inet_dsfield_to_dscp(INET_DSCP_MASK); + rule6->dscp_full = true; + + return 0; +} + +static int fib6_nl2rule_dscp_mask(const struct nlattr *nla, + struct fib6_rule *rule6, + struct netlink_ext_ack *extack) +{ + dscp_t dscp_mask; + + if (!rule6->dscp_full) { + NL_SET_ERR_MSG_ATTR(extack, nla, + "Cannot specify DSCP mask without DSCP value"); + return -EINVAL; + } + + dscp_mask = inet_dsfield_to_dscp(nla_get_u8(nla) << 2); + if (rule6->dscp & ~dscp_mask) { + NL_SET_ERR_MSG_ATTR(extack, nla, "Invalid DSCP mask"); + return -EINVAL; + } + + rule6->dscp_mask = dscp_mask; + + return 0; +} + +static int fib6_nl2rule_flowlabel(struct nlattr **tb, struct fib6_rule *rule6, + struct netlink_ext_ack *extack) +{ + __be32 flowlabel, flowlabel_mask; + + if (NL_REQ_ATTR_CHECK(extack, NULL, tb, FRA_FLOWLABEL) || + NL_REQ_ATTR_CHECK(extack, NULL, tb, FRA_FLOWLABEL_MASK)) + return -EINVAL; + + flowlabel = nla_get_be32(tb[FRA_FLOWLABEL]); + flowlabel_mask = nla_get_be32(tb[FRA_FLOWLABEL_MASK]); + + if (flowlabel_mask & ~IPV6_FLOWLABEL_MASK) { + NL_SET_ERR_MSG_ATTR(extack, tb[FRA_FLOWLABEL_MASK], + "Invalid flow label mask"); + return -EINVAL; + } + + if (flowlabel & ~flowlabel_mask) { + NL_SET_ERR_MSG(extack, "Flow label and mask do not match"); + return -EINVAL; + } + + rule6->flowlabel = flowlabel; + rule6->flowlabel_mask = flowlabel_mask; + + return 0; +} + static int fib6_rule_configure(struct fib_rule *rule, struct sk_buff *skb, struct fib_rule_hdr *frh, struct nlattr **tb, struct netlink_ext_ack *extack) { + struct fib6_rule *rule6 = (struct fib6_rule *)rule; + struct net *net = rule->fr_net; int err = -EINVAL; - struct net *net = sock_net(skb->sk); - struct fib6_rule *rule6 = (struct fib6_rule *) rule; if (!inet_validate_dscp(frh->tos)) { NL_SET_ERR_MSG(extack, @@ -356,6 +434,18 @@ static int fib6_rule_configure(struct fib_rule *rule, struct sk_buff *skb, goto errout; } rule6->dscp = inet_dsfield_to_dscp(frh->tos); + rule6->dscp_mask = frh->tos ? inet_dsfield_to_dscp(INET_DSCP_MASK) : 0; + + if (tb[FRA_DSCP] && fib6_nl2rule_dscp(tb[FRA_DSCP], rule6, extack) < 0) + goto errout; + + if (tb[FRA_DSCP_MASK] && + fib6_nl2rule_dscp_mask(tb[FRA_DSCP_MASK], rule6, extack) < 0) + goto errout; + + if ((tb[FRA_FLOWLABEL] || tb[FRA_FLOWLABEL_MASK]) && + fib6_nl2rule_flowlabel(tb, rule6, extack) < 0) + goto errout; if (rule->action == FR_ACT_TO_TBL && !rule->l3mdev) { if (rule->table == RT6_TABLE_UNSPEC) { @@ -409,7 +499,33 @@ static int fib6_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh, if (frh->dst_len && (rule6->dst.plen != frh->dst_len)) return 0; - if (frh->tos && inet_dscp_to_dsfield(rule6->dscp) != frh->tos) + if (frh->tos && + (rule6->dscp_full || + inet_dscp_to_dsfield(rule6->dscp) != frh->tos)) + return 0; + + if (tb[FRA_DSCP]) { + dscp_t dscp; + + dscp = inet_dsfield_to_dscp(nla_get_u8(tb[FRA_DSCP]) << 2); + if (!rule6->dscp_full || rule6->dscp != dscp) + return 0; + } + + if (tb[FRA_DSCP_MASK]) { + dscp_t dscp_mask; + + dscp_mask = inet_dsfield_to_dscp(nla_get_u8(tb[FRA_DSCP_MASK]) << 2); + if (!rule6->dscp_full || rule6->dscp_mask != dscp_mask) + return 0; + } + + if (tb[FRA_FLOWLABEL] && + nla_get_be32(tb[FRA_FLOWLABEL]) != rule6->flowlabel) + return 0; + + if (tb[FRA_FLOWLABEL_MASK] && + nla_get_be32(tb[FRA_FLOWLABEL_MASK]) != rule6->flowlabel_mask) return 0; if (frh->src_len && @@ -430,7 +546,22 @@ static int fib6_rule_fill(struct fib_rule *rule, struct sk_buff *skb, frh->dst_len = rule6->dst.plen; frh->src_len = rule6->src.plen; - frh->tos = inet_dscp_to_dsfield(rule6->dscp); + + if (rule6->dscp_full) { + frh->tos = 0; + if (nla_put_u8(skb, FRA_DSCP, + inet_dscp_to_dsfield(rule6->dscp) >> 2) || + nla_put_u8(skb, FRA_DSCP_MASK, + inet_dscp_to_dsfield(rule6->dscp_mask) >> 2)) + goto nla_put_failure; + } else { + frh->tos = inet_dscp_to_dsfield(rule6->dscp); + } + + if (rule6->flowlabel_mask && + (nla_put_be32(skb, FRA_FLOWLABEL, rule6->flowlabel) || + nla_put_be32(skb, FRA_FLOWLABEL_MASK, rule6->flowlabel_mask))) + goto nla_put_failure; if ((rule6->dst.plen && nla_put_in6_addr(skb, FRA_DST, &rule6->dst.addr)) || @@ -446,7 +577,16 @@ nla_put_failure: static size_t fib6_rule_nlmsg_payload(struct fib_rule *rule) { return nla_total_size(16) /* dst */ - + nla_total_size(16); /* src */ + + nla_total_size(16) /* src */ + + nla_total_size(1) /* dscp */ + + nla_total_size(1) /* dscp mask */ + + nla_total_size(4) /* flowlabel */ + + nla_total_size(4); /* flowlabel mask */ +} + +static void fib6_rule_flush_cache(struct fib_rules_ops *ops) +{ + rt_genid_bump_ipv6(ops->fro_net); } static const struct fib_rules_ops __net_initconst fib6_rules_ops_template = { @@ -461,6 +601,7 @@ static const struct fib_rules_ops __net_initconst fib6_rules_ops_template = { .compare = fib6_rule_compare, .fill = fib6_rule_fill, .nlmsg_payload = fib6_rule_nlmsg_payload, + .flush_cache = fib6_rule_flush_cache, .nlgroup = RTNLGRP_IPV6_RULE, .owner = THIS_MODULE, .fro_net = &init_net, @@ -475,11 +616,11 @@ static int __net_init fib6_rules_net_init(struct net *net) if (IS_ERR(ops)) return PTR_ERR(ops); - err = fib_default_rule_add(ops, 0, RT6_TABLE_LOCAL, 0); + err = fib_default_rule_add(ops, 0, RT6_TABLE_LOCAL); if (err) goto out_fib6_rules_ops; - err = fib_default_rule_add(ops, 0x7FFE, RT6_TABLE_MAIN, 0); + err = fib_default_rule_add(ops, 0x7FFE, RT6_TABLE_MAIN); if (err) goto out_fib6_rules_ops; diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index 9d92d51c4757..5d2f90babaa5 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -76,7 +76,7 @@ static int icmpv6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, { /* icmpv6_notify checks 8 bytes can be pulled, icmp6hdr is 8 bytes */ struct icmp6hdr *icmp6 = (struct icmp6hdr *) (skb->data + offset); - struct net *net = dev_net(skb->dev); + struct net *net = dev_net_rcu(skb->dev); if (type == ICMPV6_PKT_TOOBIG) ip6_update_pmtu(skb, net, info, skb->dev->ifindex, 0, sock_net_uid(net, NULL)); @@ -175,14 +175,17 @@ static bool icmpv6_mask_allow(struct net *net, int type) return false; } -static bool icmpv6_global_allow(struct net *net, int type) +static bool icmpv6_global_allow(struct net *net, int type, + bool *apply_ratelimit) { if (icmpv6_mask_allow(net, type)) return true; - if (icmp_global_allow()) + if (icmp_global_allow(net)) { + *apply_ratelimit = true; return true; - + } + __ICMP_INC_STATS(net, ICMP_MIB_RATELIMITGLOBAL); return false; } @@ -190,13 +193,14 @@ static bool icmpv6_global_allow(struct net *net, int type) * Check the ICMP output rate limit */ static bool icmpv6_xrlim_allow(struct sock *sk, u8 type, - struct flowi6 *fl6) + struct flowi6 *fl6, bool apply_ratelimit) { struct net *net = sock_net(sk); + struct net_device *dev; struct dst_entry *dst; bool res = false; - if (icmpv6_mask_allow(net, type)) + if (!apply_ratelimit) return true; /* @@ -205,13 +209,15 @@ static bool icmpv6_xrlim_allow(struct sock *sk, u8 type, * this lookup should be more aggressive (not longer than timeout). */ dst = ip6_route_output(net, sk, fl6); + rcu_read_lock(); + dev = dst_dev_rcu(dst); if (dst->error) { IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES); - } else if (dst->dev && (dst->dev->flags&IFF_LOOPBACK)) { + } else if (dev && (dev->flags & IFF_LOOPBACK)) { res = true; } else { - struct rt6_info *rt = (struct rt6_info *)dst; + struct rt6_info *rt = dst_rt6_info(dst); int tmo = net->ipv6.sysctl.icmpv6_time; struct inet_peer *peer; @@ -219,11 +225,14 @@ static bool icmpv6_xrlim_allow(struct sock *sk, u8 type, if (rt->rt6i_dst.plen < 128) tmo >>= ((128 - rt->rt6i_dst.plen)>>5); - peer = inet_getpeer_v6(net->ipv6.peers, &fl6->daddr, 1); + peer = inet_getpeer_v6(net->ipv6.peers, &fl6->daddr); res = inet_peer_xrlim_allow(peer, tmo); - if (peer) - inet_putpeer(peer); } + rcu_read_unlock(); + if (!res) + __ICMP6_INC_STATS(net, NULL, ICMP6_MIB_RATELIMITHOST); + else + icmp_global_consume(net); dst_release(dst); return res; } @@ -237,7 +246,7 @@ static bool icmpv6_rt_has_prefsrc(struct sock *sk, u8 type, dst = ip6_route_output(net, sk, fl6); if (!dst->error) { - struct rt6_info *rt = (struct rt6_info *)dst; + struct rt6_info *rt = dst_rt6_info(dst); struct in6_addr prefsrc; rt6_get_prefsrc(rt, &prefsrc); @@ -328,7 +337,6 @@ static void mip6_addr_swap(struct sk_buff *skb, const struct inet6_skb_parm *opt { struct ipv6hdr *iph = ipv6_hdr(skb); struct ipv6_destopt_hao *hao; - struct in6_addr tmp; int off; if (opt->dsthao) { @@ -336,9 +344,7 @@ static void mip6_addr_swap(struct sk_buff *skb, const struct inet6_skb_parm *opt if (likely(off >= 0)) { hao = (struct ipv6_destopt_hao *) (skb_network_header(skb) + off); - tmp = iph->saddr; - iph->saddr = hao->addr; - hao->addr = tmp; + swap(iph->saddr, hao->addr); } } } @@ -361,9 +367,10 @@ static struct dst_entry *icmpv6_route_lookup(struct net *net, /* * We won't send icmp if the destination is known - * anycast. + * anycast unless we need to treat anycast as unicast. */ - if (ipv6_anycast_destination(dst, &fl6->daddr)) { + if (!READ_ONCE(net->ipv6.sysctl.icmpv6_error_anycast_as_unicast) && + ipv6_anycast_destination(dst, &fl6->daddr)) { net_dbg_ratelimited("icmp6_send: acast source\n"); dst_release(dst); return ERR_PTR(-EINVAL); @@ -383,7 +390,7 @@ static struct dst_entry *icmpv6_route_lookup(struct net *net, return dst; } - err = xfrm_decode_session_reverse(skb, flowi6_to_flowi(&fl2), AF_INET6); + err = xfrm_decode_session_reverse(net, skb, flowi6_to_flowi(&fl2), AF_INET6); if (err) goto relookup_failed; @@ -422,7 +429,10 @@ static struct net_device *icmp6_dev(const struct sk_buff *skb) if (unlikely(dev->ifindex == LOOPBACK_IFINDEX || netif_is_l3_master(skb->dev))) { const struct rt6_info *rt6 = skb_rt6_info(skb); - if (rt6) + /* The destination could be an external IP in Ext Hdr (SRv6, RPL, etc.), + * and ip6_null_entry could be set to skb if no route is found. + */ + if (rt6 && rt6->rt6i_idev) dev = rt6->rt6i_idev->dev; } @@ -434,6 +444,193 @@ static int icmp6_iif(const struct sk_buff *skb) return icmp6_dev(skb)->ifindex; } +struct icmp6_ext_iio_addr6_subobj { + __be16 afi; + __be16 reserved; + struct in6_addr addr6; +}; + +static unsigned int icmp6_ext_iio_len(void) +{ + return sizeof(struct icmp_extobj_hdr) + + /* ifIndex */ + sizeof(__be32) + + /* Interface Address Sub-Object */ + sizeof(struct icmp6_ext_iio_addr6_subobj) + + /* Interface Name Sub-Object. Length must be a multiple of 4 + * bytes. + */ + ALIGN(sizeof(struct icmp_ext_iio_name_subobj), 4) + + /* MTU */ + sizeof(__be32); +} + +static unsigned int icmp6_ext_max_len(u8 ext_objs) +{ + unsigned int ext_max_len; + + ext_max_len = sizeof(struct icmp_ext_hdr); + + if (ext_objs & BIT(ICMP_ERR_EXT_IIO_IIF)) + ext_max_len += icmp6_ext_iio_len(); + + return ext_max_len; +} + +static struct in6_addr *icmp6_ext_iio_addr6_find(const struct net_device *dev) +{ + struct inet6_dev *in6_dev; + struct inet6_ifaddr *ifa; + + in6_dev = __in6_dev_get(dev); + if (!in6_dev) + return NULL; + + /* It is unclear from RFC 5837 which IP address should be chosen, but + * it makes sense to choose a global unicast address. + */ + list_for_each_entry_rcu(ifa, &in6_dev->addr_list, if_list) { + if (ifa->flags & (IFA_F_TENTATIVE | IFA_F_DADFAILED)) + continue; + if (ipv6_addr_type(&ifa->addr) != IPV6_ADDR_UNICAST || + ipv6_addr_src_scope(&ifa->addr) != IPV6_ADDR_SCOPE_GLOBAL) + continue; + return &ifa->addr; + } + + return NULL; +} + +static void icmp6_ext_iio_iif_append(struct net *net, struct sk_buff *skb, + int iif) +{ + struct icmp_ext_iio_name_subobj *name_subobj; + struct icmp_extobj_hdr *objh; + struct net_device *dev; + struct in6_addr *addr6; + __be32 data; + + if (!iif) + return; + + /* Add the fields in the order specified by RFC 5837. */ + objh = skb_put(skb, sizeof(*objh)); + objh->class_num = ICMP_EXT_OBJ_CLASS_IIO; + objh->class_type = ICMP_EXT_CTYPE_IIO_ROLE(ICMP_EXT_CTYPE_IIO_ROLE_IIF); + + data = htonl(iif); + skb_put_data(skb, &data, sizeof(__be32)); + objh->class_type |= ICMP_EXT_CTYPE_IIO_IFINDEX; + + rcu_read_lock(); + + dev = dev_get_by_index_rcu(net, iif); + if (!dev) + goto out; + + addr6 = icmp6_ext_iio_addr6_find(dev); + if (addr6) { + struct icmp6_ext_iio_addr6_subobj *addr6_subobj; + + addr6_subobj = skb_put_zero(skb, sizeof(*addr6_subobj)); + addr6_subobj->afi = htons(ICMP_AFI_IP6); + addr6_subobj->addr6 = *addr6; + objh->class_type |= ICMP_EXT_CTYPE_IIO_IPADDR; + } + + name_subobj = skb_put_zero(skb, ALIGN(sizeof(*name_subobj), 4)); + name_subobj->len = ALIGN(sizeof(*name_subobj), 4); + netdev_copy_name(dev, name_subobj->name); + objh->class_type |= ICMP_EXT_CTYPE_IIO_NAME; + + data = htonl(READ_ONCE(dev->mtu)); + skb_put_data(skb, &data, sizeof(__be32)); + objh->class_type |= ICMP_EXT_CTYPE_IIO_MTU; + +out: + rcu_read_unlock(); + objh->length = htons(skb_tail_pointer(skb) - (unsigned char *)objh); +} + +static void icmp6_ext_objs_append(struct net *net, struct sk_buff *skb, + u8 ext_objs, int iif) +{ + if (ext_objs & BIT(ICMP_ERR_EXT_IIO_IIF)) + icmp6_ext_iio_iif_append(net, skb, iif); +} + +static struct sk_buff * +icmp6_ext_append(struct net *net, struct sk_buff *skb_in, + struct icmp6hdr *icmp6h, unsigned int room, int iif) +{ + unsigned int payload_len, ext_max_len, ext_len; + struct icmp_ext_hdr *ext_hdr; + struct sk_buff *skb; + u8 ext_objs; + int nhoff; + + switch (icmp6h->icmp6_type) { + case ICMPV6_DEST_UNREACH: + case ICMPV6_TIME_EXCEED: + break; + default: + return NULL; + } + + /* Do not overwrite existing extensions. This can happen when we + * receive an ICMPv4 message with extensions from a tunnel and + * translate it to an ICMPv6 message towards an IPv6 host in the + * overlay network. + */ + if (icmp6h->icmp6_datagram_len) + return NULL; + + ext_objs = READ_ONCE(net->ipv6.sysctl.icmpv6_errors_extension_mask); + if (!ext_objs) + return NULL; + + ext_max_len = icmp6_ext_max_len(ext_objs); + if (ICMP_EXT_ORIG_DGRAM_MIN_LEN + ext_max_len > room) + return NULL; + + skb = skb_clone(skb_in, GFP_ATOMIC); + if (!skb) + return NULL; + + nhoff = skb_network_offset(skb); + payload_len = min(skb->len - nhoff, ICMP_EXT_ORIG_DGRAM_MIN_LEN); + + if (!pskb_network_may_pull(skb, payload_len)) + goto free_skb; + + if (pskb_trim(skb, nhoff + ICMP_EXT_ORIG_DGRAM_MIN_LEN) || + __skb_put_padto(skb, nhoff + ICMP_EXT_ORIG_DGRAM_MIN_LEN, false)) + goto free_skb; + + if (pskb_expand_head(skb, 0, ext_max_len, GFP_ATOMIC)) + goto free_skb; + + ext_hdr = skb_put_zero(skb, sizeof(*ext_hdr)); + ext_hdr->version = ICMP_EXT_VERSION_2; + + icmp6_ext_objs_append(net, skb, ext_objs, iif); + + /* Do not send an empty extension structure. */ + ext_len = skb_tail_pointer(skb) - (unsigned char *)ext_hdr; + if (ext_len == sizeof(*ext_hdr)) + goto free_skb; + + ext_hdr->checksum = ip_compute_csum(ext_hdr, ext_len); + /* The length of the original datagram in 64-bit words (RFC 4884). */ + icmp6h->icmp6_datagram_len = ICMP_EXT_ORIG_DGRAM_MIN_LEN / sizeof(u64); + + return skb; + +free_skb: + consume_skb(skb); + return NULL; +} + /* * Send an ICMP message in response to a packet in error */ @@ -447,7 +644,10 @@ void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, struct net *net; struct ipv6_pinfo *np; const struct in6_addr *saddr = NULL; + bool apply_ratelimit = false; + struct sk_buff *ext_skb; struct dst_entry *dst; + unsigned int room; struct icmp6hdr tmp_hdr; struct flowi6 fl6; struct icmpv6_msg msg; @@ -463,7 +663,10 @@ void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, if (!skb->dev) return; - net = dev_net(skb->dev); + + rcu_read_lock(); + + net = dev_net_rcu(skb->dev); mark = IP6_REPLY_MARK(net, skb->mark); /* * Make sure we respect the rules @@ -486,7 +689,7 @@ void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, !(type == ICMPV6_PARAMPROB && code == ICMPV6_UNK_OPTION && (opt_unrec(skb, info)))) - return; + goto out; saddr = NULL; } @@ -516,7 +719,7 @@ void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, if ((addr_type == IPV6_ADDR_ANY) || (addr_type & IPV6_ADDR_MULTICAST)) { net_dbg_ratelimited("icmp6_send: addr_any/mcast source [%pI6c > %pI6c]\n", &hdr->saddr, &hdr->daddr); - return; + goto out; } /* @@ -525,14 +728,15 @@ void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, if (is_ineligible(skb)) { net_dbg_ratelimited("icmp6_send: no reply to icmp error [%pI6c > %pI6c]\n", &hdr->saddr, &hdr->daddr); - return; + goto out; } - /* Needed by both icmp_global_allow and icmpv6_xmit_lock */ + /* Needed by both icmpv6_global_allow and icmpv6_xmit_lock */ local_bh_disable(); /* Check global sysctl_icmp_msgs_per_sec ratelimit */ - if (!(skb->dev->flags & IFF_LOOPBACK) && !icmpv6_global_allow(net, type)) + if (!(skb->dev->flags & IFF_LOOPBACK) && + !icmpv6_global_allow(net, type, &apply_ratelimit)) goto out_bh_enable; mip6_addr_swap(skb, parm); @@ -570,8 +774,8 @@ void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, np = inet6_sk(sk); - if (!icmpv6_xrlim_allow(sk, type, &fl6)) - goto out; + if (!icmpv6_xrlim_allow(sk, type, &fl6, apply_ratelimit)) + goto out_unlock; tmp_hdr.icmp6_type = type; tmp_hdr.icmp6_code = code; @@ -579,17 +783,17 @@ void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, tmp_hdr.icmp6_pointer = htonl(info); if (!fl6.flowi6_oif && ipv6_addr_is_multicast(&fl6.daddr)) - fl6.flowi6_oif = np->mcast_oif; + fl6.flowi6_oif = READ_ONCE(np->mcast_oif); else if (!fl6.flowi6_oif) - fl6.flowi6_oif = np->ucast_oif; + fl6.flowi6_oif = READ_ONCE(np->ucast_oif); - ipcm6_init_sk(&ipc6, np); + ipcm6_init_sk(&ipc6, sk); ipc6.sockc.mark = mark; fl6.flowlabel = ip6_make_flowinfo(ipc6.tclass, fl6.flowlabel); dst = icmpv6_route_lookup(net, skb, sk, &fl6); if (IS_ERR(dst)) - goto out; + goto out_unlock; ipc6.hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst); @@ -597,21 +801,25 @@ void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, msg.offset = skb_network_offset(skb); msg.type = type; - len = skb->len - msg.offset; - len = min_t(unsigned int, len, IPV6_MIN_MTU - sizeof(struct ipv6hdr) - sizeof(struct icmp6hdr)); + room = IPV6_MIN_MTU - sizeof(struct ipv6hdr) - sizeof(struct icmp6hdr); + ext_skb = icmp6_ext_append(net, skb, &tmp_hdr, room, parm->iif); + if (ext_skb) + msg.skb = ext_skb; + + len = msg.skb->len - msg.offset; + len = min_t(unsigned int, len, room); if (len < 0) { net_dbg_ratelimited("icmp: len problem [%pI6c > %pI6c]\n", &hdr->saddr, &hdr->daddr); goto out_dst_release; } - rcu_read_lock(); idev = __in6_dev_get(skb->dev); if (ip6_append_data(sk, icmpv6_getfrag, &msg, len + sizeof(struct icmp6hdr), sizeof(struct icmp6hdr), - &ipc6, &fl6, (struct rt6_info *)dst, + &ipc6, &fl6, dst_rt6_info(dst), MSG_DONTWAIT)) { ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTERRORS); ip6_flush_pending_frames(sk); @@ -619,13 +827,17 @@ void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, icmpv6_push_pending_frames(sk, &fl6, &tmp_hdr, len + sizeof(struct icmp6hdr)); } - rcu_read_unlock(); + out_dst_release: + if (ext_skb) + consume_skb(ext_skb); dst_release(dst); -out: +out_unlock: icmpv6_xmit_unlock(sk); out_bh_enable: local_bh_enable(); +out: + rcu_read_unlock(); } EXPORT_SYMBOL(icmp6_send); @@ -668,8 +880,8 @@ int ip6_err_gen_icmpv6_unreach(struct sk_buff *skb, int nhs, int type, skb_pull(skb2, nhs); skb_reset_network_header(skb2); - rt = rt6_lookup(dev_net(skb->dev), &ipv6_hdr(skb2)->saddr, NULL, 0, - skb, 0); + rt = rt6_lookup(dev_net_rcu(skb->dev), &ipv6_hdr(skb2)->saddr, + NULL, 0, skb, 0); if (rt && rt->dst.dev) skb2->dev = rt->dst.dev; @@ -704,32 +916,34 @@ int ip6_err_gen_icmpv6_unreach(struct sk_buff *skb, int nhs, int type, } EXPORT_SYMBOL(ip6_err_gen_icmpv6_unreach); -static void icmpv6_echo_reply(struct sk_buff *skb) +static enum skb_drop_reason icmpv6_echo_reply(struct sk_buff *skb) { - struct net *net = dev_net(skb->dev); + struct net *net = dev_net_rcu(skb->dev); struct sock *sk; struct inet6_dev *idev; struct ipv6_pinfo *np; const struct in6_addr *saddr = NULL; struct icmp6hdr *icmph = icmp6_hdr(skb); + bool apply_ratelimit = false; struct icmp6hdr tmp_hdr; struct flowi6 fl6; struct icmpv6_msg msg; struct dst_entry *dst; struct ipcm6_cookie ipc6; u32 mark = IP6_REPLY_MARK(net, skb->mark); + SKB_DR(reason); bool acast; u8 type; if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr) && net->ipv6.sysctl.icmpv6_echo_ignore_multicast) - return; + return reason; saddr = &ipv6_hdr(skb)->daddr; acast = ipv6_anycast_destination(skb_dst(skb), saddr); if (acast && net->ipv6.sysctl.icmpv6_echo_ignore_anycast) - return; + return reason; if (!ipv6_unicast_destination(skb) && !(net->ipv6.sysctl.anycast_src_echo_reply && acast)) @@ -764,9 +978,9 @@ static void icmpv6_echo_reply(struct sk_buff *skb) np = inet6_sk(sk); if (!fl6.flowi6_oif && ipv6_addr_is_multicast(&fl6.daddr)) - fl6.flowi6_oif = np->mcast_oif; + fl6.flowi6_oif = READ_ONCE(np->mcast_oif); else if (!fl6.flowi6_oif) - fl6.flowi6_oif = np->ucast_oif; + fl6.flowi6_oif = READ_ONCE(np->ucast_oif); if (ip6_dst_lookup(net, sk, &dst, &fl6)) goto out; @@ -775,8 +989,9 @@ static void icmpv6_echo_reply(struct sk_buff *skb) goto out; /* Check the ratelimit */ - if ((!(skb->dev->flags & IFF_LOOPBACK) && !icmpv6_global_allow(net, ICMPV6_ECHO_REPLY)) || - !icmpv6_xrlim_allow(sk, ICMPV6_ECHO_REPLY, &fl6)) + if ((!(skb->dev->flags & IFF_LOOPBACK) && + !icmpv6_global_allow(net, ICMPV6_ECHO_REPLY, &apply_ratelimit)) || + !icmpv6_xrlim_allow(sk, ICMPV6_ECHO_REPLY, &fl6, apply_ratelimit)) goto out_dst_release; idev = __in6_dev_get(skb->dev); @@ -785,7 +1000,7 @@ static void icmpv6_echo_reply(struct sk_buff *skb) msg.offset = 0; msg.type = type; - ipcm6_init_sk(&ipc6, np); + ipcm6_init_sk(&ipc6, sk); ipc6.hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst); ipc6.tclass = ipv6_get_dsfield(ipv6_hdr(skb)); ipc6.sockc.mark = mark; @@ -797,12 +1012,13 @@ static void icmpv6_echo_reply(struct sk_buff *skb) if (ip6_append_data(sk, icmpv6_getfrag, &msg, skb->len + sizeof(struct icmp6hdr), sizeof(struct icmp6hdr), &ipc6, &fl6, - (struct rt6_info *)dst, MSG_DONTWAIT)) { + dst_rt6_info(dst), MSG_DONTWAIT)) { __ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTERRORS); ip6_flush_pending_frames(sk); } else { icmpv6_push_pending_frames(sk, &fl6, &tmp_hdr, skb->len + sizeof(struct icmp6hdr)); + reason = SKB_CONSUMED; } out_dst_release: dst_release(dst); @@ -810,18 +1026,22 @@ out: icmpv6_xmit_unlock(sk); out_bh_enable: local_bh_enable(); + return reason; } -void icmpv6_notify(struct sk_buff *skb, u8 type, u8 code, __be32 info) +enum skb_drop_reason icmpv6_notify(struct sk_buff *skb, u8 type, + u8 code, __be32 info) { struct inet6_skb_parm *opt = IP6CB(skb); + struct net *net = dev_net_rcu(skb->dev); const struct inet6_protocol *ipprot; + enum skb_drop_reason reason; int inner_offset; __be16 frag_off; u8 nexthdr; - struct net *net = dev_net(skb->dev); - if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) + reason = pskb_may_pull_reason(skb, sizeof(struct ipv6hdr)); + if (reason != SKB_NOT_DROPPED_YET) goto out; seg6_icmp_srh(skb, opt); @@ -831,14 +1051,17 @@ void icmpv6_notify(struct sk_buff *skb, u8 type, u8 code, __be32 info) /* now skip over extension headers */ inner_offset = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr, &frag_off); - if (inner_offset < 0) + if (inner_offset < 0) { + SKB_DR_SET(reason, IPV6_BAD_EXTHDR); goto out; + } } else { inner_offset = sizeof(struct ipv6hdr); } /* Checkin header including 8 bytes of inner protocol header. */ - if (!pskb_may_pull(skb, inner_offset+8)) + reason = pskb_may_pull_reason(skb, inner_offset + 8); + if (reason != SKB_NOT_DROPPED_YET) goto out; /* BUGGG_FUTURE: we should try to parse exthdrs in this packet. @@ -853,10 +1076,11 @@ void icmpv6_notify(struct sk_buff *skb, u8 type, u8 code, __be32 info) ipprot->err_handler(skb, opt, type, code, inner_offset, info); raw6_icmp_error(skb, nexthdr, type, code, inner_offset, info); - return; + return SKB_CONSUMED; out: __ICMP6_INC_STATS(net, __in6_dev_get(skb->dev), ICMP6_MIB_INERRORS); + return reason; } /* @@ -866,7 +1090,7 @@ out: static int icmpv6_rcv(struct sk_buff *skb) { enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED; - struct net *net = dev_net(skb->dev); + struct net *net = dev_net_rcu(skb->dev); struct net_device *dev = icmp6_dev(skb); struct inet6_dev *idev = __in6_dev_get(dev); const struct in6_addr *saddr, *daddr; @@ -898,7 +1122,7 @@ static int icmpv6_rcv(struct sk_buff *skb) skb_set_network_header(skb, nh); } - __ICMP6_INC_STATS(dev_net(dev), idev, ICMP6_MIB_INMSGS); + __ICMP6_INC_STATS(dev_net_rcu(dev), idev, ICMP6_MIB_INMSGS); saddr = &ipv6_hdr(skb)->saddr; daddr = &ipv6_hdr(skb)->daddr; @@ -916,26 +1140,23 @@ static int icmpv6_rcv(struct sk_buff *skb) type = hdr->icmp6_type; - ICMP6MSGIN_INC_STATS(dev_net(dev), idev, type); + ICMP6MSGIN_INC_STATS(dev_net_rcu(dev), idev, type); switch (type) { case ICMPV6_ECHO_REQUEST: if (!net->ipv6.sysctl.icmpv6_echo_ignore_all) - icmpv6_echo_reply(skb); + reason = icmpv6_echo_reply(skb); break; case ICMPV6_EXT_ECHO_REQUEST: if (!net->ipv6.sysctl.icmpv6_echo_ignore_all && READ_ONCE(net->ipv4.sysctl_icmp_echo_enable_probe)) - icmpv6_echo_reply(skb); + reason = icmpv6_echo_reply(skb); break; case ICMPV6_ECHO_REPLY: - reason = ping_rcv(skb); - break; - case ICMPV6_EXT_ECHO_REPLY: - reason = ping_rcv(skb); - break; + ping_rcv(skb); + return 0; case ICMPV6_PKT_TOOBIG: /* BUGGG_FUTURE: if packet contains rthdr, we cannot update @@ -952,7 +1173,8 @@ static int icmpv6_rcv(struct sk_buff *skb) case ICMPV6_DEST_UNREACH: case ICMPV6_TIME_EXCEED: case ICMPV6_PARAMPROB: - icmpv6_notify(skb, type, hdr->icmp6_code, hdr->icmp6_mtu); + reason = icmpv6_notify(skb, type, hdr->icmp6_code, + hdr->icmp6_mtu); break; case NDISC_ROUTER_SOLICITATION: @@ -960,7 +1182,7 @@ static int icmpv6_rcv(struct sk_buff *skb) case NDISC_NEIGHBOUR_SOLICITATION: case NDISC_NEIGHBOUR_ADVERTISEMENT: case NDISC_REDIRECT: - ndisc_rcv(skb); + reason = ndisc_rcv(skb); break; case ICMPV6_MGM_QUERY: @@ -994,7 +1216,8 @@ static int icmpv6_rcv(struct sk_buff *skb) * must pass to upper level */ - icmpv6_notify(skb, type, hdr->icmp6_code, hdr->icmp6_mtu); + reason = icmpv6_notify(skb, type, hdr->icmp6_code, + hdr->icmp6_mtu); } /* until the v6 path can be better sorted assume failure and @@ -1009,19 +1232,17 @@ static int icmpv6_rcv(struct sk_buff *skb) csum_error: reason = SKB_DROP_REASON_ICMP_CSUM; - __ICMP6_INC_STATS(dev_net(dev), idev, ICMP6_MIB_CSUMERRORS); + __ICMP6_INC_STATS(dev_net_rcu(dev), idev, ICMP6_MIB_CSUMERRORS); discard_it: - __ICMP6_INC_STATS(dev_net(dev), idev, ICMP6_MIB_INERRORS); + __ICMP6_INC_STATS(dev_net_rcu(dev), idev, ICMP6_MIB_INERRORS); drop_no_count: kfree_skb_reason(skb, reason); return 0; } -void icmpv6_flow_init(struct sock *sk, struct flowi6 *fl6, - u8 type, +void icmpv6_flow_init(const struct sock *sk, struct flowi6 *fl6, u8 type, const struct in6_addr *saddr, - const struct in6_addr *daddr, - int oif) + const struct in6_addr *daddr, int oif) { memset(fl6, 0, sizeof(*fl6)); fl6->saddr = *saddr; @@ -1146,6 +1367,10 @@ int icmpv6_err_convert(u8 type, u8 code, int *err) EXPORT_SYMBOL(icmpv6_err_convert); #ifdef CONFIG_SYSCTL + +static u32 icmpv6_errors_extension_mask_all = + GENMASK_U8(ICMP_ERR_EXT_COUNT - 1, 0); + static struct ctl_table ipv6_icmp_table_template[] = { { .procname = "ratelimit", @@ -1182,7 +1407,24 @@ static struct ctl_table ipv6_icmp_table_template[] = { .mode = 0644, .proc_handler = proc_do_large_bitmap, }, - { }, + { + .procname = "error_anycast_as_unicast", + .data = &init_net.ipv6.sysctl.icmpv6_error_anycast_as_unicast, + .maxlen = sizeof(u8), + .mode = 0644, + .proc_handler = proc_dou8vec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { + .procname = "errors_extension_mask", + .data = &init_net.ipv6.sysctl.icmpv6_errors_extension_mask, + .maxlen = sizeof(u8), + .mode = 0644, + .proc_handler = proc_dou8vec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = &icmpv6_errors_extension_mask_all, + }, }; struct ctl_table * __net_init ipv6_icmp_sysctl_init(struct net *net) @@ -1199,7 +1441,14 @@ struct ctl_table * __net_init ipv6_icmp_sysctl_init(struct net *net) table[2].data = &net->ipv6.sysctl.icmpv6_echo_ignore_multicast; table[3].data = &net->ipv6.sysctl.icmpv6_echo_ignore_anycast; table[4].data = &net->ipv6.sysctl.icmpv6_ratemask_ptr; + table[5].data = &net->ipv6.sysctl.icmpv6_error_anycast_as_unicast; + table[6].data = &net->ipv6.sysctl.icmpv6_errors_extension_mask; } return table; } + +size_t ipv6_icmp_sysctl_table_size(void) +{ + return ARRAY_SIZE(ipv6_icmp_table_template); +} #endif diff --git a/net/ipv6/ila/ila.h b/net/ipv6/ila/ila.h index ad5f6f6ba333..85b92917849b 100644 --- a/net/ipv6/ila/ila.h +++ b/net/ipv6/ila/ila.h @@ -108,6 +108,7 @@ int ila_lwt_init(void); void ila_lwt_fini(void); int ila_xlat_init_net(struct net *net); +void ila_xlat_pre_exit_net(struct net *net); void ila_xlat_exit_net(struct net *net); int ila_xlat_nl_cmd_add_mapping(struct sk_buff *skb, struct genl_info *info); diff --git a/net/ipv6/ila/ila_common.c b/net/ipv6/ila/ila_common.c index 95e9146918cc..b8d43ed4689d 100644 --- a/net/ipv6/ila/ila_common.c +++ b/net/ipv6/ila/ila_common.c @@ -86,7 +86,7 @@ static void ila_csum_adjust_transport(struct sk_buff *skb, diff = get_csum_diff(ip6h, p); inet_proto_csum_replace_by_diff(&th->check, skb, - diff, true); + diff, true, true); } break; case NEXTHDR_UDP: @@ -97,7 +97,7 @@ static void ila_csum_adjust_transport(struct sk_buff *skb, if (uh->check || skb->ip_summed == CHECKSUM_PARTIAL) { diff = get_csum_diff(ip6h, p); inet_proto_csum_replace_by_diff(&uh->check, skb, - diff, true); + diff, true, true); if (!uh->check) uh->check = CSUM_MANGLED_0; } @@ -111,7 +111,7 @@ static void ila_csum_adjust_transport(struct sk_buff *skb, diff = get_csum_diff(ip6h, p); inet_proto_csum_replace_by_diff(&ih->icmp6_cksum, skb, - diff, true); + diff, true, true); } break; } diff --git a/net/ipv6/ila/ila_lwt.c b/net/ipv6/ila/ila_lwt.c index 8c1ce78956ba..7bb9edc5c28c 100644 --- a/net/ipv6/ila/ila_lwt.c +++ b/net/ipv6/ila/ila_lwt.c @@ -38,7 +38,7 @@ static inline struct ila_params *ila_params_lwtunnel( static int ila_output(struct net *net, struct sock *sk, struct sk_buff *skb) { struct dst_entry *orig_dst = skb_dst(skb); - struct rt6_info *rt = (struct rt6_info *)orig_dst; + struct rt6_info *rt = dst_rt6_info(orig_dst); struct ila_lwt *ilwt = ila_lwt_lwtunnel(orig_dst->lwtstate); struct dst_entry *dst; int err = -EINVAL; @@ -58,7 +58,9 @@ static int ila_output(struct net *net, struct sock *sk, struct sk_buff *skb) return orig_dst->lwtstate->orig_output(net, sk, skb); } + local_bh_disable(); dst = dst_cache_get(&ilwt->dst_cache); + local_bh_enable(); if (unlikely(!dst)) { struct ipv6hdr *ip6h = ipv6_hdr(skb); struct flowi6 fl6; @@ -68,9 +70,9 @@ static int ila_output(struct net *net, struct sock *sk, struct sk_buff *skb) */ memset(&fl6, 0, sizeof(fl6)); - fl6.flowi6_oif = orig_dst->dev->ifindex; + fl6.flowi6_oif = dst_dev(orig_dst)->ifindex; fl6.flowi6_iif = LOOPBACK_IFINDEX; - fl6.daddr = *rt6_nexthop((struct rt6_info *)orig_dst, + fl6.daddr = *rt6_nexthop(dst_rt6_info(orig_dst), &ip6h->daddr); dst = ip6_route_output(net, NULL, &fl6); @@ -86,10 +88,15 @@ static int ila_output(struct net *net, struct sock *sk, struct sk_buff *skb) goto drop; } - if (ilwt->connected) + /* cache only if we don't create a dst reference loop */ + if (ilwt->connected && orig_dst->lwtstate != dst->lwtstate) { + local_bh_disable(); dst_cache_set_ip6(&ilwt->dst_cache, dst, &fl6.saddr); + local_bh_enable(); + } } + skb_dst_drop(skb); skb_dst_set(skb, dst); return dst_output(net, sk, skb); diff --git a/net/ipv6/ila/ila_main.c b/net/ipv6/ila/ila_main.c index 3faf62530d6a..976c78efbae1 100644 --- a/net/ipv6/ila/ila_main.c +++ b/net/ipv6/ila/ila_main.c @@ -1,6 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 #include <net/genetlink.h> -#include <net/ila.h> #include <net/netns/generic.h> #include <uapi/linux/genetlink.h> #include "ila.h" @@ -72,6 +71,11 @@ ila_xlat_init_fail: return err; } +static __net_exit void ila_pre_exit_net(struct net *net) +{ + ila_xlat_pre_exit_net(net); +} + static __net_exit void ila_exit_net(struct net *net) { ila_xlat_exit_net(net); @@ -79,6 +83,7 @@ static __net_exit void ila_exit_net(struct net *net) static struct pernet_operations ila_net_ops = { .init = ila_init_net, + .pre_exit = ila_pre_exit_net, .exit = ila_exit_net, .id = &ila_net_id, .size = sizeof(struct ila_net), diff --git a/net/ipv6/ila/ila_xlat.c b/net/ipv6/ila/ila_xlat.c index 47447f0241df..1d41b2ab4884 100644 --- a/net/ipv6/ila/ila_xlat.c +++ b/net/ipv6/ila/ila_xlat.c @@ -5,7 +5,6 @@ #include <linux/rhashtable.h> #include <linux/vmalloc.h> #include <net/genetlink.h> -#include <net/ila.h> #include <net/netns/generic.h> #include <uapi/linux/genetlink.h> #include "ila.h" @@ -106,16 +105,11 @@ static int parse_nl_config(struct genl_info *info, xp->ip.locator_match.v64 = (__force __be64)nla_get_u64( info->attrs[ILA_ATTR_LOCATOR_MATCH]); - if (info->attrs[ILA_ATTR_CSUM_MODE]) - xp->ip.csum_mode = nla_get_u8(info->attrs[ILA_ATTR_CSUM_MODE]); - else - xp->ip.csum_mode = ILA_CSUM_NO_ACTION; + xp->ip.csum_mode = nla_get_u8_default(info->attrs[ILA_ATTR_CSUM_MODE], + ILA_CSUM_NO_ACTION); - if (info->attrs[ILA_ATTR_IDENT_TYPE]) - xp->ip.ident_type = nla_get_u8( - info->attrs[ILA_ATTR_IDENT_TYPE]); - else - xp->ip.ident_type = ILA_ATYPE_USE_FORMAT; + xp->ip.ident_type = nla_get_u8_default(info->attrs[ILA_ATTR_IDENT_TYPE], + ILA_ATYPE_USE_FORMAT); if (info->attrs[ILA_ATTR_IFINDEX]) xp->ifindex = nla_get_s32(info->attrs[ILA_ATTR_IFINDEX]); @@ -201,6 +195,8 @@ static const struct nf_hook_ops ila_nf_hook_ops[] = { }, }; +static DEFINE_MUTEX(ila_mutex); + static int ila_add_mapping(struct net *net, struct ila_xlat_params *xp) { struct ila_net *ilan = net_generic(net, ila_net_id); @@ -208,16 +204,20 @@ static int ila_add_mapping(struct net *net, struct ila_xlat_params *xp) spinlock_t *lock = ila_get_lock(ilan, xp->ip.locator_match); int err = 0, order; - if (!ilan->xlat.hooks_registered) { + if (!READ_ONCE(ilan->xlat.hooks_registered)) { /* We defer registering net hooks in the namespace until the * first mapping is added. */ - err = nf_register_net_hooks(net, ila_nf_hook_ops, - ARRAY_SIZE(ila_nf_hook_ops)); + mutex_lock(&ila_mutex); + if (!ilan->xlat.hooks_registered) { + err = nf_register_net_hooks(net, ila_nf_hook_ops, + ARRAY_SIZE(ila_nf_hook_ops)); + if (!err) + WRITE_ONCE(ilan->xlat.hooks_registered, true); + } + mutex_unlock(&ila_mutex); if (err) return err; - - ilan->xlat.hooks_registered = true; } ila = kzalloc(sizeof(*ila), GFP_KERNEL); @@ -477,6 +477,7 @@ int ila_xlat_nl_cmd_get_mapping(struct sk_buff *skb, struct genl_info *info) rcu_read_lock(); + ret = -ESRCH; ila = ila_lookup_by_params(&xp, ilan); if (ila) { ret = ila_dump_info(ila, @@ -619,6 +620,15 @@ int ila_xlat_init_net(struct net *net) return 0; } +void ila_xlat_pre_exit_net(struct net *net) +{ + struct ila_net *ilan = net_generic(net, ila_net_id); + + if (ilan->xlat.hooks_registered) + nf_unregister_net_hooks(net, ila_nf_hook_ops, + ARRAY_SIZE(ila_nf_hook_ops)); +} + void ila_xlat_exit_net(struct net *net) { struct ila_net *ilan = net_generic(net, ila_net_id); @@ -626,10 +636,6 @@ void ila_xlat_exit_net(struct net *net) rhashtable_free_and_destroy(&ilan->xlat.rhash_table, ila_free_cb, NULL); free_bucket_spinlocks(ilan->xlat.locks); - - if (ilan->xlat.hooks_registered) - nf_unregister_net_hooks(net, ila_nf_hook_ops, - ARRAY_SIZE(ila_nf_hook_ops)); } static int ila_xlat_addr(struct sk_buff *skb, bool sir2ila) diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c index 5a9f4d722f35..ea5cf3fdfdd6 100644 --- a/net/ipv6/inet6_connection_sock.c +++ b/net/ipv6/inet6_connection_sock.c @@ -45,7 +45,7 @@ struct dst_entry *inet6_csk_route_req(const struct sock *sk, fl6->flowi6_mark = ireq->ir_mark; fl6->fl6_dport = ireq->ir_rmt_port; fl6->fl6_sport = htons(ireq->ir_num); - fl6->flowi6_uid = sk->sk_uid; + fl6->flowi6_uid = sk_uid(sk); security_req_classify_flow(req, flowi6_to_flowi_common(fl6)); dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_p); @@ -54,21 +54,6 @@ struct dst_entry *inet6_csk_route_req(const struct sock *sk, return dst; } -EXPORT_SYMBOL(inet6_csk_route_req); - -void inet6_csk_addr2sockaddr(struct sock *sk, struct sockaddr *uaddr) -{ - struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) uaddr; - - sin6->sin6_family = AF_INET6; - sin6->sin6_addr = sk->sk_v6_daddr; - sin6->sin6_port = inet_sk(sk)->inet_dport; - /* We do not store received flowlabel for TCP */ - sin6->sin6_flowinfo = 0; - sin6->sin6_scope_id = ipv6_iface_scope_id(&sin6->sin6_addr, - sk->sk_bound_dev_if); -} -EXPORT_SYMBOL_GPL(inet6_csk_addr2sockaddr); static inline struct dst_entry *__inet6_csk_dst_check(struct sock *sk, u32 cookie) @@ -94,7 +79,7 @@ static struct dst_entry *inet6_csk_route_socket(struct sock *sk, fl6->flowi6_mark = sk->sk_mark; fl6->fl6_sport = inet->inet_sport; fl6->fl6_dport = inet->inet_dport; - fl6->flowi6_uid = sk->sk_uid; + fl6->flowi6_uid = sk_uid(sk); security_sk_classify_flow(sk, flowi6_to_flowi_common(fl6)); rcu_read_lock(); @@ -106,7 +91,7 @@ static struct dst_entry *inet6_csk_route_socket(struct sock *sk, dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_p); if (!IS_ERR(dst)) - ip6_dst_store(sk, dst, NULL, NULL); + ip6_dst_store(sk, dst, false, false); } return dst; } @@ -120,7 +105,7 @@ int inet6_csk_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl_unused dst = inet6_csk_route_socket(sk, &fl6); if (IS_ERR(dst)) { - sk->sk_err_soft = -PTR_ERR(dst); + WRITE_ONCE(sk->sk_err_soft, -PTR_ERR(dst)); sk->sk_route_caps = 0; kfree_skb(skb); return PTR_ERR(dst); @@ -133,7 +118,7 @@ int inet6_csk_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl_unused fl6.daddr = sk->sk_v6_daddr; res = ip6_xmit(sk, skb, &fl6, sk->sk_mark, rcu_dereference(np->opt), - np->tclass, sk->sk_priority); + np->tclass, READ_ONCE(sk->sk_priority)); rcu_read_unlock(); return res; } @@ -151,4 +136,3 @@ struct dst_entry *inet6_csk_update_pmtu(struct sock *sk, u32 mtu) dst = inet6_csk_route_socket(sk, &fl6); return IS_ERR(dst) ? NULL : dst; } -EXPORT_SYMBOL_GPL(inet6_csk_update_pmtu); diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c index b64b49012655..5e1da088d8e1 100644 --- a/net/ipv6/inet6_hashtables.c +++ b/net/ipv6/inet6_hashtables.c @@ -14,31 +14,31 @@ #include <linux/random.h> #include <net/addrconf.h> +#include <net/hotdata.h> #include <net/inet_connection_sock.h> #include <net/inet_hashtables.h> #include <net/inet6_hashtables.h> #include <net/secure_seq.h> #include <net/ip.h> #include <net/sock_reuseport.h> +#include <net/tcp.h> u32 inet6_ehashfn(const struct net *net, const struct in6_addr *laddr, const u16 lport, const struct in6_addr *faddr, const __be16 fport) { - static u32 inet6_ehash_secret __read_mostly; - static u32 ipv6_hash_secret __read_mostly; - u32 lhash, fhash; net_get_random_once(&inet6_ehash_secret, sizeof(inet6_ehash_secret)); - net_get_random_once(&ipv6_hash_secret, sizeof(ipv6_hash_secret)); + net_get_random_once(&tcp_ipv6_hash_secret, sizeof(tcp_ipv6_hash_secret)); lhash = (__force u32)laddr->s6_addr32[3]; - fhash = __ipv6_addr_jhash(faddr, ipv6_hash_secret); + fhash = __ipv6_addr_jhash(faddr, tcp_ipv6_hash_secret); - return __inet6_ehashfn(lhash, lport, fhash, fport, - inet6_ehash_secret + net_hash_mix(net)); + return lport + __inet6_ehashfn(lhash, 0, fhash, fport, + inet6_ehash_secret + net_hash_mix(net)); } +EXPORT_SYMBOL_GPL(inet6_ehashfn); /* * Sockets in TCP_CLOSE state are _always_ taken out of the hash, so @@ -46,25 +46,24 @@ u32 inet6_ehashfn(const struct net *net, * * The sockhash lock must be held as a reader here. */ -struct sock *__inet6_lookup_established(struct net *net, - struct inet_hashinfo *hashinfo, - const struct in6_addr *saddr, - const __be16 sport, - const struct in6_addr *daddr, - const u16 hnum, - const int dif, const int sdif) +struct sock *__inet6_lookup_established(const struct net *net, + const struct in6_addr *saddr, + const __be16 sport, + const struct in6_addr *daddr, + const u16 hnum, + const int dif, const int sdif) { - struct sock *sk; - const struct hlist_nulls_node *node; const __portpair ports = INET_COMBINED_PORTS(sport, hnum); - /* Optimize here for direct hit, only listening connections can - * have wildcards anyways. - */ - unsigned int hash = inet6_ehashfn(net, daddr, hnum, saddr, sport); - unsigned int slot = hash & hashinfo->ehash_mask; - struct inet_ehash_bucket *head = &hashinfo->ehash[slot]; - + const struct hlist_nulls_node *node; + struct inet_ehash_bucket *head; + struct inet_hashinfo *hashinfo; + unsigned int hash, slot; + struct sock *sk; + hashinfo = net->ipv4.tcp_death_row.hashinfo; + hash = inet6_ehashfn(net, daddr, hnum, saddr, sport); + slot = hash & hashinfo->ehash_mask; + head = &hashinfo->ehash[slot]; begin: sk_nulls_for_each_rcu(sk, node, &head->chain) { if (sk->sk_hash != hash) @@ -89,7 +88,7 @@ found: } EXPORT_SYMBOL(__inet6_lookup_established); -static inline int compute_score(struct sock *sk, struct net *net, +static inline int compute_score(struct sock *sk, const struct net *net, const unsigned short hnum, const struct in6_addr *daddr, const int dif, const int sdif) @@ -111,25 +110,43 @@ static inline int compute_score(struct sock *sk, struct net *net, return score; } -static inline struct sock *lookup_reuseport(struct net *net, struct sock *sk, - struct sk_buff *skb, int doff, - const struct in6_addr *saddr, - __be16 sport, - const struct in6_addr *daddr, - unsigned short hnum) +/** + * inet6_lookup_reuseport() - execute reuseport logic on AF_INET6 socket if necessary. + * @net: network namespace. + * @sk: AF_INET6 socket, must be in TCP_LISTEN state for TCP or TCP_CLOSE for UDP. + * @skb: context for a potential SK_REUSEPORT program. + * @doff: header offset. + * @saddr: source address. + * @sport: source port. + * @daddr: destination address. + * @hnum: destination port in host byte order. + * @ehashfn: hash function used to generate the fallback hash. + * + * Return: NULL if sk doesn't have SO_REUSEPORT set, otherwise a pointer to + * the selected sock or an error. + */ +struct sock *inet6_lookup_reuseport(const struct net *net, struct sock *sk, + struct sk_buff *skb, int doff, + const struct in6_addr *saddr, + __be16 sport, + const struct in6_addr *daddr, + unsigned short hnum, + inet6_ehashfn_t *ehashfn) { struct sock *reuse_sk = NULL; u32 phash; if (sk->sk_reuseport) { - phash = inet6_ehashfn(net, daddr, hnum, saddr, sport); + phash = INDIRECT_CALL_INET(ehashfn, udp6_ehashfn, inet6_ehashfn, + net, daddr, hnum, saddr, sport); reuse_sk = reuseport_select_sock(sk, phash, skb, doff); } return reuse_sk; } +EXPORT_SYMBOL_GPL(inet6_lookup_reuseport); /* called with rcu_read_lock() */ -static struct sock *inet6_lhash2_lookup(struct net *net, +static struct sock *inet6_lhash2_lookup(const struct net *net, struct inet_listen_hashbucket *ilb2, struct sk_buff *skb, int doff, const struct in6_addr *saddr, @@ -143,8 +160,8 @@ static struct sock *inet6_lhash2_lookup(struct net *net, sk_nulls_for_each_rcu(sk, node, &ilb2->nulls_head) { score = compute_score(sk, net, hnum, daddr, dif, sdif); if (score > hiscore) { - result = lookup_reuseport(net, sk, skb, doff, - saddr, sport, daddr, hnum); + result = inet6_lookup_reuseport(net, sk, skb, doff, + saddr, sport, daddr, hnum, inet6_ehashfn); if (result) return result; @@ -156,50 +173,54 @@ static struct sock *inet6_lhash2_lookup(struct net *net, return result; } -static inline struct sock *inet6_lookup_run_bpf(struct net *net, - struct inet_hashinfo *hashinfo, - struct sk_buff *skb, int doff, - const struct in6_addr *saddr, - const __be16 sport, - const struct in6_addr *daddr, - const u16 hnum, const int dif) +struct sock *inet6_lookup_run_sk_lookup(const struct net *net, + int protocol, + struct sk_buff *skb, int doff, + const struct in6_addr *saddr, + const __be16 sport, + const struct in6_addr *daddr, + const u16 hnum, const int dif, + inet6_ehashfn_t *ehashfn) { struct sock *sk, *reuse_sk; bool no_reuseport; - if (hashinfo != net->ipv4.tcp_death_row.hashinfo) - return NULL; /* only TCP is supported */ - - no_reuseport = bpf_sk_lookup_run_v6(net, IPPROTO_TCP, saddr, sport, + no_reuseport = bpf_sk_lookup_run_v6(net, protocol, saddr, sport, daddr, hnum, dif, &sk); if (no_reuseport || IS_ERR_OR_NULL(sk)) return sk; - reuse_sk = lookup_reuseport(net, sk, skb, doff, saddr, sport, daddr, hnum); + reuse_sk = inet6_lookup_reuseport(net, sk, skb, doff, + saddr, sport, daddr, hnum, ehashfn); if (reuse_sk) sk = reuse_sk; return sk; } - -struct sock *inet6_lookup_listener(struct net *net, - struct inet_hashinfo *hashinfo, - struct sk_buff *skb, int doff, - const struct in6_addr *saddr, - const __be16 sport, const struct in6_addr *daddr, - const unsigned short hnum, const int dif, const int sdif) +EXPORT_SYMBOL_GPL(inet6_lookup_run_sk_lookup); + +struct sock *inet6_lookup_listener(const struct net *net, + struct sk_buff *skb, int doff, + const struct in6_addr *saddr, + const __be16 sport, + const struct in6_addr *daddr, + const unsigned short hnum, + const int dif, const int sdif) { struct inet_listen_hashbucket *ilb2; + struct inet_hashinfo *hashinfo; struct sock *result = NULL; unsigned int hash2; /* Lookup redirect from BPF */ if (static_branch_unlikely(&bpf_sk_lookup_enabled)) { - result = inet6_lookup_run_bpf(net, hashinfo, skb, doff, - saddr, sport, daddr, hnum, dif); + result = inet6_lookup_run_sk_lookup(net, IPPROTO_TCP, skb, doff, + saddr, sport, daddr, hnum, dif, + inet6_ehashfn); if (result) goto done; } + hashinfo = net->ipv4.tcp_death_row.hashinfo; hash2 = ipv6_portaddr_hash(net, daddr, hnum); ilb2 = inet_lhash2_bucket(hashinfo, hash2); @@ -223,7 +244,7 @@ done: } EXPORT_SYMBOL_GPL(inet6_lookup_listener); -struct sock *inet6_lookup(struct net *net, struct inet_hashinfo *hashinfo, +struct sock *inet6_lookup(const struct net *net, struct sk_buff *skb, int doff, const struct in6_addr *saddr, const __be16 sport, const struct in6_addr *daddr, const __be16 dport, @@ -232,7 +253,7 @@ struct sock *inet6_lookup(struct net *net, struct inet_hashinfo *hashinfo, struct sock *sk; bool refcounted; - sk = __inet6_lookup(net, hashinfo, skb, doff, saddr, sport, daddr, + sk = __inet6_lookup(net, skb, doff, saddr, sport, daddr, ntohs(dport), dif, 0, &refcounted); if (sk && !refcounted && !refcount_inc_not_zero(&sk->sk_refcnt)) sk = NULL; @@ -242,7 +263,9 @@ EXPORT_SYMBOL_GPL(inet6_lookup); static int __inet6_check_established(struct inet_timewait_death_row *death_row, struct sock *sk, const __u16 lport, - struct inet_timewait_sock **twp) + struct inet_timewait_sock **twp, + bool rcu_lookup, + u32 hash) { struct inet_hashinfo *hinfo = death_row->hashinfo; struct inet_sock *inet = inet_sk(sk); @@ -252,14 +275,26 @@ static int __inet6_check_established(struct inet_timewait_death_row *death_row, struct net *net = sock_net(sk); const int sdif = l3mdev_master_ifindex_by_index(net, dif); const __portpair ports = INET_COMBINED_PORTS(inet->inet_dport, lport); - const unsigned int hash = inet6_ehashfn(net, daddr, lport, saddr, - inet->inet_dport); struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash); - spinlock_t *lock = inet_ehash_lockp(hinfo, hash); - struct sock *sk2; - const struct hlist_nulls_node *node; struct inet_timewait_sock *tw = NULL; + const struct hlist_nulls_node *node; + struct sock *sk2; + spinlock_t *lock; + + if (rcu_lookup) { + sk_nulls_for_each(sk2, node, &head->chain) { + if (sk2->sk_hash != hash || + !inet6_match(net, sk2, saddr, daddr, + ports, dif, sdif)) + continue; + if (sk2->sk_state == TCP_TIME_WAIT) + break; + return -EADDRNOTAVAIL; + } + return 0; + } + lock = inet_ehash_lockp(hinfo, hash); spin_lock(lock); sk_nulls_for_each(sk2, node, &head->chain) { @@ -270,7 +305,7 @@ static int __inet6_check_established(struct inet_timewait_death_row *death_row, dif, sdif))) { if (sk2->sk_state == TCP_TIME_WAIT) { tw = inet_twsk(sk2); - if (twsk_unique(sk, sk2, twp)) + if (tcp_twsk_unique(sk, sk2, twp)) break; } goto not_unique; @@ -317,22 +352,19 @@ static u64 inet6_sk_port_offset(const struct sock *sk) int inet6_hash_connect(struct inet_timewait_death_row *death_row, struct sock *sk) { + const struct in6_addr *daddr = &sk->sk_v6_rcv_saddr; + const struct in6_addr *saddr = &sk->sk_v6_daddr; + const struct inet_sock *inet = inet_sk(sk); + const struct net *net = sock_net(sk); u64 port_offset = 0; + u32 hash_port0; if (!inet_sk(sk)->inet_num) port_offset = inet6_sk_port_offset(sk); - return __inet_hash_connect(death_row, sk, port_offset, - __inet6_check_established); -} -EXPORT_SYMBOL_GPL(inet6_hash_connect); -int inet6_hash(struct sock *sk) -{ - int err = 0; + hash_port0 = inet6_ehashfn(net, daddr, 0, saddr, inet->inet_dport); - if (sk->sk_state != TCP_CLOSE) - err = __inet_hash(sk, NULL); - - return err; + return __inet_hash_connect(death_row, sk, port_offset, hash_port0, + __inet6_check_established); } -EXPORT_SYMBOL_GPL(inet6_hash); +EXPORT_SYMBOL_GPL(inet6_hash_connect); diff --git a/net/ipv6/ioam6.c b/net/ipv6/ioam6.c index 571f0e4d9cf3..9553a3200081 100644 --- a/net/ipv6/ioam6.c +++ b/net/ipv6/ioam6.c @@ -135,15 +135,11 @@ static int ioam6_genl_addns(struct sk_buff *skb, struct genl_info *info) ns->id = id; - if (!info->attrs[IOAM6_ATTR_NS_DATA]) - data32 = IOAM6_U32_UNAVAILABLE; - else - data32 = nla_get_u32(info->attrs[IOAM6_ATTR_NS_DATA]); + data32 = nla_get_u32_default(info->attrs[IOAM6_ATTR_NS_DATA], + IOAM6_U32_UNAVAILABLE); - if (!info->attrs[IOAM6_ATTR_NS_DATA_WIDE]) - data64 = IOAM6_U64_UNAVAILABLE; - else - data64 = nla_get_u64(info->attrs[IOAM6_ATTR_NS_DATA_WIDE]); + data64 = nla_get_u64_default(info->attrs[IOAM6_ATTR_NS_DATA_WIDE], + IOAM6_U64_UNAVAILABLE); ns->data = cpu_to_be32(data32); ns->data_wide = cpu_to_be64(data64); @@ -612,6 +608,68 @@ static const struct genl_ops ioam6_genl_ops[] = { }, }; +#define IOAM6_GENL_EV_GRP_OFFSET 0 + +static const struct genl_multicast_group ioam6_mcgrps[] = { + [IOAM6_GENL_EV_GRP_OFFSET] = { .name = IOAM6_GENL_EV_GRP_NAME, + .flags = GENL_MCAST_CAP_NET_ADMIN }, +}; + +static int ioam6_event_put_trace(struct sk_buff *skb, + struct ioam6_trace_hdr *trace, + unsigned int len) +{ + if (nla_put_u16(skb, IOAM6_EVENT_ATTR_TRACE_NAMESPACE, + be16_to_cpu(trace->namespace_id)) || + nla_put_u8(skb, IOAM6_EVENT_ATTR_TRACE_NODELEN, trace->nodelen) || + nla_put_u32(skb, IOAM6_EVENT_ATTR_TRACE_TYPE, + be32_to_cpu(trace->type_be32)) || + nla_put(skb, IOAM6_EVENT_ATTR_TRACE_DATA, + len - sizeof(struct ioam6_trace_hdr) - trace->remlen * 4, + trace->data + trace->remlen * 4)) + return 1; + + return 0; +} + +void ioam6_event(enum ioam6_event_type type, struct net *net, gfp_t gfp, + void *opt, unsigned int opt_len) +{ + struct nlmsghdr *nlh; + struct sk_buff *skb; + + if (!genl_has_listeners(&ioam6_genl_family, net, + IOAM6_GENL_EV_GRP_OFFSET)) + return; + + skb = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp); + if (!skb) + return; + + nlh = genlmsg_put(skb, 0, 0, &ioam6_genl_family, 0, type); + if (!nlh) + goto nla_put_failure; + + switch (type) { + case IOAM6_EVENT_UNSPEC: + WARN_ON_ONCE(1); + break; + case IOAM6_EVENT_TRACE: + if (ioam6_event_put_trace(skb, (struct ioam6_trace_hdr *)opt, + opt_len)) + goto nla_put_failure; + break; + } + + genlmsg_end(skb, nlh); + genlmsg_multicast_netns(&ioam6_genl_family, net, skb, 0, + IOAM6_GENL_EV_GRP_OFFSET, gfp); + return; + +nla_put_failure: + nlmsg_free(skb); +} + static struct genl_family ioam6_genl_family __ro_after_init = { .name = IOAM6_GENL_NAME, .version = IOAM6_GENL_VERSION, @@ -620,6 +678,8 @@ static struct genl_family ioam6_genl_family __ro_after_init = { .ops = ioam6_genl_ops, .n_ops = ARRAY_SIZE(ioam6_genl_ops), .resv_start_op = IOAM6_CMD_NS_SET_SCHEMA + 1, + .mcgrps = ioam6_mcgrps, + .n_mcgrps = ARRAY_SIZE(ioam6_mcgrps), .module = THIS_MODULE, }; @@ -636,6 +696,7 @@ static void __ioam6_fill_trace_data(struct sk_buff *skb, struct ioam6_schema *sc, u8 sclen, bool is_input) { + struct net_device *dev = skb_dst_dev(skb); struct timespec64 ts; ktime_t tstamp; u64 raw64; @@ -652,7 +713,7 @@ static void __ioam6_fill_trace_data(struct sk_buff *skb, if (is_input) byte--; - raw32 = dev_net(skb_dst(skb)->dev)->ipv6.sysctl.ioam6_id; + raw32 = dev_net(dev)->ipv6.sysctl.ioam6_id; *(__be32 *)data = cpu_to_be32((byte << 24) | raw32); data += sizeof(__be32); @@ -663,15 +724,15 @@ static void __ioam6_fill_trace_data(struct sk_buff *skb, if (!skb->dev) raw16 = IOAM6_U16_UNAVAILABLE; else - raw16 = (__force u16)__in6_dev_get(skb->dev)->cnf.ioam6_id; + raw16 = (__force u16)READ_ONCE(__in6_dev_get(skb->dev)->cnf.ioam6_id); *(__be16 *)data = cpu_to_be16(raw16); data += sizeof(__be16); - if (skb_dst(skb)->dev->flags & IFF_LOOPBACK) + if (dev->flags & IFF_LOOPBACK) raw16 = IOAM6_U16_UNAVAILABLE; else - raw16 = (__force u16)__in6_dev_get(skb_dst(skb)->dev)->cnf.ioam6_id; + raw16 = (__force u16)READ_ONCE(__in6_dev_get(dev)->cnf.ioam6_id); *(__be16 *)data = cpu_to_be16(raw16); data += sizeof(__be16); @@ -723,10 +784,10 @@ static void __ioam6_fill_trace_data(struct sk_buff *skb, struct Qdisc *qdisc; __u32 qlen, backlog; - if (skb_dst(skb)->dev->flags & IFF_LOOPBACK) { + if (dev->flags & IFF_LOOPBACK) { *(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE); } else { - queue = skb_get_tx_queue(skb_dst(skb)->dev, skb); + queue = skb_get_tx_queue(dev, skb); qdisc = rcu_dereference(queue->qdisc); qdisc_qstats_qlen_backlog(qdisc, &qlen, &backlog); @@ -747,7 +808,7 @@ static void __ioam6_fill_trace_data(struct sk_buff *skb, if (is_input) byte--; - raw64 = dev_net(skb_dst(skb)->dev)->ipv6.sysctl.ioam6_id_wide; + raw64 = dev_net(dev)->ipv6.sysctl.ioam6_id_wide; *(__be64 *)data = cpu_to_be64(((u64)byte << 56) | raw64); data += sizeof(__be64); @@ -758,15 +819,15 @@ static void __ioam6_fill_trace_data(struct sk_buff *skb, if (!skb->dev) raw32 = IOAM6_U32_UNAVAILABLE; else - raw32 = __in6_dev_get(skb->dev)->cnf.ioam6_id_wide; + raw32 = READ_ONCE(__in6_dev_get(skb->dev)->cnf.ioam6_id_wide); *(__be32 *)data = cpu_to_be32(raw32); data += sizeof(__be32); - if (skb_dst(skb)->dev->flags & IFF_LOOPBACK) + if (dev->flags & IFF_LOOPBACK) raw32 = IOAM6_U32_UNAVAILABLE; else - raw32 = __in6_dev_get(skb_dst(skb)->dev)->cnf.ioam6_id_wide; + raw32 = READ_ONCE(__in6_dev_get(dev)->cnf.ioam6_id_wide); *(__be32 *)data = cpu_to_be32(raw32); data += sizeof(__be32); diff --git a/net/ipv6/ioam6_iptunnel.c b/net/ipv6/ioam6_iptunnel.c index f6f5b83dd954..1fe7894f14dd 100644 --- a/net/ipv6/ioam6_iptunnel.c +++ b/net/ipv6/ioam6_iptunnel.c @@ -38,15 +38,18 @@ struct ioam6_lwt_freq { }; struct ioam6_lwt { + struct dst_entry null_dst; struct dst_cache cache; struct ioam6_lwt_freq freq; atomic_t pkt_cnt; u8 mode; + bool has_tunsrc; + struct in6_addr tunsrc; struct in6_addr tundst; - struct ioam6_lwt_encap tuninfo; + struct ioam6_lwt_encap tuninfo; }; -static struct netlink_range_validation freq_range = { +static const struct netlink_range_validation freq_range = { .min = IOAM6_IPTUNNEL_FREQ_MIN, .max = IOAM6_IPTUNNEL_FREQ_MAX, }; @@ -72,8 +75,10 @@ static const struct nla_policy ioam6_iptunnel_policy[IOAM6_IPTUNNEL_MAX + 1] = { [IOAM6_IPTUNNEL_MODE] = NLA_POLICY_RANGE(NLA_U8, IOAM6_IPTUNNEL_MODE_MIN, IOAM6_IPTUNNEL_MODE_MAX), + [IOAM6_IPTUNNEL_SRC] = NLA_POLICY_EXACT_LEN(sizeof(struct in6_addr)), [IOAM6_IPTUNNEL_DST] = NLA_POLICY_EXACT_LEN(sizeof(struct in6_addr)), - [IOAM6_IPTUNNEL_TRACE] = NLA_POLICY_EXACT_LEN(sizeof(struct ioam6_trace_hdr)), + [IOAM6_IPTUNNEL_TRACE] = NLA_POLICY_EXACT_LEN( + sizeof(struct ioam6_trace_hdr)), }; static bool ioam6_validate_trace_hdr(struct ioam6_trace_hdr *trace) @@ -85,7 +90,7 @@ static bool ioam6_validate_trace_hdr(struct ioam6_trace_hdr *trace) trace->type.bit12 | trace->type.bit13 | trace->type.bit14 | trace->type.bit15 | trace->type.bit16 | trace->type.bit17 | trace->type.bit18 | trace->type.bit19 | trace->type.bit20 | - trace->type.bit21) + trace->type.bit21 | trace->type.bit23) return false; trace->nodelen = 0; @@ -138,10 +143,13 @@ static int ioam6_build_state(struct net *net, struct nlattr *nla, } } - if (!tb[IOAM6_IPTUNNEL_MODE]) - mode = IOAM6_IPTUNNEL_MODE_INLINE; - else - mode = nla_get_u8(tb[IOAM6_IPTUNNEL_MODE]); + mode = nla_get_u8_default(tb[IOAM6_IPTUNNEL_MODE], + IOAM6_IPTUNNEL_MODE_INLINE); + + if (tb[IOAM6_IPTUNNEL_SRC] && mode == IOAM6_IPTUNNEL_MODE_INLINE) { + NL_SET_ERR_MSG(extack, "no tunnel src expected with this mode"); + return -EINVAL; + } if (!tb[IOAM6_IPTUNNEL_DST] && mode != IOAM6_IPTUNNEL_MODE_INLINE) { NL_SET_ERR_MSG(extack, "this mode needs a tunnel destination"); @@ -167,19 +175,48 @@ static int ioam6_build_state(struct net *net, struct nlattr *nla, ilwt = ioam6_lwt_state(lwt); err = dst_cache_init(&ilwt->cache, GFP_ATOMIC); - if (err) { - kfree(lwt); - return err; - } + if (err) + goto free_lwt; + + /* This "fake" dst_entry will be stored in a dst_cache, which will call + * dst_hold() and dst_release() on it. We must ensure that dst_destroy() + * will never be called. For that, its initial refcount is 1 and +1 when + * it is stored in the cache. Then, +1/-1 each time we read the cache + * and release it. Long story short, we're fine. + */ + dst_init(&ilwt->null_dst, NULL, NULL, DST_OBSOLETE_NONE, DST_NOCOUNT); atomic_set(&ilwt->pkt_cnt, 0); ilwt->freq.k = freq_k; ilwt->freq.n = freq_n; ilwt->mode = mode; - if (tb[IOAM6_IPTUNNEL_DST]) + + if (!tb[IOAM6_IPTUNNEL_SRC]) { + ilwt->has_tunsrc = false; + } else { + ilwt->has_tunsrc = true; + ilwt->tunsrc = nla_get_in6_addr(tb[IOAM6_IPTUNNEL_SRC]); + + if (ipv6_addr_any(&ilwt->tunsrc)) { + NL_SET_ERR_MSG_ATTR(extack, tb[IOAM6_IPTUNNEL_SRC], + "invalid tunnel source address"); + err = -EINVAL; + goto free_cache; + } + } + + if (tb[IOAM6_IPTUNNEL_DST]) { ilwt->tundst = nla_get_in6_addr(tb[IOAM6_IPTUNNEL_DST]); + if (ipv6_addr_any(&ilwt->tundst)) { + NL_SET_ERR_MSG_ATTR(extack, tb[IOAM6_IPTUNNEL_DST], + "invalid tunnel dest address"); + err = -EINVAL; + goto free_cache; + } + } + tuninfo = ioam6_lwt_info(lwt); tuninfo->eh.hdrlen = ((sizeof(*tuninfo) + len_aligned) >> 3) - 1; tuninfo->pad[0] = IPV6_TLV_PADN; @@ -201,6 +238,11 @@ static int ioam6_build_state(struct net *net, struct nlattr *nla, *ts = lwt; return 0; +free_cache: + dst_cache_destroy(&ilwt->cache); +free_lwt: + kfree(lwt); + return err; } static int ioam6_do_fill(struct net *net, struct sk_buff *skb) @@ -220,14 +262,15 @@ static int ioam6_do_fill(struct net *net, struct sk_buff *skb) } static int ioam6_do_inline(struct net *net, struct sk_buff *skb, - struct ioam6_lwt_encap *tuninfo) + struct ioam6_lwt_encap *tuninfo, + struct dst_entry *cache_dst) { struct ipv6hdr *oldhdr, *hdr; int hdrlen, err; hdrlen = (tuninfo->eh.hdrlen + 1) << 3; - err = skb_cow_head(skb, hdrlen + skb->mac_len); + err = skb_cow_head(skb, hdrlen + dst_dev_overhead(cache_dst, skb)); if (unlikely(err)) return err; @@ -256,7 +299,10 @@ static int ioam6_do_inline(struct net *net, struct sk_buff *skb, static int ioam6_do_encap(struct net *net, struct sk_buff *skb, struct ioam6_lwt_encap *tuninfo, - struct in6_addr *tundst) + bool has_tunsrc, + struct in6_addr *tunsrc, + struct in6_addr *tundst, + struct dst_entry *cache_dst) { struct dst_entry *dst = skb_dst(skb); struct ipv6hdr *hdr, *inner_hdr; @@ -265,7 +311,7 @@ static int ioam6_do_encap(struct net *net, struct sk_buff *skb, hdrlen = (tuninfo->eh.hdrlen + 1) << 3; len = sizeof(*hdr) + hdrlen; - err = skb_cow_head(skb, len + skb->mac_len); + err = skb_cow_head(skb, len + dst_dev_overhead(cache_dst, skb)); if (unlikely(err)) return err; @@ -285,8 +331,12 @@ static int ioam6_do_encap(struct net *net, struct sk_buff *skb, hdr->nexthdr = NEXTHDR_HOP; hdr->payload_len = cpu_to_be16(skb->len - sizeof(*hdr)); hdr->daddr = *tundst; - ipv6_dev_get_saddr(net, dst->dev, &hdr->daddr, - IPV6_PREFER_SRC_PUBLIC, &hdr->saddr); + + if (has_tunsrc) + memcpy(&hdr->saddr, tunsrc, sizeof(*tunsrc)); + else + ipv6_dev_get_saddr(net, dst_dev(dst), &hdr->daddr, + IPV6_PREFER_SRC_PUBLIC, &hdr->saddr); skb_postpush_rcsum(skb, hdr, len); @@ -295,8 +345,8 @@ static int ioam6_do_encap(struct net *net, struct sk_buff *skb, static int ioam6_output(struct net *net, struct sock *sk, struct sk_buff *skb) { - struct dst_entry *dst = skb_dst(skb); - struct in6_addr orig_daddr; + struct dst_entry *orig_dst = skb_dst(skb); + struct dst_entry *dst = NULL; struct ioam6_lwt *ilwt; int err = -EINVAL; u32 pkt_cnt; @@ -304,14 +354,27 @@ static int ioam6_output(struct net *net, struct sock *sk, struct sk_buff *skb) if (skb->protocol != htons(ETH_P_IPV6)) goto drop; - ilwt = ioam6_lwt_state(dst->lwtstate); + ilwt = ioam6_lwt_state(orig_dst->lwtstate); /* Check for insertion frequency (i.e., "k over n" insertions) */ pkt_cnt = atomic_fetch_inc(&ilwt->pkt_cnt); if (pkt_cnt % ilwt->freq.n >= ilwt->freq.k) goto out; - orig_daddr = ipv6_hdr(skb)->daddr; + local_bh_disable(); + dst = dst_cache_get(&ilwt->cache); + local_bh_enable(); + + /* This is how we notify that the destination does not change after + * transformation and that we need to use orig_dst instead of the cache + */ + if (dst == &ilwt->null_dst) { + dst_release(dst); + + dst = orig_dst; + /* keep refcount balance: dst_release() is called at the end */ + dst_hold(dst); + } switch (ilwt->mode) { case IOAM6_IPTUNNEL_MODE_INLINE: @@ -320,7 +383,7 @@ do_inline: if (ipv6_hdr(skb)->nexthdr == NEXTHDR_HOP) goto out; - err = ioam6_do_inline(net, skb, &ilwt->tuninfo); + err = ioam6_do_inline(net, skb, &ilwt->tuninfo, dst); if (unlikely(err)) goto drop; @@ -328,7 +391,9 @@ do_inline: case IOAM6_IPTUNNEL_MODE_ENCAP: do_encap: /* Encapsulation (ip6ip6) */ - err = ioam6_do_encap(net, skb, &ilwt->tuninfo, &ilwt->tundst); + err = ioam6_do_encap(net, skb, &ilwt->tuninfo, + ilwt->has_tunsrc, &ilwt->tunsrc, + &ilwt->tundst, dst); if (unlikely(err)) goto drop; @@ -346,52 +411,65 @@ do_encap: goto drop; } - err = skb_cow_head(skb, LL_RESERVED_SPACE(dst->dev)); - if (unlikely(err)) - goto drop; + if (unlikely(!dst)) { + struct ipv6hdr *hdr = ipv6_hdr(skb); + struct flowi6 fl6; - if (!ipv6_addr_equal(&orig_daddr, &ipv6_hdr(skb)->daddr)) { - preempt_disable(); - dst = dst_cache_get(&ilwt->cache); - preempt_enable(); - - if (unlikely(!dst)) { - struct ipv6hdr *hdr = ipv6_hdr(skb); - struct flowi6 fl6; - - memset(&fl6, 0, sizeof(fl6)); - fl6.daddr = hdr->daddr; - fl6.saddr = hdr->saddr; - fl6.flowlabel = ip6_flowinfo(hdr); - fl6.flowi6_mark = skb->mark; - fl6.flowi6_proto = hdr->nexthdr; - - dst = ip6_route_output(net, NULL, &fl6); - if (dst->error) { - err = dst->error; - dst_release(dst); - goto drop; - } - - preempt_disable(); - dst_cache_set_ip6(&ilwt->cache, dst, &fl6.saddr); - preempt_enable(); + memset(&fl6, 0, sizeof(fl6)); + fl6.daddr = hdr->daddr; + fl6.saddr = hdr->saddr; + fl6.flowlabel = ip6_flowinfo(hdr); + fl6.flowi6_mark = skb->mark; + fl6.flowi6_proto = hdr->nexthdr; + + dst = ip6_route_output(net, NULL, &fl6); + if (dst->error) { + err = dst->error; + goto drop; } + /* If the destination is the same after transformation (which is + * a valid use case for IOAM), then we don't want to add it to + * the cache in order to avoid a reference loop. Instead, we add + * our fake dst_entry to the cache as a way to detect this case. + * Otherwise, we add the resolved destination to the cache. + */ + local_bh_disable(); + if (orig_dst->lwtstate == dst->lwtstate) + dst_cache_set_ip6(&ilwt->cache, + &ilwt->null_dst, &fl6.saddr); + else + dst_cache_set_ip6(&ilwt->cache, dst, &fl6.saddr); + local_bh_enable(); + + err = skb_cow_head(skb, LL_RESERVED_SPACE(dst_dev(dst))); + if (unlikely(err)) + goto drop; + } + + /* avoid lwtunnel_output() reentry loop when destination is the same + * after transformation (e.g., with the inline mode) + */ + if (orig_dst->lwtstate != dst->lwtstate) { skb_dst_drop(skb); skb_dst_set(skb, dst); - return dst_output(net, sk, skb); } out: - return dst->lwtstate->orig_output(net, sk, skb); + dst_release(dst); + return orig_dst->lwtstate->orig_output(net, sk, skb); drop: + dst_release(dst); kfree_skb(skb); return err; } static void ioam6_destroy_state(struct lwtunnel_state *lwt) { + /* Since the refcount of per-cpu dst_entry caches will never be 0 (see + * why above) when our "fake" dst_entry is used, it is not necessary to + * remove them before calling dst_cache_destroy() + */ dst_cache_destroy(&ioam6_lwt_state(lwt)->cache); } @@ -414,6 +492,13 @@ static int ioam6_fill_encap_info(struct sk_buff *skb, goto ret; if (ilwt->mode != IOAM6_IPTUNNEL_MODE_INLINE) { + if (ilwt->has_tunsrc) { + err = nla_put_in6_addr(skb, IOAM6_IPTUNNEL_SRC, + &ilwt->tunsrc); + if (err) + goto ret; + } + err = nla_put_in6_addr(skb, IOAM6_IPTUNNEL_DST, &ilwt->tundst); if (err) goto ret; @@ -435,8 +520,12 @@ static int ioam6_encap_nlsize(struct lwtunnel_state *lwtstate) nla_total_size(sizeof(ilwt->mode)) + nla_total_size(sizeof(ilwt->tuninfo.traceh)); - if (ilwt->mode != IOAM6_IPTUNNEL_MODE_INLINE) + if (ilwt->mode != IOAM6_IPTUNNEL_MODE_INLINE) { + if (ilwt->has_tunsrc) + nlsize += nla_total_size(sizeof(ilwt->tunsrc)); + nlsize += nla_total_size(sizeof(ilwt->tundst)); + } return nlsize; } @@ -451,17 +540,21 @@ static int ioam6_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b) return (ilwt_a->freq.k != ilwt_b->freq.k || ilwt_a->freq.n != ilwt_b->freq.n || ilwt_a->mode != ilwt_b->mode || + ilwt_a->has_tunsrc != ilwt_b->has_tunsrc || (ilwt_a->mode != IOAM6_IPTUNNEL_MODE_INLINE && !ipv6_addr_equal(&ilwt_a->tundst, &ilwt_b->tundst)) || + (ilwt_a->mode != IOAM6_IPTUNNEL_MODE_INLINE && + ilwt_a->has_tunsrc && + !ipv6_addr_equal(&ilwt_a->tunsrc, &ilwt_b->tunsrc)) || trace_a->namespace_id != trace_b->namespace_id); } static const struct lwtunnel_encap_ops ioam6_iptun_ops = { .build_state = ioam6_build_state, .destroy_state = ioam6_destroy_state, - .output = ioam6_output, + .output = ioam6_output, .fill_encap = ioam6_fill_encap_info, - .get_encap_size = ioam6_encap_nlsize, + .get_encap_size = ioam6_encap_nlsize, .cmp_encap = ioam6_encap_cmp, .owner = THIS_MODULE, }; diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 2438da5ff6da..2111af022d94 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -160,6 +160,8 @@ struct fib6_info *fib6_info_alloc(gfp_t gfp_flags, bool with_fib6_nh) INIT_LIST_HEAD(&f6i->fib6_siblings); refcount_set(&f6i->fib6_ref, 1); + INIT_HLIST_NODE(&f6i->gc_link); + return f6i; } @@ -196,16 +198,9 @@ static void node_free_immediate(struct net *net, struct fib6_node *fn) net->ipv6.rt6_stats->fib_nodes--; } -static void node_free_rcu(struct rcu_head *head) -{ - struct fib6_node *fn = container_of(head, struct fib6_node, rcu); - - kmem_cache_free(fib6_node_kmem, fn); -} - static void node_free(struct net *net, struct fib6_node *fn) { - call_rcu(&fn->rcu, node_free_rcu); + kfree_rcu(fn, rcu); net->ipv6.rt6_stats->fib_nodes--; } @@ -246,6 +241,7 @@ static struct fib6_table *fib6_alloc_table(struct net *net, u32 id) net->ipv6.fib6_null_entry); table->tb6_root.fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO; inet_peer_base_init(&table->tb6_peers); + INIT_HLIST_HEAD(&table->tb6_gc_hlist); } return table; @@ -253,40 +249,52 @@ static struct fib6_table *fib6_alloc_table(struct net *net, u32 id) struct fib6_table *fib6_new_table(struct net *net, u32 id) { - struct fib6_table *tb; + struct fib6_table *tb, *new_tb; if (id == 0) id = RT6_TABLE_MAIN; + tb = fib6_get_table(net, id); if (tb) return tb; - tb = fib6_alloc_table(net, id); - if (tb) - fib6_link_table(net, tb); + new_tb = fib6_alloc_table(net, id); + if (!new_tb) + return NULL; + + spin_lock_bh(&net->ipv6.fib_table_hash_lock); + + tb = fib6_get_table(net, id); + if (unlikely(tb)) { + spin_unlock_bh(&net->ipv6.fib_table_hash_lock); + kfree(new_tb); + return tb; + } + + fib6_link_table(net, new_tb); + + spin_unlock_bh(&net->ipv6.fib_table_hash_lock); - return tb; + return new_tb; } EXPORT_SYMBOL_GPL(fib6_new_table); struct fib6_table *fib6_get_table(struct net *net, u32 id) { - struct fib6_table *tb; struct hlist_head *head; - unsigned int h; + struct fib6_table *tb; - if (id == 0) + if (!id) id = RT6_TABLE_MAIN; - h = id & (FIB6_TABLE_HASHSZ - 1); - rcu_read_lock(); - head = &net->ipv6.fib_table_hash[h]; - hlist_for_each_entry_rcu(tb, head, tb6_hlist) { - if (tb->tb6_id == id) { - rcu_read_unlock(); + + head = &net->ipv6.fib_table_hash[id & (FIB6_TABLE_HASHSZ - 1)]; + + /* See comment in fib6_link_table(). RCU is not required, + * but rcu_dereference_raw() is used to avoid data-race. + */ + hlist_for_each_entry_rcu(tb, head, tb6_hlist, true) + if (tb->tb6_id == id) return tb; - } - } - rcu_read_unlock(); return NULL; } @@ -342,17 +350,17 @@ static void __net_init fib6_tables_init(struct net *net) #endif -unsigned int fib6_tables_seq_read(struct net *net) +unsigned int fib6_tables_seq_read(const struct net *net) { unsigned int h, fib_seq = 0; rcu_read_lock(); for (h = 0; h < FIB6_TABLE_HASHSZ; h++) { - struct hlist_head *head = &net->ipv6.fib_table_hash[h]; - struct fib6_table *tb; + const struct hlist_head *head = &net->ipv6.fib_table_hash[h]; + const struct fib6_table *tb; hlist_for_each_entry_rcu(tb, head, tb6_hlist) - fib_seq += tb->fib_seq; + fib_seq += READ_ONCE(tb->fib_seq); } rcu_read_unlock(); @@ -397,7 +405,7 @@ int call_fib6_entry_notifiers(struct net *net, .rt = rt, }; - rt->fib6_table->fib_seq++; + WRITE_ONCE(rt->fib6_table->fib_seq, rt->fib6_table->fib_seq + 1); return call_fib6_notifiers(net, event_type, &info.info); } @@ -413,7 +421,7 @@ int call_fib6_multipath_entry_notifiers(struct net *net, .nsiblings = nsiblings, }; - rt->fib6_table->fib_seq++; + WRITE_ONCE(rt->fib6_table->fib_seq, rt->fib6_table->fib_seq + 1); return call_fib6_notifiers(net, event_type, &info.info); } @@ -424,7 +432,7 @@ int call_fib6_entry_notifiers_replace(struct net *net, struct fib6_info *rt) .nsiblings = rt->fib6_nsiblings, }; - rt->fib6_table->fib_seq++; + WRITE_ONCE(rt->fib6_table->fib_seq, rt->fib6_table->fib_seq + 1); return call_fib6_notifiers(net, FIB_EVENT_ENTRY_REPLACE, &info.info); } @@ -437,15 +445,17 @@ struct fib6_dump_arg { static int fib6_rt_dump(struct fib6_info *rt, struct fib6_dump_arg *arg) { enum fib_event_type fib_event = FIB_EVENT_ENTRY_REPLACE; + unsigned int nsiblings; int err; if (!rt || rt == arg->net->ipv6.fib6_null_entry) return 0; - if (rt->fib6_nsiblings) + nsiblings = READ_ONCE(rt->fib6_nsiblings); + if (nsiblings) err = call_fib6_multipath_entry_notifier(arg->nb, fib_event, rt, - rt->fib6_nsiblings, + nsiblings, arg->extack); else err = call_fib6_entry_notifier(arg->nb, fib_event, rt, @@ -617,23 +627,25 @@ static int fib6_dump_table(struct fib6_table *table, struct sk_buff *skb, static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) { - struct rt6_rtnl_dump_arg arg = { .filter.dump_exceptions = true, - .filter.dump_routes = true }; + struct rt6_rtnl_dump_arg arg = { + .filter.dump_exceptions = true, + .filter.dump_routes = true, + .filter.rtnl_held = false, + }; const struct nlmsghdr *nlh = cb->nlh; struct net *net = sock_net(skb->sk); - unsigned int h, s_h; unsigned int e = 0, s_e; + struct hlist_head *head; struct fib6_walker *w; struct fib6_table *tb; - struct hlist_head *head; - int res = 0; + unsigned int h, s_h; + int err = 0; + rcu_read_lock(); if (cb->strict_check) { - int err; - err = ip_valid_fib_dump_req(net, nlh, &arg.filter, cb); if (err < 0) - return err; + goto unlock; } else if (nlmsg_len(nlh) >= sizeof(struct rtmsg)) { struct rtmsg *rtm = nlmsg_data(nlh); @@ -645,19 +657,21 @@ static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) if (!w) { /* New dump: * - * 1. hook callback destructor. - */ - cb->args[3] = (long)cb->done; - cb->done = fib6_dump_done; - - /* - * 2. allocate and initialize walker. + * 1. allocate and initialize walker. */ w = kzalloc(sizeof(*w), GFP_ATOMIC); - if (!w) - return -ENOMEM; + if (!w) { + err = -ENOMEM; + goto unlock; + } w->func = fib6_dump_node; cb->args[2] = (long)w; + + /* 2. hook callback destructor. + */ + cb->args[3] = (long)cb->done; + cb->done = fib6_dump_done; + } arg.skb = skb; @@ -669,46 +683,46 @@ static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) tb = fib6_get_table(net, arg.filter.table_id); if (!tb) { if (rtnl_msg_family(cb->nlh) != PF_INET6) - goto out; + goto unlock; NL_SET_ERR_MSG_MOD(cb->extack, "FIB table does not exist"); - return -ENOENT; + err = -ENOENT; + goto unlock; } if (!cb->args[0]) { - res = fib6_dump_table(tb, skb, cb); - if (!res) + err = fib6_dump_table(tb, skb, cb); + if (!err) cb->args[0] = 1; } - goto out; + goto unlock; } s_h = cb->args[0]; s_e = cb->args[1]; - rcu_read_lock(); for (h = s_h; h < FIB6_TABLE_HASHSZ; h++, s_e = 0) { e = 0; head = &net->ipv6.fib_table_hash[h]; hlist_for_each_entry_rcu(tb, head, tb6_hlist) { if (e < s_e) goto next; - res = fib6_dump_table(tb, skb, cb); - if (res != 0) - goto out_unlock; + err = fib6_dump_table(tb, skb, cb); + if (err != 0) + goto out; next: e++; } } -out_unlock: - rcu_read_unlock(); +out: cb->args[1] = e; cb->args[0] = h; -out: - res = res < 0 ? res : skb->len; - if (res <= 0) + +unlock: + rcu_read_unlock(); + if (err <= 0) fib6_dump_end(cb); - return res; + return err; } void fib6_metric_set(struct fib6_info *f6i, int metric, u32 val) @@ -751,8 +765,6 @@ static struct fib6_node *fib6_add_1(struct net *net, int bit; __be32 dir = 0; - RT6_TRACE("fib6_add_1\n"); - /* insert node in tree */ fn = root; @@ -953,14 +965,14 @@ insert_above: } static void __fib6_drop_pcpu_from(struct fib6_nh *fib6_nh, - const struct fib6_info *match, - const struct fib6_table *table) + const struct fib6_info *match) { int cpu; if (!fib6_nh->rt6i_pcpu) return; + rcu_read_lock(); /* release the reference to this fib entry from * all of its cached pcpu routes */ @@ -969,7 +981,9 @@ static void __fib6_drop_pcpu_from(struct fib6_nh *fib6_nh, struct rt6_info *pcpu_rt; ppcpu_rt = per_cpu_ptr(fib6_nh->rt6i_pcpu, cpu); - pcpu_rt = *ppcpu_rt; + + /* Paired with xchg() in rt6_get_pcpu_route() */ + pcpu_rt = READ_ONCE(*ppcpu_rt); /* only dropping the 'from' reference if the cached route * is using 'match'. The cached pcpu_rt->from only changes @@ -979,27 +993,22 @@ static void __fib6_drop_pcpu_from(struct fib6_nh *fib6_nh, if (pcpu_rt && rcu_access_pointer(pcpu_rt->from) == match) { struct fib6_info *from; - from = xchg((__force struct fib6_info **)&pcpu_rt->from, NULL); + from = unrcu_pointer(xchg(&pcpu_rt->from, NULL)); fib6_info_release(from); } } + rcu_read_unlock(); } -struct fib6_nh_pcpu_arg { - struct fib6_info *from; - const struct fib6_table *table; -}; - static int fib6_nh_drop_pcpu_from(struct fib6_nh *nh, void *_arg) { - struct fib6_nh_pcpu_arg *arg = _arg; + struct fib6_info *arg = _arg; - __fib6_drop_pcpu_from(nh, arg->from, arg->table); + __fib6_drop_pcpu_from(nh, arg); return 0; } -static void fib6_drop_pcpu_from(struct fib6_info *f6i, - const struct fib6_table *table) +static void fib6_drop_pcpu_from(struct fib6_info *f6i) { /* Make sure rt6_make_pcpu_route() wont add other percpu routes * while we are cleaning them here. @@ -1008,18 +1017,14 @@ static void fib6_drop_pcpu_from(struct fib6_info *f6i, mb(); /* paired with the cmpxchg() in rt6_make_pcpu_route() */ if (f6i->nh) { - struct fib6_nh_pcpu_arg arg = { - .from = f6i, - .table = table - }; - - nexthop_for_each_fib6_nh(f6i->nh, fib6_nh_drop_pcpu_from, - &arg); + rcu_read_lock(); + nexthop_for_each_fib6_nh(f6i->nh, fib6_nh_drop_pcpu_from, f6i); + rcu_read_unlock(); } else { struct fib6_nh *fib6_nh; fib6_nh = f6i->fib6_nh; - __fib6_drop_pcpu_from(fib6_nh, f6i, table); + __fib6_drop_pcpu_from(fib6_nh, f6i); } } @@ -1030,10 +1035,16 @@ static void fib6_purge_rt(struct fib6_info *rt, struct fib6_node *fn, /* Flush all cached dst in exception table */ rt6_flush_exceptions(rt); - fib6_drop_pcpu_from(rt, table); + fib6_drop_pcpu_from(rt); + + if (rt->nh) { + spin_lock(&rt->nh->lock); - if (rt->nh && !list_empty(&rt->nh_list)) - list_del_init(&rt->nh_list); + if (!list_empty(&rt->nh_list)) + list_del_init(&rt->nh_list); + + spin_unlock(&rt->nh->lock); + } if (refcount_read(&rt->fib6_ref) != 1) { /* This route is used as dummy address holder in some split @@ -1057,6 +1068,9 @@ static void fib6_purge_rt(struct fib6_info *rt, struct fib6_node *fn, lockdep_is_held(&table->tb6_lock)); } } + + fib6_clean_expires(rt); + fib6_remove_gc_list(rt); } /* @@ -1064,8 +1078,8 @@ static void fib6_purge_rt(struct fib6_info *rt, struct fib6_node *fn, */ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt, - struct nl_info *info, - struct netlink_ext_ack *extack) + struct nl_info *info, struct netlink_ext_ack *extack, + struct list_head *purge_list) { struct fib6_info *leaf = rcu_dereference_protected(fn->leaf, lockdep_is_held(&rt->fib6_table->tb6_lock)); @@ -1114,13 +1128,20 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt, if (rt6_duplicate_nexthop(iter, rt)) { if (rt->fib6_nsiblings) - rt->fib6_nsiblings = 0; + WRITE_ONCE(rt->fib6_nsiblings, 0); if (!(iter->fib6_flags & RTF_EXPIRES)) return -EEXIST; - if (!(rt->fib6_flags & RTF_EXPIRES)) + if (!(rt->fib6_flags & RTF_EXPIRES)) { fib6_clean_expires(iter); - else + fib6_remove_gc_list(iter); + } else { fib6_set_expires(iter, rt->expires); + fib6_add_gc_list(iter); + } + if (!(rt->fib6_flags & (RTF_ADDRCONF | RTF_PREFIX_RT))) { + iter->fib6_flags &= ~RTF_ADDRCONF; + iter->fib6_flags &= ~RTF_PREFIX_RT; + } if (rt->fib6_pmtu) fib6_metric_set(iter, RTAX_MTU, @@ -1140,7 +1161,8 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt, */ if (rt_can_ecmp && rt6_qualify_for_ecmp(iter)) - rt->fib6_nsiblings++; + WRITE_ONCE(rt->fib6_nsiblings, + rt->fib6_nsiblings + 1); } if (iter->fib6_metric > rt->fib6_metric) @@ -1175,8 +1197,8 @@ next_iter: while (sibling) { if (sibling->fib6_metric == rt->fib6_metric && rt6_qualify_for_ecmp(sibling)) { - list_add_tail(&rt->fib6_siblings, - &sibling->fib6_siblings); + list_add_tail_rcu(&rt->fib6_siblings, + &sibling->fib6_siblings); break; } sibling = rcu_dereference_protected(sibling->fib6_next, @@ -1190,12 +1212,15 @@ next_iter: fib6_nsiblings = 0; list_for_each_entry_safe(sibling, temp_sibling, &rt->fib6_siblings, fib6_siblings) { - sibling->fib6_nsiblings++; + WRITE_ONCE(sibling->fib6_nsiblings, + sibling->fib6_nsiblings + 1); BUG_ON(sibling->fib6_nsiblings != rt->fib6_nsiblings); fib6_nsiblings++; } BUG_ON(fib6_nsiblings != rt->fib6_nsiblings); + rcu_read_lock(); rt6_multipath_rebalance(temp_sibling); + rcu_read_unlock(); } /* @@ -1235,10 +1260,13 @@ add: list_for_each_entry_safe(sibling, next_sibling, &rt->fib6_siblings, fib6_siblings) - sibling->fib6_nsiblings--; - rt->fib6_nsiblings = 0; - list_del_init(&rt->fib6_siblings); + WRITE_ONCE(sibling->fib6_nsiblings, + sibling->fib6_nsiblings - 1); + WRITE_ONCE(rt->fib6_nsiblings, 0); + list_del_rcu(&rt->fib6_siblings); + rcu_read_lock(); rt6_multipath_rebalance(next_sibling); + rcu_read_unlock(); return err; } } @@ -1286,10 +1314,9 @@ add: } nsiblings = iter->fib6_nsiblings; iter->fib6_node = NULL; - fib6_purge_rt(iter, fn, info->nl_net); + list_add(&iter->purge_link, purge_list); if (rcu_access_pointer(fn->rr_ptr) == iter) fn->rr_ptr = NULL; - fib6_info_release(iter); if (nsiblings) { /* Replacing an ECMP route, remove all siblings */ @@ -1302,10 +1329,9 @@ add: if (rt6_qualify_for_ecmp(iter)) { *ins = iter->fib6_next; iter->fib6_node = NULL; - fib6_purge_rt(iter, fn, info->nl_net); + list_add(&iter->purge_link, purge_list); if (rcu_access_pointer(fn->rr_ptr) == iter) fn->rr_ptr = NULL; - fib6_info_release(iter); nsiblings--; info->nl_net->ipv6.rt6_stats->fib_rt_entries--; } else { @@ -1321,6 +1347,28 @@ add: return 0; } +static int fib6_add_rt2node_nh(struct fib6_node *fn, struct fib6_info *rt, + struct nl_info *info, struct netlink_ext_ack *extack, + struct list_head *purge_list) +{ + int err; + + spin_lock(&rt->nh->lock); + + if (rt->nh->dead) { + NL_SET_ERR_MSG(extack, "Nexthop has been deleted"); + err = -EINVAL; + } else { + err = fib6_add_rt2node(fn, rt, info, extack, purge_list); + if (!err) + list_add(&rt->nh_list, &rt->nh->f6i_list); + } + + spin_unlock(&rt->nh->lock); + + return err; +} + static void fib6_start_gc(struct net *net, struct fib6_info *rt) { if (!timer_pending(&net->ipv6.ip6_fib_timer) && @@ -1375,7 +1423,11 @@ int fib6_add(struct fib6_node *root, struct fib6_info *rt, struct nl_info *info, struct netlink_ext_ack *extack) { struct fib6_table *table = rt->fib6_table; - struct fib6_node *fn, *pn = NULL; + LIST_HEAD(purge_list); + struct fib6_node *fn; +#ifdef CONFIG_IPV6_SUBTREES + struct fib6_node *pn = NULL; +#endif int err = -ENOMEM; int allow_create = 1; int replace_required = 0; @@ -1399,9 +1451,9 @@ int fib6_add(struct fib6_node *root, struct fib6_info *rt, goto out; } +#ifdef CONFIG_IPV6_SUBTREES pn = fn; -#ifdef CONFIG_IPV6_SUBTREES if (rt->fib6_src.plen) { struct fib6_node *sn; @@ -1474,11 +1526,24 @@ int fib6_add(struct fib6_node *root, struct fib6_info *rt, } #endif - err = fib6_add_rt2node(fn, rt, info, extack); + if (rt->nh) + err = fib6_add_rt2node_nh(fn, rt, info, extack, &purge_list); + else + err = fib6_add_rt2node(fn, rt, info, extack, &purge_list); if (!err) { - if (rt->nh) - list_add(&rt->nh_list, &rt->nh->f6i_list); + struct fib6_info *iter, *next; + + list_for_each_entry_safe(iter, next, &purge_list, purge_link) { + list_del(&iter->purge_link); + fib6_purge_rt(iter, fn, info->nl_net); + fib6_info_release(iter); + } + __fib6_update_sernum_upto_root(rt, fib6_new_sernum(info->nl_net)); + + if (rt->fib6_flags & RTF_EXPIRES) + fib6_add_gc_list(rt); + fib6_start_gc(info->nl_net, rt); } @@ -1501,13 +1566,9 @@ out: if (!pn_leaf && !(pn->fn_flags & RTN_RTINFO)) { pn_leaf = fib6_find_prefix(info->nl_net, table, pn); -#if RT6_DEBUG >= 2 - if (!pn_leaf) { - WARN_ON(!pn_leaf); + if (!pn_leaf) pn_leaf = info->nl_net->ipv6.fib6_null_entry; - } -#endif fib6_info_hold(pn_leaf); rcu_assign_pointer(pn->leaf, pn_leaf); } @@ -1807,7 +1868,7 @@ static struct fib6_node *fib6_repair_tree(struct net *net, lockdep_is_held(&table->tb6_lock)); struct fib6_info *new_fn_leaf; - RT6_TRACE("fixing tree: plen=%d iter=%d\n", fn->fn_bit, iter); + pr_debug("fixing tree: plen=%d iter=%d\n", fn->fn_bit, iter); iter++; WARN_ON(fn->fn_flags & RTN_RTINFO); @@ -1870,7 +1931,8 @@ static struct fib6_node *fib6_repair_tree(struct net *net, FOR_WALKERS(net, w) { if (!child) { if (w->node == fn) { - RT6_TRACE("W %p adjusted by delnode 1, s=%d/%d\n", w, w->state, nstate); + pr_debug("W %p adjusted by delnode 1, s=%d/%d\n", + w, w->state, nstate); w->node = pn; w->state = nstate; } @@ -1878,10 +1940,12 @@ static struct fib6_node *fib6_repair_tree(struct net *net, if (w->node == fn) { w->node = child; if (children&2) { - RT6_TRACE("W %p adjusted by delnode 2, s=%d\n", w, w->state); + pr_debug("W %p adjusted by delnode 2, s=%d\n", + w, w->state); w->state = w->state >= FWS_R ? FWS_U : FWS_INIT; } else { - RT6_TRACE("W %p adjusted by delnode 2, s=%d\n", w, w->state); + pr_debug("W %p adjusted by delnode 2, s=%d\n", + w, w->state); w->state = w->state >= FWS_C ? FWS_U : FWS_INIT; } } @@ -1909,8 +1973,6 @@ static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn, struct net *net = info->nl_net; bool notify_del = false; - RT6_TRACE("fib6_del_route\n"); - /* If the deleted route is the first in the node and it is not part of * a multipath route, then we need to replace it with the next route * in the node, if exists. @@ -1949,9 +2011,10 @@ static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn, notify_del = true; list_for_each_entry_safe(sibling, next_sibling, &rt->fib6_siblings, fib6_siblings) - sibling->fib6_nsiblings--; - rt->fib6_nsiblings = 0; - list_del_init(&rt->fib6_siblings); + WRITE_ONCE(sibling->fib6_nsiblings, + sibling->fib6_nsiblings - 1); + WRITE_ONCE(rt->fib6_nsiblings, 0); + list_del_rcu(&rt->fib6_siblings); rt6_multipath_rebalance(next_sibling); } @@ -1959,7 +2022,7 @@ static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn, read_lock(&net->ipv6.fib6_walker_lock); FOR_WALKERS(net, w) { if (w->state == FWS_C && w->leaf == rt) { - RT6_TRACE("walker %p adjusted by delroute\n", w); + pr_debug("walker %p adjusted by delroute\n", w); w->leaf = rcu_dereference_protected(rt->fib6_next, lockdep_is_held(&table->tb6_lock)); if (!w->leaf) @@ -2285,9 +2348,8 @@ static void fib6_flush_trees(struct net *net) * Garbage collection */ -static int fib6_age(struct fib6_info *rt, void *arg) +static int fib6_age(struct fib6_info *rt, struct fib6_gc_args *gc_args) { - struct fib6_gc_args *gc_args = arg; unsigned long now = jiffies; /* @@ -2297,7 +2359,7 @@ static int fib6_age(struct fib6_info *rt, void *arg) if (rt->fib6_flags & RTF_EXPIRES && rt->expires) { if (time_after(now, rt->expires)) { - RT6_TRACE("expiring %p\n", rt); + pr_debug("expiring %p\n", rt); return -1; } gc_args->more++; @@ -2312,6 +2374,42 @@ static int fib6_age(struct fib6_info *rt, void *arg) return 0; } +static void fib6_gc_table(struct net *net, + struct fib6_table *tb6, + struct fib6_gc_args *gc_args) +{ + struct fib6_info *rt; + struct hlist_node *n; + struct nl_info info = { + .nl_net = net, + .skip_notify = false, + }; + + hlist_for_each_entry_safe(rt, n, &tb6->tb6_gc_hlist, gc_link) + if (fib6_age(rt, gc_args) == -1) + fib6_del(rt, &info); +} + +static void fib6_gc_all(struct net *net, struct fib6_gc_args *gc_args) +{ + struct fib6_table *table; + struct hlist_head *head; + unsigned int h; + + rcu_read_lock(); + for (h = 0; h < FIB6_TABLE_HASHSZ; h++) { + head = &net->ipv6.fib_table_hash[h]; + hlist_for_each_entry_rcu(table, head, tb6_hlist) { + spin_lock_bh(&table->tb6_lock); + + fib6_gc_table(net, table, gc_args); + + spin_unlock_bh(&table->tb6_lock); + } + } + rcu_read_unlock(); +} + void fib6_run_gc(unsigned long expires, struct net *net, bool force) { struct fib6_gc_args gc_args; @@ -2327,7 +2425,7 @@ void fib6_run_gc(unsigned long expires, struct net *net, bool force) net->ipv6.sysctl.ip6_rt_gc_interval; gc_args.more = 0; - fib6_clean_all(net, fib6_age, &gc_args); + fib6_gc_all(net, &gc_args); now = jiffies; net->ipv6.ip6_rt_last_gc = now; @@ -2336,13 +2434,13 @@ void fib6_run_gc(unsigned long expires, struct net *net, bool force) round_jiffies(now + net->ipv6.sysctl.ip6_rt_gc_interval)); else - del_timer(&net->ipv6.ip6_fib_timer); + timer_delete(&net->ipv6.ip6_fib_timer); spin_unlock_bh(&net->ipv6.fib6_gc_lock); } static void fib6_gc_timer_cb(struct timer_list *t) { - struct net *arg = from_timer(arg, t, ipv6.ip6_fib_timer); + struct net *arg = timer_container_of(arg, t, ipv6.ip6_fib_timer); fib6_run_gc(0, arg, true); } @@ -2376,6 +2474,8 @@ static int __net_init fib6_net_init(struct net *net) if (!net->ipv6.fib_table_hash) goto out_rt6_stats; + spin_lock_init(&net->ipv6.fib_table_hash_lock); + net->ipv6.fib6_main_tbl = kzalloc(sizeof(*net->ipv6.fib6_main_tbl), GFP_KERNEL); if (!net->ipv6.fib6_main_tbl) @@ -2387,6 +2487,7 @@ static int __net_init fib6_net_init(struct net *net) net->ipv6.fib6_main_tbl->tb6_root.fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO; inet_peer_base_init(&net->ipv6.fib6_main_tbl->tb6_peers); + INIT_HLIST_HEAD(&net->ipv6.fib6_main_tbl->tb6_gc_hlist); #ifdef CONFIG_IPV6_MULTIPLE_TABLES net->ipv6.fib6_local_tbl = kzalloc(sizeof(*net->ipv6.fib6_local_tbl), @@ -2399,6 +2500,7 @@ static int __net_init fib6_net_init(struct net *net) net->ipv6.fib6_local_tbl->tb6_root.fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO; inet_peer_base_init(&net->ipv6.fib6_local_tbl->tb6_peers); + INIT_HLIST_HEAD(&net->ipv6.fib6_local_tbl->tb6_gc_hlist); #endif fib6_tables_init(net); @@ -2421,7 +2523,7 @@ static void fib6_net_exit(struct net *net) { unsigned int i; - del_timer_sync(&net->ipv6.ip6_fib_timer); + timer_delete_sync(&net->ipv6.ip6_fib_timer); for (i = 0; i < FIB6_TABLE_HASHSZ; i++) { struct hlist_head *head = &net->ipv6.fib_table_hash[i]; @@ -2444,14 +2546,18 @@ static struct pernet_operations fib6_net_ops = { .exit = fib6_net_exit, }; +static const struct rtnl_msg_handler fib6_rtnl_msg_handlers[] __initconst_or_module = { + {.owner = THIS_MODULE, .protocol = PF_INET6, .msgtype = RTM_GETROUTE, + .dumpit = inet6_dump_fib, + .flags = RTNL_FLAG_DUMP_UNLOCKED | RTNL_FLAG_DUMP_SPLIT_NLM_DONE}, +}; + int __init fib6_init(void) { int ret = -ENOMEM; - fib6_node_kmem = kmem_cache_create("fib6_nodes", - sizeof(struct fib6_node), 0, - SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, - NULL); + fib6_node_kmem = KMEM_CACHE(fib6_node, + SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT); if (!fib6_node_kmem) goto out; @@ -2459,8 +2565,7 @@ int __init fib6_init(void) if (ret) goto out_kmem_cache_create; - ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE, NULL, - inet6_dump_fib, 0); + ret = rtnl_register_many(fib6_rtnl_msg_handlers); if (ret) goto out_unregister_subsys; @@ -2491,7 +2596,7 @@ static int ipv6_route_native_seq_show(struct seq_file *seq, void *v) const struct net_device *dev; if (rt->nh) - fib6_nh = nexthop_fib6_nh_bh(rt->nh); + fib6_nh = nexthop_fib6_nh(rt->nh); seq_printf(seq, "%pi6 %02x ", &rt->fib6_dst.addr, rt->fib6_dst.plen); @@ -2556,14 +2661,14 @@ static struct fib6_table *ipv6_route_seq_next_table(struct fib6_table *tbl, if (tbl) { h = (tbl->tb6_id & (FIB6_TABLE_HASHSZ - 1)) + 1; - node = rcu_dereference_bh(hlist_next_rcu(&tbl->tb6_hlist)); + node = rcu_dereference(hlist_next_rcu(&tbl->tb6_hlist)); } else { h = 0; node = NULL; } while (!node && h < FIB6_TABLE_HASHSZ) { - node = rcu_dereference_bh( + node = rcu_dereference( hlist_first_rcu(&net->ipv6.fib_table_hash[h++])); } return hlist_entry_safe(node, struct fib6_table, tb6_hlist); @@ -2593,7 +2698,7 @@ static void *ipv6_route_seq_next(struct seq_file *seq, void *v, loff_t *pos) if (!v) goto iter_table; - n = rcu_dereference_bh(((struct fib6_info *)v)->fib6_next); + n = rcu_dereference(((struct fib6_info *)v)->fib6_next); if (n) return n; @@ -2619,12 +2724,12 @@ iter_table: } static void *ipv6_route_seq_start(struct seq_file *seq, loff_t *pos) - __acquires(RCU_BH) + __acquires(RCU) { struct net *net = seq_file_net(seq); struct ipv6_route_iter *iter = seq->private; - rcu_read_lock_bh(); + rcu_read_lock(); iter->tbl = ipv6_route_seq_next_table(NULL, net); iter->skip = *pos; @@ -2645,7 +2750,7 @@ static bool ipv6_route_iter_active(struct ipv6_route_iter *iter) } static void ipv6_route_native_seq_stop(struct seq_file *seq, void *v) - __releases(RCU_BH) + __releases(RCU) { struct net *net = seq_file_net(seq); struct ipv6_route_iter *iter = seq->private; @@ -2653,7 +2758,7 @@ static void ipv6_route_native_seq_stop(struct seq_file *seq, void *v) if (ipv6_route_iter_active(iter)) fib6_walker_unlink(net, &iter->w); - rcu_read_unlock_bh(); + rcu_read_unlock(); } #if IS_BUILTIN(CONFIG_IPV6) && defined(CONFIG_BPF_SYSCALL) diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c index 18481eb76a0a..60d0be47a9f3 100644 --- a/net/ipv6/ip6_flowlabel.c +++ b/net/ipv6/ip6_flowlabel.c @@ -58,18 +58,18 @@ DEFINE_STATIC_KEY_DEFERRED_FALSE(ipv6_flowlabel_exclusive, HZ); EXPORT_SYMBOL(ipv6_flowlabel_exclusive); #define for_each_fl_rcu(hash, fl) \ - for (fl = rcu_dereference_bh(fl_ht[(hash)]); \ + for (fl = rcu_dereference(fl_ht[(hash)]); \ fl != NULL; \ - fl = rcu_dereference_bh(fl->next)) + fl = rcu_dereference(fl->next)) #define for_each_fl_continue_rcu(fl) \ - for (fl = rcu_dereference_bh(fl->next); \ + for (fl = rcu_dereference(fl->next); \ fl != NULL; \ - fl = rcu_dereference_bh(fl->next)) + fl = rcu_dereference(fl->next)) -#define for_each_sk_fl_rcu(np, sfl) \ - for (sfl = rcu_dereference_bh(np->ipv6_fl_list); \ +#define for_each_sk_fl_rcu(sk, sfl) \ + for (sfl = rcu_dereference(inet_sk(sk)->ipv6_fl_list); \ sfl != NULL; \ - sfl = rcu_dereference_bh(sfl->next)) + sfl = rcu_dereference(sfl->next)) static inline struct ip6_flowlabel *__fl_lookup(struct net *net, __be32 label) { @@ -86,11 +86,11 @@ static struct ip6_flowlabel *fl_lookup(struct net *net, __be32 label) { struct ip6_flowlabel *fl; - rcu_read_lock_bh(); + rcu_read_lock(); fl = __fl_lookup(net, label); if (fl && !atomic_inc_not_zero(&fl->users)) fl = NULL; - rcu_read_unlock_bh(); + rcu_read_unlock(); return fl; } @@ -217,6 +217,7 @@ static struct ip6_flowlabel *fl_intern(struct net *net, fl->label = label & IPV6_FLOWLABEL_MASK; + rcu_read_lock(); spin_lock_bh(&ip6_fl_lock); if (label == 0) { for (;;) { @@ -240,6 +241,7 @@ static struct ip6_flowlabel *fl_intern(struct net *net, if (lfl) { atomic_inc(&lfl->users); spin_unlock_bh(&ip6_fl_lock); + rcu_read_unlock(); return lfl; } } @@ -249,6 +251,7 @@ static struct ip6_flowlabel *fl_intern(struct net *net, rcu_assign_pointer(fl_ht[FL_HASH(fl->label)], fl); atomic_inc(&fl_size); spin_unlock_bh(&ip6_fl_lock); + rcu_read_unlock(); return NULL; } @@ -259,37 +262,36 @@ static struct ip6_flowlabel *fl_intern(struct net *net, struct ip6_flowlabel *__fl6_sock_lookup(struct sock *sk, __be32 label) { struct ipv6_fl_socklist *sfl; - struct ipv6_pinfo *np = inet6_sk(sk); label &= IPV6_FLOWLABEL_MASK; - rcu_read_lock_bh(); - for_each_sk_fl_rcu(np, sfl) { + rcu_read_lock(); + for_each_sk_fl_rcu(sk, sfl) { struct ip6_flowlabel *fl = sfl->fl; if (fl->label == label && atomic_inc_not_zero(&fl->users)) { fl->lastuse = jiffies; - rcu_read_unlock_bh(); + rcu_read_unlock(); return fl; } } - rcu_read_unlock_bh(); + rcu_read_unlock(); return NULL; } EXPORT_SYMBOL_GPL(__fl6_sock_lookup); void fl6_free_socklist(struct sock *sk) { - struct ipv6_pinfo *np = inet6_sk(sk); + struct inet_sock *inet = inet_sk(sk); struct ipv6_fl_socklist *sfl; - if (!rcu_access_pointer(np->ipv6_fl_list)) + if (!rcu_access_pointer(inet->ipv6_fl_list)) return; spin_lock_bh(&ip6_sk_fl_lock); - while ((sfl = rcu_dereference_protected(np->ipv6_fl_list, + while ((sfl = rcu_dereference_protected(inet->ipv6_fl_list, lockdep_is_held(&ip6_sk_fl_lock))) != NULL) { - np->ipv6_fl_list = sfl->next; + inet->ipv6_fl_list = sfl->next; spin_unlock_bh(&ip6_sk_fl_lock); fl_release(sfl->fl); @@ -467,18 +469,17 @@ done: static int mem_check(struct sock *sk) { - struct ipv6_pinfo *np = inet6_sk(sk); - struct ipv6_fl_socklist *sfl; int room = FL_MAX_SIZE - atomic_read(&fl_size); + struct ipv6_fl_socklist *sfl; int count = 0; if (room > FL_MAX_SIZE - FL_MAX_PER_SOCK) return 0; - rcu_read_lock_bh(); - for_each_sk_fl_rcu(np, sfl) + rcu_read_lock(); + for_each_sk_fl_rcu(sk, sfl) count++; - rcu_read_unlock_bh(); + rcu_read_unlock(); if (room <= 0 || ((count >= FL_MAX_PER_SOCK || @@ -489,13 +490,15 @@ static int mem_check(struct sock *sk) return 0; } -static inline void fl_link(struct ipv6_pinfo *np, struct ipv6_fl_socklist *sfl, - struct ip6_flowlabel *fl) +static inline void fl_link(struct sock *sk, struct ipv6_fl_socklist *sfl, + struct ip6_flowlabel *fl) { + struct inet_sock *inet = inet_sk(sk); + spin_lock_bh(&ip6_sk_fl_lock); sfl->fl = fl; - sfl->next = np->ipv6_fl_list; - rcu_assign_pointer(np->ipv6_fl_list, sfl); + sfl->next = inet->ipv6_fl_list; + rcu_assign_pointer(inet->ipv6_fl_list, sfl); spin_unlock_bh(&ip6_sk_fl_lock); } @@ -510,14 +513,14 @@ int ipv6_flowlabel_opt_get(struct sock *sk, struct in6_flowlabel_req *freq, return 0; } - if (np->repflow) { + if (inet6_test_bit(REPFLOW, sk)) { freq->flr_label = np->flow_label; return 0; } - rcu_read_lock_bh(); + rcu_read_lock(); - for_each_sk_fl_rcu(np, sfl) { + for_each_sk_fl_rcu(sk, sfl) { if (sfl->fl->label == (np->flow_label & IPV6_FLOWLABEL_MASK)) { spin_lock_bh(&ip6_fl_lock); freq->flr_label = sfl->fl->label; @@ -527,11 +530,11 @@ int ipv6_flowlabel_opt_get(struct sock *sk, struct in6_flowlabel_req *freq, freq->flr_linger = sfl->fl->linger / HZ; spin_unlock_bh(&ip6_fl_lock); - rcu_read_unlock_bh(); + rcu_read_unlock(); return 0; } } - rcu_read_unlock_bh(); + rcu_read_unlock(); return -ENOENT; } @@ -548,15 +551,15 @@ static int ipv6_flowlabel_put(struct sock *sk, struct in6_flowlabel_req *freq) if (freq->flr_flags & IPV6_FL_F_REFLECT) { if (sk->sk_protocol != IPPROTO_TCP) return -ENOPROTOOPT; - if (!np->repflow) + if (!inet6_test_bit(REPFLOW, sk)) return -ESRCH; np->flow_label = 0; - np->repflow = 0; + inet6_clear_bit(REPFLOW, sk); return 0; } spin_lock_bh(&ip6_sk_fl_lock); - for (sflp = &np->ipv6_fl_list; + for (sflp = &inet_sk(sk)->ipv6_fl_list; (sfl = socklist_dereference(*sflp)) != NULL; sflp = &sfl->next) { if (sfl->fl->label == freq->flr_label) @@ -576,21 +579,20 @@ found: static int ipv6_flowlabel_renew(struct sock *sk, struct in6_flowlabel_req *freq) { - struct ipv6_pinfo *np = inet6_sk(sk); struct net *net = sock_net(sk); struct ipv6_fl_socklist *sfl; int err; - rcu_read_lock_bh(); - for_each_sk_fl_rcu(np, sfl) { + rcu_read_lock(); + for_each_sk_fl_rcu(sk, sfl) { if (sfl->fl->label == freq->flr_label) { err = fl6_renew(sfl->fl, freq->flr_linger, freq->flr_expires); - rcu_read_unlock_bh(); + rcu_read_unlock(); return err; } } - rcu_read_unlock_bh(); + rcu_read_unlock(); if (freq->flr_share == IPV6_FL_S_NONE && ns_capable(net->user_ns, CAP_NET_ADMIN)) { @@ -611,7 +613,6 @@ static int ipv6_flowlabel_get(struct sock *sk, struct in6_flowlabel_req *freq, { struct ipv6_fl_socklist *sfl, *sfl1 = NULL; struct ip6_flowlabel *fl, *fl1 = NULL; - struct ipv6_pinfo *np = inet6_sk(sk); struct net *net = sock_net(sk); int err; @@ -623,7 +624,7 @@ static int ipv6_flowlabel_get(struct sock *sk, struct in6_flowlabel_req *freq, if (sk->sk_protocol != IPPROTO_TCP) return -ENOPROTOOPT; - np->repflow = 1; + inet6_set_bit(REPFLOW, sk); return 0; } @@ -641,11 +642,11 @@ static int ipv6_flowlabel_get(struct sock *sk, struct in6_flowlabel_req *freq, if (freq->flr_label) { err = -EEXIST; - rcu_read_lock_bh(); - for_each_sk_fl_rcu(np, sfl) { + rcu_read_lock(); + for_each_sk_fl_rcu(sk, sfl) { if (sfl->fl->label == freq->flr_label) { if (freq->flr_flags & IPV6_FL_F_EXCL) { - rcu_read_unlock_bh(); + rcu_read_unlock(); goto done; } fl1 = sfl->fl; @@ -654,7 +655,7 @@ static int ipv6_flowlabel_get(struct sock *sk, struct in6_flowlabel_req *freq, break; } } - rcu_read_unlock_bh(); + rcu_read_unlock(); if (!fl1) fl1 = fl_lookup(net, freq->flr_label); @@ -679,7 +680,7 @@ recheck: fl1->linger = fl->linger; if ((long)(fl->expires - fl1->expires) > 0) fl1->expires = fl->expires; - fl_link(np, sfl1, fl1); + fl_link(sk, sfl1, fl1); fl_free(fl); return 0; @@ -713,7 +714,7 @@ release: } } - fl_link(np, sfl1, fl); + fl_link(sk, sfl1, fl); return 0; done: fl_free(fl); @@ -809,7 +810,7 @@ static void *ip6fl_seq_start(struct seq_file *seq, loff_t *pos) state->pid_ns = proc_pid_ns(file_inode(seq->file)->i_sb); - rcu_read_lock_bh(); + rcu_read_lock(); return *pos ? ip6fl_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; } @@ -828,7 +829,7 @@ static void *ip6fl_seq_next(struct seq_file *seq, void *v, loff_t *pos) static void ip6fl_seq_stop(struct seq_file *seq, void *v) __releases(RCU) { - rcu_read_unlock_bh(); + rcu_read_unlock(); } static int ip6fl_seq_show(struct seq_file *seq, void *v) @@ -904,6 +905,6 @@ int ip6_flowlabel_init(void) void ip6_flowlabel_cleanup(void) { static_key_deferred_flush(&ipv6_flowlabel_exclusive); - del_timer(&ip6_fl_gc_timer); + timer_delete(&ip6_fl_gc_timer); unregister_pernet_subsys(&ip6_flowlabel_net_ops); } diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 89f5f0f3f5d6..c82a75510c0e 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -43,6 +43,7 @@ #include <net/xfrm.h> #include <net/net_namespace.h> #include <net/netns/generic.h> +#include <net/netdev_lock.h> #include <net/rtnetlink.h> #include <net/ipv6.h> @@ -110,8 +111,32 @@ static u32 HASH_ADDR(const struct in6_addr *addr) #define tunnels_l tunnels[1] #define tunnels_wc tunnels[0] -/* Given src, dst and key, find appropriate for input tunnel. */ +static bool ip6gre_tunnel_match(struct ip6_tnl *t, int dev_type, int link, + int *cand_score, struct ip6_tnl **ret) +{ + int score = 0; + + if (t->dev->type != ARPHRD_IP6GRE && + t->dev->type != dev_type) + return false; + + if (t->parms.link != link) + score |= 1; + if (t->dev->type != dev_type) + score |= 2; + if (score == 0) { + *ret = t; + return true; + } + + if (score < *cand_score) { + *ret = t; + *cand_score = score; + } + return false; +} +/* Given src, dst and key, find appropriate for input tunnel. */ static struct ip6_tnl *ip6gre_tunnel_lookup(struct net_device *dev, const struct in6_addr *remote, const struct in6_addr *local, __be32 key, __be16 gre_proto) @@ -126,8 +151,8 @@ static struct ip6_tnl *ip6gre_tunnel_lookup(struct net_device *dev, gre_proto == htons(ETH_P_ERSPAN) || gre_proto == htons(ETH_P_ERSPAN2)) ? ARPHRD_ETHER : ARPHRD_IP6GRE; - int score, cand_score = 4; struct net_device *ndev; + int cand_score = 4; for_each_ip_tunnel_rcu(t, ign->tunnels_r_l[h0 ^ h1]) { if (!ipv6_addr_equal(local, &t->parms.laddr) || @@ -136,22 +161,8 @@ static struct ip6_tnl *ip6gre_tunnel_lookup(struct net_device *dev, !(t->dev->flags & IFF_UP)) continue; - if (t->dev->type != ARPHRD_IP6GRE && - t->dev->type != dev_type) - continue; - - score = 0; - if (t->parms.link != link) - score |= 1; - if (t->dev->type != dev_type) - score |= 2; - if (score == 0) - return t; - - if (score < cand_score) { - cand = t; - cand_score = score; - } + if (ip6gre_tunnel_match(t, dev_type, link, &cand_score, &cand)) + return cand; } for_each_ip_tunnel_rcu(t, ign->tunnels_r[h0 ^ h1]) { @@ -160,22 +171,8 @@ static struct ip6_tnl *ip6gre_tunnel_lookup(struct net_device *dev, !(t->dev->flags & IFF_UP)) continue; - if (t->dev->type != ARPHRD_IP6GRE && - t->dev->type != dev_type) - continue; - - score = 0; - if (t->parms.link != link) - score |= 1; - if (t->dev->type != dev_type) - score |= 2; - if (score == 0) - return t; - - if (score < cand_score) { - cand = t; - cand_score = score; - } + if (ip6gre_tunnel_match(t, dev_type, link, &cand_score, &cand)) + return cand; } for_each_ip_tunnel_rcu(t, ign->tunnels_l[h1]) { @@ -186,22 +183,8 @@ static struct ip6_tnl *ip6gre_tunnel_lookup(struct net_device *dev, !(t->dev->flags & IFF_UP)) continue; - if (t->dev->type != ARPHRD_IP6GRE && - t->dev->type != dev_type) - continue; - - score = 0; - if (t->parms.link != link) - score |= 1; - if (t->dev->type != dev_type) - score |= 2; - if (score == 0) - return t; - - if (score < cand_score) { - cand = t; - cand_score = score; - } + if (ip6gre_tunnel_match(t, dev_type, link, &cand_score, &cand)) + return cand; } for_each_ip_tunnel_rcu(t, ign->tunnels_wc[h1]) { @@ -209,22 +192,8 @@ static struct ip6_tnl *ip6gre_tunnel_lookup(struct net_device *dev, !(t->dev->flags & IFF_UP)) continue; - if (t->dev->type != ARPHRD_IP6GRE && - t->dev->type != dev_type) - continue; - - score = 0; - if (t->parms.link != link) - score |= 1; - if (t->dev->type != dev_type) - score |= 2; - if (score == 0) - return t; - - if (score < cand_score) { - cand = t; - cand_score = score; - } + if (ip6gre_tunnel_match(t, dev_type, link, &cand_score, &cand)) + return cand; } if (cand) @@ -360,9 +329,9 @@ static struct ip6_tnl *ip6gre_tunnel_locate(struct net *net, if (parms->name[0]) { if (!dev_valid_name(parms->name)) return NULL; - strscpy(name, parms->name, IFNAMSIZ); + strscpy(name, parms->name); } else { - strcpy(name, "ip6gre%d"); + strscpy(name, "ip6gre%d"); } dev = alloc_netdev(sizeof(*t), name, NET_NAME_UNKNOWN, ip6gre_tunnel_setup); @@ -496,11 +465,11 @@ static int ip6gre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi) tpi->proto); if (tunnel) { if (tunnel->parms.collect_md) { + IP_TUNNEL_DECLARE_FLAGS(flags); struct metadata_dst *tun_dst; __be64 tun_id; - __be16 flags; - flags = tpi->flags; + ip_tunnel_flags_copy(flags, tpi->flags); tun_id = key32_to_tunnel_id(tpi->key); tun_dst = ipv6_tun_rx_dst(skb, flags, tun_id, 0); @@ -528,6 +497,9 @@ static int ip6erspan_rcv(struct sk_buff *skb, struct ip6_tnl *tunnel; u8 ver; + if (unlikely(!pskb_may_pull(skb, sizeof(*ershdr)))) + return PACKET_REJECT; + ipv6h = ipv6_hdr(skb); ershdr = (struct erspan_base_hdr *)skb->data; ver = ershdr->ver; @@ -548,14 +520,14 @@ static int ip6erspan_rcv(struct sk_buff *skb, if (tunnel->parms.collect_md) { struct erspan_metadata *pkt_md, *md; + IP_TUNNEL_DECLARE_FLAGS(flags); struct metadata_dst *tun_dst; struct ip_tunnel_info *info; unsigned char *gh; __be64 tun_id; - __be16 flags; - tpi->flags |= TUNNEL_KEY; - flags = tpi->flags; + __set_bit(IP_TUNNEL_KEY_BIT, tpi->flags); + ip_tunnel_flags_copy(flags, tpi->flags); tun_id = key32_to_tunnel_id(tpi->key); tun_dst = ipv6_tun_rx_dst(skb, flags, tun_id, @@ -577,7 +549,8 @@ static int ip6erspan_rcv(struct sk_buff *skb, md2 = &md->u.md2; memcpy(md2, pkt_md, ver == 1 ? ERSPAN_V1_MDSIZE : ERSPAN_V2_MDSIZE); - info->key.tun_flags |= TUNNEL_ERSPAN_OPT; + __set_bit(IP_TUNNEL_ERSPAN_OPT_BIT, + info->key.tun_flags); info->options_len = sizeof(*md); ip6_tnl_rcv(tunnel, skb, tpi, tun_dst, log_ecn_error); @@ -745,8 +718,8 @@ static netdev_tx_t __gre6_xmit(struct sk_buff *skb, __u32 *pmtu, __be16 proto) { struct ip6_tnl *tunnel = netdev_priv(dev); + IP_TUNNEL_DECLARE_FLAGS(flags); __be16 protocol; - __be16 flags; if (dev->type == ARPHRD_ETHER) IPCB(skb)->flags = 0; @@ -778,8 +751,11 @@ static netdev_tx_t __gre6_xmit(struct sk_buff *skb, fl6->fl6_gre_key = tunnel_id_to_key32(key->tun_id); dsfield = key->tos; - flags = key->tun_flags & - (TUNNEL_CSUM | TUNNEL_KEY | TUNNEL_SEQ); + ip_tunnel_flags_zero(flags); + __set_bit(IP_TUNNEL_CSUM_BIT, flags); + __set_bit(IP_TUNNEL_KEY_BIT, flags); + __set_bit(IP_TUNNEL_SEQ_BIT, flags); + ip_tunnel_flags_and(flags, flags, key->tun_flags); tun_hlen = gre_calc_hlen(flags); if (skb_cow_head(skb, dev->needed_headroom ?: tun_hlen + tunnel->encap_hlen)) @@ -788,19 +764,21 @@ static netdev_tx_t __gre6_xmit(struct sk_buff *skb, gre_build_header(skb, tun_hlen, flags, protocol, tunnel_id_to_key32(tun_info->key.tun_id), - (flags & TUNNEL_SEQ) ? htonl(atomic_fetch_inc(&tunnel->o_seqno)) - : 0); + test_bit(IP_TUNNEL_SEQ_BIT, flags) ? + htonl(atomic_fetch_inc(&tunnel->o_seqno)) : + 0); } else { if (skb_cow_head(skb, dev->needed_headroom ?: tunnel->hlen)) return -ENOMEM; - flags = tunnel->parms.o_flags; + ip_tunnel_flags_copy(flags, tunnel->parms.o_flags); gre_build_header(skb, tunnel->tun_hlen, flags, protocol, tunnel->parms.o_key, - (flags & TUNNEL_SEQ) ? htonl(atomic_fetch_inc(&tunnel->o_seqno)) - : 0); + test_bit(IP_TUNNEL_SEQ_BIT, flags) ? + htonl(atomic_fetch_inc(&tunnel->o_seqno)) : + 0); } return ip6_tnl_xmit(skb, dev, dsfield, fl6, encap_limit, pmtu, @@ -822,7 +800,8 @@ static inline int ip6gre_xmit_ipv4(struct sk_buff *skb, struct net_device *dev) prepare_ip6gre_xmit_ipv4(skb, dev, &fl6, &dsfield, &encap_limit); - err = gre_handle_offloads(skb, !!(t->parms.o_flags & TUNNEL_CSUM)); + err = gre_handle_offloads(skb, test_bit(IP_TUNNEL_CSUM_BIT, + t->parms.o_flags)); if (err) return -1; @@ -856,7 +835,8 @@ static inline int ip6gre_xmit_ipv6(struct sk_buff *skb, struct net_device *dev) prepare_ip6gre_xmit_ipv6(skb, dev, &fl6, &dsfield, &encap_limit)) return -1; - if (gre_handle_offloads(skb, !!(t->parms.o_flags & TUNNEL_CSUM))) + if (gre_handle_offloads(skb, test_bit(IP_TUNNEL_CSUM_BIT, + t->parms.o_flags))) return -1; err = __gre6_xmit(skb, dev, dsfield, &fl6, encap_limit, @@ -883,7 +863,8 @@ static int ip6gre_xmit_other(struct sk_buff *skb, struct net_device *dev) prepare_ip6gre_xmit_other(skb, dev, &fl6, &dsfield, &encap_limit)) return -1; - err = gre_handle_offloads(skb, !!(t->parms.o_flags & TUNNEL_CSUM)); + err = gre_handle_offloads(skb, test_bit(IP_TUNNEL_CSUM_BIT, + t->parms.o_flags)); if (err) return err; err = __gre6_xmit(skb, dev, dsfield, &fl6, encap_limit, &mtu, skb->protocol); @@ -936,6 +917,7 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb, struct ip_tunnel_info *tun_info = NULL; struct ip6_tnl *t = netdev_priv(dev); struct dst_entry *dst = skb_dst(skb); + IP_TUNNEL_DECLARE_FLAGS(flags) = { }; bool truncate = false; int encap_limit = -1; __u8 dsfield = false; @@ -955,11 +937,12 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb, goto tx_err; if (skb->len > dev->mtu + dev->hard_header_len) { - pskb_trim(skb, dev->mtu + dev->hard_header_len); + if (pskb_trim(skb, dev->mtu + dev->hard_header_len)) + goto tx_err; truncate = true; } - nhoff = skb_network_header(skb) - skb_mac_header(skb); + nhoff = skb_network_offset(skb); if (skb->protocol == htons(ETH_P_IP) && (ntohs(ip_hdr(skb)->tot_len) > skb->len - nhoff)) truncate = true; @@ -968,7 +951,7 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb, int thoff; if (skb_transport_header_was_set(skb)) - thoff = skb_transport_header(skb) - skb_mac_header(skb); + thoff = skb_transport_offset(skb); else thoff = nhoff + sizeof(struct ipv6hdr); if (ntohs(ipv6_hdr(skb)->payload_len) > skb->len - thoff) @@ -978,7 +961,7 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb, if (skb_cow_head(skb, dev->needed_headroom ?: t->hlen)) goto tx_err; - t->parms.o_flags &= ~TUNNEL_KEY; + __clear_bit(IP_TUNNEL_KEY_BIT, t->parms.o_flags); IPCB(skb)->flags = 0; /* For collect_md mode, derive fl6 from the tunnel key, @@ -1003,7 +986,8 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb, fl6.fl6_gre_key = tunnel_id_to_key32(key->tun_id); dsfield = key->tos; - if (!(tun_info->key.tun_flags & TUNNEL_ERSPAN_OPT)) + if (!test_bit(IP_TUNNEL_ERSPAN_OPT_BIT, + tun_info->key.tun_flags)) goto tx_err; if (tun_info->options_len < sizeof(*md)) goto tx_err; @@ -1015,12 +999,14 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb, ntohl(tun_id), ntohl(md->u.index), truncate, false); + proto = htons(ETH_P_ERSPAN); } else if (md->version == 2) { erspan_build_header_v2(skb, ntohl(tun_id), md->u.md2.dir, get_hwid(&md->u.md2), truncate, false); + proto = htons(ETH_P_ERSPAN2); } else { goto tx_err; } @@ -1043,30 +1029,35 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb, break; } - if (t->parms.erspan_ver == 1) + if (t->parms.erspan_ver == 1) { erspan_build_header(skb, ntohl(t->parms.o_key), t->parms.index, truncate, false); - else if (t->parms.erspan_ver == 2) + proto = htons(ETH_P_ERSPAN); + } else if (t->parms.erspan_ver == 2) { erspan_build_header_v2(skb, ntohl(t->parms.o_key), t->parms.dir, t->parms.hwid, truncate, false); - else + proto = htons(ETH_P_ERSPAN2); + } else { goto tx_err; + } fl6.daddr = t->parms.raddr; } /* Push GRE header. */ - proto = (t->parms.erspan_ver == 1) ? htons(ETH_P_ERSPAN) - : htons(ETH_P_ERSPAN2); - gre_build_header(skb, 8, TUNNEL_SEQ, proto, 0, htonl(atomic_fetch_inc(&t->o_seqno))); + __set_bit(IP_TUNNEL_SEQ_BIT, flags); + gre_build_header(skb, 8, flags, proto, 0, + htonl(atomic_fetch_inc(&t->o_seqno))); /* TooBig packet may have updated dst->dev's mtu */ - if (!t->parms.collect_md && dst && dst_mtu(dst) > dst->dev->mtu) - dst->ops->update_pmtu(dst, NULL, skb, dst->dev->mtu, false); - + if (!t->parms.collect_md && dst) { + mtu = READ_ONCE(dst_dev(dst)->mtu); + if (dst_mtu(dst) > mtu) + dst->ops->update_pmtu(dst, NULL, skb, mtu, false); + } err = ip6_tnl_xmit(skb, dev, dsfield, &fl6, encap_limit, &mtu, NEXTHDR_GRE); if (err != 0) { @@ -1204,8 +1195,8 @@ static void ip6gre_tnl_copy_tnl_parm(struct ip6_tnl *t, t->parms.proto = p->proto; t->parms.i_key = p->i_key; t->parms.o_key = p->o_key; - t->parms.i_flags = p->i_flags; - t->parms.o_flags = p->o_flags; + ip_tunnel_flags_copy(t->parms.i_flags, p->i_flags); + ip_tunnel_flags_copy(t->parms.o_flags, p->o_flags); t->parms.fwmark = p->fwmark; t->parms.erspan_ver = p->erspan_ver; t->parms.index = p->index; @@ -1234,8 +1225,8 @@ static void ip6gre_tnl_parm_from_user(struct __ip6_tnl_parm *p, p->link = u->link; p->i_key = u->i_key; p->o_key = u->o_key; - p->i_flags = gre_flags_to_tnl_flags(u->i_flags); - p->o_flags = gre_flags_to_tnl_flags(u->o_flags); + gre_flags_to_tnl_flags(p->i_flags, u->i_flags); + gre_flags_to_tnl_flags(p->o_flags, u->o_flags); memcpy(p->name, u->name, sizeof(u->name)); } @@ -1387,7 +1378,7 @@ static int ip6gre_header(struct sk_buff *skb, struct net_device *dev, ipv6h->daddr = t->parms.raddr; p = (__be16 *)(ipv6h + 1); - p[0] = t->parms.o_flags; + p[0] = ip_tunnel_flags_to_be16(t->parms.o_flags); p[1] = htons(type); /* @@ -1414,7 +1405,6 @@ static const struct net_device_ops ip6gre_netdev_ops = { .ndo_start_xmit = ip6gre_tunnel_xmit, .ndo_siocdevprivate = ip6gre_tunnel_siocdevprivate, .ndo_change_mtu = ip6_tnl_change_mtu, - .ndo_get_stats64 = dev_get_tstats64, .ndo_get_iflink = ip6_tnl_get_iflink, }; @@ -1424,7 +1414,6 @@ static void ip6gre_dev_free(struct net_device *dev) gro_cells_destroy(&t->gro_cells); dst_cache_destroy(&t->dst_cache); - free_percpu(dev->tstats); } static void ip6gre_tunnel_setup(struct net_device *dev) @@ -1433,6 +1422,7 @@ static void ip6gre_tunnel_setup(struct net_device *dev) dev->needs_free_netdev = true; dev->priv_destructor = ip6gre_dev_free; + dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS; dev->type = ARPHRD_IP6GRE; dev->flags |= IFF_NOARP; @@ -1451,23 +1441,23 @@ static void ip6gre_tunnel_setup(struct net_device *dev) static void ip6gre_tnl_init_features(struct net_device *dev) { struct ip6_tnl *nt = netdev_priv(dev); - __be16 flags; - dev->features |= GRE6_FEATURES | NETIF_F_LLTX; + dev->features |= GRE6_FEATURES; dev->hw_features |= GRE6_FEATURES; - flags = nt->parms.o_flags; - /* TCP offload with GRE SEQ is not supported, nor can we support 2 * levels of outer headers requiring an update. */ - if (flags & TUNNEL_SEQ) + if (test_bit(IP_TUNNEL_SEQ_BIT, nt->parms.o_flags)) return; - if (flags & TUNNEL_CSUM && nt->encap.type != TUNNEL_ENCAP_NONE) + if (test_bit(IP_TUNNEL_CSUM_BIT, nt->parms.o_flags) && + nt->encap.type != TUNNEL_ENCAP_NONE) return; dev->features |= NETIF_F_GSO_SOFTWARE; dev->hw_features |= NETIF_F_GSO_SOFTWARE; + + dev->lltx = true; } static int ip6gre_tunnel_init_common(struct net_device *dev) @@ -1479,16 +1469,11 @@ static int ip6gre_tunnel_init_common(struct net_device *dev) tunnel = netdev_priv(dev); tunnel->dev = dev; - tunnel->net = dev_net(dev); - strcpy(tunnel->parms.name, dev->name); - - dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); - if (!dev->tstats) - return -ENOMEM; + strscpy(tunnel->parms.name, dev->name); ret = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL); if (ret) - goto cleanup_alloc_pcpu_stats; + return ret; ret = gro_cells_init(&tunnel->gro_cells, dev); if (ret) @@ -1507,13 +1492,11 @@ static int ip6gre_tunnel_init_common(struct net_device *dev) ip6gre_tnl_init_features(dev); netdev_hold(dev, &tunnel->dev_tracker, GFP_KERNEL); + netdev_lockdep_set_classes(dev); return 0; cleanup_dst_cache_init: dst_cache_destroy(&tunnel->dst_cache); -cleanup_alloc_pcpu_stats: - free_percpu(dev->tstats); - dev->tstats = NULL; return ret; } @@ -1546,7 +1529,7 @@ static void ip6gre_fb_tunnel_init(struct net_device *dev) tunnel->dev = dev; tunnel->net = dev_net(dev); - strcpy(tunnel->parms.name, dev->name); + strscpy(tunnel->parms.name, dev->name); tunnel->hlen = sizeof(struct ipv6hdr) + 4; } @@ -1557,7 +1540,7 @@ static struct inet6_protocol ip6gre_protocol __read_mostly = { .flags = INET6_PROTO_FINAL, }; -static void ip6gre_destroy_tunnels(struct net *net, struct list_head *head) +static void __net_exit ip6gre_exit_rtnl_net(struct net *net, struct list_head *head) { struct ip6gre_net *ign = net_generic(net, ip6gre_net_id); struct net_device *dev, *aux; @@ -1574,16 +1557,16 @@ static void ip6gre_destroy_tunnels(struct net *net, struct list_head *head) for (h = 0; h < IP6_GRE_HASH_SIZE; h++) { struct ip6_tnl *t; - t = rtnl_dereference(ign->tunnels[prio][h]); + t = rtnl_net_dereference(net, ign->tunnels[prio][h]); while (t) { /* If dev is in the same netns, it has already * been added to the list by the previous loop. */ if (!net_eq(dev_net(t->dev), net)) - unregister_netdevice_queue(t->dev, - head); - t = rtnl_dereference(t->next); + unregister_netdevice_queue(t->dev, head); + + t = rtnl_net_dereference(net, t->next); } } } @@ -1608,8 +1591,7 @@ static int __net_init ip6gre_init_net(struct net *net) /* FB netdevice is special: we have one, and only one per netns. * Allowing to move it to another netns is clearly unsafe. */ - ign->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL; - + ign->fb_tunnel_dev->netns_immutable = true; ip6gre_fb_tunnel_init(ign->fb_tunnel_dev); ign->fb_tunnel_dev->rtnl_link_ops = &ip6gre_link_ops; @@ -1628,21 +1610,9 @@ err_alloc_dev: return err; } -static void __net_exit ip6gre_exit_batch_net(struct list_head *net_list) -{ - struct net *net; - LIST_HEAD(list); - - rtnl_lock(); - list_for_each_entry(net, net_list, exit_list) - ip6gre_destroy_tunnels(net, &list); - unregister_netdevice_many(&list); - rtnl_unlock(); -} - static struct pernet_operations ip6gre_net_ops = { .init = ip6gre_init_net, - .exit_batch = ip6gre_exit_batch_net, + .exit_rtnl = ip6gre_exit_rtnl_net, .id = &ip6gre_net_id, .size = sizeof(struct ip6gre_net), }; @@ -1789,12 +1759,12 @@ static void ip6gre_netlink_parms(struct nlattr *data[], parms->link = nla_get_u32(data[IFLA_GRE_LINK]); if (data[IFLA_GRE_IFLAGS]) - parms->i_flags = gre_flags_to_tnl_flags( - nla_get_be16(data[IFLA_GRE_IFLAGS])); + gre_flags_to_tnl_flags(parms->i_flags, + nla_get_be16(data[IFLA_GRE_IFLAGS])); if (data[IFLA_GRE_OFLAGS]) - parms->o_flags = gre_flags_to_tnl_flags( - nla_get_be16(data[IFLA_GRE_OFLAGS])); + gre_flags_to_tnl_flags(parms->o_flags, + nla_get_be16(data[IFLA_GRE_OFLAGS])); if (data[IFLA_GRE_IKEY]) parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]); @@ -1847,7 +1817,6 @@ static const struct net_device_ops ip6gre_tap_netdev_ops = { .ndo_set_mac_address = eth_mac_addr, .ndo_validate_addr = eth_validate_addr, .ndo_change_mtu = ip6_tnl_change_mtu, - .ndo_get_stats64 = dev_get_tstats64, .ndo_get_iflink = ip6_tnl_get_iflink, }; @@ -1873,16 +1842,11 @@ static int ip6erspan_tap_init(struct net_device *dev) tunnel = netdev_priv(dev); tunnel->dev = dev; - tunnel->net = dev_net(dev); - strcpy(tunnel->parms.name, dev->name); - - dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); - if (!dev->tstats) - return -ENOMEM; + strscpy(tunnel->parms.name, dev->name); ret = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL); if (ret) - goto cleanup_alloc_pcpu_stats; + return ret; ret = gro_cells_init(&tunnel->gro_cells, dev); if (ret) @@ -1899,13 +1863,11 @@ static int ip6erspan_tap_init(struct net_device *dev) ip6erspan_tnl_link_config(tunnel, 1); netdev_hold(dev, &tunnel->dev_tracker, GFP_KERNEL); + netdev_lockdep_set_classes(dev); return 0; cleanup_dst_cache_init: dst_cache_destroy(&tunnel->dst_cache); -cleanup_alloc_pcpu_stats: - free_percpu(dev->tstats); - dev->tstats = NULL; return ret; } @@ -1916,7 +1878,6 @@ static const struct net_device_ops ip6erspan_netdev_ops = { .ndo_set_mac_address = eth_mac_addr, .ndo_validate_addr = eth_validate_addr, .ndo_change_mtu = ip6_tnl_change_mtu, - .ndo_get_stats64 = dev_get_tstats64, .ndo_get_iflink = ip6_tnl_get_iflink, }; @@ -1930,6 +1891,7 @@ static void ip6gre_tap_setup(struct net_device *dev) dev->needs_free_netdev = true; dev->priv_destructor = ip6gre_dev_free; + dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS; dev->priv_flags &= ~IFF_TX_SKB_SHARING; dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; netif_keep_dst(dev); @@ -1968,7 +1930,7 @@ static bool ip6gre_netlink_encap_parms(struct nlattr *data[], return ret; } -static int ip6gre_newlink_common(struct net *src_net, struct net_device *dev, +static int ip6gre_newlink_common(struct net *link_net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { @@ -1989,7 +1951,7 @@ static int ip6gre_newlink_common(struct net *src_net, struct net_device *dev, eth_hw_addr_random(dev); nt->dev = dev; - nt->net = dev_net(dev); + nt->net = link_net; err = register_netdevice(dev); if (err) @@ -2002,12 +1964,14 @@ out: return err; } -static int ip6gre_newlink(struct net *src_net, struct net_device *dev, - struct nlattr *tb[], struct nlattr *data[], +static int ip6gre_newlink(struct net_device *dev, + struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { + struct net *net = params->link_net ? : dev_net(dev); struct ip6_tnl *nt = netdev_priv(dev); - struct net *net = dev_net(dev); + struct nlattr **data = params->data; + struct nlattr **tb = params->tb; struct ip6gre_net *ign; int err; @@ -2022,7 +1986,7 @@ static int ip6gre_newlink(struct net *src_net, struct net_device *dev, return -EEXIST; } - err = ip6gre_newlink_common(src_net, dev, tb, data, extack); + err = ip6gre_newlink_common(net, dev, tb, data, extack); if (!err) { ip6gre_tnl_link_config(nt, !tb[IFLA_MTU]); ip6gre_tunnel_link_md(ign, nt); @@ -2140,11 +2104,13 @@ static int ip6gre_fill_info(struct sk_buff *skb, const struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); struct __ip6_tnl_parm *p = &t->parms; - __be16 o_flags = p->o_flags; + IP_TUNNEL_DECLARE_FLAGS(o_flags); + + ip_tunnel_flags_copy(o_flags, p->o_flags); if (p->erspan_ver == 1 || p->erspan_ver == 2) { if (!p->collect_md) - o_flags |= TUNNEL_KEY; + __set_bit(IP_TUNNEL_KEY_BIT, o_flags); if (nla_put_u8(skb, IFLA_GRE_ERSPAN_VER, p->erspan_ver)) goto nla_put_failure; @@ -2230,17 +2196,20 @@ static void ip6erspan_tap_setup(struct net_device *dev) dev->needs_free_netdev = true; dev->priv_destructor = ip6gre_dev_free; + dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS; dev->priv_flags &= ~IFF_TX_SKB_SHARING; dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; netif_keep_dst(dev); } -static int ip6erspan_newlink(struct net *src_net, struct net_device *dev, - struct nlattr *tb[], struct nlattr *data[], +static int ip6erspan_newlink(struct net_device *dev, + struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { + struct net *net = params->link_net ? : dev_net(dev); struct ip6_tnl *nt = netdev_priv(dev); - struct net *net = dev_net(dev); + struct nlattr **data = params->data; + struct nlattr **tb = params->tb; struct ip6gre_net *ign; int err; @@ -2256,7 +2225,7 @@ static int ip6erspan_newlink(struct net *src_net, struct net_device *dev, return -EEXIST; } - err = ip6gre_newlink_common(src_net, dev, tb, data, extack); + err = ip6gre_newlink_common(net, dev, tb, data, extack); if (!err) { ip6erspan_tnl_link_config(nt, !tb[IFLA_MTU]); ip6erspan_tunnel_link_md(ign, nt); @@ -2401,7 +2370,7 @@ static void __exit ip6gre_fini(void) module_init(ip6gre_init); module_exit(ip6gre_fini); MODULE_LICENSE("GPL"); -MODULE_AUTHOR("D. Kozlov (xeb@mail.ru)"); +MODULE_AUTHOR("D. Kozlov <xeb@mail.ru>"); MODULE_DESCRIPTION("GRE over IPv6 tunneling device"); MODULE_ALIAS_RTNL_LINK("ip6gre"); MODULE_ALIAS_RTNL_LINK("ip6gretap"); diff --git a/net/ipv6/ip6_icmp.c b/net/ipv6/ip6_icmp.c index 9e3574880cb0..233914b63bdb 100644 --- a/net/ipv6/ip6_icmp.c +++ b/net/ipv6/ip6_icmp.c @@ -54,11 +54,12 @@ void icmpv6_ndo_send(struct sk_buff *skb_in, u8 type, u8 code, __u32 info) struct inet6_skb_parm parm = { 0 }; struct sk_buff *cloned_skb = NULL; enum ip_conntrack_info ctinfo; + enum ip_conntrack_dir dir; struct in6_addr orig_ip; struct nf_conn *ct; ct = nf_ct_get(skb_in, &ctinfo); - if (!ct || !(ct->status & IPS_SRC_NAT)) { + if (!ct || !(READ_ONCE(ct->status) & IPS_NAT_MASK)) { __icmpv6_send(skb_in, type, code, info, &parm); return; } @@ -73,7 +74,8 @@ void icmpv6_ndo_send(struct sk_buff *skb_in, u8 type, u8 code, __u32 info) goto out; orig_ip = ipv6_hdr(skb_in)->saddr; - ipv6_hdr(skb_in)->saddr = ct->tuplehash[0].tuple.src.u3.in6; + dir = CTINFO2DIR(ctinfo); + ipv6_hdr(skb_in)->saddr = ct->tuplehash[dir].tuple.src.u3.in6; __icmpv6_send(skb_in, type, code, info, &parm); ipv6_hdr(skb_in)->saddr = orig_ip; out: diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c index e1ebf5e42ebe..168ec07e31cc 100644 --- a/net/ipv6/ip6_input.c +++ b/net/ipv6/ip6_input.c @@ -99,7 +99,8 @@ static bool ip6_can_use_hint(const struct sk_buff *skb, static struct sk_buff *ip6_extract_route_hint(const struct net *net, struct sk_buff *skb) { - if (fib6_routes_require_src(net) || fib6_has_custom_rules(net)) + if (fib6_routes_require_src(net) || fib6_has_custom_rules(net) || + IP6CB(skb)->flags & IP6SKB_MULTIPATH) return NULL; return skb; @@ -110,9 +111,8 @@ static void ip6_list_rcv_finish(struct net *net, struct sock *sk, { struct sk_buff *skb, *next, *hint = NULL; struct dst_entry *curr_dst = NULL; - struct list_head sublist; + LIST_HEAD(sublist); - INIT_LIST_HEAD(&sublist); list_for_each_entry_safe(skb, next, head, list) { struct dst_entry *dst; @@ -167,9 +167,9 @@ static struct sk_buff *ip6_rcv_core(struct sk_buff *skb, struct net_device *dev, SKB_DR_SET(reason, NOT_SPECIFIED); if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL || - !idev || unlikely(idev->cnf.disable_ipv6)) { + !idev || unlikely(READ_ONCE(idev->cnf.disable_ipv6))) { __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS); - if (idev && unlikely(idev->cnf.disable_ipv6)) + if (idev && unlikely(READ_ONCE(idev->cnf.disable_ipv6))) SKB_DR_SET(reason, IPV6DISABLED); goto drop; } @@ -187,7 +187,9 @@ static struct sk_buff *ip6_rcv_core(struct sk_buff *skb, struct net_device *dev, * arrived via the sending interface (ethX), because of the * nature of scoping architecture. --yoshfuji */ - IP6CB(skb)->iif = skb_valid_dst(skb) ? ip6_dst_idev(skb_dst(skb))->dev->ifindex : dev->ifindex; + IP6CB(skb)->iif = skb_valid_dst(skb) ? + ip6_dst_idev(skb_dst(skb))->dev->ifindex : + dev->ifindex; if (unlikely(!pskb_may_pull(skb, sizeof(*hdr)))) goto err; @@ -235,7 +237,7 @@ static struct sk_buff *ip6_rcv_core(struct sk_buff *skb, struct net_device *dev, if (!ipv6_addr_is_multicast(&hdr->daddr) && (skb->pkt_type == PACKET_BROADCAST || skb->pkt_type == PACKET_MULTICAST) && - idev->cnf.drop_unicast_in_l2_multicast) { + READ_ONCE(idev->cnf.drop_unicast_in_l2_multicast)) { SKB_DR_SET(reason, UNICAST_IN_L2_MULTICAST); goto err; } @@ -326,9 +328,8 @@ void ipv6_list_rcv(struct list_head *head, struct packet_type *pt, struct net_device *curr_dev = NULL; struct net *curr_net = NULL; struct sk_buff *skb, *next; - struct list_head sublist; + LIST_HEAD(sublist); - INIT_LIST_HEAD(&sublist); list_for_each_entry_safe(skb, next, head, list) { struct net_device *dev = skb->dev; struct net *net = dev_net(dev); @@ -404,10 +405,6 @@ resubmit_final: /* Only do this once for first final protocol */ have_final = true; - /* Free reference early: we don't need it any more, - and it may hold ip_conntrack module loaded - indefinitely. */ - nf_reset_ct(skb); skb_postpull_rcsum(skb, skb_network_header(skb), skb_network_header_len(skb)); @@ -430,10 +427,12 @@ resubmit_final: goto discard; } } - if (!(ipprot->flags & INET6_PROTO_NOPOLICY) && - !xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) { - SKB_DR_SET(reason, XFRM_POLICY); - goto discard; + if (!(ipprot->flags & INET6_PROTO_NOPOLICY)) { + if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) { + SKB_DR_SET(reason, XFRM_POLICY); + goto discard; + } + nf_reset_ct(skb); } ret = INDIRECT_CALL_2(ipprot->handler, tcp_v6_rcv, udpv6_rcv, @@ -479,10 +478,15 @@ discard: static int ip6_input_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { + if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC))) { + __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), + IPSTATS_MIB_INDISCARDS); + kfree_skb_reason(skb, SKB_DROP_REASON_NOMEM); + return 0; + } + skb_clear_delivery_time(skb); - rcu_read_lock(); ip6_protocol_deliver_rcu(net, skb, 0, false); - rcu_read_unlock(); return 0; } @@ -490,46 +494,46 @@ static int ip6_input_finish(struct net *net, struct sock *sk, struct sk_buff *sk int ip6_input(struct sk_buff *skb) { - return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_IN, - dev_net(skb->dev), NULL, skb, skb->dev, NULL, - ip6_input_finish); + int res; + + rcu_read_lock(); + res = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_IN, + dev_net_rcu(skb->dev), NULL, skb, skb->dev, NULL, + ip6_input_finish); + rcu_read_unlock(); + + return res; } EXPORT_SYMBOL_GPL(ip6_input); int ip6_mc_input(struct sk_buff *skb) { + struct net_device *dev = skb->dev; int sdif = inet6_sdif(skb); const struct ipv6hdr *hdr; - struct net_device *dev; bool deliver; - __IP6_UPD_PO_STATS(dev_net(skb_dst(skb)->dev), - __in6_dev_get_safely(skb->dev), IPSTATS_MIB_INMCAST, - skb->len); + __IP6_UPD_PO_STATS(skb_dst_dev_net_rcu(skb), + __in6_dev_get_safely(dev), IPSTATS_MIB_INMCAST, + skb->len); /* skb->dev passed may be master dev for vrfs. */ if (sdif) { - rcu_read_lock(); - dev = dev_get_by_index_rcu(dev_net(skb->dev), sdif); + dev = dev_get_by_index_rcu(dev_net_rcu(dev), sdif); if (!dev) { - rcu_read_unlock(); kfree_skb(skb); return -ENODEV; } - } else { - dev = skb->dev; } hdr = ipv6_hdr(skb); deliver = ipv6_chk_mcast_addr(dev, &hdr->daddr, NULL); - if (sdif) - rcu_read_unlock(); #ifdef CONFIG_IPV6_MROUTE /* * IPv6 multicast router mode is now supported ;) */ - if (atomic_read(&dev_net(skb->dev)->ipv6.devconf_all->mc_forwarding) && + if (atomic_read(&dev_net_rcu(skb->dev)->ipv6.devconf_all->mc_forwarding) && !(ipv6_addr_type(&hdr->daddr) & (IPV6_ADDR_LOOPBACK|IPV6_ADDR_LINKLOCAL)) && likely(!(IP6CB(skb)->flags & IP6SKB_FORWARDED))) { @@ -570,22 +574,21 @@ int ip6_mc_input(struct sk_buff *skb) /* unknown RA - process it normally */ } - if (deliver) + if (deliver) { skb2 = skb_clone(skb, GFP_ATOMIC); - else { + } else { skb2 = skb; skb = NULL; } - if (skb2) { + if (skb2) ip6_mr_input(skb2); - } } out: #endif - if (likely(deliver)) + if (likely(deliver)) { ip6_input(skb); - else { + } else { /* discard */ kfree_skb(skb); } diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c index 00dc2e3b0184..fce91183797a 100644 --- a/net/ipv6/ip6_offload.c +++ b/net/ipv6/ip6_offload.c @@ -16,6 +16,7 @@ #include <net/tcp.h> #include <net/udp.h> #include <net/gro.h> +#include <net/gso.h> #include "ip6_offload.h" @@ -36,6 +37,40 @@ INDIRECT_CALL_L4(cb, f2, f1, head, skb); \ }) +static int ipv6_gro_pull_exthdrs(struct sk_buff *skb, int off, int proto) +{ + const struct net_offload *ops = NULL; + struct ipv6_opt_hdr *opth; + + for (;;) { + int len; + + ops = rcu_dereference(inet6_offloads[proto]); + + if (unlikely(!ops)) + break; + + if (!(ops->flags & INET6_PROTO_GSO_EXTHDR)) + break; + + opth = skb_gro_header(skb, off + sizeof(*opth), off); + if (unlikely(!opth)) + break; + + len = ipv6_optlen(opth); + + opth = skb_gro_header(skb, off + len, off); + if (unlikely(!opth)) + break; + proto = opth->nexthdr; + + off += len; + } + + skb_gro_pull(skb, off - skb_gro_receive_network_offset(skb)); + return proto; +} + static int ipv6_gso_pull_exthdrs(struct sk_buff *skb, int proto) { const struct net_offload *ops = NULL; @@ -44,15 +79,13 @@ static int ipv6_gso_pull_exthdrs(struct sk_buff *skb, int proto) struct ipv6_opt_hdr *opth; int len; - if (proto != NEXTHDR_HOP) { - ops = rcu_dereference(inet6_offloads[proto]); + ops = rcu_dereference(inet6_offloads[proto]); - if (unlikely(!ops)) - break; + if (unlikely(!ops)) + break; - if (!(ops->flags & INET6_PROTO_GSO_EXTHDR)) - break; - } + if (!(ops->flags & INET6_PROTO_GSO_EXTHDR)) + break; if (unlikely(!pskb_may_pull(skb, 8))) break; @@ -115,7 +148,9 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb, ops = rcu_dereference(inet6_offloads[proto]); if (likely(ops && ops->callbacks.gso_segment)) { - skb_reset_transport_header(skb); + if (!skb_reset_transport_header_careful(skb)) + goto out; + segs = ops->callbacks.gso_segment(skb, features); if (!segs) skb->network_header = skb_mac_header(skb) + nhoff - skb->head; @@ -170,13 +205,12 @@ static int ipv6_exthdrs_len(struct ipv6hdr *iph, proto = iph->nexthdr; for (;;) { - if (proto != NEXTHDR_HOP) { - *opps = rcu_dereference(inet6_offloads[proto]); - if (unlikely(!(*opps))) - break; - if (!((*opps)->flags & INET6_PROTO_GSO_EXTHDR)) - break; - } + *opps = rcu_dereference(inet6_offloads[proto]); + if (unlikely(!(*opps))) + break; + if (!((*opps)->flags & INET6_PROTO_GSO_EXTHDR)) + break; + opth = (void *)opth + optlen; optlen = ipv6_optlen(opth); len += optlen; @@ -204,33 +238,30 @@ INDIRECT_CALLABLE_SCOPE struct sk_buff *ipv6_gro_receive(struct list_head *head, if (unlikely(!iph)) goto out; - skb_set_network_header(skb, off); - skb_gro_pull(skb, sizeof(*iph)); - skb_set_transport_header(skb, skb_gro_offset(skb)); + NAPI_GRO_CB(skb)->network_offsets[NAPI_GRO_CB(skb)->encap_mark] = off; - flush += ntohs(iph->payload_len) != skb_gro_len(skb); + flush += ntohs(iph->payload_len) != skb->len - hlen; proto = iph->nexthdr; ops = rcu_dereference(inet6_offloads[proto]); if (!ops || !ops->callbacks.gro_receive) { - pskb_pull(skb, skb_gro_offset(skb)); - skb_gro_frag0_invalidate(skb); - proto = ipv6_gso_pull_exthdrs(skb, proto); - skb_gro_pull(skb, -skb_transport_offset(skb)); - skb_reset_transport_header(skb); - __skb_push(skb, skb_gro_offset(skb)); + proto = ipv6_gro_pull_exthdrs(skb, hlen, proto); ops = rcu_dereference(inet6_offloads[proto]); if (!ops || !ops->callbacks.gro_receive) goto out; - iph = ipv6_hdr(skb); + iph = skb_gro_network_header(skb); + } else { + skb_gro_pull(skb, sizeof(*iph)); } + skb_set_transport_header(skb, skb_gro_offset(skb)); + NAPI_GRO_CB(skb)->proto = proto; flush--; - nlen = skb_network_header_len(skb); + nlen = skb_gro_offset(skb) - off; list_for_each_entry(p, head, list) { const struct ipv6hdr *iph2; @@ -261,19 +292,8 @@ not_same_flow: nlen - sizeof(struct ipv6hdr))) goto not_same_flow; } - /* flush if Traffic Class fields are different */ - NAPI_GRO_CB(p)->flush |= !!((first_word & htonl(0x0FF00000)) | - (__force __be32)(iph->hop_limit ^ iph2->hop_limit)); - NAPI_GRO_CB(p)->flush |= flush; - - /* If the previous IP ID value was based on an atomic - * datagram we can overwrite the value and ignore it. - */ - if (NAPI_GRO_CB(skb)->is_atomic) - NAPI_GRO_CB(p)->flush_id = 0; } - NAPI_GRO_CB(skb)->is_atomic = true; NAPI_GRO_CB(skb)->flush |= flush; skb_gro_postpull_rcsum(skb, iph, nlen); @@ -390,14 +410,6 @@ static int ip4ip6_gro_complete(struct sk_buff *skb, int nhoff) return inet_gro_complete(skb, nhoff); } -static struct packet_offload ipv6_packet_offload __read_mostly = { - .type = cpu_to_be16(ETH_P_IPV6), - .callbacks = { - .gso_segment = ipv6_gso_segment, - .gro_receive = ipv6_gro_receive, - .gro_complete = ipv6_gro_complete, - }, -}; static struct sk_buff *sit_gso_segment(struct sk_buff *skb, netdev_features_t features) @@ -457,7 +469,15 @@ static int __init ipv6_offload_init(void) if (ipv6_exthdrs_offload_init() < 0) pr_crit("%s: Cannot add EXTHDRS protocol offload\n", __func__); - dev_add_offload(&ipv6_packet_offload); + net_hotdata.ipv6_packet_offload = (struct packet_offload) { + .type = cpu_to_be16(ETH_P_IPV6), + .callbacks = { + .gso_segment = ipv6_gso_segment, + .gro_receive = ipv6_gro_receive, + .gro_complete = ipv6_gro_complete, + }, + }; + dev_add_offload(&net_hotdata.ipv6_packet_offload); inet_add_offload(&sit_offload, IPPROTO_IPV6); inet6_add_offload(&ip6ip6_offload, IPPROTO_IPV6); diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index c314fdde0097..f904739e99b9 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -42,6 +42,7 @@ #include <net/sock.h> #include <net/snmp.h> +#include <net/gso.h> #include <net/ipv6.h> #include <net/ndisc.h> #include <net/protocol.h> @@ -59,7 +60,7 @@ static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); - struct net_device *dev = dst->dev; + struct net_device *dev = dst_dev_rcu(dst); struct inet6_dev *idev = ip6_dst_idev(dst); unsigned int hh_len = LL_RESERVED_SPACE(dev); const struct in6_addr *daddr, *nexthop; @@ -69,6 +70,7 @@ static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff * /* Be paranoid, rather than too clever. */ if (unlikely(hh_len > skb_headroom(skb)) && dev->header_ops) { + /* idev stays alive because we hold rcu_read_lock(). */ skb = skb_expand_head(skb, hh_len); if (!skb) { IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS); @@ -112,19 +114,19 @@ static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff * if (lwtunnel_xmit_redirect(dst->lwtstate)) { int res = lwtunnel_xmit(skb); - if (res < 0 || res == LWTUNNEL_XMIT_DONE) + if (res != LWTUNNEL_XMIT_CONTINUE) return res; } - rcu_read_lock_bh(); - nexthop = rt6_nexthop((struct rt6_info *)dst, daddr); + IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUT, skb->len); + + nexthop = rt6_nexthop(dst_rt6_info(dst), daddr); neigh = __ipv6_neigh_lookup_noref(dev, nexthop); - if (unlikely(IS_ERR_OR_NULL(neigh))) { + if (IS_ERR_OR_NULL(neigh)) { if (unlikely(!neigh)) neigh = __neigh_create(&nd_tbl, nexthop, dev, false); if (IS_ERR(neigh)) { - rcu_read_unlock_bh(); IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTNOROUTES); kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_CREATEFAIL); return -EINVAL; @@ -132,7 +134,6 @@ static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff * } sock_confirm_neigh(skb, neigh); ret = neigh_output(neigh, skb, false); - rcu_read_unlock_bh(); return ret; } @@ -161,7 +162,13 @@ ip6_finish_output_gso_slowpath_drop(struct net *net, struct sock *sk, int err; skb_mark_not_on_list(segs); - err = ip6_fragment(net, sk, segs, ip6_finish_output2); + /* Last GSO segment can be smaller than gso_size (and MTU). + * Adding a fragment header would produce an "atomic fragment", + * which is considered harmful (RFC-8021). Avoid that. + */ + err = segs->len > mtu ? + ip6_fragment(net, sk, segs, ip6_finish_output2) : + ip6_finish_output2(net, sk, segs); if (err && ret == 0) ret = err; } @@ -169,6 +176,16 @@ ip6_finish_output_gso_slowpath_drop(struct net *net, struct sock *sk, return ret; } +static int ip6_finish_output_gso(struct net *net, struct sock *sk, + struct sk_buff *skb, unsigned int mtu) +{ + if (!(IP6CB(skb)->flags & IP6SKB_FAKEJUMBO) && + !skb_gso_validate_network_len(skb, mtu)) + return ip6_finish_output_gso_slowpath_drop(net, sk, skb, mtu); + + return ip6_finish_output2(net, sk, skb); +} + static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) { unsigned int mtu; @@ -182,17 +199,14 @@ static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff #endif mtu = ip6_skb_dst_mtu(skb); - if (skb_is_gso(skb) && - !(IP6CB(skb)->flags & IP6SKB_FAKEJUMBO) && - !skb_gso_validate_network_len(skb, mtu)) - return ip6_finish_output_gso_slowpath_drop(net, sk, skb, mtu); + if (skb_is_gso(skb)) + return ip6_finish_output_gso(net, sk, skb, mtu); - if ((skb->len > mtu && !skb_is_gso(skb)) || - dst_allfrag(skb_dst(skb)) || + if (skb->len > mtu || (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size)) return ip6_fragment(net, sk, skb, ip6_finish_output2); - else - return ip6_finish_output2(net, sk, skb); + + return ip6_finish_output2(net, sk, skb); } static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) @@ -212,35 +226,42 @@ static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *s int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb) { - struct net_device *dev = skb_dst(skb)->dev, *indev = skb->dev; - struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb)); + struct dst_entry *dst = skb_dst(skb); + struct net_device *dev, *indev = skb->dev; + struct inet6_dev *idev; + int ret; skb->protocol = htons(ETH_P_IPV6); + rcu_read_lock(); + dev = dst_dev_rcu(dst); + idev = ip6_dst_idev(dst); skb->dev = dev; - if (unlikely(idev->cnf.disable_ipv6)) { + if (unlikely(!idev || READ_ONCE(idev->cnf.disable_ipv6))) { IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS); + rcu_read_unlock(); kfree_skb_reason(skb, SKB_DROP_REASON_IPV6DISABLED); return 0; } - return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, - net, sk, skb, indev, dev, - ip6_finish_output, - !(IP6CB(skb)->flags & IP6SKB_REROUTED)); + ret = NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, + net, sk, skb, indev, dev, + ip6_finish_output, + !(IP6CB(skb)->flags & IP6SKB_REROUTED)); + rcu_read_unlock(); + return ret; } EXPORT_SYMBOL(ip6_output); -bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np) +bool ip6_autoflowlabel(struct net *net, const struct sock *sk) { - if (!np->autoflowlabel_set) + if (!inet6_test_bit(AUTOFLOWLABEL_SET, sk)) return ip6_default_np_autolabel(net); - else - return np->autoflowlabel; + return inet6_test_bit(AUTOFLOWLABEL, sk); } /* - * xmit an sk_buff (used by TCP, SCTP and DCCP) + * xmit an sk_buff (used by TCP and SCTP) * Note : socket lock is not held for SYNACK packets, but might be modified * by calls to skb_set_owner_w() and ipv6_local_error(), * which are using proper atomic operations or spinlocks. @@ -248,30 +269,35 @@ bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np) int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, __u32 mark, struct ipv6_txoptions *opt, int tclass, u32 priority) { - struct net *net = sock_net(sk); const struct ipv6_pinfo *np = inet6_sk(sk); struct in6_addr *first_hop = &fl6->daddr; struct dst_entry *dst = skb_dst(skb); - struct net_device *dev = dst->dev; struct inet6_dev *idev = ip6_dst_idev(dst); struct hop_jumbo_hdr *hop_jumbo; int hoplen = sizeof(*hop_jumbo); + struct net *net = sock_net(sk); unsigned int head_room; + struct net_device *dev; struct ipv6hdr *hdr; u8 proto = fl6->flowi6_proto; int seg_len = skb->len; - int hlimit = -1; + int ret, hlimit = -1; u32 mtu; + rcu_read_lock(); + + dev = dst_dev_rcu(dst); head_room = sizeof(struct ipv6hdr) + hoplen + LL_RESERVED_SPACE(dev); if (opt) head_room += opt->opt_nflen + opt->opt_flen; if (unlikely(head_room > skb_headroom(skb))) { + /* idev stays alive while we hold rcu_read_lock(). */ skb = skb_expand_head(skb, head_room); if (!skb) { IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS); - return -ENOBUFS; + ret = -ENOBUFS; + goto unlock; } } @@ -308,12 +334,12 @@ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, * Fill in the IPv6 header */ if (np) - hlimit = np->hop_limit; + hlimit = READ_ONCE(np->hop_limit); if (hlimit < 0) hlimit = ip6_dst_hoplimit(dst); ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel, - ip6_autoflowlabel(net, np), fl6)); + ip6_autoflowlabel(net, sk), fl6)); hdr->payload_len = htons(seg_len); hdr->nexthdr = proto; @@ -328,23 +354,27 @@ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, mtu = dst_mtu(dst); if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) { - IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUT, skb->len); + IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTREQUESTS); /* if egress device is enslaved to an L3 master device pass the * skb to its handler for processing */ skb = l3mdev_ip6_out((struct sock *)sk, skb); - if (unlikely(!skb)) - return 0; + if (unlikely(!skb)) { + ret = 0; + goto unlock; + } /* hooks should never assume socket lock is held. * we promote our socket to non const */ - return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, - net, (struct sock *)sk, skb, NULL, dev, - dst_output); + ret = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, + net, (struct sock *)sk, skb, NULL, dev, + dst_output); + goto unlock; } + ret = -EMSGSIZE; skb->dev = dev; /* ipv6_local_error() does not require socket lock, * we promote our socket to non const @@ -353,7 +383,9 @@ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, IP6_INC_STATS(net, idev, IPSTATS_MIB_FRAGFAILS); kfree_skb(skb); - return -EMSGSIZE; +unlock: + rcu_read_unlock(); + return ret; } EXPORT_SYMBOL(ip6_xmit); @@ -368,9 +400,8 @@ static int ip6_call_ra_chain(struct sk_buff *skb, int sel) if (sk && ra->sel == sel && (!sk->sk_bound_dev_if || sk->sk_bound_dev_if == skb->dev->ifindex)) { - struct ipv6_pinfo *np = inet6_sk(sk); - if (np && np->rtalert_isolate && + if (inet6_test_bit(RTALERT_ISOLATE, sk) && !net_eq(sock_net(sk), dev_net(skb->dev))) { continue; } @@ -447,11 +478,6 @@ static int ip6_forward_proxy_check(struct sk_buff *skb) static inline int ip6_forward_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { - struct dst_entry *dst = skb_dst(skb); - - __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS); - __IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len); - #ifdef CONFIG_NET_SWITCHDEV if (skb->offload_l3_fwd_mark) { consume_skb(skb); @@ -486,13 +512,15 @@ int ip6_forward(struct sk_buff *skb) struct dst_entry *dst = skb_dst(skb); struct ipv6hdr *hdr = ipv6_hdr(skb); struct inet6_skb_parm *opt = IP6CB(skb); - struct net *net = dev_net(dst->dev); + struct net *net = dev_net(dst_dev(dst)); + struct net_device *dev; struct inet6_dev *idev; SKB_DR(reason); u32 mtu; idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif)); - if (net->ipv6.devconf_all->forwarding == 0) + if (!READ_ONCE(net->ipv6.devconf_all->forwarding) && + (!idev || !READ_ONCE(idev->cnf.force_forwarding))) goto error; if (skb->pkt_type != PACKET_HOST) @@ -504,8 +532,8 @@ int ip6_forward(struct sk_buff *skb) if (skb_warn_if_lro(skb)) goto drop; - if (!net->ipv6.devconf_all->disable_policy && - (!idev || !idev->cnf.disable_policy) && + if (!READ_ONCE(net->ipv6.devconf_all->disable_policy) && + (!idev || !READ_ONCE(idev->cnf.disable_policy)) && !xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) { __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS); goto drop; @@ -543,8 +571,8 @@ int ip6_forward(struct sk_buff *skb) } /* XXX: idev->cnf.proxy_ndp? */ - if (net->ipv6.devconf_all->proxy_ndp && - pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) { + if (READ_ONCE(net->ipv6.devconf_all->proxy_ndp) && + pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev)) { int proxied = ip6_forward_proxy_check(skb); if (proxied > 0) { /* It's tempting to decrease the hop limit @@ -574,12 +602,12 @@ int ip6_forward(struct sk_buff *skb) goto drop; } dst = skb_dst(skb); - + dev = dst_dev(dst); /* IPv6 specs say nothing about it, but it is clear that we cannot send redirects to source routed frames. We don't send redirects to frames decapsulated from IPsec. */ - if (IP6CB(skb)->iif == dst->dev->ifindex && + if (IP6CB(skb)->iif == dev->ifindex && opt->srcrt == 0 && !skb_sec_path(skb)) { struct in6_addr *target = NULL; struct inet_peer *peer; @@ -590,21 +618,21 @@ int ip6_forward(struct sk_buff *skb) * send a redirect. */ - rt = (struct rt6_info *) dst; + rt = dst_rt6_info(dst); if (rt->rt6i_flags & RTF_GATEWAY) target = &rt->rt6i_gateway; else target = &hdr->daddr; - peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1); + rcu_read_lock(); + peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr); /* Limit redirects both by destination (here) and by source (inside ndisc_send_redirect) */ if (inet_peer_xrlim_allow(peer, 1*HZ)) ndisc_send_redirect(skb, target); - if (peer) - inet_putpeer(peer); + rcu_read_unlock(); } else { int addrtype = ipv6_addr_type(&hdr->saddr); @@ -619,13 +647,15 @@ int ip6_forward(struct sk_buff *skb) } } + __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS); + mtu = ip6_dst_mtu_maybe_forward(dst, true); if (mtu < IPV6_MIN_MTU) mtu = IPV6_MIN_MTU; if (ip6_pkt_too_big(skb, mtu)) { /* Again, force OUTPUT device used as source address */ - skb->dev = dst->dev; + skb->dev = dev; icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); __IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS); __IP6_INC_STATS(net, ip6_dst_idev(dst), @@ -634,7 +664,7 @@ int ip6_forward(struct sk_buff *skb) return -EMSGSIZE; } - if (skb_cow(skb, dst->dev->hard_header_len)) { + if (skb_cow(skb, dev->hard_header_len)) { __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS); goto drop; @@ -647,7 +677,7 @@ int ip6_forward(struct sk_buff *skb) hdr->hop_limit--; return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, - net, NULL, skb, skb->dev, dst->dev, + net, NULL, skb, skb->dev, dev, ip6_forward_finish); error: @@ -845,10 +875,10 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, int (*output)(struct net *, struct sock *, struct sk_buff *)) { struct sk_buff *frag; - struct rt6_info *rt = (struct rt6_info *)skb_dst(skb); + struct rt6_info *rt = dst_rt6_info(skb_dst(skb)); struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ? inet6_sk(skb->sk) : NULL; - bool mono_delivery_time = skb->mono_delivery_time; + u8 tstamp_type = skb->tstamp_type; struct ip6_frag_state state; unsigned int mtu, hlen, nexthdr_offset; ktime_t tstamp = skb->tstamp; @@ -881,9 +911,11 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, mtu = IPV6_MIN_MTU; } - if (np && np->frag_size < mtu) { - if (np->frag_size) - mtu = np->frag_size; + if (np) { + u32 frag_size = READ_ONCE(np->frag_size); + + if (frag_size && frag_size < mtu) + mtu = frag_size; } if (mtu < hlen + sizeof(struct frag_hdr) + 8) goto fail_toobig; @@ -942,7 +974,7 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, if (iter.frag) ip6_fraglist_prepare(skb, &iter); - skb_set_delivery_time(skb, tstamp, mono_delivery_time); + skb_set_delivery_time(skb, tstamp, tstamp_type); err = output(net, sk, skb); if (!err) IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), @@ -1003,7 +1035,7 @@ slow_path: /* * Put this fragment into the sending queue. */ - skb_set_delivery_time(frag, tstamp, mono_delivery_time); + skb_set_delivery_time(frag, tstamp, tstamp_type); err = output(net, sk, frag); if (err) goto fail; @@ -1017,9 +1049,6 @@ slow_path: return err; fail_toobig: - if (skb->sk && dst_allfrag(skb_dst(skb))) - sk_gso_disable(skb->sk); - icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); err = -EMSGSIZE; @@ -1053,7 +1082,7 @@ static struct dst_entry *ip6_sk_dst_check(struct sock *sk, return NULL; } - rt = (struct rt6_info *)dst; + rt = dst_rt6_info(dst); /* Yes, checking route validity in not connected * case is not very simple. Take into account, * that we do not support routing by source, TOS, @@ -1071,11 +1100,13 @@ static struct dst_entry *ip6_sk_dst_check(struct sock *sk, * sockets. * 2. oif also should be the same. */ - if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) || + if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, + np->daddr_cache ? &sk->sk_v6_daddr : NULL) || #ifdef CONFIG_IPV6_SUBTREES - ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) || + ip6_rt_check(&rt->rt6i_src, &fl6->saddr, + np->saddr_cache ? &np->saddr : NULL) || #endif - (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) { + (fl6->flowi6_oif && fl6->flowi6_oif != dst_dev(dst)->ifindex)) { dst_release(dst); dst = NULL; } @@ -1108,12 +1139,13 @@ static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk, struct rt6_info *rt; *dst = ip6_route_output(net, sk, fl6); - rt = (*dst)->error ? NULL : (struct rt6_info *)*dst; + rt = (*dst)->error ? NULL : dst_rt6_info(*dst); rcu_read_lock(); from = rt ? rcu_dereference(rt->from) : NULL; err = ip6_route_get_saddr(net, from, &fl6->daddr, - sk ? inet6_sk(sk)->srcprefs : 0, + sk ? READ_ONCE(inet6_sk(sk)->srcprefs) : 0, + fl6->flowi6_l3mdev, &fl6->saddr); rcu_read_unlock(); @@ -1149,12 +1181,12 @@ static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk, * dst entry and replace it instead with the * dst entry of the nexthop router */ - rt = (struct rt6_info *) *dst; - rcu_read_lock_bh(); + rt = dst_rt6_info(*dst); + rcu_read_lock(); n = __ipv6_neigh_lookup_noref(rt->dst.dev, rt6_nexthop(rt, &fl6->daddr)); - err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0; - rcu_read_unlock_bh(); + err = n && !(READ_ONCE(n->nud_state) & NUD_VALID) ? -EINVAL : 0; + rcu_read_unlock(); if (err) { struct inet6_ifaddr *ifp; @@ -1283,74 +1315,6 @@ struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6, } EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow); -/** - * ip6_dst_lookup_tunnel - perform route lookup on tunnel - * @skb: Packet for which lookup is done - * @dev: Tunnel device - * @net: Network namespace of tunnel device - * @sock: Socket which provides route info - * @saddr: Memory to store the src ip address - * @info: Tunnel information - * @protocol: IP protocol - * @use_cache: Flag to enable cache usage - * This function performs a route lookup on a tunnel - * - * It returns a valid dst pointer and stores src address to be used in - * tunnel in param saddr on success, else a pointer encoded error code. - */ - -struct dst_entry *ip6_dst_lookup_tunnel(struct sk_buff *skb, - struct net_device *dev, - struct net *net, - struct socket *sock, - struct in6_addr *saddr, - const struct ip_tunnel_info *info, - u8 protocol, - bool use_cache) -{ - struct dst_entry *dst = NULL; -#ifdef CONFIG_DST_CACHE - struct dst_cache *dst_cache; -#endif - struct flowi6 fl6; - __u8 prio; - -#ifdef CONFIG_DST_CACHE - dst_cache = (struct dst_cache *)&info->dst_cache; - if (use_cache) { - dst = dst_cache_get_ip6(dst_cache, saddr); - if (dst) - return dst; - } -#endif - memset(&fl6, 0, sizeof(fl6)); - fl6.flowi6_mark = skb->mark; - fl6.flowi6_proto = protocol; - fl6.daddr = info->key.u.ipv6.dst; - fl6.saddr = info->key.u.ipv6.src; - prio = info->key.tos; - fl6.flowlabel = ip6_make_flowinfo(prio, info->key.label); - - dst = ipv6_stub->ipv6_dst_lookup_flow(net, sock->sk, &fl6, - NULL); - if (IS_ERR(dst)) { - netdev_dbg(dev, "no route to %pI6\n", &fl6.daddr); - return ERR_PTR(-ENETUNREACH); - } - if (dst->dev == dev) { /* is this necessary? */ - netdev_dbg(dev, "circular route to %pI6\n", &fl6.daddr); - dst_release(dst); - return ERR_PTR(-ELOOP); - } -#ifdef CONFIG_DST_CACHE - if (use_cache) - dst_cache_set_ip6(dst_cache, dst, &fl6.saddr); -#endif - *saddr = fl6.saddr; - return dst; -} -EXPORT_SYMBOL_GPL(ip6_dst_lookup_tunnel); - static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src, gfp_t gfp) { @@ -1392,7 +1356,7 @@ static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork, struct rt6_info *rt) { struct ipv6_pinfo *np = inet6_sk(sk); - unsigned int mtu; + unsigned int mtu, frag_size; struct ipv6_txoptions *nopt, *opt = ipc6->opt; /* callers pass dst together with a reference, set it first so @@ -1435,26 +1399,29 @@ static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork, } v6_cork->hop_limit = ipc6->hlimit; v6_cork->tclass = ipc6->tclass; + v6_cork->dontfrag = ipc6->dontfrag; if (rt->dst.flags & DST_XFRM_TUNNEL) - mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ? + mtu = READ_ONCE(np->pmtudisc) >= IPV6_PMTUDISC_PROBE ? READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst); else - mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ? + mtu = READ_ONCE(np->pmtudisc) >= IPV6_PMTUDISC_PROBE ? READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst)); - if (np->frag_size < mtu) { - if (np->frag_size) - mtu = np->frag_size; - } + + frag_size = READ_ONCE(np->frag_size); + if (frag_size && frag_size < mtu) + mtu = frag_size; + cork->base.fragsize = mtu; cork->base.gso_size = ipc6->gso_size; cork->base.tx_flags = 0; cork->base.mark = ipc6->sockc.mark; - sock_tx_timestamp(sk, ipc6->sockc.tsflags, &cork->base.tx_flags); - - if (dst_allfrag(xfrm_dst_path(&rt->dst))) - cork->base.flags |= IPCORK_ALLFRAG; + cork->base.priority = ipc6->sockc.priority; + sock_tx_timestamp(sk, &ipc6->sockc, &cork->base.tx_flags); + if (ipc6->sockc.tsflags & SOCKCM_FLAG_TS_OPT_ID) { + cork->base.flags |= IPCORK_TS_OPT_ID; + cork->base.ts_opt_id = ipc6->sockc.ts_opt_id; + } cork->base.length = 0; - cork->base.transmit_time = ipc6->sockc.transmit_time; return 0; @@ -1468,7 +1435,7 @@ static int __ip6_append_data(struct sock *sk, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb), void *from, size_t length, int transhdrlen, - unsigned int flags, struct ipcm6_cookie *ipc6) + unsigned int flags) { struct sk_buff *skb, *skb_prev = NULL; struct inet_cork *cork = &cork_full->base; @@ -1483,12 +1450,12 @@ static int __ip6_append_data(struct sock *sk, int offset = 0; bool zc = false; u32 tskey = 0; - struct rt6_info *rt = (struct rt6_info *)cork->dst; + struct rt6_info *rt = dst_rt6_info(cork->dst); + bool paged, hold_tskey = false, extra_uref = false; struct ipv6_txoptions *opt = v6_cork->opt; int csummode = CHECKSUM_NONE; unsigned int maxnonfragsize, headersize; unsigned int wmem_alloc_delta = 0; - bool paged, extra_uref = false; skb = skb_peek_tail(queue); if (!skb) { @@ -1500,10 +1467,6 @@ static int __ip6_append_data(struct sock *sk, mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize; orig_mtu = mtu; - if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP && - sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) - tskey = atomic_inc_return(&sk->sk_tskey) - 1; - hh_len = LL_RESERVED_SPACE(rt->dst.dev); fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len + @@ -1511,8 +1474,6 @@ static int __ip6_append_data(struct sock *sk, headersize = sizeof(struct ipv6hdr) + (opt ? opt->opt_flen + opt->opt_nflen : 0) + - (dst_allfrag(&rt->dst) ? - sizeof(struct frag_hdr) : 0) + rt->rt6i_nfheader_len; if (mtu <= fragheaderlen || @@ -1528,7 +1489,7 @@ static int __ip6_append_data(struct sock *sk, if (headersize + transhdrlen > mtu) goto emsgsize; - if (cork->length + length > mtu - headersize && ipc6->dontfrag && + if (cork->length + length > mtu - headersize && v6_cork->dontfrag && (sk->sk_protocol == IPPROTO_UDP || sk->sk_protocol == IPPROTO_ICMPV6 || sk->sk_protocol == IPPROTO_RAW)) { @@ -1576,7 +1537,8 @@ emsgsize: uarg = msg->msg_ubuf; } } else if (sock_flag(sk, SOCK_ZEROCOPY)) { - uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb)); + uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb), + false); if (!uarg) return -ENOBUFS; extra_uref = !skb_zcopy(skb); /* only ref on new uarg */ @@ -1589,6 +1551,25 @@ emsgsize: skb_zcopy_set(skb, uarg, &extra_uref); } } + } else if ((flags & MSG_SPLICE_PAGES) && length) { + if (inet_test_bit(HDRINCL, sk)) + return -EPERM; + if (rt->dst.dev->features & NETIF_F_SG && + getfrag == ip_generic_getfrag) + /* We need an empty buffer to attach stuff to */ + paged = true; + else + flags &= ~MSG_SPLICE_PAGES; + } + + if (cork->tx_flags & SKBTX_ANY_TSTAMP && + READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_OPT_ID) { + if (cork->flags & IPCORK_TS_OPT_ID) { + tskey = cork->ts_opt_id; + } else { + tskey = atomic_inc_return(&sk->sk_tskey) - 1; + hold_tskey = true; + } } /* @@ -1613,7 +1594,7 @@ emsgsize: while (length > 0) { /* Check if the remaining data fits into current packet. */ - copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len; + copy = (cork->length <= mtu ? mtu : maxfraglen) - skb->len; if (copy < length) copy = maxfraglen - skb->len; @@ -1644,7 +1625,7 @@ alloc_new_skb: */ datalen = length + fraggap; - if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen) + if (datalen > (cork->length <= mtu ? mtu : maxfraglen) - fragheaderlen) datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len; fraglen = datalen + fragheaderlen; pagedlen = 0; @@ -1683,7 +1664,10 @@ alloc_new_skb: fraglen = datalen + fragheaderlen; copy = datalen - transhdrlen - fraggap - pagedlen; - if (copy < 0) { + /* [!] NOTE: copy may be negative if pagedlen>0 + * because then the equation may reduces to -fraggap. + */ + if (copy < 0 && !(flags & MSG_SPLICE_PAGES)) { err = -EINVAL; goto error; } @@ -1729,11 +1713,14 @@ alloc_new_skb: pskb_trim_unique(skb_prev, maxfraglen); } if (copy > 0 && - getfrag(from, data + transhdrlen, offset, - copy, fraggap, skb) < 0) { + INDIRECT_CALL_1(getfrag, ip_generic_getfrag, + from, data + transhdrlen, offset, + copy, fraggap, skb) < 0) { err = -EFAULT; kfree_skb(skb); goto error; + } else if (flags & MSG_SPLICE_PAGES) { + copy = 0; } offset += copy; @@ -1772,12 +1759,25 @@ alloc_new_skb: unsigned int off; off = skb->len; - if (getfrag(from, skb_put(skb, copy), - offset, copy, off, skb) < 0) { + if (INDIRECT_CALL_1(getfrag, ip_generic_getfrag, + from, skb_put(skb, copy), + offset, copy, off, skb) < 0) { __skb_trim(skb, off); err = -EFAULT; goto error; } + } else if (flags & MSG_SPLICE_PAGES) { + struct msghdr *msg = from; + + err = -EIO; + if (WARN_ON_ONCE(copy > msg->msg_iter.count)) + goto error; + + err = skb_splice_from_iter(skb, &msg->msg_iter, copy); + if (err < 0) + goto error; + copy = err; + wmem_alloc_delta += copy; } else if (!zc) { int i = skb_shinfo(skb)->nr_frags; @@ -1798,7 +1798,8 @@ alloc_new_skb: get_page(pfrag->page); } copy = min_t(int, copy, pfrag->size - pfrag->offset); - if (getfrag(from, + if (INDIRECT_CALL_1(getfrag, ip_generic_getfrag, + from, page_address(pfrag->page) + pfrag->offset, offset, copy, skb->len, skb) < 0) goto error_efault; @@ -1829,6 +1830,8 @@ error: cork->length -= length; IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS); refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc); + if (hold_tskey) + atomic_dec(&sk->sk_tskey); return err; } @@ -1866,7 +1869,7 @@ int ip6_append_data(struct sock *sk, return __ip6_append_data(sk, &sk->sk_write_queue, &inet->cork, &np->cork, sk_page_frag(sk), getfrag, - from, length, transhdrlen, flags, ipc6); + from, length, transhdrlen, flags); } EXPORT_SYMBOL_GPL(ip6_append_data); @@ -1875,7 +1878,6 @@ static void ip6_cork_steal_dst(struct sk_buff *skb, struct inet_cork_full *cork) struct dst_entry *dst = cork->base.dst; cork->base.dst = NULL; - cork->base.flags &= ~IPCORK_ALLFRAG; skb_dst_set(skb, dst); } @@ -1896,7 +1898,6 @@ static void ip6_cork_release(struct inet_cork_full *cork, if (cork->base.dst) { dst_release(cork->base.dst); cork->base.dst = NULL; - cork->base.flags &= ~IPCORK_ALLFRAG; } } @@ -1908,11 +1909,10 @@ struct sk_buff *__ip6_make_skb(struct sock *sk, struct sk_buff *skb, *tmp_skb; struct sk_buff **tail_skb; struct in6_addr *final_dst; - struct ipv6_pinfo *np = inet6_sk(sk); struct net *net = sock_net(sk); struct ipv6hdr *hdr; struct ipv6_txoptions *opt = v6_cork->opt; - struct rt6_info *rt = (struct rt6_info *)cork->base.dst; + struct rt6_info *rt = dst_rt6_info(cork->base.dst); struct flowi6 *fl6 = &cork->fl.u.ip6; unsigned char proto = fl6->flowi6_proto; @@ -1951,22 +1951,31 @@ struct sk_buff *__ip6_make_skb(struct sock *sk, ip6_flow_hdr(hdr, v6_cork->tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel, - ip6_autoflowlabel(net, np), fl6)); + ip6_autoflowlabel(net, sk), fl6)); hdr->hop_limit = v6_cork->hop_limit; hdr->nexthdr = proto; hdr->saddr = fl6->saddr; hdr->daddr = *final_dst; - skb->priority = sk->sk_priority; + skb->priority = cork->base.priority; skb->mark = cork->base.mark; - skb->tstamp = cork->base.transmit_time; + if (sk_is_tcp(sk)) + skb_set_delivery_time(skb, cork->base.transmit_time, SKB_CLOCK_MONOTONIC); + else + skb_set_delivery_type_by_clockid(skb, cork->base.transmit_time, sk->sk_clockid); ip6_cork_steal_dst(skb, cork); - IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len); + IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTREQUESTS); if (proto == IPPROTO_ICMPV6) { struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb)); + u8 icmp6_type; - ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type); + if (sk->sk_socket->type == SOCK_RAW && + !(fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH)) + icmp6_type = fl6->fl6_icmp_type; + else + icmp6_type = icmp6_hdr(skb)->icmp6_type; + ICMP6MSGOUT_INC_STATS(net, idev, icmp6_type); ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS); } @@ -1978,9 +1987,10 @@ out: int ip6_send_skb(struct sk_buff *skb) { struct net *net = sock_net(skb->sk); - struct rt6_info *rt = (struct rt6_info *)skb_dst(skb); + struct rt6_info *rt = dst_rt6_info(skb_dst(skb)); int err; + rcu_read_lock(); err = ip6_local_out(net, skb->sk, skb); if (err) { if (err > 0) @@ -1990,6 +2000,7 @@ int ip6_send_skb(struct sk_buff *skb) IPSTATS_MIB_OUTDISCARDS); } + rcu_read_unlock(); return err; } @@ -2057,13 +2068,11 @@ struct sk_buff *ip6_make_skb(struct sock *sk, ip6_cork_release(cork, &v6_cork); return ERR_PTR(err); } - if (ipc6->dontfrag < 0) - ipc6->dontfrag = inet6_sk(sk)->dontfrag; err = __ip6_append_data(sk, &queue, cork, &v6_cork, ¤t->task_frag, getfrag, from, length + exthdrlen, transhdrlen + exthdrlen, - flags, ipc6); + flags); if (err) { __ip6_flush_pending_frames(sk, &queue, cork, &v6_cork); return ERR_PTR(err); diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 47b6607a1370..6405072050e0 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -52,7 +52,9 @@ #include <net/inet_ecn.h> #include <net/net_namespace.h> #include <net/netns/generic.h> +#include <net/netdev_lock.h> #include <net/dst_metadata.h> +#include <net/inet_dscp.h> MODULE_AUTHOR("Ville Nuorvala"); MODULE_DESCRIPTION("IPv6 tunneling device"); @@ -247,14 +249,12 @@ static void ip6_dev_free(struct net_device *dev) gro_cells_destroy(&t->gro_cells); dst_cache_destroy(&t->dst_cache); - free_percpu(dev->tstats); } static int ip6_tnl_create2(struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); - struct net *net = dev_net(dev); - struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); + struct ip6_tnl_net *ip6n = net_generic(t->net, ip6_tnl_net_id); int err; dev->rtnl_link_ops = &ip6_link_ops; @@ -399,7 +399,7 @@ __u16 ip6_tnl_parse_tlv_enc_lim(struct sk_buff *skb, __u8 *raw) const struct ipv6hdr *ipv6h = (const struct ipv6hdr *)raw; unsigned int nhoff = raw - skb->data; unsigned int off = nhoff + sizeof(*ipv6h); - u8 next, nexthdr = ipv6h->nexthdr; + u8 nexthdr = ipv6h->nexthdr; while (ipv6_ext_hdr(nexthdr) && nexthdr != NEXTHDR_NONE) { struct ipv6_opt_hdr *hdr; @@ -410,25 +410,25 @@ __u16 ip6_tnl_parse_tlv_enc_lim(struct sk_buff *skb, __u8 *raw) hdr = (struct ipv6_opt_hdr *)(skb->data + off); if (nexthdr == NEXTHDR_FRAGMENT) { - struct frag_hdr *frag_hdr = (struct frag_hdr *) hdr; - if (frag_hdr->frag_off) - break; optlen = 8; } else if (nexthdr == NEXTHDR_AUTH) { optlen = ipv6_authlen(hdr); } else { optlen = ipv6_optlen(hdr); } - /* cache hdr->nexthdr, since pskb_may_pull() might - * invalidate hdr - */ - next = hdr->nexthdr; - if (nexthdr == NEXTHDR_DEST) { - u16 i = 2; - /* Remember : hdr is no longer valid at this point. */ - if (!pskb_may_pull(skb, off + optlen)) + if (!pskb_may_pull(skb, off + optlen)) + break; + + hdr = (struct ipv6_opt_hdr *)(skb->data + off); + if (nexthdr == NEXTHDR_FRAGMENT) { + struct frag_hdr *frag_hdr = (struct frag_hdr *)hdr; + + if (frag_hdr->frag_off) break; + } + if (nexthdr == NEXTHDR_DEST) { + u16 i = 2; while (1) { struct ipv6_tlv_tnl_enc_lim *tel; @@ -449,7 +449,7 @@ __u16 ip6_tnl_parse_tlv_enc_lim(struct sk_buff *skb, __u8 *raw) i++; } } - nexthdr = next; + nexthdr = hdr->nexthdr; off += optlen; } return 0; @@ -609,7 +609,8 @@ ip4ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, /* Try to guess incoming interface */ rt = ip_route_output_ports(dev_net(skb->dev), &fl4, NULL, eiph->saddr, - 0, 0, 0, IPPROTO_IPIP, RT_TOS(eiph->tos), 0); + 0, 0, 0, IPPROTO_IPIP, + eiph->tos & INET_DSCP_MASK, 0); if (IS_ERR(rt)) goto out; @@ -620,7 +621,8 @@ ip4ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, if (rt->rt_flags & RTCF_LOCAL) { rt = ip_route_output_ports(dev_net(skb->dev), &fl4, NULL, eiph->daddr, eiph->saddr, 0, 0, - IPPROTO_IPIP, RT_TOS(eiph->tos), 0); + IPPROTO_IPIP, + eiph->tos & INET_DSCP_MASK, 0); if (IS_ERR(rt) || rt->dst.dev->type != ARPHRD_TUNNEL6) { if (!IS_ERR(rt)) ip_rt_put(rt); @@ -628,9 +630,9 @@ ip4ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, } skb_dst_set(skb2, &rt->dst); } else { - if (ip_route_input(skb2, eiph->daddr, eiph->saddr, eiph->tos, - skb2->dev) || - skb_dst(skb2)->dev->type != ARPHRD_TUNNEL6) + if (ip_route_input(skb2, eiph->daddr, eiph->saddr, + ip4h_dscp(eiph), skb2->dev) || + skb_dst_dev(skb2)->type != ARPHRD_TUNNEL6) goto out; } @@ -796,20 +798,18 @@ static int __ip6_tnl_rcv(struct ip6_tnl *tunnel, struct sk_buff *skb, struct sk_buff *skb), bool log_ecn_err) { - const struct ipv6hdr *ipv6h = ipv6_hdr(skb); - int err; + const struct ipv6hdr *ipv6h; + int nh, err; - if ((!(tpi->flags & TUNNEL_CSUM) && - (tunnel->parms.i_flags & TUNNEL_CSUM)) || - ((tpi->flags & TUNNEL_CSUM) && - !(tunnel->parms.i_flags & TUNNEL_CSUM))) { + if (test_bit(IP_TUNNEL_CSUM_BIT, tunnel->parms.i_flags) != + test_bit(IP_TUNNEL_CSUM_BIT, tpi->flags)) { DEV_STATS_INC(tunnel->dev, rx_crc_errors); DEV_STATS_INC(tunnel->dev, rx_errors); goto drop; } - if (tunnel->parms.i_flags & TUNNEL_SEQ) { - if (!(tpi->flags & TUNNEL_SEQ) || + if (test_bit(IP_TUNNEL_SEQ_BIT, tunnel->parms.i_flags)) { + if (!test_bit(IP_TUNNEL_SEQ_BIT, tpi->flags) || (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) { DEV_STATS_INC(tunnel->dev, rx_fifo_errors); @@ -829,7 +829,6 @@ static int __ip6_tnl_rcv(struct ip6_tnl *tunnel, struct sk_buff *skb, goto drop; } - ipv6h = ipv6_hdr(skb); skb->protocol = eth_type_trans(skb, tunnel->dev); skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); } else { @@ -837,7 +836,23 @@ static int __ip6_tnl_rcv(struct ip6_tnl *tunnel, struct sk_buff *skb, skb_reset_mac_header(skb); } + /* Save offset of outer header relative to skb->head, + * because we are going to reset the network header to the inner header + * and might change skb->head. + */ + nh = skb_network_header(skb) - skb->head; + skb_reset_network_header(skb); + + if (!pskb_inet_may_pull(skb)) { + DEV_STATS_INC(tunnel->dev, rx_length_errors); + DEV_STATS_INC(tunnel->dev, rx_errors); + goto drop; + } + + /* Get the outer header. */ + ipv6h = (struct ipv6hdr *)(skb->head + nh); + memset(skb->cb, 0, sizeof(struct inet6_skb_parm)); __skb_tunnel_rx(skb, tunnel->dev, tunnel->net); @@ -932,7 +947,9 @@ static int ipxip6_rcv(struct sk_buff *skb, u8 ipproto, if (iptunnel_pull_header(skb, 0, tpi->proto, false)) goto drop; if (t->parms.collect_md) { - tun_dst = ipv6_tun_rx_dst(skb, 0, 0, 0); + IP_TUNNEL_DECLARE_FLAGS(flags) = { }; + + tun_dst = ipv6_tun_rx_dst(skb, flags, 0, 0); if (!tun_dst) goto drop; } @@ -1162,7 +1179,7 @@ route_lookup: ndst = dst; } - tdev = dst->dev; + tdev = dst_dev(dst); if (tdev == dev) { DEV_STATS_INC(dev, collisions); @@ -1238,10 +1255,9 @@ route_lookup: /* Calculate max headroom for all the headers and adjust * needed_headroom if necessary. */ - max_headroom = LL_RESERVED_SPACE(dst->dev) + sizeof(struct ipv6hdr) + max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct ipv6hdr) + dst->header_len + t->hlen; - if (max_headroom > dev->needed_headroom) - dev->needed_headroom = max_headroom; + ip_tunnel_adj_headroom(dev, max_headroom); err = ip6_tnl_encap(skb, t, &proto, fl6); if (err) @@ -1261,7 +1277,7 @@ route_lookup: ipv6h->nexthdr = proto; ipv6h->saddr = fl6->saddr; ipv6h->daddr = fl6->daddr; - ip6tunnel_xmit(NULL, skb, dev); + ip6tunnel_xmit(NULL, skb, dev, 0); return 0; tx_err_link_failure: DEV_STATS_INC(dev, tx_carrier_errors); @@ -1493,7 +1509,8 @@ static void ip6_tnl_link_config(struct ip6_tnl *t) tdev = __dev_get_by_index(t->net, p->link); if (tdev) { - dev->hard_header_len = tdev->hard_header_len + t_hlen; + dev->needed_headroom = tdev->hard_header_len + + tdev->needed_headroom + t_hlen; mtu = min_t(unsigned int, tdev->mtu, IP6_MAX_MTU); mtu = mtu - t_hlen; @@ -1544,11 +1561,22 @@ static void ip6_tnl_update(struct ip6_tnl *t, struct __ip6_tnl_parm *p) netdev_state_change(t->dev); } -static void ip6_tnl0_update(struct ip6_tnl *t, struct __ip6_tnl_parm *p) +static int ip6_tnl0_update(struct ip6_tnl *t, struct __ip6_tnl_parm *p, + bool strict) { - /* for default tnl0 device allow to change only the proto */ + /* For the default ip6tnl0 device, allow changing only the protocol + * (the IP6_TNL_F_CAP_PER_PACKET flag is set on ip6tnl0, and all other + * parameters are 0). + */ + if (strict && + (!ipv6_addr_any(&p->laddr) || !ipv6_addr_any(&p->raddr) || + p->flags != t->parms.flags || p->hop_limit || p->encap_limit || + p->flowinfo || p->link || p->fwmark || p->collect_md)) + return -EINVAL; + t->parms.proto = p->proto; netdev_state_change(t->dev); + return 0; } static void @@ -1662,7 +1690,7 @@ ip6_tnl_siocdevprivate(struct net_device *dev, struct ifreq *ifr, } else t = netdev_priv(dev); if (dev == ip6n->fb_tnl_dev) - ip6_tnl0_update(t, &p1); + ip6_tnl0_update(t, &p1, false); else ip6_tnl_update(t, &p1); } @@ -1717,7 +1745,9 @@ ip6_tnl_siocdevprivate(struct net_device *dev, struct ifreq *ifr, int ip6_tnl_change_mtu(struct net_device *dev, int new_mtu) { struct ip6_tnl *tnl = netdev_priv(dev); + int t_hlen; + t_hlen = tnl->hlen + sizeof(struct ipv6hdr); if (tnl->parms.proto == IPPROTO_IPV6) { if (new_mtu < IPV6_MIN_MTU) return -EINVAL; @@ -1726,13 +1756,13 @@ int ip6_tnl_change_mtu(struct net_device *dev, int new_mtu) return -EINVAL; } if (tnl->parms.proto == IPPROTO_IPV6 || tnl->parms.proto == 0) { - if (new_mtu > IP6_MAX_MTU - dev->hard_header_len) + if (new_mtu > IP6_MAX_MTU - dev->hard_header_len - t_hlen) return -EINVAL; } else { - if (new_mtu > IP_MAX_MTU - dev->hard_header_len) + if (new_mtu > IP_MAX_MTU - dev->hard_header_len - t_hlen) return -EINVAL; } - dev->mtu = new_mtu; + WRITE_ONCE(dev->mtu, new_mtu); return 0; } EXPORT_SYMBOL(ip6_tnl_change_mtu); @@ -1741,7 +1771,7 @@ int ip6_tnl_get_iflink(const struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); - return t->parms.link; + return READ_ONCE(t->parms.link); } EXPORT_SYMBOL(ip6_tnl_get_iflink); @@ -1832,7 +1862,8 @@ static void ip6_tnl_dev_setup(struct net_device *dev) dev->type = ARPHRD_TUNNEL6; dev->flags |= IFF_NOARP; dev->addr_len = sizeof(struct in6_addr); - dev->features |= NETIF_F_LLTX; + dev->lltx = true; + dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS; netif_keep_dst(dev); dev->features |= IPXIPX_FEATURES; @@ -1857,14 +1888,10 @@ ip6_tnl_dev_init_gen(struct net_device *dev) int t_hlen; t->dev = dev; - t->net = dev_net(dev); - dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); - if (!dev->tstats) - return -ENOMEM; ret = dst_cache_init(&t->dst_cache, GFP_KERNEL); if (ret) - goto free_stats; + return ret; ret = gro_cells_init(&t->gro_cells, dev); if (ret) @@ -1875,21 +1902,18 @@ ip6_tnl_dev_init_gen(struct net_device *dev) t_hlen = t->hlen + sizeof(struct ipv6hdr); dev->type = ARPHRD_TUNNEL6; - dev->hard_header_len = LL_MAX_HEADER + t_hlen; dev->mtu = ETH_DATA_LEN - t_hlen; if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) dev->mtu -= 8; dev->min_mtu = ETH_MIN_MTU; - dev->max_mtu = IP6_MAX_MTU - dev->hard_header_len; + dev->max_mtu = IP6_MAX_MTU - dev->hard_header_len - t_hlen; netdev_hold(dev, &t->dev_tracker, GFP_KERNEL); + netdev_lockdep_set_classes(dev); return 0; destroy_dst: dst_cache_destroy(&t->dst_cache); -free_stats: - free_percpu(dev->tstats); - dev->tstats = NULL; return ret; } @@ -1925,6 +1949,7 @@ static int __net_init ip6_fb_tnl_dev_init(struct net_device *dev) struct net *net = dev_net(dev); struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); + t->net = net; t->parms.proto = IPPROTO_IPV6; rcu_assign_pointer(ip6n->tnls_wc[0], t); @@ -1987,17 +2012,22 @@ static void ip6_tnl_netlink_parms(struct nlattr *data[], parms->fwmark = nla_get_u32(data[IFLA_IPTUN_FWMARK]); } -static int ip6_tnl_newlink(struct net *src_net, struct net_device *dev, - struct nlattr *tb[], struct nlattr *data[], +static int ip6_tnl_newlink(struct net_device *dev, + struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { - struct net *net = dev_net(dev); - struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); + struct nlattr **data = params->data; + struct nlattr **tb = params->tb; struct ip_tunnel_encap ipencap; + struct ip6_tnl_net *ip6n; struct ip6_tnl *nt, *t; + struct net *net; int err; + net = params->link_net ? : dev_net(dev); + ip6n = net_generic(net, ip6_tnl_net_id); nt = netdev_priv(dev); + nt->net = net; if (ip_tunnel_netlink_encap_parms(data, &ipencap)) { err = ip6_tnl_encap_setup(nt, &ipencap); @@ -2033,8 +2063,28 @@ static int ip6_tnl_changelink(struct net_device *dev, struct nlattr *tb[], struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); struct ip_tunnel_encap ipencap; - if (dev == ip6n->fb_tnl_dev) - return -EINVAL; + if (dev == ip6n->fb_tnl_dev) { + if (ip_tunnel_netlink_encap_parms(data, &ipencap)) { + /* iproute2 always sets TUNNEL_ENCAP_FLAG_CSUM6, so + * let's ignore this flag. + */ + ipencap.flags &= ~TUNNEL_ENCAP_FLAG_CSUM6; + if (memchr_inv(&ipencap, 0, sizeof(ipencap))) { + NL_SET_ERR_MSG(extack, + "Only protocol can be changed for fallback tunnel, not encap params"); + return -EINVAL; + } + } + + ip6_tnl_netlink_parms(data, &p); + if (ip6_tnl0_update(t, &p, true) < 0) { + NL_SET_ERR_MSG(extack, + "Only protocol can be changed for fallback tunnel"); + return -EINVAL; + } + + return 0; + } if (ip_tunnel_netlink_encap_parms(data, &ipencap)) { int err = ip6_tnl_encap_setup(t, &ipencap); @@ -2136,7 +2186,7 @@ struct net *ip6_tnl_get_link_net(const struct net_device *dev) { struct ip6_tnl *tunnel = netdev_priv(dev); - return tunnel->net; + return READ_ONCE(tunnel->net); } EXPORT_SYMBOL(ip6_tnl_get_link_net); @@ -2190,7 +2240,7 @@ static struct xfrm6_tunnel mplsip6_handler __read_mostly = { .priority = 1, }; -static void __net_exit ip6_tnl_destroy_tunnels(struct net *net, struct list_head *list) +static void __net_exit ip6_tnl_exit_rtnl_net(struct net *net, struct list_head *list) { struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); struct net_device *dev, *aux; @@ -2202,25 +2252,27 @@ static void __net_exit ip6_tnl_destroy_tunnels(struct net *net, struct list_head unregister_netdevice_queue(dev, list); for (h = 0; h < IP6_TUNNEL_HASH_SIZE; h++) { - t = rtnl_dereference(ip6n->tnls_r_l[h]); + t = rtnl_net_dereference(net, ip6n->tnls_r_l[h]); while (t) { /* If dev is in the same netns, it has already * been added to the list by the previous loop. */ if (!net_eq(dev_net(t->dev), net)) unregister_netdevice_queue(t->dev, list); - t = rtnl_dereference(t->next); + + t = rtnl_net_dereference(net, t->next); } } - t = rtnl_dereference(ip6n->tnls_wc[0]); + t = rtnl_net_dereference(net, ip6n->tnls_wc[0]); while (t) { /* If dev is in the same netns, it has already * been added to the list by the previous loop. */ if (!net_eq(dev_net(t->dev), net)) unregister_netdevice_queue(t->dev, list); - t = rtnl_dereference(t->next); + + t = rtnl_net_dereference(net, t->next); } } @@ -2246,7 +2298,7 @@ static int __net_init ip6_tnl_init_net(struct net *net) /* FB netdevice is special: we have one, and only one per netns. * Allowing to move it to another netns is clearly unsafe. */ - ip6n->fb_tnl_dev->features |= NETIF_F_NETNS_LOCAL; + ip6n->fb_tnl_dev->netns_immutable = true; err = ip6_fb_tnl_dev_init(ip6n->fb_tnl_dev); if (err < 0) @@ -2267,21 +2319,9 @@ err_alloc_dev: return err; } -static void __net_exit ip6_tnl_exit_batch_net(struct list_head *net_list) -{ - struct net *net; - LIST_HEAD(list); - - rtnl_lock(); - list_for_each_entry(net, net_list, exit_list) - ip6_tnl_destroy_tunnels(net, &list); - unregister_netdevice_many(&list); - rtnl_unlock(); -} - static struct pernet_operations ip6_tnl_net_ops = { .init = ip6_tnl_init_net, - .exit_batch = ip6_tnl_exit_batch_net, + .exit_rtnl = ip6_tnl_exit_rtnl_net, .id = &ip6_tnl_net_id, .size = sizeof(struct ip6_tnl_net), }; diff --git a/net/ipv6/ip6_udp_tunnel.c b/net/ipv6/ip6_udp_tunnel.c index cdc4d4ee2420..cef3e0210744 100644 --- a/net/ipv6/ip6_udp_tunnel.c +++ b/net/ipv6/ip6_udp_tunnel.c @@ -1,3 +1,4 @@ + // SPDX-License-Identifier: GPL-2.0-only #include <linux/module.h> #include <linux/errno.h> @@ -39,7 +40,7 @@ int udp_sock_create6(struct net *net, struct udp_port_cfg *cfg, memcpy(&udp6_addr.sin6_addr, &cfg->local_ip6, sizeof(udp6_addr.sin6_addr)); udp6_addr.sin6_port = cfg->local_udp_port; - err = kernel_bind(sock, (struct sockaddr *)&udp6_addr, + err = kernel_bind(sock, (struct sockaddr_unsized *)&udp6_addr, sizeof(udp6_addr)); if (err < 0) goto error; @@ -51,7 +52,7 @@ int udp_sock_create6(struct net *net, struct udp_port_cfg *cfg, sizeof(udp6_addr.sin6_addr)); udp6_addr.sin6_port = cfg->peer_udp_port; err = kernel_connect(sock, - (struct sockaddr *)&udp6_addr, + (struct sockaddr_unsized *)&udp6_addr, sizeof(udp6_addr), 0); } if (err < 0) @@ -73,12 +74,14 @@ error: } EXPORT_SYMBOL_GPL(udp_sock_create6); -int udp_tunnel6_xmit_skb(struct dst_entry *dst, struct sock *sk, - struct sk_buff *skb, - struct net_device *dev, struct in6_addr *saddr, - struct in6_addr *daddr, - __u8 prio, __u8 ttl, __be32 label, - __be16 src_port, __be16 dst_port, bool nocheck) +void udp_tunnel6_xmit_skb(struct dst_entry *dst, struct sock *sk, + struct sk_buff *skb, + struct net_device *dev, + const struct in6_addr *saddr, + const struct in6_addr *daddr, + __u8 prio, __u8 ttl, __be32 label, + __be16 src_port, __be16 dst_port, bool nocheck, + u16 ip6cb_flags) { struct udphdr *uh; struct ipv6hdr *ip6h; @@ -106,9 +109,78 @@ int udp_tunnel6_xmit_skb(struct dst_entry *dst, struct sock *sk, ip6h->daddr = *daddr; ip6h->saddr = *saddr; - ip6tunnel_xmit(sk, skb, dev); - return 0; + ip6tunnel_xmit(sk, skb, dev, ip6cb_flags); } EXPORT_SYMBOL_GPL(udp_tunnel6_xmit_skb); +/** + * udp_tunnel6_dst_lookup - perform route lookup on UDP tunnel + * @skb: Packet for which lookup is done + * @dev: Tunnel device + * @net: Network namespace of tunnel device + * @sock: Socket which provides route info + * @oif: Index of the output interface + * @saddr: Memory to store the src ip address + * @key: Tunnel information + * @sport: UDP source port + * @dport: UDP destination port + * @dsfield: The traffic class field + * @dst_cache: The dst cache to use for lookup + * This function performs a route lookup on a UDP tunnel + * + * It returns a valid dst pointer and stores src address to be used in + * tunnel in param saddr on success, else a pointer encoded error code. + */ + +struct dst_entry *udp_tunnel6_dst_lookup(struct sk_buff *skb, + struct net_device *dev, + struct net *net, + struct socket *sock, + int oif, + struct in6_addr *saddr, + const struct ip_tunnel_key *key, + __be16 sport, __be16 dport, u8 dsfield, + struct dst_cache *dst_cache) +{ + struct dst_entry *dst = NULL; + struct flowi6 fl6; + +#ifdef CONFIG_DST_CACHE + if (dst_cache) { + dst = dst_cache_get_ip6(dst_cache, saddr); + if (dst) + return dst; + } +#endif + memset(&fl6, 0, sizeof(fl6)); + fl6.flowi6_mark = skb->mark; + fl6.flowi6_proto = IPPROTO_UDP; + fl6.flowi6_oif = oif; + fl6.daddr = key->u.ipv6.dst; + fl6.saddr = key->u.ipv6.src; + fl6.fl6_sport = sport; + fl6.fl6_dport = dport; + fl6.flowlabel = ip6_make_flowinfo(dsfield, key->label); + + dst = ipv6_stub->ipv6_dst_lookup_flow(net, sock->sk, &fl6, + NULL); + if (IS_ERR(dst)) { + netdev_dbg(dev, "no route to %pI6\n", &fl6.daddr); + return ERR_PTR(-ENETUNREACH); + } + if (dst_dev(dst) == dev) { /* is this necessary? */ + netdev_dbg(dev, "circular route to %pI6\n", &fl6.daddr); + dst_release(dst); + return ERR_PTR(-ELOOP); + } +#ifdef CONFIG_DST_CACHE + if (dst_cache) + dst_cache_set_ip6(dst_cache, dst, &fl6.saddr); +#endif + *saddr = fl6.saddr; + return dst; +} +EXPORT_SYMBOL_GPL(udp_tunnel6_dst_lookup); + +MODULE_DESCRIPTION("IPv6 Foo over UDP tunnel driver"); MODULE_LICENSE("GPL"); diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c index 10b222865d46..ad5290be4dd6 100644 --- a/net/ipv6/ip6_vti.c +++ b/net/ipv6/ip6_vti.c @@ -45,6 +45,7 @@ #include <net/xfrm.h> #include <net/net_namespace.h> #include <net/netns/generic.h> +#include <net/netdev_lock.h> #include <linux/etherdevice.h> #define IP6_VTI_HASH_SIZE_SHIFT 5 @@ -174,16 +175,10 @@ vti6_tnl_unlink(struct vti6_net *ip6n, struct ip6_tnl *t) } } -static void vti6_dev_free(struct net_device *dev) -{ - free_percpu(dev->tstats); -} - static int vti6_tnl_create2(struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); - struct net *net = dev_net(dev); - struct vti6_net *ip6n = net_generic(net, vti6_net_id); + struct vti6_net *ip6n = net_generic(t->net, vti6_net_id); int err; dev->rtnl_link_ops = &vti6_link_ops; @@ -502,7 +497,7 @@ vti6_xmit(struct sk_buff *skb, struct net_device *dev, struct flowi *fl) (const struct in6_addr *)&x->id.daddr)) goto tx_err_link_failure; - tdev = dst->dev; + tdev = dst_dev(dst); if (tdev == dev) { DEV_STATS_INC(dev, collisions); @@ -534,7 +529,7 @@ vti6_xmit(struct sk_buff *skb, struct net_device *dev, struct flowi *fl) xmit: skb_scrub_packet(skb, !net_eq(t->net, dev_net(dev))); skb_dst_set(skb, dst); - skb->dev = skb_dst(skb)->dev; + skb->dev = dst_dev(dst); err = dst_output(t->net, skb->sk, skb); if (net_xmit_eval(err) == 0) @@ -568,12 +563,12 @@ vti6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) vti6_addr_conflict(t, ipv6_hdr(skb))) goto tx_err; - xfrm_decode_session(skb, &fl, AF_INET6); memset(IP6CB(skb), 0, sizeof(*IP6CB(skb))); + xfrm_decode_session(dev_net(dev), skb, &fl, AF_INET6); break; case htons(ETH_P_IP): - xfrm_decode_session(skb, &fl, AF_INET); memset(IPCB(skb), 0, sizeof(*IPCB(skb))); + xfrm_decode_session(dev_net(dev), skb, &fl, AF_INET); break; default: goto tx_err; @@ -671,7 +666,8 @@ static void vti6_link_config(struct ip6_tnl *t, bool keep_mtu) dev->flags &= ~IFF_POINTOPOINT; if (keep_mtu && dev->mtu) { - dev->mtu = clamp(dev->mtu, dev->min_mtu, dev->max_mtu); + WRITE_ONCE(dev->mtu, + clamp(dev->mtu, dev->min_mtu, dev->max_mtu)); return; } @@ -892,7 +888,6 @@ static const struct net_device_ops vti6_netdev_ops = { .ndo_uninit = vti6_dev_uninit, .ndo_start_xmit = vti6_tnl_xmit, .ndo_siocdevprivate = vti6_siocdevprivate, - .ndo_get_stats64 = dev_get_tstats64, .ndo_get_iflink = ip6_tnl_get_iflink, }; @@ -908,8 +903,8 @@ static void vti6_dev_setup(struct net_device *dev) dev->netdev_ops = &vti6_netdev_ops; dev->header_ops = &ip_tunnel_header_ops; dev->needs_free_netdev = true; - dev->priv_destructor = vti6_dev_free; + dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS; dev->type = ARPHRD_TUNNEL6; dev->min_mtu = IPV4_MIN_MTU; dev->max_mtu = IP_MAX_MTU - sizeof(struct ipv6hdr); @@ -930,11 +925,8 @@ static inline int vti6_dev_init_gen(struct net_device *dev) struct ip6_tnl *t = netdev_priv(dev); t->dev = dev; - t->net = dev_net(dev); - dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); - if (!dev->tstats) - return -ENOMEM; netdev_hold(dev, &t->dev_tracker, GFP_KERNEL); + netdev_lockdep_set_classes(dev); return 0; } @@ -965,6 +957,7 @@ static int __net_init vti6_fb_tnl_dev_init(struct net_device *dev) struct net *net = dev_net(dev); struct vti6_net *ip6n = net_generic(net, vti6_net_id); + t->net = net; t->parms.proto = IPPROTO_IPV6; rcu_assign_pointer(ip6n->tnls_wc[0], t); @@ -1004,17 +997,20 @@ static void vti6_netlink_parms(struct nlattr *data[], parms->fwmark = nla_get_u32(data[IFLA_VTI_FWMARK]); } -static int vti6_newlink(struct net *src_net, struct net_device *dev, - struct nlattr *tb[], struct nlattr *data[], +static int vti6_newlink(struct net_device *dev, + struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { - struct net *net = dev_net(dev); + struct nlattr **data = params->data; struct ip6_tnl *nt; + struct net *net; + net = params->link_net ? : dev_net(dev); nt = netdev_priv(dev); vti6_netlink_parms(data, &nt->parms); nt->parms.proto = IPPROTO_IPV6; + nt->net = net; if (vti6_locate(net, &nt->parms, 0)) return -EEXIST; @@ -1116,21 +1112,21 @@ static struct rtnl_link_ops vti6_link_ops __read_mostly = { .get_link_net = ip6_tnl_get_link_net, }; -static void __net_exit vti6_destroy_tunnels(struct vti6_net *ip6n, - struct list_head *list) +static void __net_exit vti6_exit_rtnl_net(struct net *net, struct list_head *list) { - int h; + struct vti6_net *ip6n = net_generic(net, vti6_net_id); struct ip6_tnl *t; + int h; for (h = 0; h < IP6_VTI_HASH_SIZE; h++) { - t = rtnl_dereference(ip6n->tnls_r_l[h]); + t = rtnl_net_dereference(net, ip6n->tnls_r_l[h]); while (t) { unregister_netdevice_queue(t->dev, list); - t = rtnl_dereference(t->next); + t = rtnl_net_dereference(net, t->next); } } - t = rtnl_dereference(ip6n->tnls_wc[0]); + t = rtnl_net_dereference(net, ip6n->tnls_wc[0]); if (t) unregister_netdevice_queue(t->dev, list); } @@ -1174,24 +1170,9 @@ err_alloc_dev: return err; } -static void __net_exit vti6_exit_batch_net(struct list_head *net_list) -{ - struct vti6_net *ip6n; - struct net *net; - LIST_HEAD(list); - - rtnl_lock(); - list_for_each_entry(net, net_list, exit_list) { - ip6n = net_generic(net, vti6_net_id); - vti6_destroy_tunnels(ip6n, &list); - } - unregister_netdevice_many(&list); - rtnl_unlock(); -} - static struct pernet_operations vti6_net_ops = { .init = vti6_init_net, - .exit_batch = vti6_exit_batch_net, + .exit_rtnl = vti6_exit_rtnl_net, .id = &vti6_net_id, .size = sizeof(struct vti6_net), }; diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index 51cf37abd142..e047a4680ab0 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -125,7 +125,7 @@ static struct mr_table *ip6mr_mr_table_iter(struct net *net, return ret; } -static struct mr_table *ip6mr_get_table(struct net *net, u32 id) +static struct mr_table *__ip6mr_get_table(struct net *net, u32 id) { struct mr_table *mrt; @@ -136,6 +136,16 @@ static struct mr_table *ip6mr_get_table(struct net *net, u32 id) return NULL; } +static struct mr_table *ip6mr_get_table(struct net *net, u32 id) +{ + struct mr_table *mrt; + + rcu_read_lock(); + mrt = __ip6mr_get_table(net, id); + rcu_read_unlock(); + return mrt; +} + static int ip6mr_fib_lookup(struct net *net, struct flowi6 *flp6, struct mr_table **mrt) { @@ -177,7 +187,7 @@ static int ip6mr_rule_action(struct fib_rule *rule, struct flowi *flp, arg->table = fib_rule_get_table(rule, arg); - mrt = ip6mr_get_table(rule->fr_net, arg->table); + mrt = __ip6mr_get_table(rule->fr_net, arg->table); if (!mrt) return -EAGAIN; res->mrt = mrt; @@ -242,7 +252,7 @@ static int __net_init ip6mr_rules_init(struct net *net) goto err1; } - err = fib_default_rule_add(ops, 0x7fff, RT6_TABLE_DFLT, 0); + err = fib_default_rule_add(ops, 0x7fff, RT6_TABLE_DFLT); if (err < 0) goto err2; @@ -276,7 +286,7 @@ static int ip6mr_rules_dump(struct net *net, struct notifier_block *nb, return fib_rules_dump(net, nb, RTNL_FAMILY_IP6MR, extack); } -static unsigned int ip6mr_rules_seq_read(struct net *net) +static unsigned int ip6mr_rules_seq_read(const struct net *net) { return fib_rules_seq_read(net, RTNL_FAMILY_IP6MR); } @@ -304,6 +314,8 @@ static struct mr_table *ip6mr_get_table(struct net *net, u32 id) return net->ipv6.mrt6; } +#define __ip6mr_get_table ip6mr_get_table + static int ip6mr_fib_lookup(struct net *net, struct flowi6 *flp6, struct mr_table **mrt) { @@ -335,7 +347,7 @@ static int ip6mr_rules_dump(struct net *net, struct notifier_block *nb, return 0; } -static unsigned int ip6mr_rules_seq_read(struct net *net) +static unsigned int ip6mr_rules_seq_read(const struct net *net) { return 0; } @@ -382,7 +394,7 @@ static struct mr_table *ip6mr_new_table(struct net *net, u32 id) { struct mr_table *mrt; - mrt = ip6mr_get_table(net, id); + mrt = __ip6mr_get_table(net, id); if (mrt) return mrt; @@ -392,6 +404,10 @@ static struct mr_table *ip6mr_new_table(struct net *net, u32 id) static void ip6mr_free_table(struct mr_table *mrt) { + struct net *net = read_pnet(&mrt->net); + + WARN_ON_ONCE(!mr_can_free_table(net)); + timer_shutdown_sync(&mrt->ipmr_expire_timer); mroute_clean_tables(mrt, MRT6_FLUSH_MIFS | MRT6_FLUSH_MIFS_STATIC | MRT6_FLUSH_MFC | MRT6_FLUSH_MFC_STATIC); @@ -411,13 +427,15 @@ static void *ip6mr_vif_seq_start(struct seq_file *seq, loff_t *pos) struct net *net = seq_file_net(seq); struct mr_table *mrt; - mrt = ip6mr_get_table(net, RT6_TABLE_DFLT); - if (!mrt) + rcu_read_lock(); + mrt = __ip6mr_get_table(net, RT6_TABLE_DFLT); + if (!mrt) { + rcu_read_unlock(); return ERR_PTR(-ENOENT); + } iter->mrt = mrt; - rcu_read_lock(); return mr_vif_seq_start(seq, pos); } @@ -492,9 +510,9 @@ static int ipmr_mfc_seq_show(struct seq_file *seq, void *v) if (it->cache != &mrt->mfc_unres_queue) { seq_printf(seq, " %8lu %8lu %8lu", - mfc->_c.mfc_un.res.pkt, - mfc->_c.mfc_un.res.bytes, - mfc->_c.mfc_un.res.wrong_if); + atomic_long_read(&mfc->_c.mfc_un.res.pkt), + atomic_long_read(&mfc->_c.mfc_un.res.bytes), + atomic_long_read(&mfc->_c.mfc_un.res.wrong_if)); for (n = mfc->_c.mfc_un.res.minvif; n < mfc->_c.mfc_un.res.maxvif; n++) { if (VIF_EXISTS(mrt, n) && @@ -640,7 +658,7 @@ static void reg_vif_setup(struct net_device *dev) dev->flags = IFF_NOARP; dev->netdev_ops = ®_vif_netdev_ops; dev->needs_free_netdev = true; - dev->features |= NETIF_F_NETNS_LOCAL; + dev->netns_immutable = true; } static struct net_device *ip6mr_reg_vif(struct net *net, struct mr_table *mrt) @@ -821,7 +839,7 @@ static void ipmr_do_expire_process(struct mr_table *mrt) static void ipmr_expire_process(struct timer_list *t) { - struct mr_table *mrt = from_timer(mrt, t, ipmr_expire_timer); + struct mr_table *mrt = timer_container_of(mrt, t, ipmr_expire_timer); if (!spin_trylock(&mfc_unres_lock)) { mod_timer(&mrt->ipmr_expire_timer, jiffies + 1); @@ -856,7 +874,7 @@ static void ip6mr_update_thresholds(struct mr_table *mrt, cache->mfc_un.res.maxvif = vifi + 1; } } - cache->mfc_un.res.lastuse = jiffies; + WRITE_ONCE(cache->mfc_un.res.lastuse, jiffies); } static int mif6_add(struct net *net, struct mr_table *mrt, @@ -1073,7 +1091,7 @@ static int ip6mr_cache_report(const struct mr_table *mrt, struct sk_buff *pkt, And all this only to mangle msg->im6_msgtype and to set msg->im6_mbz to "mbz" :-) */ - skb_push(skb, -skb_network_offset(pkt)); + __skb_pull(skb, skb_network_offset(pkt)); skb_push(skb, sizeof(*msg)); skb_reset_transport_header(skb); @@ -1260,11 +1278,9 @@ static int ip6mr_device_event(struct notifier_block *this, return NOTIFY_DONE; } -static unsigned int ip6mr_seq_read(struct net *net) +static unsigned int ip6mr_seq_read(const struct net *net) { - ASSERT_RTNL(); - - return net->ipv6.ipmr_seq + ip6mr_rules_seq_read(net); + return READ_ONCE(net->ipv6.ipmr_seq) + ip6mr_rules_seq_read(net); } static int ip6mr_dump(struct net *net, struct notifier_block *nb, @@ -1369,14 +1385,17 @@ static struct pernet_operations ip6mr_net_ops = { .exit_batch = ip6mr_net_exit_batch, }; +static const struct rtnl_msg_handler ip6mr_rtnl_msg_handlers[] __initconst_or_module = { + {.owner = THIS_MODULE, .protocol = RTNL_FAMILY_IP6MR, + .msgtype = RTM_GETROUTE, + .doit = ip6mr_rtm_getroute, .dumpit = ip6mr_rtm_dumproute}, +}; + int __init ip6_mr_init(void) { int err; - mrt_cachep = kmem_cache_create("ip6_mrt_cache", - sizeof(struct mfc6_cache), - 0, SLAB_HWCACHE_ALIGN, - NULL); + mrt_cachep = KMEM_CACHE(mfc6_cache, SLAB_HWCACHE_ALIGN); if (!mrt_cachep) return -ENOMEM; @@ -1394,9 +1413,8 @@ int __init ip6_mr_init(void) goto add_proto_fail; } #endif - err = rtnl_register_module(THIS_MODULE, RTNL_FAMILY_IP6MR, RTM_GETROUTE, - ip6mr_rtm_getroute, ip6mr_rtm_dumproute, 0); - if (err == 0) + err = rtnl_register_many(ip6mr_rtnl_msg_handlers); + if (!err) return 0; #ifdef CONFIG_IPV6_PIMSM_V2 @@ -1411,9 +1429,9 @@ reg_pernet_fail: return err; } -void ip6_mr_cleanup(void) +void __init ip6_mr_cleanup(void) { - rtnl_unregister(RTNL_FAMILY_IP6MR, RTM_GETROUTE); + rtnl_unregister_many(ip6mr_rtnl_msg_handlers); #ifdef CONFIG_IPV6_PIMSM_V2 inet6_del_protocol(&pim6_protocol, IPPROTO_PIM); #endif @@ -1498,7 +1516,7 @@ static int ip6mr_mfc_add(struct net *net, struct mr_table *mrt, } } if (list_empty(&mrt->mfc_unres_queue)) - del_timer(&mrt->ipmr_expire_timer); + timer_delete(&mrt->ipmr_expire_timer); spin_unlock_bh(&mfc_unres_lock); if (found) { @@ -1879,11 +1897,10 @@ int ip6_mroute_getsockopt(struct sock *sk, int optname, sockptr_t optval, /* * The IP multicast ioctl support routines. */ - -int ip6mr_ioctl(struct sock *sk, int cmd, void __user *arg) +int ip6mr_ioctl(struct sock *sk, int cmd, void *arg) { - struct sioc_sg_req6 sr; - struct sioc_mif_req6 vr; + struct sioc_sg_req6 *sr; + struct sioc_mif_req6 *vr; struct vif_device *vif; struct mfc6_cache *c; struct net *net = sock_net(sk); @@ -1895,40 +1912,33 @@ int ip6mr_ioctl(struct sock *sk, int cmd, void __user *arg) switch (cmd) { case SIOCGETMIFCNT_IN6: - if (copy_from_user(&vr, arg, sizeof(vr))) - return -EFAULT; - if (vr.mifi >= mrt->maxvif) + vr = (struct sioc_mif_req6 *)arg; + if (vr->mifi >= mrt->maxvif) return -EINVAL; - vr.mifi = array_index_nospec(vr.mifi, mrt->maxvif); + vr->mifi = array_index_nospec(vr->mifi, mrt->maxvif); rcu_read_lock(); - vif = &mrt->vif_table[vr.mifi]; - if (VIF_EXISTS(mrt, vr.mifi)) { - vr.icount = READ_ONCE(vif->pkt_in); - vr.ocount = READ_ONCE(vif->pkt_out); - vr.ibytes = READ_ONCE(vif->bytes_in); - vr.obytes = READ_ONCE(vif->bytes_out); + vif = &mrt->vif_table[vr->mifi]; + if (VIF_EXISTS(mrt, vr->mifi)) { + vr->icount = READ_ONCE(vif->pkt_in); + vr->ocount = READ_ONCE(vif->pkt_out); + vr->ibytes = READ_ONCE(vif->bytes_in); + vr->obytes = READ_ONCE(vif->bytes_out); rcu_read_unlock(); - - if (copy_to_user(arg, &vr, sizeof(vr))) - return -EFAULT; return 0; } rcu_read_unlock(); return -EADDRNOTAVAIL; case SIOCGETSGCNT_IN6: - if (copy_from_user(&sr, arg, sizeof(sr))) - return -EFAULT; + sr = (struct sioc_sg_req6 *)arg; rcu_read_lock(); - c = ip6mr_cache_find(mrt, &sr.src.sin6_addr, &sr.grp.sin6_addr); + c = ip6mr_cache_find(mrt, &sr->src.sin6_addr, + &sr->grp.sin6_addr); if (c) { - sr.pktcnt = c->_c.mfc_un.res.pkt; - sr.bytecnt = c->_c.mfc_un.res.bytes; - sr.wrong_if = c->_c.mfc_un.res.wrong_if; + sr->pktcnt = atomic_long_read(&c->_c.mfc_un.res.pkt); + sr->bytecnt = atomic_long_read(&c->_c.mfc_un.res.bytes); + sr->wrong_if = atomic_long_read(&c->_c.mfc_un.res.wrong_if); rcu_read_unlock(); - - if (copy_to_user(arg, &sr, sizeof(sr))) - return -EFAULT; return 0; } rcu_read_unlock(); @@ -1997,9 +2007,9 @@ int ip6mr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg) rcu_read_lock(); c = ip6mr_cache_find(mrt, &sr.src.sin6_addr, &sr.grp.sin6_addr); if (c) { - sr.pktcnt = c->_c.mfc_un.res.pkt; - sr.bytecnt = c->_c.mfc_un.res.bytes; - sr.wrong_if = c->_c.mfc_un.res.wrong_if; + sr.pktcnt = atomic_long_read(&c->_c.mfc_un.res.pkt); + sr.bytecnt = atomic_long_read(&c->_c.mfc_un.res.bytes); + sr.wrong_if = atomic_long_read(&c->_c.mfc_un.res.wrong_if); rcu_read_unlock(); if (copy_to_user(arg, &sr, sizeof(sr))) @@ -2018,8 +2028,6 @@ static inline int ip6mr_forward2_finish(struct net *net, struct sock *sk, struct { IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_OUTFORWDATAGRAMS); - IP6_ADD_STATS(net, ip6_dst_idev(skb_dst(skb)), - IPSTATS_MIB_OUTOCTETS, skb->len); return dst_output(net, sk, skb); } @@ -2027,8 +2035,8 @@ static inline int ip6mr_forward2_finish(struct net *net, struct sock *sk, struct * Processing handlers for ip6mr_forward */ -static int ip6mr_forward2(struct net *net, struct mr_table *mrt, - struct sk_buff *skb, int vifi) +static int ip6mr_prepare_xmit(struct net *net, struct mr_table *mrt, + struct sk_buff *skb, int vifi) { struct vif_device *vif = &mrt->vif_table[vifi]; struct net_device *vif_dev; @@ -2038,7 +2046,7 @@ static int ip6mr_forward2(struct net *net, struct mr_table *mrt, vif_dev = vif_dev_read(vif); if (!vif_dev) - goto out_free; + return -1; #ifdef CONFIG_IPV6_PIMSM_V2 if (vif->flags & MIFF_REGISTER) { @@ -2047,7 +2055,7 @@ static int ip6mr_forward2(struct net *net, struct mr_table *mrt, DEV_STATS_ADD(vif_dev, tx_bytes, skb->len); DEV_STATS_INC(vif_dev, tx_packets); ip6mr_cache_report(mrt, skb, vifi, MRT6MSG_WHOLEPKT); - goto out_free; + return -1; } #endif @@ -2061,7 +2069,7 @@ static int ip6mr_forward2(struct net *net, struct mr_table *mrt, dst = ip6_route_output(net, NULL, &fl6); if (dst->error) { dst_release(dst); - goto out_free; + return -1; } skb_dst_drop(skb); @@ -2085,20 +2093,43 @@ static int ip6mr_forward2(struct net *net, struct mr_table *mrt, /* We are about to write */ /* XXX: extension headers? */ if (skb_cow(skb, sizeof(*ipv6h) + LL_RESERVED_SPACE(vif_dev))) - goto out_free; + return -1; ipv6h = ipv6_hdr(skb); ipv6h->hop_limit--; + return 0; +} + +static void ip6mr_forward2(struct net *net, struct mr_table *mrt, + struct sk_buff *skb, int vifi) +{ + struct net_device *indev = skb->dev; + + if (ip6mr_prepare_xmit(net, mrt, skb, vifi)) + goto out_free; IP6CB(skb)->flags |= IP6SKB_FORWARDED; - return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, - net, NULL, skb, skb->dev, vif_dev, - ip6mr_forward2_finish); + NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, + net, NULL, skb, indev, skb->dev, + ip6mr_forward2_finish); + return; + +out_free: + kfree_skb(skb); +} + +static void ip6mr_output2(struct net *net, struct mr_table *mrt, + struct sk_buff *skb, int vifi) +{ + if (ip6mr_prepare_xmit(net, mrt, skb, vifi)) + goto out_free; + + ip6_output(net, NULL, skb); + return; out_free: kfree_skb(skb); - return 0; } /* Called with rcu_read_lock() */ @@ -2124,9 +2155,9 @@ static void ip6_mr_forward(struct net *net, struct mr_table *mrt, int true_vifi = ip6mr_find_vif(mrt, dev); vif = c->_c.mfc_parent; - c->_c.mfc_un.res.pkt++; - c->_c.mfc_un.res.bytes += skb->len; - c->_c.mfc_un.res.lastuse = jiffies; + atomic_long_inc(&c->_c.mfc_un.res.pkt); + atomic_long_add(skb->len, &c->_c.mfc_un.res.bytes); + WRITE_ONCE(c->_c.mfc_un.res.lastuse, jiffies); if (ipv6_addr_any(&c->mf6c_origin) && true_vifi >= 0) { struct mfc6_cache *cache_proxy; @@ -2144,7 +2175,7 @@ static void ip6_mr_forward(struct net *net, struct mr_table *mrt, * Wrong interface: drop packet and (maybe) send PIM assert. */ if (rcu_access_pointer(mrt->vif_table[vif].dev) != dev) { - c->_c.mfc_un.res.wrong_if++; + atomic_long_inc(&c->_c.mfc_un.res.wrong_if); if (true_vifi >= 0 && mrt->mroute_do_assert && /* pimsm uses asserts, when switching from RPT to SPT, @@ -2213,6 +2244,56 @@ dont_forward: kfree_skb(skb); } +/* Called under rcu_read_lock() */ +static void ip6_mr_output_finish(struct net *net, struct mr_table *mrt, + struct net_device *dev, struct sk_buff *skb, + struct mfc6_cache *c) +{ + int psend = -1; + int ct; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + atomic_long_inc(&c->_c.mfc_un.res.pkt); + atomic_long_add(skb->len, &c->_c.mfc_un.res.bytes); + WRITE_ONCE(c->_c.mfc_un.res.lastuse, jiffies); + + /* Forward the frame */ + if (ipv6_addr_any(&c->mf6c_origin) && + ipv6_addr_any(&c->mf6c_mcastgrp)) { + if (ipv6_hdr(skb)->hop_limit > + c->_c.mfc_un.res.ttls[c->_c.mfc_parent]) { + /* It's an (*,*) entry and the packet is not coming from + * the upstream: forward the packet to the upstream + * only. + */ + psend = c->_c.mfc_parent; + goto last_forward; + } + goto dont_forward; + } + for (ct = c->_c.mfc_un.res.maxvif - 1; + ct >= c->_c.mfc_un.res.minvif; ct--) { + if (ipv6_hdr(skb)->hop_limit > c->_c.mfc_un.res.ttls[ct]) { + if (psend != -1) { + struct sk_buff *skb2; + + skb2 = skb_clone(skb, GFP_ATOMIC); + if (skb2) + ip6mr_output2(net, mrt, skb2, psend); + } + psend = ct; + } + } +last_forward: + if (psend != -1) { + ip6mr_output2(net, mrt, skb, psend); + return; + } + +dont_forward: + kfree_skb(skb); +} /* * Multicast packets for forwarding arrive here @@ -2220,21 +2301,20 @@ dont_forward: int ip6_mr_input(struct sk_buff *skb) { + struct net_device *dev = skb->dev; + struct net *net = dev_net_rcu(dev); struct mfc6_cache *cache; - struct net *net = dev_net(skb->dev); struct mr_table *mrt; struct flowi6 fl6 = { - .flowi6_iif = skb->dev->ifindex, + .flowi6_iif = dev->ifindex, .flowi6_mark = skb->mark, }; int err; - struct net_device *dev; /* skb->dev passed in is the master dev for vrfs. * Get the proper interface that does have a vif associated with it. */ - dev = skb->dev; - if (netif_is_l3_master(skb->dev)) { + if (netif_is_l3_master(dev)) { dev = dev_get_by_index_rcu(net, IPCB(skb)->iif); if (!dev) { kfree_skb(skb); @@ -2280,19 +2360,76 @@ int ip6_mr_input(struct sk_buff *skb) return 0; } +int ip6_mr_output(struct net *net, struct sock *sk, struct sk_buff *skb) +{ + struct net_device *dev = skb_dst(skb)->dev; + struct flowi6 fl6 = (struct flowi6) { + .flowi6_iif = LOOPBACK_IFINDEX, + .flowi6_mark = skb->mark, + }; + struct mfc6_cache *cache; + struct mr_table *mrt; + int err; + int vif; + + guard(rcu)(); + + if (IP6CB(skb)->flags & IP6SKB_FORWARDED) + goto ip6_output; + if (!(IP6CB(skb)->flags & IP6SKB_MCROUTE)) + goto ip6_output; + + err = ip6mr_fib_lookup(net, &fl6, &mrt); + if (err < 0) { + kfree_skb(skb); + return err; + } + + cache = ip6mr_cache_find(mrt, + &ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr); + if (!cache) { + vif = ip6mr_find_vif(mrt, dev); + if (vif >= 0) + cache = ip6mr_cache_find_any(mrt, + &ipv6_hdr(skb)->daddr, + vif); + } + + /* No usable cache entry */ + if (!cache) { + vif = ip6mr_find_vif(mrt, dev); + if (vif >= 0) + return ip6mr_cache_unresolved(mrt, vif, skb, dev); + goto ip6_output; + } + + /* Wrong interface */ + vif = cache->_c.mfc_parent; + if (rcu_access_pointer(mrt->vif_table[vif].dev) != dev) + goto ip6_output; + + ip6_mr_output_finish(net, mrt, dev, skb, cache); + return 0; + +ip6_output: + return ip6_output(net, sk, skb); +} + int ip6mr_get_route(struct net *net, struct sk_buff *skb, struct rtmsg *rtm, u32 portid) { int err; struct mr_table *mrt; struct mfc6_cache *cache; - struct rt6_info *rt = (struct rt6_info *)skb_dst(skb); + struct rt6_info *rt = dst_rt6_info(skb_dst(skb)); - mrt = ip6mr_get_table(net, RT6_TABLE_DFLT); - if (!mrt) + rcu_read_lock(); + mrt = __ip6mr_get_table(net, RT6_TABLE_DFLT); + if (!mrt) { + rcu_read_unlock(); return -ENOENT; + } - rcu_read_lock(); cache = ip6mr_cache_find(mrt, &rt->rt6i_src.addr, &rt->rt6i_dst.addr); if (!cache && skb->dev) { int vif = ip6mr_find_vif(mrt, skb->dev); @@ -2444,8 +2581,7 @@ static void mr6_netlink_event(struct mr_table *mrt, struct mfc6_cache *mfc, errout: kfree_skb(skb); - if (err < 0) - rtnl_set_sk_err(net, RTNLGRP_IPV6_MROUTE, err); + rtnl_set_sk_err(net, RTNLGRP_IPV6_MROUTE, err); } static size_t mrt6msg_netlink_msgsize(size_t payloadlen) @@ -2571,9 +2707,9 @@ static int ip6mr_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, src = nla_get_in6_addr(tb[RTA_SRC]); if (tb[RTA_DST]) grp = nla_get_in6_addr(tb[RTA_DST]); - tableid = tb[RTA_TABLE] ? nla_get_u32(tb[RTA_TABLE]) : 0; + tableid = nla_get_u32_default(tb[RTA_TABLE], 0); - mrt = ip6mr_get_table(net, tableid ?: RT_TABLE_DEFAULT); + mrt = __ip6mr_get_table(net, tableid ?: RT_TABLE_DEFAULT); if (!mrt) { NL_SET_ERR_MSG_MOD(extack, "MR table does not exist"); return -ENOENT; @@ -2605,7 +2741,9 @@ static int ip6mr_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, static int ip6mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb) { const struct nlmsghdr *nlh = cb->nlh; - struct fib_dump_filter filter = {}; + struct fib_dump_filter filter = { + .rtnl_held = true, + }; int err; if (cb->strict_check) { @@ -2618,7 +2756,7 @@ static int ip6mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb) if (filter.table_id) { struct mr_table *mrt; - mrt = ip6mr_get_table(sock_net(skb->sk), filter.table_id); + mrt = __ip6mr_get_table(sock_net(skb->sk), filter.table_id); if (!mrt) { if (rtnl_msg_family(cb->nlh) != RTNL_FAMILY_IP6MR) return skb->len; diff --git a/net/ipv6/ipcomp6.c b/net/ipv6/ipcomp6.c index 72d4858dec18..8607569de34f 100644 --- a/net/ipv6/ipcomp6.c +++ b/net/ipv6/ipcomp6.c @@ -71,6 +71,7 @@ static int ipcomp6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, return 0; } +static struct lock_class_key xfrm_state_lock_key; static struct xfrm_state *ipcomp6_tunnel_create(struct xfrm_state *x) { struct net *net = xs_net(x); @@ -79,6 +80,7 @@ static struct xfrm_state *ipcomp6_tunnel_create(struct xfrm_state *x) t = xfrm_state_alloc(net); if (!t) goto out; + lockdep_set_class(&t->lock, &xfrm_state_lock_key); t->id.proto = IPPROTO_IPV6; t->id.spi = xfrm6_tunnel_alloc_spi(net, (xfrm_address_t *)&x->props.saddr); diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index 9ce51680290b..a61e742794f9 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -49,6 +49,7 @@ #include <net/xfrm.h> #include <net/compat.h> #include <net/seg6.h> +#include <net/psp.h> #include <linux/uaccess.h> @@ -102,42 +103,24 @@ int ip6_ra_control(struct sock *sk, int sel) struct ipv6_txoptions *ipv6_update_options(struct sock *sk, struct ipv6_txoptions *opt) { - if (inet_sk(sk)->is_icsk) { + if (inet_test_bit(IS_ICSK, sk)) { if (opt && !((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) && inet_sk(sk)->inet_daddr != LOOPBACK4_IPV6) { struct inet_connection_sock *icsk = inet_csk(sk); - icsk->icsk_ext_hdr_len = opt->opt_flen + opt->opt_nflen; + + icsk->icsk_ext_hdr_len = + psp_sk_overhead(sk) + + opt->opt_flen + opt->opt_nflen; icsk->icsk_sync_mss(sk, icsk->icsk_pmtu_cookie); } } - opt = xchg((__force struct ipv6_txoptions **)&inet6_sk(sk)->opt, - opt); + opt = unrcu_pointer(xchg(&inet6_sk(sk)->opt, RCU_INITIALIZER(opt))); sk_dst_reset(sk); return opt; } -static bool setsockopt_needs_rtnl(int optname) -{ - switch (optname) { - case IPV6_ADDRFORM: - case IPV6_ADD_MEMBERSHIP: - case IPV6_DROP_MEMBERSHIP: - case IPV6_JOIN_ANYCAST: - case IPV6_LEAVE_ANYCAST: - case MCAST_JOIN_GROUP: - case MCAST_LEAVE_GROUP: - case MCAST_JOIN_SOURCE_GROUP: - case MCAST_LEAVE_SOURCE_GROUP: - case MCAST_BLOCK_SOURCE: - case MCAST_UNBLOCK_SOURCE: - case MCAST_MSFILTER: - return true; - } - return false; -} - static int copy_group_source_from_sockptr(struct group_source_req *greqs, sockptr_t optval, int optlen) { @@ -210,7 +193,7 @@ static int ipv6_set_mcast_msfilter(struct sock *sk, sockptr_t optval, if (optlen < GROUP_FILTER_SIZE(0)) return -EINVAL; - if (optlen > READ_ONCE(sysctl_optmem_max)) + if (optlen > READ_ONCE(sock_net(sk)->core.sysctl_optmem_max)) return -ENOBUFS; gsf = memdup_sockptr(optval, optlen); @@ -244,7 +227,7 @@ static int compat_ipv6_set_mcast_msfilter(struct sock *sk, sockptr_t optval, if (optlen < size0) return -EINVAL; - if (optlen > READ_ONCE(sysctl_optmem_max) - 4) + if (optlen > READ_ONCE(sock_net(sk)->core.sysctl_optmem_max) - 4) return -ENOBUFS; p = kmalloc(optlen + 4, GFP_KERNEL); @@ -396,9 +379,8 @@ int do_ipv6_setsockopt(struct sock *sk, int level, int optname, { struct ipv6_pinfo *np = inet6_sk(sk); struct net *net = sock_net(sk); - int val, valbool; int retv = -ENOPROTOOPT; - bool needs_rtnl = setsockopt_needs_rtnl(optname); + int val, valbool; if (sockptr_is_null(optval)) val = 0; @@ -415,8 +397,155 @@ int do_ipv6_setsockopt(struct sock *sk, int level, int optname, if (ip6_mroute_opt(optname)) return ip6_mroute_setsockopt(sk, optname, optval, optlen); - if (needs_rtnl) - rtnl_lock(); + /* Handle options that can be set without locking the socket. */ + switch (optname) { + case IPV6_UNICAST_HOPS: + if (optlen < sizeof(int)) + return -EINVAL; + if (val > 255 || val < -1) + return -EINVAL; + WRITE_ONCE(np->hop_limit, val); + return 0; + case IPV6_MULTICAST_LOOP: + if (optlen < sizeof(int)) + return -EINVAL; + if (val != valbool) + return -EINVAL; + inet6_assign_bit(MC6_LOOP, sk, valbool); + return 0; + case IPV6_MULTICAST_HOPS: + if (sk->sk_type == SOCK_STREAM) + return retv; + if (optlen < sizeof(int)) + return -EINVAL; + if (val > 255 || val < -1) + return -EINVAL; + WRITE_ONCE(np->mcast_hops, + val == -1 ? IPV6_DEFAULT_MCASTHOPS : val); + return 0; + case IPV6_MTU: + if (optlen < sizeof(int)) + return -EINVAL; + if (val && val < IPV6_MIN_MTU) + return -EINVAL; + WRITE_ONCE(np->frag_size, val); + return 0; + case IPV6_MINHOPCOUNT: + if (optlen < sizeof(int)) + return -EINVAL; + if (val < 0 || val > 255) + return -EINVAL; + + if (val) + static_branch_enable(&ip6_min_hopcount); + + /* tcp_v6_err() and tcp_v6_rcv() might read min_hopcount + * while we are changing it. + */ + WRITE_ONCE(np->min_hopcount, val); + return 0; + case IPV6_RECVERR_RFC4884: + if (optlen < sizeof(int)) + return -EINVAL; + if (val < 0 || val > 1) + return -EINVAL; + inet6_assign_bit(RECVERR6_RFC4884, sk, valbool); + return 0; + case IPV6_MULTICAST_ALL: + if (optlen < sizeof(int)) + return -EINVAL; + inet6_assign_bit(MC6_ALL, sk, valbool); + return 0; + case IPV6_AUTOFLOWLABEL: + inet6_assign_bit(AUTOFLOWLABEL, sk, valbool); + inet6_set_bit(AUTOFLOWLABEL_SET, sk); + return 0; + case IPV6_DONTFRAG: + inet6_assign_bit(DONTFRAG, sk, valbool); + return 0; + case IPV6_RECVERR: + if (optlen < sizeof(int)) + return -EINVAL; + inet6_assign_bit(RECVERR6, sk, valbool); + if (!val) + skb_errqueue_purge(&sk->sk_error_queue); + return 0; + case IPV6_ROUTER_ALERT_ISOLATE: + if (optlen < sizeof(int)) + return -EINVAL; + inet6_assign_bit(RTALERT_ISOLATE, sk, valbool); + return 0; + case IPV6_MTU_DISCOVER: + if (optlen < sizeof(int)) + return -EINVAL; + if (val < IPV6_PMTUDISC_DONT || val > IPV6_PMTUDISC_OMIT) + return -EINVAL; + WRITE_ONCE(np->pmtudisc, val); + return 0; + case IPV6_FLOWINFO_SEND: + if (optlen < sizeof(int)) + return -EINVAL; + inet6_assign_bit(SNDFLOW, sk, valbool); + return 0; + case IPV6_ADDR_PREFERENCES: + if (optlen < sizeof(int)) + return -EINVAL; + return ip6_sock_set_addr_preferences(sk, val); + case IPV6_MULTICAST_IF: + if (sk->sk_type == SOCK_STREAM) + return -ENOPROTOOPT; + if (optlen < sizeof(int)) + return -EINVAL; + if (val) { + struct net_device *dev; + int bound_dev_if, midx; + + rcu_read_lock(); + + dev = dev_get_by_index_rcu(net, val); + if (!dev) { + rcu_read_unlock(); + return -ENODEV; + } + midx = l3mdev_master_ifindex_rcu(dev); + + rcu_read_unlock(); + + bound_dev_if = READ_ONCE(sk->sk_bound_dev_if); + if (bound_dev_if && + bound_dev_if != val && + (!midx || midx != bound_dev_if)) + return -EINVAL; + } + WRITE_ONCE(np->mcast_oif, val); + return 0; + case IPV6_UNICAST_IF: + { + struct net_device *dev; + int ifindex; + + if (optlen != sizeof(int)) + return -EINVAL; + + ifindex = (__force int)ntohl((__force __be32)val); + if (!ifindex) { + WRITE_ONCE(np->ucast_oif, 0); + return 0; + } + + dev = dev_get_by_index(net, ifindex); + if (!dev) + return -EADDRNOTAVAIL; + dev_put(dev); + + if (READ_ONCE(sk->sk_bound_dev_if)) + return -EINVAL; + + WRITE_ONCE(np->ucast_oif, ifindex); + return 0; + } + } + sockopt_lock_sock(sk); /* Another thread has converted the socket into IPv4 with @@ -464,13 +593,6 @@ int do_ipv6_setsockopt(struct sock *sk, int level, int optname, __ipv6_sock_mc_close(sk); __ipv6_sock_ac_close(sk); - /* - * Sock is moving from IPv6 to IPv4 (sk_prot), so - * remove it from the refcnt debug socks count in the - * original family... - */ - sk_refcnt_debug_dec(sk); - if (sk->sk_protocol == IPPROTO_TCP) { struct inet_connection_sock *icsk = inet_csk(sk); @@ -481,8 +603,8 @@ int do_ipv6_setsockopt(struct sock *sk, int level, int optname, WRITE_ONCE(sk->sk_prot, &tcp_prot); /* Paired with READ_ONCE() in tcp_(get|set)sockopt() */ WRITE_ONCE(icsk->icsk_af_ops, &ipv4_specific); - sk->sk_socket->ops = &inet_stream_ops; - sk->sk_family = PF_INET; + WRITE_ONCE(sk->sk_socket->ops, &inet_stream_ops); + WRITE_ONCE(sk->sk_family, PF_INET); tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); } else { struct proto *prot = &udp_prot; @@ -495,8 +617,8 @@ int do_ipv6_setsockopt(struct sock *sk, int level, int optname, /* Paired with READ_ONCE(sk->sk_prot) in inet6_dgram_ops */ WRITE_ONCE(sk->sk_prot, prot); - sk->sk_socket->ops = &inet_dgram_ops; - sk->sk_family = PF_INET; + WRITE_ONCE(sk->sk_socket->ops, &inet_dgram_ops); + WRITE_ONCE(sk->sk_family, PF_INET); } /* Disable all options not to allocate memory anymore, @@ -507,11 +629,6 @@ int do_ipv6_setsockopt(struct sock *sk, int level, int optname, inet6_cleanup_sock(sk); - /* - * ... and add it to the refcnt debug socks count - * in the new family. -acme - */ - sk_refcnt_debug_inc(sk); module_put(THIS_MODULE); retv = 0; break; @@ -645,7 +762,7 @@ int do_ipv6_setsockopt(struct sock *sk, int level, int optname, if (optlen < sizeof(int)) goto e_inval; /* we don't have a separate transparent bit for IPV6 we use the one in the IPv4 socket */ - inet_sk(sk)->transparent = valbool; + inet_assign_bit(TRANSPARENT, sk, valbool); retv = 0; break; @@ -653,7 +770,7 @@ int do_ipv6_setsockopt(struct sock *sk, int level, int optname, if (optlen < sizeof(int)) goto e_inval; /* we also don't have a separate freebind bit for IPV6 */ - inet_sk(sk)->freebind = valbool; + inet_assign_bit(FREEBIND, sk, valbool); retv = 0; break; @@ -728,6 +845,7 @@ int do_ipv6_setsockopt(struct sock *sk, int level, int optname, goto done; msg.msg_controllen = optlen; + msg.msg_control_is_user = false; msg.msg_control = (void *)(opt+1); ipc6.opt = opt; @@ -744,95 +862,7 @@ done: } break; } - case IPV6_UNICAST_HOPS: - if (optlen < sizeof(int)) - goto e_inval; - if (val > 255 || val < -1) - goto e_inval; - np->hop_limit = val; - retv = 0; - break; - - case IPV6_MULTICAST_HOPS: - if (sk->sk_type == SOCK_STREAM) - break; - if (optlen < sizeof(int)) - goto e_inval; - if (val > 255 || val < -1) - goto e_inval; - np->mcast_hops = (val == -1 ? IPV6_DEFAULT_MCASTHOPS : val); - retv = 0; - break; - - case IPV6_MULTICAST_LOOP: - if (optlen < sizeof(int)) - goto e_inval; - if (val != valbool) - goto e_inval; - np->mc_loop = valbool; - retv = 0; - break; - - case IPV6_UNICAST_IF: - { - struct net_device *dev = NULL; - int ifindex; - - if (optlen != sizeof(int)) - goto e_inval; - - ifindex = (__force int)ntohl((__force __be32)val); - if (ifindex == 0) { - np->ucast_oif = 0; - retv = 0; - break; - } - - dev = dev_get_by_index(net, ifindex); - retv = -EADDRNOTAVAIL; - if (!dev) - break; - dev_put(dev); - retv = -EINVAL; - if (sk->sk_bound_dev_if) - break; - - np->ucast_oif = ifindex; - retv = 0; - break; - } - - case IPV6_MULTICAST_IF: - if (sk->sk_type == SOCK_STREAM) - break; - if (optlen < sizeof(int)) - goto e_inval; - - if (val) { - struct net_device *dev; - int midx; - - rcu_read_lock(); - - dev = dev_get_by_index_rcu(net, val); - if (!dev) { - rcu_read_unlock(); - retv = -ENODEV; - break; - } - midx = l3mdev_master_ifindex_rcu(dev); - - rcu_read_unlock(); - - if (sk->sk_bound_dev_if && - sk->sk_bound_dev_if != val && - (!midx || midx != sk->sk_bound_dev_if)) - goto e_inval; - } - np->mcast_oif = val; - retv = 0; - break; case IPV6_ADD_MEMBERSHIP: case IPV6_DROP_MEMBERSHIP: { @@ -842,7 +872,7 @@ done: goto e_inval; retv = -EPROTO; - if (inet_sk(sk)->is_icsk) + if (inet_test_bit(IS_ICSK, sk)) break; retv = -EFAULT; @@ -873,13 +903,6 @@ done: retv = ipv6_sock_ac_drop(sk, mreq.ipv6mr_ifindex, &mreq.ipv6mr_acaddr); break; } - case IPV6_MULTICAST_ALL: - if (optlen < sizeof(int)) - goto e_inval; - np->mc_all = valbool; - retv = 0; - break; - case MCAST_JOIN_GROUP: case MCAST_LEAVE_GROUP: if (in_compat_syscall()) @@ -906,42 +929,8 @@ done: if (optlen < sizeof(int)) goto e_inval; retv = ip6_ra_control(sk, val); - break; - case IPV6_ROUTER_ALERT_ISOLATE: - if (optlen < sizeof(int)) - goto e_inval; - np->rtalert_isolate = valbool; - retv = 0; - break; - case IPV6_MTU_DISCOVER: - if (optlen < sizeof(int)) - goto e_inval; - if (val < IPV6_PMTUDISC_DONT || val > IPV6_PMTUDISC_OMIT) - goto e_inval; - np->pmtudisc = val; - retv = 0; - break; - case IPV6_MTU: - if (optlen < sizeof(int)) - goto e_inval; - if (val && val < IPV6_MIN_MTU) - goto e_inval; - np->frag_size = val; - retv = 0; - break; - case IPV6_RECVERR: - if (optlen < sizeof(int)) - goto e_inval; - np->recverr = valbool; - if (!val) - skb_queue_purge(&sk->sk_error_queue); - retv = 0; - break; - case IPV6_FLOWINFO_SEND: - if (optlen < sizeof(int)) - goto e_inval; - np->sndflow = valbool; - retv = 0; + if (retv == 0) + inet6_assign_bit(RTALERT, sk, valbool); break; case IPV6_FLOWLABEL_MGR: retv = ipv6_flowlabel_opt(sk, optval, optlen); @@ -954,53 +943,14 @@ done: retv = xfrm_user_policy(sk, optname, optval, optlen); break; - case IPV6_ADDR_PREFERENCES: - if (optlen < sizeof(int)) - goto e_inval; - retv = __ip6_sock_set_addr_preferences(sk, val); - break; - case IPV6_MINHOPCOUNT: - if (optlen < sizeof(int)) - goto e_inval; - if (val < 0 || val > 255) - goto e_inval; - - if (val) - static_branch_enable(&ip6_min_hopcount); - - /* tcp_v6_err() and tcp_v6_rcv() might read min_hopcount - * while we are changing it. - */ - WRITE_ONCE(np->min_hopcount, val); - retv = 0; - break; - case IPV6_DONTFRAG: - np->dontfrag = valbool; - retv = 0; - break; - case IPV6_AUTOFLOWLABEL: - np->autoflowlabel = valbool; - np->autoflowlabel_set = 1; - retv = 0; - break; case IPV6_RECVFRAGSIZE: np->rxopt.bits.recvfragsize = valbool; retv = 0; break; - case IPV6_RECVERR_RFC4884: - if (optlen < sizeof(int)) - goto e_inval; - if (val < 0 || val > 1) - goto e_inval; - np->recverr_rfc4884 = valbool; - retv = 0; - break; } unlock: sockopt_release_sock(sk); - if (needs_rtnl) - rtnl_unlock(); return retv; @@ -1015,7 +965,7 @@ int ipv6_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, int err; if (level == SOL_IP && sk->sk_type != SOCK_RAW) - return udp_prot.setsockopt(sk, level, optname, optval, optlen); + return ip_setsockopt(sk, level, optname, optval, optlen); if (level != SOL_IPV6) return -ENOPROTOOPT; @@ -1184,14 +1134,17 @@ int do_ipv6_getsockopt(struct sock *sk, int level, int optname, sockopt_release_sock(sk); if (!skb) { if (np->rxopt.bits.rxinfo) { + int mcast_oif = READ_ONCE(np->mcast_oif); struct in6_pktinfo src_info; - src_info.ipi6_ifindex = np->mcast_oif ? np->mcast_oif : + + src_info.ipi6_ifindex = mcast_oif ? : np->sticky_pktinfo.ipi6_ifindex; - src_info.ipi6_addr = np->mcast_oif ? sk->sk_v6_daddr : np->sticky_pktinfo.ipi6_addr; + src_info.ipi6_addr = mcast_oif ? sk->sk_v6_daddr : np->sticky_pktinfo.ipi6_addr; put_cmsg(&msg, SOL_IPV6, IPV6_PKTINFO, sizeof(src_info), &src_info); } if (np->rxopt.bits.rxhlim) { - int hlim = np->mcast_hops; + int hlim = READ_ONCE(np->mcast_hops); + put_cmsg(&msg, SOL_IPV6, IPV6_HOPLIMIT, sizeof(hlim), &hlim); } if (np->rxopt.bits.rxtclass) { @@ -1200,15 +1153,18 @@ int do_ipv6_getsockopt(struct sock *sk, int level, int optname, put_cmsg(&msg, SOL_IPV6, IPV6_TCLASS, sizeof(tclass), &tclass); } if (np->rxopt.bits.rxoinfo) { + int mcast_oif = READ_ONCE(np->mcast_oif); struct in6_pktinfo src_info; - src_info.ipi6_ifindex = np->mcast_oif ? np->mcast_oif : + + src_info.ipi6_ifindex = mcast_oif ? : np->sticky_pktinfo.ipi6_ifindex; - src_info.ipi6_addr = np->mcast_oif ? sk->sk_v6_daddr : - np->sticky_pktinfo.ipi6_addr; + src_info.ipi6_addr = mcast_oif ? sk->sk_v6_daddr : + np->sticky_pktinfo.ipi6_addr; put_cmsg(&msg, SOL_IPV6, IPV6_2292PKTINFO, sizeof(src_info), &src_info); } if (np->rxopt.bits.rxohlim) { - int hlim = np->mcast_hops; + int hlim = READ_ONCE(np->mcast_hops); + put_cmsg(&msg, SOL_IPV6, IPV6_2292HOPLIMIT, sizeof(hlim), &hlim); } if (np->rxopt.bits.rxflow) { @@ -1341,11 +1297,11 @@ int do_ipv6_getsockopt(struct sock *sk, int level, int optname, } case IPV6_TRANSPARENT: - val = inet_sk(sk)->transparent; + val = inet_test_bit(TRANSPARENT, sk); break; case IPV6_FREEBIND: - val = inet_sk(sk)->freebind; + val = inet_test_bit(FREEBIND, sk); break; case IPV6_RECVORIGDSTADDR: @@ -1358,9 +1314,9 @@ int do_ipv6_getsockopt(struct sock *sk, int level, int optname, struct dst_entry *dst; if (optname == IPV6_UNICAST_HOPS) - val = np->hop_limit; + val = READ_ONCE(np->hop_limit); else - val = np->mcast_hops; + val = READ_ONCE(np->mcast_hops); if (val < 0) { rcu_read_lock(); @@ -1371,36 +1327,36 @@ int do_ipv6_getsockopt(struct sock *sk, int level, int optname, } if (val < 0) - val = sock_net(sk)->ipv6.devconf_all->hop_limit; + val = READ_ONCE(sock_net(sk)->ipv6.devconf_all->hop_limit); break; } case IPV6_MULTICAST_LOOP: - val = np->mc_loop; + val = inet6_test_bit(MC6_LOOP, sk); break; case IPV6_MULTICAST_IF: - val = np->mcast_oif; + val = READ_ONCE(np->mcast_oif); break; case IPV6_MULTICAST_ALL: - val = np->mc_all; + val = inet6_test_bit(MC6_ALL, sk); break; case IPV6_UNICAST_IF: - val = (__force int)htonl((__u32) np->ucast_oif); + val = (__force int)htonl((__u32) READ_ONCE(np->ucast_oif)); break; case IPV6_MTU_DISCOVER: - val = np->pmtudisc; + val = READ_ONCE(np->pmtudisc); break; case IPV6_RECVERR: - val = np->recverr; + val = inet6_test_bit(RECVERR6, sk); break; case IPV6_FLOWINFO_SEND: - val = np->sndflow; + val = inet6_test_bit(SNDFLOW, sk); break; case IPV6_FLOWLABEL_MGR: @@ -1435,45 +1391,51 @@ int do_ipv6_getsockopt(struct sock *sk, int level, int optname, } case IPV6_ADDR_PREFERENCES: + { + u8 srcprefs = READ_ONCE(np->srcprefs); val = 0; - if (np->srcprefs & IPV6_PREFER_SRC_TMP) + if (srcprefs & IPV6_PREFER_SRC_TMP) val |= IPV6_PREFER_SRC_TMP; - else if (np->srcprefs & IPV6_PREFER_SRC_PUBLIC) + else if (srcprefs & IPV6_PREFER_SRC_PUBLIC) val |= IPV6_PREFER_SRC_PUBLIC; else { /* XXX: should we return system default? */ val |= IPV6_PREFER_SRC_PUBTMP_DEFAULT; } - if (np->srcprefs & IPV6_PREFER_SRC_COA) + if (srcprefs & IPV6_PREFER_SRC_COA) val |= IPV6_PREFER_SRC_COA; else val |= IPV6_PREFER_SRC_HOME; break; - + } case IPV6_MINHOPCOUNT: - val = np->min_hopcount; + val = READ_ONCE(np->min_hopcount); break; case IPV6_DONTFRAG: - val = np->dontfrag; + val = inet6_test_bit(DONTFRAG, sk); break; case IPV6_AUTOFLOWLABEL: - val = ip6_autoflowlabel(sock_net(sk), np); + val = ip6_autoflowlabel(sock_net(sk), sk); break; case IPV6_RECVFRAGSIZE: val = np->rxopt.bits.recvfragsize; break; + case IPV6_ROUTER_ALERT: + val = inet6_test_bit(RTALERT, sk); + break; + case IPV6_ROUTER_ALERT_ISOLATE: - val = np->rtalert_isolate; + val = inet6_test_bit(RTALERT_ISOLATE, sk); break; case IPV6_RECVERR_RFC4884: - val = np->recverr_rfc4884; + val = inet6_test_bit(RECVERR6_RFC4884, sk); break; default: @@ -1493,7 +1455,7 @@ int ipv6_getsockopt(struct sock *sk, int level, int optname, int err; if (level == SOL_IP && sk->sk_type != SOCK_RAW) - return udp_prot.getsockopt(sk, level, optname, optval, optlen); + return ip_getsockopt(sk, level, optname, optval, optlen); if (level != SOL_IPV6) return -ENOPROTOOPT; diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index 1c02160cf7a4..016b572e7d6f 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -33,8 +33,10 @@ #include <linux/in.h> #include <linux/in6.h> #include <linux/netdevice.h> +#include <linux/if_addr.h> #include <linux/if_arp.h> #include <linux/route.h> +#include <linux/rtnetlink.h> #include <linux/init.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> @@ -47,6 +49,7 @@ #include <linux/netfilter_ipv6.h> #include <net/net_namespace.h> +#include <net/netlink.h> #include <net/sock.h> #include <net/snmp.h> @@ -105,9 +108,9 @@ static int __ipv6_dev_mc_inc(struct net_device *dev, int sysctl_mld_max_msf __read_mostly = IPV6_MLD_MAX_MSF; int sysctl_mld_qrv __read_mostly = MLD_QRV_DEFAULT; -/* - * socket join on multicast group - */ +#define mc_assert_locked(idev) \ + lockdep_assert_held(&(idev)->mc_lock) + #define mc_dereference(e, idev) \ rcu_dereference_protected(e, lockdep_is_held(&(idev)->mc_lock)) @@ -159,24 +162,48 @@ static int unsolicited_report_interval(struct inet6_dev *idev) int iv; if (mld_in_v1_mode(idev)) - iv = idev->cnf.mldv1_unsolicited_report_interval; + iv = READ_ONCE(idev->cnf.mldv1_unsolicited_report_interval); else - iv = idev->cnf.mldv2_unsolicited_report_interval; + iv = READ_ONCE(idev->cnf.mldv2_unsolicited_report_interval); return iv > 0 ? iv : 1; } +static struct net_device *ip6_mc_find_dev(struct net *net, + const struct in6_addr *group, + int ifindex) +{ + struct net_device *dev = NULL; + struct rt6_info *rt; + + if (ifindex == 0) { + rcu_read_lock(); + rt = rt6_lookup(net, group, NULL, 0, NULL, 0); + if (rt) { + dev = dst_dev_rcu(&rt->dst); + dev_hold(dev); + ip6_rt_put(rt); + } + rcu_read_unlock(); + } else { + dev = dev_get_by_index(net, ifindex); + } + + return dev; +} + +/* + * socket join on multicast group + */ static int __ipv6_sock_mc_join(struct sock *sk, int ifindex, const struct in6_addr *addr, unsigned int mode) { - struct net_device *dev = NULL; - struct ipv6_mc_socklist *mc_lst; struct ipv6_pinfo *np = inet6_sk(sk); + struct ipv6_mc_socklist *mc_lst; struct net *net = sock_net(sk); + struct net_device *dev = NULL; int err; - ASSERT_RTNL(); - if (!ipv6_addr_is_multicast(addr)) return -EINVAL; @@ -187,23 +214,13 @@ static int __ipv6_sock_mc_join(struct sock *sk, int ifindex, } mc_lst = sock_kmalloc(sk, sizeof(struct ipv6_mc_socklist), GFP_KERNEL); - if (!mc_lst) return -ENOMEM; mc_lst->next = NULL; mc_lst->addr = *addr; - if (ifindex == 0) { - struct rt6_info *rt; - rt = rt6_lookup(net, addr, NULL, 0, NULL, 0); - if (rt) { - dev = rt->dst.dev; - ip6_rt_put(rt); - } - } else - dev = __dev_get_by_index(net, ifindex); - + dev = ip6_mc_find_dev(net, addr, ifindex); if (!dev) { sock_kfree_s(sk, mc_lst, sizeof(*mc_lst)); return -ENODEV; @@ -213,12 +230,11 @@ static int __ipv6_sock_mc_join(struct sock *sk, int ifindex, mc_lst->sfmode = mode; RCU_INIT_POINTER(mc_lst->sflist, NULL); - /* - * now add/increase the group membership on the device - */ - + /* now add/increase the group membership on the device */ err = __ipv6_dev_mc_inc(dev, addr, mode); + dev_put(dev); + if (err) { sock_kfree_s(sk, mc_lst, sizeof(*mc_lst)); return err; @@ -245,14 +261,36 @@ int ipv6_sock_mc_join_ssm(struct sock *sk, int ifindex, /* * socket leave on multicast group */ +static void __ipv6_sock_mc_drop(struct sock *sk, struct ipv6_mc_socklist *mc_lst) +{ + struct net *net = sock_net(sk); + struct net_device *dev; + + dev = dev_get_by_index(net, mc_lst->ifindex); + if (dev) { + struct inet6_dev *idev = in6_dev_get(dev); + + ip6_mc_leave_src(sk, mc_lst, idev); + + if (idev) { + __ipv6_dev_mc_dec(idev, &mc_lst->addr); + in6_dev_put(idev); + } + + dev_put(dev); + } else { + ip6_mc_leave_src(sk, mc_lst, NULL); + } + + atomic_sub(sizeof(*mc_lst), &sk->sk_omem_alloc); + kfree_rcu(mc_lst, rcu); +} + int ipv6_sock_mc_drop(struct sock *sk, int ifindex, const struct in6_addr *addr) { struct ipv6_pinfo *np = inet6_sk(sk); - struct ipv6_mc_socklist *mc_lst; struct ipv6_mc_socklist __rcu **lnk; - struct net *net = sock_net(sk); - - ASSERT_RTNL(); + struct ipv6_mc_socklist *mc_lst; if (!ipv6_addr_is_multicast(addr)) return -EINVAL; @@ -262,23 +300,8 @@ int ipv6_sock_mc_drop(struct sock *sk, int ifindex, const struct in6_addr *addr) lnk = &mc_lst->next) { if ((ifindex == 0 || mc_lst->ifindex == ifindex) && ipv6_addr_equal(&mc_lst->addr, addr)) { - struct net_device *dev; - *lnk = mc_lst->next; - - dev = __dev_get_by_index(net, mc_lst->ifindex); - if (dev) { - struct inet6_dev *idev = __in6_dev_get(dev); - - ip6_mc_leave_src(sk, mc_lst, idev); - if (idev) - __ipv6_dev_mc_dec(idev, &mc_lst->addr); - } else { - ip6_mc_leave_src(sk, mc_lst, NULL); - } - - atomic_sub(sizeof(*mc_lst), &sk->sk_omem_alloc); - kfree_rcu(mc_lst, rcu); + __ipv6_sock_mc_drop(sk, mc_lst); return 0; } } @@ -287,31 +310,20 @@ int ipv6_sock_mc_drop(struct sock *sk, int ifindex, const struct in6_addr *addr) } EXPORT_SYMBOL(ipv6_sock_mc_drop); -static struct inet6_dev *ip6_mc_find_dev_rtnl(struct net *net, - const struct in6_addr *group, - int ifindex) +static struct inet6_dev *ip6_mc_find_idev(struct net *net, + const struct in6_addr *group, + int ifindex) { - struct net_device *dev = NULL; - struct inet6_dev *idev = NULL; - - if (ifindex == 0) { - struct rt6_info *rt = rt6_lookup(net, group, NULL, 0, NULL, 0); - - if (rt) { - dev = rt->dst.dev; - ip6_rt_put(rt); - } - } else { - dev = __dev_get_by_index(net, ifindex); - } + struct net_device *dev; + struct inet6_dev *idev; + dev = ip6_mc_find_dev(net, group, ifindex); if (!dev) return NULL; - idev = __in6_dev_get(dev); - if (!idev) - return NULL; - if (idev->dead) - return NULL; + + idev = in6_dev_get(dev); + dev_put(dev); + return idev; } @@ -319,28 +331,10 @@ void __ipv6_sock_mc_close(struct sock *sk) { struct ipv6_pinfo *np = inet6_sk(sk); struct ipv6_mc_socklist *mc_lst; - struct net *net = sock_net(sk); - - ASSERT_RTNL(); while ((mc_lst = sock_dereference(np->ipv6_mc_list, sk)) != NULL) { - struct net_device *dev; - np->ipv6_mc_list = mc_lst->next; - - dev = __dev_get_by_index(net, mc_lst->ifindex); - if (dev) { - struct inet6_dev *idev = __in6_dev_get(dev); - - ip6_mc_leave_src(sk, mc_lst, idev); - if (idev) - __ipv6_dev_mc_dec(idev, &mc_lst->addr); - } else { - ip6_mc_leave_src(sk, mc_lst, NULL); - } - - atomic_sub(sizeof(*mc_lst), &sk->sk_omem_alloc); - kfree_rcu(mc_lst, rcu); + __ipv6_sock_mc_drop(sk, mc_lst); } } @@ -351,24 +345,22 @@ void ipv6_sock_mc_close(struct sock *sk) if (!rcu_access_pointer(np->ipv6_mc_list)) return; - rtnl_lock(); lock_sock(sk); __ipv6_sock_mc_close(sk); release_sock(sk); - rtnl_unlock(); } int ip6_mc_source(int add, int omode, struct sock *sk, - struct group_source_req *pgsr) + struct group_source_req *pgsr) { + struct ipv6_pinfo *inet6 = inet6_sk(sk); struct in6_addr *source, *group; + struct net *net = sock_net(sk); struct ipv6_mc_socklist *pmc; - struct inet6_dev *idev; - struct ipv6_pinfo *inet6 = inet6_sk(sk); struct ip6_sf_socklist *psl; - struct net *net = sock_net(sk); - int i, j, rv; + struct inet6_dev *idev; int leavegroup = 0; + int i, j, rv; int err; source = &((struct sockaddr_in6 *)&pgsr->gsr_source)->sin6_addr; @@ -377,13 +369,19 @@ int ip6_mc_source(int add, int omode, struct sock *sk, if (!ipv6_addr_is_multicast(group)) return -EINVAL; - idev = ip6_mc_find_dev_rtnl(net, group, pgsr->gsr_interface); + idev = ip6_mc_find_idev(net, group, pgsr->gsr_interface); if (!idev) return -ENODEV; + mutex_lock(&idev->mc_lock); + + if (idev->dead) { + err = -ENODEV; + goto done; + } + err = -EADDRNOTAVAIL; - mutex_lock(&idev->mc_lock); for_each_pmc_socklock(inet6, sk, pmc) { if (pgsr->gsr_interface && pmc->ifindex != pgsr->gsr_interface) continue; @@ -480,6 +478,7 @@ int ip6_mc_source(int add, int omode, struct sock *sk, ip6_mc_add_src(idev, group, omode, 1, source, 1); done: mutex_unlock(&idev->mc_lock); + in6_dev_put(idev); if (leavegroup) err = ipv6_sock_mc_drop(sk, pgsr->gsr_interface, group); return err; @@ -488,12 +487,12 @@ done: int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf, struct sockaddr_storage *list) { - const struct in6_addr *group; - struct ipv6_mc_socklist *pmc; - struct inet6_dev *idev; struct ipv6_pinfo *inet6 = inet6_sk(sk); struct ip6_sf_socklist *newpsl, *psl; struct net *net = sock_net(sk); + const struct in6_addr *group; + struct ipv6_mc_socklist *pmc; + struct inet6_dev *idev; int leavegroup = 0; int i, err; @@ -505,10 +504,17 @@ int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf, gsf->gf_fmode != MCAST_EXCLUDE) return -EINVAL; - idev = ip6_mc_find_dev_rtnl(net, group, gsf->gf_interface); + idev = ip6_mc_find_idev(net, group, gsf->gf_interface); if (!idev) return -ENODEV; + mutex_lock(&idev->mc_lock); + + if (idev->dead) { + err = -ENODEV; + goto done; + } + err = 0; if (gsf->gf_fmode == MCAST_INCLUDE && gsf->gf_numsrc == 0) { @@ -541,24 +547,19 @@ int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf, psin6 = (struct sockaddr_in6 *)list; newpsl->sl_addr[i] = psin6->sin6_addr; } - mutex_lock(&idev->mc_lock); + err = ip6_mc_add_src(idev, group, gsf->gf_fmode, newpsl->sl_count, newpsl->sl_addr, 0); if (err) { - mutex_unlock(&idev->mc_lock); sock_kfree_s(sk, newpsl, struct_size(newpsl, sl_addr, newpsl->sl_max)); goto done; } - mutex_unlock(&idev->mc_lock); } else { newpsl = NULL; - mutex_lock(&idev->mc_lock); ip6_mc_add_src(idev, group, gsf->gf_fmode, 0, NULL, 0); - mutex_unlock(&idev->mc_lock); } - mutex_lock(&idev->mc_lock); psl = sock_dereference(pmc->sflist, sk); if (psl) { ip6_mc_del_src(idev, group, pmc->sfmode, @@ -568,12 +569,14 @@ int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf, } else { ip6_mc_del_src(idev, group, pmc->sfmode, 0, NULL, 0); } + rcu_assign_pointer(pmc->sflist, newpsl); - mutex_unlock(&idev->mc_lock); kfree_rcu(psl, rcu); pmc->sfmode = gsf->gf_fmode; err = 0; done: + mutex_unlock(&idev->mc_lock); + in6_dev_put(idev); if (leavegroup) err = ipv6_sock_mc_drop(sk, gsf->gf_interface, group); return err; @@ -586,17 +589,14 @@ int ip6_mc_msfget(struct sock *sk, struct group_filter *gsf, const struct in6_addr *group; struct ipv6_mc_socklist *pmc; struct ip6_sf_socklist *psl; - int i, count, copycount; + unsigned int count; + int i, copycount; group = &((struct sockaddr_in6 *)&gsf->gf_group)->sin6_addr; if (!ipv6_addr_is_multicast(group)) return -EINVAL; - /* changes to the ipv6_mc_list require the socket lock and - * rtnl lock. We have the socket lock, so reading the list is safe. - */ - for_each_pmc_socklock(inet6, sk, pmc) { if (pmc->ifindex != gsf->gf_interface) continue; @@ -610,7 +610,7 @@ int ip6_mc_msfget(struct sock *sk, struct group_filter *gsf, psl = sock_dereference(pmc->sflist, sk); count = psl ? psl->sl_count : 0; - copycount = count < gsf->gf_numsrc ? count : gsf->gf_numsrc; + copycount = min(count, gsf->gf_numsrc); gsf->gf_numsrc = count; for (i = 0; i < copycount; i++) { struct sockaddr_in6 *psin6; @@ -627,12 +627,12 @@ int ip6_mc_msfget(struct sock *sk, struct group_filter *gsf, return 0; } -bool inet6_mc_check(struct sock *sk, const struct in6_addr *mc_addr, +bool inet6_mc_check(const struct sock *sk, const struct in6_addr *mc_addr, const struct in6_addr *src_addr) { - struct ipv6_pinfo *np = inet6_sk(sk); - struct ipv6_mc_socklist *mc; - struct ip6_sf_socklist *psl; + const struct ipv6_pinfo *np = inet6_sk(sk); + const struct ipv6_mc_socklist *mc; + const struct ip6_sf_socklist *psl; bool rv = true; rcu_read_lock(); @@ -642,7 +642,7 @@ bool inet6_mc_check(struct sock *sk, const struct in6_addr *mc_addr, } if (!mc) { rcu_read_unlock(); - return np->mc_all; + return inet6_test_bit(MC6_ALL, sk); } psl = rcu_dereference(mc->sflist); if (!psl) { @@ -664,12 +664,13 @@ bool inet6_mc_check(struct sock *sk, const struct in6_addr *mc_addr, return rv; } -/* called with mc_lock */ static void igmp6_group_added(struct ifmcaddr6 *mc) { struct net_device *dev = mc->idev->dev; char buf[MAX_ADDR_LEN]; + mc_assert_locked(mc->idev); + if (IPV6_ADDR_MC_SCOPE(&mc->mca_addr) < IPV6_ADDR_SCOPE_LINKLOCAL) return; @@ -699,12 +700,13 @@ static void igmp6_group_added(struct ifmcaddr6 *mc) mld_ifc_event(mc->idev); } -/* called with mc_lock */ static void igmp6_group_dropped(struct ifmcaddr6 *mc) { struct net_device *dev = mc->idev->dev; char buf[MAX_ADDR_LEN]; + mc_assert_locked(mc->idev); + if (IPV6_ADDR_MC_SCOPE(&mc->mca_addr) < IPV6_ADDR_SCOPE_LINKLOCAL) return; @@ -725,14 +727,13 @@ static void igmp6_group_dropped(struct ifmcaddr6 *mc) refcount_dec(&mc->mca_refcnt); } -/* - * deleted ifmcaddr6 manipulation - * called with mc_lock - */ +/* deleted ifmcaddr6 manipulation */ static void mld_add_delrec(struct inet6_dev *idev, struct ifmcaddr6 *im) { struct ifmcaddr6 *pmc; + mc_assert_locked(idev); + /* this is an "ifmcaddr6" for convenience; only the fields below * are actually used. In particular, the refcnt and users are not * used for management of the delete list. Using the same structure @@ -766,54 +767,54 @@ static void mld_add_delrec(struct inet6_dev *idev, struct ifmcaddr6 *im) rcu_assign_pointer(idev->mc_tomb, pmc); } -/* called with mc_lock */ static void mld_del_delrec(struct inet6_dev *idev, struct ifmcaddr6 *im) { struct ip6_sf_list *psf, *sources, *tomb; struct in6_addr *pmca = &im->mca_addr; struct ifmcaddr6 *pmc, *pmc_prev; + mc_assert_locked(idev); + pmc_prev = NULL; for_each_mc_tomb(idev, pmc) { if (ipv6_addr_equal(&pmc->mca_addr, pmca)) break; pmc_prev = pmc; } - if (pmc) { - if (pmc_prev) - rcu_assign_pointer(pmc_prev->next, pmc->next); - else - rcu_assign_pointer(idev->mc_tomb, pmc->next); - } - - if (pmc) { - im->idev = pmc->idev; - if (im->mca_sfmode == MCAST_INCLUDE) { - tomb = rcu_replace_pointer(im->mca_tomb, - mc_dereference(pmc->mca_tomb, pmc->idev), - lockdep_is_held(&im->idev->mc_lock)); - rcu_assign_pointer(pmc->mca_tomb, tomb); - - sources = rcu_replace_pointer(im->mca_sources, - mc_dereference(pmc->mca_sources, pmc->idev), - lockdep_is_held(&im->idev->mc_lock)); - rcu_assign_pointer(pmc->mca_sources, sources); - for_each_psf_mclock(im, psf) - psf->sf_crcount = idev->mc_qrv; - } else { - im->mca_crcount = idev->mc_qrv; - } - in6_dev_put(pmc->idev); - ip6_mc_clear_src(pmc); - kfree_rcu(pmc, rcu); + if (!pmc) + return; + if (pmc_prev) + rcu_assign_pointer(pmc_prev->next, pmc->next); + else + rcu_assign_pointer(idev->mc_tomb, pmc->next); + + im->idev = pmc->idev; + if (im->mca_sfmode == MCAST_INCLUDE) { + tomb = rcu_replace_pointer(im->mca_tomb, + mc_dereference(pmc->mca_tomb, pmc->idev), + lockdep_is_held(&im->idev->mc_lock)); + rcu_assign_pointer(pmc->mca_tomb, tomb); + + sources = rcu_replace_pointer(im->mca_sources, + mc_dereference(pmc->mca_sources, pmc->idev), + lockdep_is_held(&im->idev->mc_lock)); + rcu_assign_pointer(pmc->mca_sources, sources); + for_each_psf_mclock(im, psf) + psf->sf_crcount = idev->mc_qrv; + } else { + im->mca_crcount = idev->mc_qrv; } + ip6_mc_clear_src(pmc); + in6_dev_put(pmc->idev); + kfree_rcu(pmc, rcu); } -/* called with mc_lock */ static void mld_clear_delrec(struct inet6_dev *idev) { struct ifmcaddr6 *pmc, *nextpmc; + mc_assert_locked(idev); + pmc = mc_dereference(idev->mc_tomb, idev); RCU_INIT_POINTER(idev->mc_tomb, NULL); @@ -839,29 +840,18 @@ static void mld_clear_delrec(struct inet6_dev *idev) static void mld_clear_query(struct inet6_dev *idev) { - struct sk_buff *skb; - spin_lock_bh(&idev->mc_query_lock); - while ((skb = __skb_dequeue(&idev->mc_query_queue))) - kfree_skb(skb); + __skb_queue_purge(&idev->mc_query_queue); spin_unlock_bh(&idev->mc_query_lock); } static void mld_clear_report(struct inet6_dev *idev) { - struct sk_buff *skb; - spin_lock_bh(&idev->mc_report_lock); - while ((skb = __skb_dequeue(&idev->mc_report_queue))) - kfree_skb(skb); + __skb_queue_purge(&idev->mc_report_queue); spin_unlock_bh(&idev->mc_report_lock); } -static void mca_get(struct ifmcaddr6 *mc) -{ - refcount_inc(&mc->mca_refcnt); -} - static void ma_put(struct ifmcaddr6 *mc) { if (refcount_dec_and_test(&mc->mca_refcnt)) { @@ -870,13 +860,14 @@ static void ma_put(struct ifmcaddr6 *mc) } } -/* called with mc_lock */ static struct ifmcaddr6 *mca_alloc(struct inet6_dev *idev, const struct in6_addr *addr, unsigned int mode) { struct ifmcaddr6 *mc; + mc_assert_locked(idev); + mc = kzalloc(sizeof(*mc), GFP_KERNEL); if (!mc) return NULL; @@ -900,29 +891,63 @@ static struct ifmcaddr6 *mca_alloc(struct inet6_dev *idev, return mc; } +static void inet6_ifmcaddr_notify(struct net_device *dev, + const struct ifmcaddr6 *ifmca, int event) +{ + struct inet6_fill_args fillargs = { + .portid = 0, + .seq = 0, + .event = event, + .flags = 0, + .netnsid = -1, + .force_rt_scope_universe = true, + }; + struct net *net = dev_net(dev); + struct sk_buff *skb; + int err = -ENOMEM; + + skb = nlmsg_new(NLMSG_ALIGN(sizeof(struct ifaddrmsg)) + + nla_total_size(sizeof(struct in6_addr)) + + nla_total_size(sizeof(struct ifa_cacheinfo)), + GFP_KERNEL); + if (!skb) + goto error; + + err = inet6_fill_ifmcaddr(skb, ifmca, &fillargs); + if (err < 0) { + WARN_ON_ONCE(err == -EMSGSIZE); + nlmsg_free(skb); + goto error; + } + + rtnl_notify(skb, net, 0, RTNLGRP_IPV6_MCADDR, NULL, GFP_KERNEL); + return; +error: + rtnl_set_sk_err(net, RTNLGRP_IPV6_MCADDR, err); +} + /* * device multicast group inc (add if not found) */ static int __ipv6_dev_mc_inc(struct net_device *dev, const struct in6_addr *addr, unsigned int mode) { - struct ifmcaddr6 *mc; struct inet6_dev *idev; - - ASSERT_RTNL(); + struct ifmcaddr6 *mc; /* we need to take a reference on idev */ idev = in6_dev_get(dev); - if (!idev) return -EINVAL; - if (idev->dead) { + mutex_lock(&idev->mc_lock); + + if (READ_ONCE(idev->dead)) { + mutex_unlock(&idev->mc_lock); in6_dev_put(idev); return -ENODEV; } - mutex_lock(&idev->mc_lock); for_each_mc_mclock(idev, mc) { if (ipv6_addr_equal(&mc->mca_addr, addr)) { mc->mca_users++; @@ -943,12 +968,11 @@ static int __ipv6_dev_mc_inc(struct net_device *dev, rcu_assign_pointer(mc->next, idev->mc_list); rcu_assign_pointer(idev->mc_list, mc); - mca_get(mc); - mld_del_delrec(idev, mc); igmp6_group_added(mc); + inet6_ifmcaddr_notify(dev, mc, RTM_NEWMULTICAST); mutex_unlock(&idev->mc_lock); - ma_put(mc); + return 0; } @@ -965,9 +989,8 @@ int __ipv6_dev_mc_dec(struct inet6_dev *idev, const struct in6_addr *addr) { struct ifmcaddr6 *ma, __rcu **map; - ASSERT_RTNL(); - mutex_lock(&idev->mc_lock); + for (map = &idev->mc_list; (ma = mc_dereference(*map, idev)); map = &ma->next) { @@ -976,6 +999,8 @@ int __ipv6_dev_mc_dec(struct inet6_dev *idev, const struct in6_addr *addr) *map = ma->next; igmp6_group_dropped(ma); + inet6_ifmcaddr_notify(idev->dev, ma, + RTM_DELMULTICAST); ip6_mc_clear_src(ma); mutex_unlock(&idev->mc_lock); @@ -996,13 +1021,12 @@ int ipv6_dev_mc_dec(struct net_device *dev, const struct in6_addr *addr) struct inet6_dev *idev; int err; - ASSERT_RTNL(); - - idev = __in6_dev_get(dev); + idev = in6_dev_get(dev); if (!idev) - err = -ENODEV; - else - err = __ipv6_dev_mc_dec(idev, addr); + return -ENODEV; + + err = __ipv6_dev_mc_dec(idev, addr); + in6_dev_put(idev); return err; } @@ -1020,73 +1044,80 @@ bool ipv6_chk_mcast_addr(struct net_device *dev, const struct in6_addr *group, rcu_read_lock(); idev = __in6_dev_get(dev); - if (idev) { - for_each_mc_rcu(idev, mc) { - if (ipv6_addr_equal(&mc->mca_addr, group)) - break; - } - if (mc) { - if (src_addr && !ipv6_addr_any(src_addr)) { - struct ip6_sf_list *psf; + if (!idev) + goto unlock; + for_each_mc_rcu(idev, mc) { + if (ipv6_addr_equal(&mc->mca_addr, group)) + break; + } + if (!mc) + goto unlock; + if (src_addr && !ipv6_addr_any(src_addr)) { + struct ip6_sf_list *psf; - for_each_psf_rcu(mc, psf) { - if (ipv6_addr_equal(&psf->sf_addr, src_addr)) - break; - } - if (psf) - rv = psf->sf_count[MCAST_INCLUDE] || - psf->sf_count[MCAST_EXCLUDE] != - mc->mca_sfcount[MCAST_EXCLUDE]; - else - rv = mc->mca_sfcount[MCAST_EXCLUDE] != 0; - } else - rv = true; /* don't filter unspecified source */ + for_each_psf_rcu(mc, psf) { + if (ipv6_addr_equal(&psf->sf_addr, src_addr)) + break; } + if (psf) + rv = READ_ONCE(psf->sf_count[MCAST_INCLUDE]) || + READ_ONCE(psf->sf_count[MCAST_EXCLUDE]) != + READ_ONCE(mc->mca_sfcount[MCAST_EXCLUDE]); + else + rv = READ_ONCE(mc->mca_sfcount[MCAST_EXCLUDE]) != 0; + } else { + rv = true; /* don't filter unspecified source */ } +unlock: rcu_read_unlock(); return rv; } -/* called with mc_lock */ static void mld_gq_start_work(struct inet6_dev *idev) { unsigned long tv = get_random_u32_below(idev->mc_maxdelay); + mc_assert_locked(idev); + idev->mc_gq_running = 1; if (!mod_delayed_work(mld_wq, &idev->mc_gq_work, tv + 2)) in6_dev_hold(idev); } -/* called with mc_lock */ static void mld_gq_stop_work(struct inet6_dev *idev) { + mc_assert_locked(idev); + idev->mc_gq_running = 0; if (cancel_delayed_work(&idev->mc_gq_work)) __in6_dev_put(idev); } -/* called with mc_lock */ static void mld_ifc_start_work(struct inet6_dev *idev, unsigned long delay) { unsigned long tv = get_random_u32_below(delay); + mc_assert_locked(idev); + if (!mod_delayed_work(mld_wq, &idev->mc_ifc_work, tv + 2)) in6_dev_hold(idev); } -/* called with mc_lock */ static void mld_ifc_stop_work(struct inet6_dev *idev) { + mc_assert_locked(idev); + idev->mc_ifc_count = 0; if (cancel_delayed_work(&idev->mc_ifc_work)) __in6_dev_put(idev); } -/* called with mc_lock */ static void mld_dad_start_work(struct inet6_dev *idev, unsigned long delay) { unsigned long tv = get_random_u32_below(delay); + mc_assert_locked(idev); + if (!mod_delayed_work(mld_wq, &idev->mc_dad_work, tv + 2)) in6_dev_hold(idev); } @@ -1111,14 +1142,13 @@ static void mld_report_stop_work(struct inet6_dev *idev) __in6_dev_put(idev); } -/* - * IGMP handling (alias multicast ICMPv6 messages) - * called with mc_lock - */ +/* IGMP handling (alias multicast ICMPv6 messages) */ static void igmp6_group_queried(struct ifmcaddr6 *ma, unsigned long resptime) { unsigned long delay = resptime; + mc_assert_locked(ma->idev); + /* Do not start work for these addresses */ if (ipv6_addr_is_ll_all_nodes(&ma->mca_addr) || IPV6_ADDR_MC_SCOPE(&ma->mca_addr) < IPV6_ADDR_SCOPE_LINKLOCAL) @@ -1137,15 +1167,15 @@ static void igmp6_group_queried(struct ifmcaddr6 *ma, unsigned long resptime) ma->mca_flags |= MAF_TIMER_RUNNING; } -/* mark EXCLUDE-mode sources - * called with mc_lock - */ +/* mark EXCLUDE-mode sources */ static bool mld_xmarksources(struct ifmcaddr6 *pmc, int nsrcs, const struct in6_addr *srcs) { struct ip6_sf_list *psf; int i, scount; + mc_assert_locked(pmc->idev); + scount = 0; for_each_psf_mclock(pmc, psf) { if (scount == nsrcs) @@ -1168,13 +1198,14 @@ static bool mld_xmarksources(struct ifmcaddr6 *pmc, int nsrcs, return true; } -/* called with mc_lock */ static bool mld_marksources(struct ifmcaddr6 *pmc, int nsrcs, const struct in6_addr *srcs) { struct ip6_sf_list *psf; int i, scount; + mc_assert_locked(pmc->idev); + if (pmc->mca_sfmode == MCAST_EXCLUDE) return mld_xmarksources(pmc, nsrcs, srcs); @@ -1202,15 +1233,15 @@ static bool mld_marksources(struct ifmcaddr6 *pmc, int nsrcs, static int mld_force_mld_version(const struct inet6_dev *idev) { + const struct net *net = dev_net(idev->dev); + int all_force; + + all_force = READ_ONCE(net->ipv6.devconf_all->force_mld_version); /* Normally, both are 0 here. If enforcement to a particular is * being used, individual device enforcement will have a lower * precedence over 'all' device (.../conf/all/force_mld_version). */ - - if (dev_net(idev->dev)->ipv6.devconf_all->force_mld_version != 0) - return dev_net(idev->dev)->ipv6.devconf_all->force_mld_version; - else - return idev->cnf.force_mld_version; + return all_force ?: READ_ONCE(idev->cnf.force_mld_version); } static bool mld_in_v2_mode_only(const struct inet6_dev *idev) @@ -1699,11 +1730,9 @@ mld_scount(struct ifmcaddr6 *pmc, int type, int gdeleted, int sdeleted) return scount; } -static void ip6_mc_hdr(struct sock *sk, struct sk_buff *skb, - struct net_device *dev, - const struct in6_addr *saddr, - const struct in6_addr *daddr, - int proto, int len) +static void ip6_mc_hdr(const struct sock *sk, struct sk_buff *skb, + struct net_device *dev, const struct in6_addr *saddr, + const struct in6_addr *daddr, int proto, int len) { struct ipv6hdr *hdr; @@ -1718,7 +1747,7 @@ static void ip6_mc_hdr(struct sock *sk, struct sk_buff *skb, hdr->payload_len = htons(len); hdr->nexthdr = proto; - hdr->hop_limit = inet6_sk(sk)->hop_limit; + hdr->hop_limit = READ_ONCE(inet6_sk(sk)->hop_limit); hdr->saddr = *saddr; hdr->daddr = *daddr; @@ -1731,21 +1760,19 @@ static struct sk_buff *mld_newpack(struct inet6_dev *idev, unsigned int mtu) struct net_device *dev = idev->dev; int hlen = LL_RESERVED_SPACE(dev); int tlen = dev->needed_tailroom; - struct net *net = dev_net(dev); const struct in6_addr *saddr; struct in6_addr addr_buf; struct mld2_report *pmr; struct sk_buff *skb; unsigned int size; struct sock *sk; - int err; + struct net *net; - sk = net->ipv6.igmp_sk; /* we assume size > sizeof(ra) here * Also try to not allocate high-order pages for big MTU */ size = min_t(int, mtu, PAGE_SIZE / 2) + hlen + tlen; - skb = sock_alloc_send_skb(sk, size, 1, &err); + skb = alloc_skb(size, GFP_KERNEL); if (!skb) return NULL; @@ -1753,6 +1780,12 @@ static struct sk_buff *mld_newpack(struct inet6_dev *idev, unsigned int mtu) skb_reserve(skb, hlen); skb_tailroom_reserve(skb, mtu, tlen); + rcu_read_lock(); + + net = dev_net_rcu(dev); + sk = net->ipv6.igmp_sk; + skb_set_owner_w(skb, sk); + if (ipv6_get_lladdr(dev, &addr_buf, IFA_F_TENTATIVE)) { /* <draft-ietf-magma-mld-source-05.txt>: * use unspecified address as the source address @@ -1764,6 +1797,8 @@ static struct sk_buff *mld_newpack(struct inet6_dev *idev, unsigned int mtu) ip6_mc_hdr(sk, skb, dev, saddr, &mld2_all_mcr, NEXTHDR_HOP, 0); + rcu_read_unlock(); + skb_put_data(skb, ra, sizeof(ra)); skb_set_transport_header(skb, skb_tail_pointer(skb) - skb->data); @@ -1791,7 +1826,7 @@ static void mld_sendpack(struct sk_buff *skb) rcu_read_lock(); idev = __in6_dev_get(skb->dev); - IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUT, skb->len); + IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTREQUESTS); payload_len = (skb_tail_pointer(skb) - skb_network_header(skb)) - sizeof(*pip6); @@ -1865,7 +1900,6 @@ static struct sk_buff *add_grhead(struct sk_buff *skb, struct ifmcaddr6 *pmc, #define AVAILABLE(skb) ((skb) ? skb_availroom(skb) : 0) -/* called with mc_lock */ static struct sk_buff *add_grec(struct sk_buff *skb, struct ifmcaddr6 *pmc, int type, int gdeleted, int sdeleted, int crsend) @@ -1879,6 +1913,8 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ifmcaddr6 *pmc, struct mld2_report *pmr; unsigned int mtu; + mc_assert_locked(idev); + if (pmc->mca_flags & MAF_NOREPORT) return skb; @@ -1997,12 +2033,13 @@ empty_source: return skb; } -/* called with mc_lock */ static void mld_send_report(struct inet6_dev *idev, struct ifmcaddr6 *pmc) { struct sk_buff *skb = NULL; int type; + mc_assert_locked(idev); + if (!pmc) { for_each_mc_mclock(idev, pmc) { if (pmc->mca_flags & MAF_NOREPORT) @@ -2024,10 +2061,7 @@ static void mld_send_report(struct inet6_dev *idev, struct ifmcaddr6 *pmc) mld_sendpack(skb); } -/* - * remove zero-count source records from a source filter list - * called with mc_lock - */ +/* remove zero-count source records from a source filter list */ static void mld_clear_zeros(struct ip6_sf_list __rcu **ppsf, struct inet6_dev *idev) { struct ip6_sf_list *psf_prev, *psf_next, *psf; @@ -2051,7 +2085,6 @@ static void mld_clear_zeros(struct ip6_sf_list __rcu **ppsf, struct inet6_dev *i } } -/* called with mc_lock */ static void mld_send_cr(struct inet6_dev *idev) { struct ifmcaddr6 *pmc, *pmc_prev, *pmc_next; @@ -2123,21 +2156,21 @@ static void mld_send_cr(struct inet6_dev *idev) static void igmp6_send(struct in6_addr *addr, struct net_device *dev, int type) { - struct net *net = dev_net(dev); - struct sock *sk = net->ipv6.igmp_sk; + const struct in6_addr *snd_addr, *saddr; + int err, len, payload_len, full_len; + struct in6_addr addr_buf; struct inet6_dev *idev; struct sk_buff *skb; struct mld_msg *hdr; - const struct in6_addr *snd_addr, *saddr; - struct in6_addr addr_buf; int hlen = LL_RESERVED_SPACE(dev); int tlen = dev->needed_tailroom; - int err, len, payload_len, full_len; u8 ra[8] = { IPPROTO_ICMPV6, 0, IPV6_TLV_ROUTERALERT, 2, 0, 0, IPV6_TLV_PADN, 0 }; - struct flowi6 fl6; struct dst_entry *dst; + struct flowi6 fl6; + struct net *net; + struct sock *sk; if (type == ICMPV6_MGM_REDUCTION) snd_addr = &in6addr_linklocal_allrouters; @@ -2148,20 +2181,21 @@ static void igmp6_send(struct in6_addr *addr, struct net_device *dev, int type) payload_len = len + sizeof(ra); full_len = sizeof(struct ipv6hdr) + payload_len; - rcu_read_lock(); - IP6_UPD_PO_STATS(net, __in6_dev_get(dev), - IPSTATS_MIB_OUT, full_len); - rcu_read_unlock(); + skb = alloc_skb(hlen + tlen + full_len, GFP_KERNEL); - skb = sock_alloc_send_skb(sk, hlen + tlen + full_len, 1, &err); + rcu_read_lock(); + net = dev_net_rcu(dev); + idev = __in6_dev_get(dev); + IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTREQUESTS); if (!skb) { - rcu_read_lock(); - IP6_INC_STATS(net, __in6_dev_get(dev), - IPSTATS_MIB_OUTDISCARDS); + IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS); rcu_read_unlock(); return; } + sk = net->ipv6.igmp_sk; + skb_set_owner_w(skb, sk); + skb->priority = TC_PRIO_CONTROL; skb_reserve(skb, hlen); @@ -2186,9 +2220,6 @@ static void igmp6_send(struct in6_addr *addr, struct net_device *dev, int type) IPPROTO_ICMPV6, csum_partial(hdr, len, 0)); - rcu_read_lock(); - idev = __in6_dev_get(skb->dev); - icmpv6_flow_init(sk, &fl6, type, &ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr, skb->dev->ifindex); @@ -2217,13 +2248,14 @@ err_out: goto out; } -/* called with mc_lock */ static void mld_send_initial_cr(struct inet6_dev *idev) { - struct sk_buff *skb; struct ifmcaddr6 *pmc; + struct sk_buff *skb; int type; + mc_assert_locked(idev); + if (mld_in_v1_mode(idev)) return; @@ -2270,13 +2302,14 @@ static void mld_dad_work(struct work_struct *work) in6_dev_put(idev); } -/* called with mc_lock */ static int ip6_mc_del1_src(struct ifmcaddr6 *pmc, int sfmode, - const struct in6_addr *psfsrc) + const struct in6_addr *psfsrc) { struct ip6_sf_list *psf, *psf_prev; int rv = 0; + mc_assert_locked(pmc->idev); + psf_prev = NULL; for_each_psf_mclock(pmc, psf) { if (ipv6_addr_equal(&psf->sf_addr, psfsrc)) @@ -2287,7 +2320,7 @@ static int ip6_mc_del1_src(struct ifmcaddr6 *pmc, int sfmode, /* source filter not found, or count wrong => bug */ return -ESRCH; } - psf->sf_count[sfmode]--; + WRITE_ONCE(psf->sf_count[sfmode], psf->sf_count[sfmode] - 1); if (!psf->sf_count[MCAST_INCLUDE] && !psf->sf_count[MCAST_EXCLUDE]) { struct inet6_dev *idev = pmc->idev; @@ -2313,7 +2346,6 @@ static int ip6_mc_del1_src(struct ifmcaddr6 *pmc, int sfmode, return rv; } -/* called with mc_lock */ static int ip6_mc_del_src(struct inet6_dev *idev, const struct in6_addr *pmca, int sfmode, int sfcount, const struct in6_addr *psfsrc, int delta) @@ -2325,6 +2357,8 @@ static int ip6_mc_del_src(struct inet6_dev *idev, const struct in6_addr *pmca, if (!idev) return -ENODEV; + mc_assert_locked(idev); + for_each_mc_mclock(idev, pmc) { if (ipv6_addr_equal(pmca, &pmc->mca_addr)) break; @@ -2366,15 +2400,14 @@ static int ip6_mc_del_src(struct inet6_dev *idev, const struct in6_addr *pmca, return err; } -/* - * Add multicast single-source filter to the interface list - * called with mc_lock - */ +/* Add multicast single-source filter to the interface list */ static int ip6_mc_add1_src(struct ifmcaddr6 *pmc, int sfmode, - const struct in6_addr *psfsrc) + const struct in6_addr *psfsrc) { struct ip6_sf_list *psf, *psf_prev; + mc_assert_locked(pmc->idev); + psf_prev = NULL; for_each_psf_mclock(pmc, psf) { if (ipv6_addr_equal(&psf->sf_addr, psfsrc)) @@ -2393,15 +2426,16 @@ static int ip6_mc_add1_src(struct ifmcaddr6 *pmc, int sfmode, rcu_assign_pointer(pmc->mca_sources, psf); } } - psf->sf_count[sfmode]++; + WRITE_ONCE(psf->sf_count[sfmode], psf->sf_count[sfmode] + 1); return 0; } -/* called with mc_lock */ static void sf_markstate(struct ifmcaddr6 *pmc) { - struct ip6_sf_list *psf; int mca_xcount = pmc->mca_sfcount[MCAST_EXCLUDE]; + struct ip6_sf_list *psf; + + mc_assert_locked(pmc->idev); for_each_psf_mclock(pmc, psf) { if (pmc->mca_sfcount[MCAST_EXCLUDE]) { @@ -2414,14 +2448,15 @@ static void sf_markstate(struct ifmcaddr6 *pmc) } } -/* called with mc_lock */ static int sf_setstate(struct ifmcaddr6 *pmc) { - struct ip6_sf_list *psf, *dpsf; int mca_xcount = pmc->mca_sfcount[MCAST_EXCLUDE]; + struct ip6_sf_list *psf, *dpsf; int qrv = pmc->idev->mc_qrv; int new_in, rv; + mc_assert_locked(pmc->idev); + rv = 0; for_each_psf_mclock(pmc, psf) { if (pmc->mca_sfcount[MCAST_EXCLUDE]) { @@ -2480,10 +2515,7 @@ static int sf_setstate(struct ifmcaddr6 *pmc) return rv; } -/* - * Add multicast source filter list to the interface list - * called with mc_lock - */ +/* Add multicast source filter list to the interface list */ static int ip6_mc_add_src(struct inet6_dev *idev, const struct in6_addr *pmca, int sfmode, int sfcount, const struct in6_addr *psfsrc, int delta) @@ -2495,6 +2527,8 @@ static int ip6_mc_add_src(struct inet6_dev *idev, const struct in6_addr *pmca, if (!idev) return -ENODEV; + mc_assert_locked(idev); + for_each_mc_mclock(idev, pmc) { if (ipv6_addr_equal(pmca, &pmc->mca_addr)) break; @@ -2505,7 +2539,8 @@ static int ip6_mc_add_src(struct inet6_dev *idev, const struct in6_addr *pmca, sf_markstate(pmc); isexclude = pmc->mca_sfmode == MCAST_EXCLUDE; if (!delta) - pmc->mca_sfcount[sfmode]++; + WRITE_ONCE(pmc->mca_sfcount[sfmode], + pmc->mca_sfcount[sfmode] + 1); err = 0; for (i = 0; i < sfcount; i++) { err = ip6_mc_add1_src(pmc, sfmode, &psfsrc[i]); @@ -2516,7 +2551,8 @@ static int ip6_mc_add_src(struct inet6_dev *idev, const struct in6_addr *pmca, int j; if (!delta) - pmc->mca_sfcount[sfmode]--; + WRITE_ONCE(pmc->mca_sfcount[sfmode], + pmc->mca_sfcount[sfmode] - 1); for (j = 0; j < i; j++) ip6_mc_del1_src(pmc, sfmode, &psfsrc[j]); } else if (isexclude != (pmc->mca_sfcount[MCAST_EXCLUDE] != 0)) { @@ -2540,11 +2576,12 @@ static int ip6_mc_add_src(struct inet6_dev *idev, const struct in6_addr *pmca, return err; } -/* called with mc_lock */ static void ip6_mc_clear_src(struct ifmcaddr6 *pmc) { struct ip6_sf_list *psf, *nextpsf; + mc_assert_locked(pmc->idev); + for (psf = mc_dereference(pmc->mca_tomb, pmc->idev); psf; psf = nextpsf) { @@ -2561,14 +2598,16 @@ static void ip6_mc_clear_src(struct ifmcaddr6 *pmc) RCU_INIT_POINTER(pmc->mca_sources, NULL); pmc->mca_sfmode = MCAST_EXCLUDE; pmc->mca_sfcount[MCAST_INCLUDE] = 0; - pmc->mca_sfcount[MCAST_EXCLUDE] = 1; + /* Paired with the READ_ONCE() from ipv6_chk_mcast_addr() */ + WRITE_ONCE(pmc->mca_sfcount[MCAST_EXCLUDE], 1); } -/* called with mc_lock */ static void igmp6_join_group(struct ifmcaddr6 *ma) { unsigned long delay; + mc_assert_locked(ma->idev); + if (ma->mca_flags & MAF_NOREPORT) return; @@ -2615,9 +2654,10 @@ static int ip6_mc_leave_src(struct sock *sk, struct ipv6_mc_socklist *iml, return err; } -/* called with mc_lock */ static void igmp6_leave_group(struct ifmcaddr6 *ma) { + mc_assert_locked(ma->idev); + if (mld_in_v1_mode(ma->idev)) { if (ma->mca_flags & MAF_LAST_REPORTER) { igmp6_send(&ma->mca_addr, ma->idev->dev, @@ -2662,9 +2702,10 @@ static void mld_ifc_work(struct work_struct *work) in6_dev_put(idev); } -/* called with mc_lock */ static void mld_ifc_event(struct inet6_dev *idev) { + mc_assert_locked(idev); + if (mld_in_v1_mode(idev)) return; @@ -2722,11 +2763,14 @@ void ipv6_mc_down(struct inet6_dev *idev) /* Should stop work after group drop. or we will * start work again in mld_ifc_event() */ - synchronize_net(); mld_query_stop_work(idev); mld_report_stop_work(idev); + + mutex_lock(&idev->mc_lock); mld_ifc_stop_work(idev); mld_gq_stop_work(idev); + mutex_unlock(&idev->mc_lock); + mld_dad_stop_work(idev); } @@ -2816,8 +2860,6 @@ static void ipv6_mc_rejoin_groups(struct inet6_dev *idev) { struct ifmcaddr6 *pmc; - ASSERT_RTNL(); - mutex_lock(&idev->mc_lock); if (mld_in_v1_mode(idev)) { for_each_mc_mclock(idev, pmc) @@ -3013,8 +3055,6 @@ static struct ip6_sf_list *igmp6_mcf_get_next(struct seq_file *seq, struct ip6_s continue; state->im = rcu_dereference(state->idev->mc_list); } - if (!state->im) - break; psf = rcu_dereference(state->im->mca_sources); } out: @@ -3075,8 +3115,8 @@ static int igmp6_mcf_seq_show(struct seq_file *seq, void *v) state->dev->ifindex, state->dev->name, &state->im->mca_addr, &psf->sf_addr, - psf->sf_count[MCAST_INCLUDE], - psf->sf_count[MCAST_EXCLUDE]); + READ_ONCE(psf->sf_count[MCAST_INCLUDE]), + READ_ONCE(psf->sf_count[MCAST_EXCLUDE])); } return 0; } diff --git a/net/ipv6/mip6.c b/net/ipv6/mip6.c index 83d2a8be263f..6a16a5bd0d91 100644 --- a/net/ipv6/mip6.c +++ b/net/ipv6/mip6.c @@ -405,6 +405,7 @@ static void __exit mip6_fini(void) module_init(mip6_init); module_exit(mip6_fini); +MODULE_DESCRIPTION("IPv6 Mobility driver"); MODULE_LICENSE("GPL"); MODULE_ALIAS_XFRM_TYPE(AF_INET6, XFRM_PROTO_DSTOPTS); MODULE_ALIAS_XFRM_TYPE(AF_INET6, XFRM_PROTO_ROUTING); diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 3a553494ff16..59d17b6f06bf 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -130,7 +130,7 @@ struct neigh_table nd_tbl = { [NEIGH_VAR_DELAY_PROBE_TIME] = 5 * HZ, [NEIGH_VAR_INTERVAL_PROBE_TIME_MS] = 5 * HZ, [NEIGH_VAR_GC_STALETIME] = 60 * HZ, - [NEIGH_VAR_QUEUE_LEN_BYTES] = SK_WMEM_MAX, + [NEIGH_VAR_QUEUE_LEN_BYTES] = SK_WMEM_DEFAULT, [NEIGH_VAR_PROXY_QLEN] = 64, [NEIGH_VAR_ANYCAST_DELAY] = 1 * HZ, [NEIGH_VAR_PROXY_DELAY] = (8 * HZ) / 10, @@ -197,11 +197,12 @@ static struct nd_opt_hdr *ndisc_next_option(struct nd_opt_hdr *cur, static inline int ndisc_is_useropt(const struct net_device *dev, struct nd_opt_hdr *opt) { - return opt->nd_opt_type == ND_OPT_RDNSS || + return opt->nd_opt_type == ND_OPT_PREFIX_INFO || + opt->nd_opt_type == ND_OPT_RDNSS || opt->nd_opt_type == ND_OPT_DNSSL || + opt->nd_opt_type == ND_OPT_6CO || opt->nd_opt_type == ND_OPT_CAPTIVE_PORTAL || - opt->nd_opt_type == ND_OPT_PREF64 || - ndisc_ops_is_useropt(dev, opt->nd_opt_type); + opt->nd_opt_type == ND_OPT_PREF64; } static struct nd_opt_hdr *ndisc_next_useropt(const struct net_device *dev, @@ -226,6 +227,7 @@ struct ndisc_options *ndisc_parse_options(const struct net_device *dev, return NULL; memset(ndopts, 0, sizeof(*ndopts)); while (opt_len) { + bool unknown = false; int l; if (opt_len < sizeof(struct nd_opt_hdr)) return NULL; @@ -241,9 +243,8 @@ struct ndisc_options *ndisc_parse_options(const struct net_device *dev, case ND_OPT_NONCE: case ND_OPT_REDIRECT_HDR: if (ndopts->nd_opt_array[nd_opt->nd_opt_type]) { - ND_PRINTK(2, warn, - "%s: duplicated ND6 option found: type=%d\n", - __func__, nd_opt->nd_opt_type); + net_dbg_ratelimited("%s: duplicated ND6 option found: type=%d\n", + __func__, nd_opt->nd_opt_type); } else { ndopts->nd_opt_array[nd_opt->nd_opt_type] = nd_opt; } @@ -261,22 +262,20 @@ struct ndisc_options *ndisc_parse_options(const struct net_device *dev, break; #endif default: - if (ndisc_is_useropt(dev, nd_opt)) { - ndopts->nd_useropts_end = nd_opt; - if (!ndopts->nd_useropts) - ndopts->nd_useropts = nd_opt; - } else { - /* - * Unknown options must be silently ignored, - * to accommodate future extension to the - * protocol. - */ - ND_PRINTK(2, notice, - "%s: ignored unsupported option; type=%d, len=%d\n", - __func__, - nd_opt->nd_opt_type, - nd_opt->nd_opt_len); - } + unknown = true; + } + if (ndisc_is_useropt(dev, nd_opt)) { + ndopts->nd_useropts_end = nd_opt; + if (!ndopts->nd_useropts) + ndopts->nd_useropts = nd_opt; + } else if (unknown) { + /* + * Unknown options must be silently ignored, + * to accommodate future extension to the + * protocol. + */ + net_dbg_ratelimited("%s: ignored unsupported option; type=%d, len=%d\n", + __func__, nd_opt->nd_opt_type, nd_opt->nd_opt_len); } next_opt: opt_len -= l; @@ -374,24 +373,25 @@ static int ndisc_constructor(struct neighbour *neigh) static int pndisc_constructor(struct pneigh_entry *n) { struct in6_addr *addr = (struct in6_addr *)&n->key; - struct in6_addr maddr; struct net_device *dev = n->dev; + struct in6_addr maddr; - if (!dev || !__in6_dev_get(dev)) + if (!dev) return -EINVAL; + addrconf_addr_solict_mult(addr, &maddr); - ipv6_dev_mc_inc(dev, &maddr); - return 0; + return ipv6_dev_mc_inc(dev, &maddr); } static void pndisc_destructor(struct pneigh_entry *n) { struct in6_addr *addr = (struct in6_addr *)&n->key; - struct in6_addr maddr; struct net_device *dev = n->dev; + struct in6_addr maddr; - if (!dev || !__in6_dev_get(dev)) + if (!dev) return; + addrconf_addr_solict_mult(addr, &maddr); ipv6_dev_mc_dec(dev, &maddr); } @@ -415,15 +415,11 @@ static struct sk_buff *ndisc_alloc_skb(struct net_device *dev, { int hlen = LL_RESERVED_SPACE(dev); int tlen = dev->needed_tailroom; - struct sock *sk = dev_net(dev)->ipv6.ndisc_sk; struct sk_buff *skb; skb = alloc_skb(hlen + sizeof(struct ipv6hdr) + len + tlen, GFP_ATOMIC); - if (!skb) { - ND_PRINTK(0, err, "ndisc: %s failed to allocate an skb\n", - __func__); + if (!skb) return NULL; - } skb->protocol = htons(ETH_P_IPV6); skb->dev = dev; @@ -434,7 +430,9 @@ static struct sk_buff *ndisc_alloc_skb(struct net_device *dev, /* Manually assign socket ownership as we avoid calling * sock_alloc_send_pskb() to bypass wmem buffer limits */ - skb_set_owner_w(skb, sk); + rcu_read_lock(); + skb_set_owner_w(skb, dev_net_rcu(dev)->ipv6.ndisc_sk); + rcu_read_unlock(); return skb; } @@ -450,7 +448,7 @@ static void ip6_nd_hdr(struct sk_buff *skb, rcu_read_lock(); idev = __in6_dev_get(skb->dev); - tclass = idev ? idev->cnf.ndisc_tclass : 0; + tclass = idev ? READ_ONCE(idev->cnf.ndisc_tclass) : 0; rcu_read_unlock(); skb_push(skb, sizeof(*hdr)); @@ -470,16 +468,21 @@ static void ip6_nd_hdr(struct sk_buff *skb, void ndisc_send_skb(struct sk_buff *skb, const struct in6_addr *daddr, const struct in6_addr *saddr) { + struct icmp6hdr *icmp6h = icmp6_hdr(skb); struct dst_entry *dst = skb_dst(skb); - struct net *net = dev_net(skb->dev); - struct sock *sk = net->ipv6.ndisc_sk; + struct net_device *dev; struct inet6_dev *idev; + struct net *net; + struct sock *sk; int err; - struct icmp6hdr *icmp6h = icmp6_hdr(skb); u8 type; type = icmp6h->icmp6_type; + rcu_read_lock(); + + net = dev_net_rcu(skb->dev); + sk = net->ipv6.ndisc_sk; if (!dst) { struct flowi6 fl6; int oif = skb->dev->ifindex; @@ -487,6 +490,7 @@ void ndisc_send_skb(struct sk_buff *skb, const struct in6_addr *daddr, icmpv6_flow_init(sk, &fl6, type, saddr, daddr, oif); dst = icmp6_dst_alloc(skb->dev, &fl6); if (IS_ERR(dst)) { + rcu_read_unlock(); kfree_skb(skb); return; } @@ -499,14 +503,14 @@ void ndisc_send_skb(struct sk_buff *skb, const struct in6_addr *daddr, csum_partial(icmp6h, skb->len, 0)); - ip6_nd_hdr(skb, saddr, daddr, inet6_sk(sk)->hop_limit, skb->len); + ip6_nd_hdr(skb, saddr, daddr, READ_ONCE(inet6_sk(sk)->hop_limit), skb->len); - rcu_read_lock(); - idev = __in6_dev_get(dst->dev); - IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUT, skb->len); + dev = dst_dev_rcu(dst); + idev = __in6_dev_get(dev); + IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTREQUESTS); err = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, - net, sk, skb, NULL, dst->dev, + net, sk, skb, NULL, dev, dst_output); if (!err) { ICMP6MSGOUT_INC_STATS(net, idev, type); @@ -534,7 +538,7 @@ void ndisc_send_na(struct net_device *dev, const struct in6_addr *daddr, src_addr = solicited_addr; if (ifp->flags & IFA_F_OPTIMISTIC) override = false; - inc_opt |= ifp->idev->cnf.force_tllao; + inc_opt |= READ_ONCE(ifp->idev->cnf.force_tllao); in6_ifa_put(ifp); } else { if (ipv6_dev_get_saddr(dev_net(dev), dev, daddr, @@ -745,10 +749,9 @@ static void ndisc_solicit(struct neighbour *neigh, struct sk_buff *skb) saddr = &ipv6_hdr(skb)->saddr; probes -= NEIGH_VAR(neigh->parms, UCAST_PROBES); if (probes < 0) { - if (!(neigh->nud_state & NUD_VALID)) { - ND_PRINTK(1, dbg, - "%s: trying to ucast probe in NUD_INVALID: %pI6\n", - __func__, target); + if (!(READ_ONCE(neigh->nud_state) & NUD_VALID)) { + net_dbg_ratelimited("%s: trying to ucast probe in NUD_INVALID: %pI6\n", + __func__, target); } ndisc_send_ns(dev, target, target, saddr, 0); } else if ((probes -= NEIGH_VAR(neigh->parms, APP_PROBES)) < 0) { @@ -765,11 +768,9 @@ static int pndisc_is_router(const void *pkey, struct pneigh_entry *n; int ret = -1; - read_lock_bh(&nd_tbl.lock); - n = __pneigh_lookup(&nd_tbl, dev_net(dev), pkey, dev); + n = pneigh_lookup(&nd_tbl, dev_net(dev), pkey, dev); if (n) - ret = !!(n->flags & NTF_ROUTER); - read_unlock_bh(&nd_tbl.lock); + ret = !!(READ_ONCE(n->flags) & NTF_ROUTER); return ret; } @@ -783,7 +784,7 @@ void ndisc_update(const struct net_device *dev, struct neighbour *neigh, ndisc_ops_update(dev, neigh, flags, icmp6_type, ndopts); } -static void ndisc_recv_ns(struct sk_buff *skb) +static enum skb_drop_reason ndisc_recv_ns(struct sk_buff *skb) { struct nd_msg *msg = (struct nd_msg *)skb_transport_header(skb); const struct in6_addr *saddr = &ipv6_hdr(skb)->saddr; @@ -797,18 +798,17 @@ static void ndisc_recv_ns(struct sk_buff *skb) struct inet6_dev *idev = NULL; struct neighbour *neigh; int dad = ipv6_addr_any(saddr); - bool inc; int is_router = -1; + SKB_DR(reason); u64 nonce = 0; + bool inc; - if (skb->len < sizeof(struct nd_msg)) { - ND_PRINTK(2, warn, "NS: packet too short\n"); - return; - } + if (skb->len < sizeof(struct nd_msg)) + return SKB_DROP_REASON_PKT_TOO_SMALL; if (ipv6_addr_is_multicast(&msg->target)) { - ND_PRINTK(2, warn, "NS: multicast target address\n"); - return; + net_dbg_ratelimited("NS: multicast target address\n"); + return reason; } /* @@ -816,21 +816,18 @@ static void ndisc_recv_ns(struct sk_buff *skb) * DAD has to be destined for solicited node multicast address. */ if (dad && !ipv6_addr_is_solict_mult(daddr)) { - ND_PRINTK(2, warn, "NS: bad DAD packet (wrong destination)\n"); - return; + net_dbg_ratelimited("NS: bad DAD packet (wrong destination)\n"); + return reason; } - if (!ndisc_parse_options(dev, msg->opt, ndoptlen, &ndopts)) { - ND_PRINTK(2, warn, "NS: invalid ND options\n"); - return; - } + if (!ndisc_parse_options(dev, msg->opt, ndoptlen, &ndopts)) + return SKB_DROP_REASON_IPV6_NDISC_BAD_OPTIONS; if (ndopts.nd_opts_src_lladdr) { lladdr = ndisc_opt_addr_data(ndopts.nd_opts_src_lladdr, dev); if (!lladdr) { - ND_PRINTK(2, warn, - "NS: invalid link-layer address length\n"); - return; + net_dbg_ratelimited("NS: invalid link-layer address length\n"); + return reason; } /* RFC2461 7.1.1: @@ -839,9 +836,8 @@ static void ndisc_recv_ns(struct sk_buff *skb) * in the message. */ if (dad) { - ND_PRINTK(2, warn, - "NS: bad DAD packet (link-layer address option)\n"); - return; + net_dbg_ratelimited("NS: bad DAD packet (link-layer address option)\n"); + return reason; } } if (ndopts.nd_opts_nonce && ndopts.nd_opts_nonce->nd_opt_len == 1) @@ -857,10 +853,8 @@ have_ifp: if (nonce != 0 && ifp->dad_nonce == nonce) { u8 *np = (u8 *)&nonce; /* Matching nonce if looped back */ - ND_PRINTK(2, notice, - "%s: IPv6 DAD loopback for address %pI6c nonce %pM ignored\n", - ifp->idev->dev->name, - &ifp->addr, np); + net_dbg_ratelimited("%s: IPv6 DAD loopback for address %pI6c nonce %pM ignored\n", + ifp->idev->dev->name, &ifp->addr, np); goto out; } /* @@ -869,7 +863,7 @@ have_ifp: * so fail our DAD process */ addrconf_dad_failure(skb, ifp); - return; + return reason; } else { /* * This is not a dad solicitation. @@ -901,12 +895,13 @@ have_ifp: idev = in6_dev_get(dev); if (!idev) { /* XXX: count this drop? */ - return; + return reason; } if (ipv6_chk_acast_addr(net, dev, &msg->target) || - (idev->cnf.forwarding && - (net->ipv6.devconf_all->proxy_ndp || idev->cnf.proxy_ndp) && + (READ_ONCE(idev->cnf.forwarding) && + (READ_ONCE(net->ipv6.devconf_all->proxy_ndp) || + READ_ONCE(idev->cnf.proxy_ndp)) && (is_router = pndisc_is_router(&msg->target, dev)) >= 0)) { if (!(NEIGH_CB(skb)->flags & LOCALLY_ENQUEUED) && skb->pkt_type != PACKET_HOST && @@ -924,12 +919,14 @@ have_ifp: pneigh_enqueue(&nd_tbl, idev->nd_parms, n); goto out; } - } else + } else { + SKB_DR_SET(reason, IPV6_NDISC_NS_OTHERHOST); goto out; + } } if (is_router < 0) - is_router = idev->cnf.forwarding; + is_router = READ_ONCE(idev->cnf.forwarding); if (dad) { ndisc_send_na(dev, &in6addr_linklocal_allnodes, &msg->target, @@ -958,6 +955,7 @@ have_ifp: true, (ifp != NULL && inc), inc); if (neigh) neigh_release(neigh); + reason = SKB_CONSUMED; } out: @@ -965,13 +963,14 @@ out: in6_ifa_put(ifp); else in6_dev_put(idev); + return reason; } static int accept_untracked_na(struct net_device *dev, struct in6_addr *saddr) { struct inet6_dev *idev = __in6_dev_get(dev); - switch (idev->cnf.accept_untracked_na) { + switch (READ_ONCE(idev->cnf.accept_untracked_na)) { case 0: /* Don't accept untracked na (absent in neighbor cache) */ return 0; case 1: /* Create new entries from na if currently untracked */ @@ -986,7 +985,7 @@ static int accept_untracked_na(struct net_device *dev, struct in6_addr *saddr) } } -static void ndisc_recv_na(struct sk_buff *skb) +static enum skb_drop_reason ndisc_recv_na(struct sk_buff *skb) { struct nd_msg *msg = (struct nd_msg *)skb_transport_header(skb); struct in6_addr *saddr = &ipv6_hdr(skb)->saddr; @@ -999,22 +998,21 @@ static void ndisc_recv_na(struct sk_buff *skb) struct inet6_dev *idev = __in6_dev_get(dev); struct inet6_ifaddr *ifp; struct neighbour *neigh; + SKB_DR(reason); u8 new_state; - if (skb->len < sizeof(struct nd_msg)) { - ND_PRINTK(2, warn, "NA: packet too short\n"); - return; - } + if (skb->len < sizeof(struct nd_msg)) + return SKB_DROP_REASON_PKT_TOO_SMALL; if (ipv6_addr_is_multicast(&msg->target)) { - ND_PRINTK(2, warn, "NA: target address is multicast\n"); - return; + net_dbg_ratelimited("NA: target address is multicast\n"); + return reason; } if (ipv6_addr_is_multicast(daddr) && msg->icmph.icmp6_solicited) { - ND_PRINTK(2, warn, "NA: solicited NA is multicasted\n"); - return; + net_dbg_ratelimited("NA: solicited NA is multicasted\n"); + return reason; } /* For some 802.11 wireless deployments (and possibly other networks), @@ -1023,19 +1021,17 @@ static void ndisc_recv_na(struct sk_buff *skb) * drop_unsolicited_na takes precedence over accept_untracked_na */ if (!msg->icmph.icmp6_solicited && idev && - idev->cnf.drop_unsolicited_na) - return; + READ_ONCE(idev->cnf.drop_unsolicited_na)) + return reason; + + if (!ndisc_parse_options(dev, msg->opt, ndoptlen, &ndopts)) + return SKB_DROP_REASON_IPV6_NDISC_BAD_OPTIONS; - if (!ndisc_parse_options(dev, msg->opt, ndoptlen, &ndopts)) { - ND_PRINTK(2, warn, "NS: invalid ND option\n"); - return; - } if (ndopts.nd_opts_tgt_lladdr) { lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr, dev); if (!lladdr) { - ND_PRINTK(2, warn, - "NA: invalid link-layer address length\n"); - return; + net_dbg_ratelimited("NA: invalid link-layer address length\n"); + return reason; } } ifp = ipv6_get_ifaddr(dev_net(dev), &msg->target, dev, 1); @@ -1043,7 +1039,7 @@ static void ndisc_recv_na(struct sk_buff *skb) if (skb->pkt_type != PACKET_LOOPBACK && (ifp->flags & IFA_F_TENTATIVE)) { addrconf_dad_failure(skb, ifp); - return; + return reason; } /* What should we make now? The advertisement is invalid, but ndisc specs say nothing @@ -1055,11 +1051,11 @@ static void ndisc_recv_na(struct sk_buff *skb) unsolicited advertisement. */ if (skb->pkt_type != PACKET_LOOPBACK) - ND_PRINTK(1, warn, - "NA: %pM advertised our address %pI6c on %s!\n", - eth_hdr(skb)->h_source, &ifp->addr, ifp->idev->dev->name); + net_warn_ratelimited("NA: %pM advertised our address %pI6c on %s!\n", + eth_hdr(skb)->h_source, &ifp->addr, + ifp->idev->dev->name); in6_ifa_put(ifp); - return; + return reason; } neigh = neigh_lookup(&nd_tbl, &msg->target, dev); @@ -1080,7 +1076,7 @@ static void ndisc_recv_na(struct sk_buff *skb) * Note that we don't do a (daddr == all-routers-mcast) check. */ new_state = msg->icmph.icmp6_solicited ? NUD_REACHABLE : NUD_STALE; - if (!neigh && lladdr && idev && idev->cnf.forwarding) { + if (!neigh && lladdr && idev && READ_ONCE(idev->cnf.forwarding)) { if (accept_untracked_na(dev, saddr)) { neigh = neigh_create(&nd_tbl, &msg->target, dev); new_state = NUD_STALE; @@ -1091,7 +1087,7 @@ static void ndisc_recv_na(struct sk_buff *skb) u8 old_flags = neigh->flags; struct net *net = dev_net(dev); - if (neigh->nud_state & NUD_FAILED) + if (READ_ONCE(neigh->nud_state) & NUD_FAILED) goto out; /* @@ -1100,8 +1096,9 @@ static void ndisc_recv_na(struct sk_buff *skb) * has already sent a NA to us. */ if (lladdr && !memcmp(lladdr, dev->dev_addr, dev->addr_len) && - net->ipv6.devconf_all->forwarding && net->ipv6.devconf_all->proxy_ndp && - pneigh_lookup(&nd_tbl, net, &msg->target, dev, 0)) { + READ_ONCE(net->ipv6.devconf_all->forwarding) && + READ_ONCE(net->ipv6.devconf_all->proxy_ndp) && + pneigh_lookup(&nd_tbl, net, &msg->target, dev)) { /* XXX: idev->cnf.proxy_ndp */ goto out; } @@ -1120,13 +1117,14 @@ static void ndisc_recv_na(struct sk_buff *skb) */ rt6_clean_tohost(dev_net(dev), saddr); } - + reason = SKB_CONSUMED; out: neigh_release(neigh); } + return reason; } -static void ndisc_recv_rs(struct sk_buff *skb) +static enum skb_drop_reason ndisc_recv_rs(struct sk_buff *skb) { struct rs_msg *rs_msg = (struct rs_msg *)skb_transport_header(skb); unsigned long ndoptlen = skb->len - sizeof(*rs_msg); @@ -1135,18 +1133,19 @@ static void ndisc_recv_rs(struct sk_buff *skb) const struct in6_addr *saddr = &ipv6_hdr(skb)->saddr; struct ndisc_options ndopts; u8 *lladdr = NULL; + SKB_DR(reason); if (skb->len < sizeof(*rs_msg)) - return; + return SKB_DROP_REASON_PKT_TOO_SMALL; idev = __in6_dev_get(skb->dev); if (!idev) { - ND_PRINTK(1, err, "RS: can't find in6 device\n"); - return; + net_err_ratelimited("RS: can't find in6 device\n"); + return reason; } /* Don't accept RS if we're not in router mode */ - if (!idev->cnf.forwarding) + if (!READ_ONCE(idev->cnf.forwarding)) goto out; /* @@ -1157,10 +1156,8 @@ static void ndisc_recv_rs(struct sk_buff *skb) goto out; /* Parse ND options */ - if (!ndisc_parse_options(skb->dev, rs_msg->opt, ndoptlen, &ndopts)) { - ND_PRINTK(2, notice, "NS: invalid ND option, ignored\n"); - goto out; - } + if (!ndisc_parse_options(skb->dev, rs_msg->opt, ndoptlen, &ndopts)) + return SKB_DROP_REASON_IPV6_NDISC_BAD_OPTIONS; if (ndopts.nd_opts_src_lladdr) { lladdr = ndisc_opt_addr_data(ndopts.nd_opts_src_lladdr, @@ -1177,9 +1174,10 @@ static void ndisc_recv_rs(struct sk_buff *skb) NEIGH_UPDATE_F_OVERRIDE_ISROUTER, NDISC_ROUTER_SOLICITATION, &ndopts); neigh_release(neigh); + reason = SKB_CONSUMED; } out: - return; + return reason; } static void ndisc_ra_useropt(struct sk_buff *ra, struct nd_opt_hdr *opt) @@ -1228,74 +1226,63 @@ errout: rtnl_set_sk_err(net, RTNLGRP_ND_USEROPT, err); } -static void ndisc_router_discovery(struct sk_buff *skb) +static enum skb_drop_reason ndisc_router_discovery(struct sk_buff *skb) { struct ra_msg *ra_msg = (struct ra_msg *)skb_transport_header(skb); + bool send_ifinfo_notify = false; struct neighbour *neigh = NULL; - struct inet6_dev *in6_dev; + struct ndisc_options ndopts; struct fib6_info *rt = NULL; + struct inet6_dev *in6_dev; + struct fib6_table *table; u32 defrtr_usr_metric; + unsigned int pref = 0; + __u32 old_if_flags; struct net *net; + SKB_DR(reason); int lifetime; - struct ndisc_options ndopts; int optlen; - unsigned int pref = 0; - __u32 old_if_flags; - bool send_ifinfo_notify = false; __u8 *opt = (__u8 *)(ra_msg + 1); optlen = (skb_tail_pointer(skb) - skb_transport_header(skb)) - sizeof(struct ra_msg); - ND_PRINTK(2, info, - "RA: %s, dev: %s\n", - __func__, skb->dev->name); + net_dbg_ratelimited("RA: %s, dev: %s\n", __func__, skb->dev->name); if (!(ipv6_addr_type(&ipv6_hdr(skb)->saddr) & IPV6_ADDR_LINKLOCAL)) { - ND_PRINTK(2, warn, "RA: source address is not link-local\n"); - return; - } - if (optlen < 0) { - ND_PRINTK(2, warn, "RA: packet too short\n"); - return; + net_dbg_ratelimited("RA: source address is not link-local\n"); + return reason; } + if (optlen < 0) + return SKB_DROP_REASON_PKT_TOO_SMALL; #ifdef CONFIG_IPV6_NDISC_NODETYPE if (skb->ndisc_nodetype == NDISC_NODETYPE_HOST) { - ND_PRINTK(2, warn, "RA: from host or unauthorized router\n"); - return; + net_dbg_ratelimited("RA: from host or unauthorized router\n"); + return reason; } #endif - /* - * set the RA_RECV flag in the interface - */ - in6_dev = __in6_dev_get(skb->dev); if (!in6_dev) { - ND_PRINTK(0, err, "RA: can't find inet6 device for %s\n", - skb->dev->name); - return; + net_err_ratelimited("RA: can't find inet6 device for %s\n", skb->dev->name); + return reason; } - if (!ndisc_parse_options(skb->dev, opt, optlen, &ndopts)) { - ND_PRINTK(2, warn, "RA: invalid ND options\n"); - return; - } + if (!ndisc_parse_options(skb->dev, opt, optlen, &ndopts)) + return SKB_DROP_REASON_IPV6_NDISC_BAD_OPTIONS; if (!ipv6_accept_ra(in6_dev)) { - ND_PRINTK(2, info, - "RA: %s, did not accept ra for dev: %s\n", - __func__, skb->dev->name); + net_dbg_ratelimited("RA: %s, did not accept ra for dev: %s\n", __func__, + skb->dev->name); goto skip_linkparms; } #ifdef CONFIG_IPV6_NDISC_NODETYPE /* skip link-specific parameters from interior routers */ if (skb->ndisc_nodetype == NDISC_NODETYPE_NODEFAULT) { - ND_PRINTK(2, info, - "RA: %s, nodetype is NODEFAULT, dev: %s\n", - __func__, skb->dev->name); + net_dbg_ratelimited("RA: %s, nodetype is NODEFAULT, dev: %s\n", __func__, + skb->dev->name); goto skip_linkparms; } #endif @@ -1323,10 +1310,17 @@ static void ndisc_router_discovery(struct sk_buff *skb) if (old_if_flags != in6_dev->if_flags) send_ifinfo_notify = true; - if (!in6_dev->cnf.accept_ra_defrtr) { - ND_PRINTK(2, info, - "RA: %s, defrtr is false for dev: %s\n", - __func__, skb->dev->name); + if (!READ_ONCE(in6_dev->cnf.accept_ra_defrtr)) { + net_dbg_ratelimited("RA: %s, defrtr is false for dev: %s\n", __func__, + skb->dev->name); + goto skip_defrtr; + } + + lifetime = ntohs(ra_msg->icmph.icmp6_rt_lifetime); + if (lifetime != 0 && + lifetime < READ_ONCE(in6_dev->cnf.accept_ra_min_lft)) { + net_dbg_ratelimited("RA: router lifetime (%ds) is too short: %s\n", lifetime, + skb->dev->name); goto skip_defrtr; } @@ -1334,21 +1328,18 @@ static void ndisc_router_discovery(struct sk_buff *skb) * accept_ra_from_local is set to true. */ net = dev_net(in6_dev->dev); - if (!in6_dev->cnf.accept_ra_from_local && + if (!READ_ONCE(in6_dev->cnf.accept_ra_from_local) && ipv6_chk_addr(net, &ipv6_hdr(skb)->saddr, in6_dev->dev, 0)) { - ND_PRINTK(2, info, - "RA from local address detected on dev: %s: default router ignored\n", - skb->dev->name); + net_dbg_ratelimited("RA from local address detected on dev: %s: default router ignored\n", + skb->dev->name); goto skip_defrtr; } - lifetime = ntohs(ra_msg->icmph.icmp6_rt_lifetime); - #ifdef CONFIG_IPV6_ROUTER_PREF pref = ra_msg->icmph.icmp6_router_pref; /* 10b is handled as if it were 00b (medium) */ if (pref == ICMPV6_ROUTER_PREF_INVALID || - !in6_dev->cnf.accept_ra_rtr_pref) + !READ_ONCE(in6_dev->cnf.accept_ra_rtr_pref)) pref = ICMPV6_ROUTER_PREF_MEDIUM; #endif /* routes added from RAs do not use nexthop objects */ @@ -1358,11 +1349,10 @@ static void ndisc_router_discovery(struct sk_buff *skb) rt->fib6_nh->fib_nh_dev, NULL, &ipv6_hdr(skb)->saddr); if (!neigh) { - ND_PRINTK(0, err, - "RA: %s got default router without neighbour\n", - __func__); + net_err_ratelimited("RA: %s got default router without neighbour\n", + __func__); fib6_info_release(rt); - return; + return reason; } } /* Set default route metric as specified by user */ @@ -1373,32 +1363,30 @@ static void ndisc_router_discovery(struct sk_buff *skb) rt = NULL; } - ND_PRINTK(3, info, "RA: rt: %p lifetime: %d, metric: %d, for dev: %s\n", - rt, lifetime, defrtr_usr_metric, skb->dev->name); + net_dbg_ratelimited("RA: rt: %p lifetime: %d, metric: %d, for dev: %s\n", rt, lifetime, + defrtr_usr_metric, skb->dev->name); if (!rt && lifetime) { - ND_PRINTK(3, info, "RA: adding default router\n"); + net_dbg_ratelimited("RA: adding default router\n"); if (neigh) neigh_release(neigh); rt = rt6_add_dflt_router(net, &ipv6_hdr(skb)->saddr, - skb->dev, pref, defrtr_usr_metric); + skb->dev, pref, defrtr_usr_metric, + lifetime); if (!rt) { - ND_PRINTK(0, err, - "RA: %s failed to add default route\n", - __func__); - return; + net_err_ratelimited("RA: %s failed to add default route\n", __func__); + return reason; } neigh = ip6_neigh_lookup(&rt->fib6_nh->fib_nh_gw6, rt->fib6_nh->fib_nh_dev, NULL, &ipv6_hdr(skb)->saddr); if (!neigh) { - ND_PRINTK(0, err, - "RA: %s got default router without neighbour\n", - __func__); + net_err_ratelimited("RA: %s got default router without neighbour\n", + __func__); fib6_info_release(rt); - return; + return reason; } neigh->flags |= NTF_ROUTER; } else if (rt && IPV6_EXTRACT_PREF(rt->fib6_flags) != pref) { @@ -1409,16 +1397,25 @@ static void ndisc_router_discovery(struct sk_buff *skb) inet6_rt_notify(RTM_NEWROUTE, rt, &nlinfo, NLM_F_REPLACE); } - if (rt) + if (rt) { + table = rt->fib6_table; + spin_lock_bh(&table->tb6_lock); + fib6_set_expires(rt, jiffies + (HZ * lifetime)); - if (in6_dev->cnf.accept_ra_min_hop_limit < 256 && + fib6_add_gc_list(rt); + + spin_unlock_bh(&table->tb6_lock); + } + if (READ_ONCE(in6_dev->cnf.accept_ra_min_hop_limit) < 256 && ra_msg->icmph.icmp6_hop_limit) { - if (in6_dev->cnf.accept_ra_min_hop_limit <= ra_msg->icmph.icmp6_hop_limit) { - in6_dev->cnf.hop_limit = ra_msg->icmph.icmp6_hop_limit; + if (READ_ONCE(in6_dev->cnf.accept_ra_min_hop_limit) <= + ra_msg->icmph.icmp6_hop_limit) { + WRITE_ONCE(in6_dev->cnf.hop_limit, + ra_msg->icmph.icmp6_hop_limit); fib6_metric_set(rt, RTAX_HOPLIMIT, ra_msg->icmph.icmp6_hop_limit); } else { - ND_PRINTK(2, warn, "RA: Got route advertisement with lower hop_limit than minimum\n"); + net_dbg_ratelimited("RA: Got route advertisement with lower hop_limit than minimum\n"); } } @@ -1452,7 +1449,7 @@ skip_defrtr: BASE_REACHABLE_TIME, rtime); NEIGH_VAR_SET(in6_dev->nd_parms, GC_STALETIME, 3 * rtime); - in6_dev->nd_parms->reachable_time = neigh_rand_reach_time(rtime); + neigh_set_reach_time(in6_dev->nd_parms); in6_dev->tstamp = jiffies; send_ifinfo_notify = true; } @@ -1474,8 +1471,7 @@ skip_linkparms: lladdr = ndisc_opt_addr_data(ndopts.nd_opts_src_lladdr, skb->dev); if (!lladdr) { - ND_PRINTK(2, warn, - "RA: invalid link-layer address length\n"); + net_dbg_ratelimited("RA: invalid link-layer address length\n"); goto out; } } @@ -1485,26 +1481,25 @@ skip_linkparms: NEIGH_UPDATE_F_OVERRIDE_ISROUTER| NEIGH_UPDATE_F_ISROUTER, NDISC_ROUTER_ADVERTISEMENT, &ndopts); + reason = SKB_CONSUMED; } if (!ipv6_accept_ra(in6_dev)) { - ND_PRINTK(2, info, - "RA: %s, accept_ra is false for dev: %s\n", - __func__, skb->dev->name); + net_dbg_ratelimited("RA: %s, accept_ra is false for dev: %s\n", __func__, + skb->dev->name); goto out; } #ifdef CONFIG_IPV6_ROUTE_INFO - if (!in6_dev->cnf.accept_ra_from_local && + if (!READ_ONCE(in6_dev->cnf.accept_ra_from_local) && ipv6_chk_addr(dev_net(in6_dev->dev), &ipv6_hdr(skb)->saddr, in6_dev->dev, 0)) { - ND_PRINTK(2, info, - "RA from local address detected on dev: %s: router info ignored.\n", - skb->dev->name); + net_dbg_ratelimited("RA from local address detected on dev: %s: router info ignored.\n", + skb->dev->name); goto skip_routeinfo; } - if (in6_dev->cnf.accept_ra_rtr_pref && ndopts.nd_opts_ri) { + if (READ_ONCE(in6_dev->cnf.accept_ra_rtr_pref) && ndopts.nd_opts_ri) { struct nd_opt_hdr *p; for (p = ndopts.nd_opts_ri; p; @@ -1516,11 +1511,14 @@ skip_linkparms: continue; #endif if (ri->prefix_len == 0 && - !in6_dev->cnf.accept_ra_defrtr) + !READ_ONCE(in6_dev->cnf.accept_ra_defrtr)) + continue; + if (ri->lifetime != 0 && + ntohl(ri->lifetime) < READ_ONCE(in6_dev->cnf.accept_ra_min_lft)) continue; - if (ri->prefix_len < in6_dev->cnf.accept_ra_rt_info_min_plen) + if (ri->prefix_len < READ_ONCE(in6_dev->cnf.accept_ra_rt_info_min_plen)) continue; - if (ri->prefix_len > in6_dev->cnf.accept_ra_rt_info_max_plen) + if (ri->prefix_len > READ_ONCE(in6_dev->cnf.accept_ra_rt_info_max_plen)) continue; rt6_route_rcv(skb->dev, (u8 *)p, (p->nd_opt_len) << 3, &ipv6_hdr(skb)->saddr); @@ -1533,14 +1531,13 @@ skip_routeinfo: #ifdef CONFIG_IPV6_NDISC_NODETYPE /* skip link-specific ndopts from interior routers */ if (skb->ndisc_nodetype == NDISC_NODETYPE_NODEFAULT) { - ND_PRINTK(2, info, - "RA: %s, nodetype is NODEFAULT (interior routes), dev: %s\n", - __func__, skb->dev->name); + net_dbg_ratelimited("RA: %s, nodetype is NODEFAULT (interior routes), dev: %s\n", + __func__, skb->dev->name); goto out; } #endif - if (in6_dev->cnf.accept_ra_pinfo && ndopts.nd_opts_pi) { + if (READ_ONCE(in6_dev->cnf.accept_ra_pinfo) && ndopts.nd_opts_pi) { struct nd_opt_hdr *p; for (p = ndopts.nd_opts_pi; p; @@ -1551,7 +1548,7 @@ skip_routeinfo: } } - if (ndopts.nd_opts_mtu && in6_dev->cnf.accept_ra_mtu) { + if (ndopts.nd_opts_mtu && READ_ONCE(in6_dev->cnf.accept_ra_mtu)) { __be32 n; u32 mtu; @@ -1564,9 +1561,9 @@ skip_routeinfo: } if (mtu < IPV6_MIN_MTU || mtu > skb->dev->mtu) { - ND_PRINTK(2, warn, "RA: invalid mtu: %d\n", mtu); - } else if (in6_dev->cnf.mtu6 != mtu) { - in6_dev->cnf.mtu6 = mtu; + net_dbg_ratelimited("RA: invalid mtu: %d\n", mtu); + } else if (READ_ONCE(in6_dev->cnf.mtu6) != mtu) { + WRITE_ONCE(in6_dev->cnf.mtu6, mtu); fib6_metric_set(rt, RTAX_MTU, mtu); rt6_mtu_change(skb->dev, mtu); } @@ -1583,7 +1580,7 @@ skip_routeinfo: } if (ndopts.nd_opts_tgt_lladdr || ndopts.nd_opts_rh) { - ND_PRINTK(2, warn, "RA: invalid RA options\n"); + net_dbg_ratelimited("RA: invalid RA options\n"); } out: /* Send a notify if RA changed managed/otherconf flags or @@ -1595,47 +1592,47 @@ out: fib6_info_release(rt); if (neigh) neigh_release(neigh); + return reason; } -static void ndisc_redirect_rcv(struct sk_buff *skb) +static enum skb_drop_reason ndisc_redirect_rcv(struct sk_buff *skb) { - u8 *hdr; - struct ndisc_options ndopts; struct rd_msg *msg = (struct rd_msg *)skb_transport_header(skb); u32 ndoptlen = skb_tail_pointer(skb) - (skb_transport_header(skb) + offsetof(struct rd_msg, opt)); + struct ndisc_options ndopts; + SKB_DR(reason); + u8 *hdr; #ifdef CONFIG_IPV6_NDISC_NODETYPE switch (skb->ndisc_nodetype) { case NDISC_NODETYPE_HOST: case NDISC_NODETYPE_NODEFAULT: - ND_PRINTK(2, warn, - "Redirect: from host or unauthorized router\n"); - return; + net_dbg_ratelimited("Redirect: from host or unauthorized router\n"); + return reason; } #endif if (!(ipv6_addr_type(&ipv6_hdr(skb)->saddr) & IPV6_ADDR_LINKLOCAL)) { - ND_PRINTK(2, warn, - "Redirect: source address is not link-local\n"); - return; + net_dbg_ratelimited("Redirect: source address is not link-local\n"); + return reason; } if (!ndisc_parse_options(skb->dev, msg->opt, ndoptlen, &ndopts)) - return; + return SKB_DROP_REASON_IPV6_NDISC_BAD_OPTIONS; if (!ndopts.nd_opts_rh) { ip6_redirect_no_header(skb, dev_net(skb->dev), skb->dev->ifindex); - return; + return reason; } hdr = (u8 *)ndopts.nd_opts_rh; hdr += 8; if (!pskb_pull(skb, hdr - skb_transport_header(skb))) - return; + return SKB_DROP_REASON_PKT_TOO_SMALL; - icmpv6_notify(skb, NDISC_REDIRECT, 0, 0); + return icmpv6_notify(skb, NDISC_REDIRECT, 0, 0); } static void ndisc_fill_redirect_hdr_option(struct sk_buff *skb, @@ -1656,7 +1653,7 @@ static void ndisc_fill_redirect_hdr_option(struct sk_buff *skb, void ndisc_send_redirect(struct sk_buff *skb, const struct in6_addr *target) { struct net_device *dev = skb->dev; - struct net *net = dev_net(dev); + struct net *net = dev_net_rcu(dev); struct sock *sk = net->ipv6.ndisc_sk; int optlen = 0; struct inet_peer *peer; @@ -1671,22 +1668,20 @@ void ndisc_send_redirect(struct sk_buff *skb, const struct in6_addr *target) ops_data_buf[NDISC_OPS_REDIRECT_DATA_SPACE], *ops_data = NULL; bool ret; - if (netif_is_l3_master(skb->dev)) { - dev = __dev_get_by_index(dev_net(skb->dev), IPCB(skb)->iif); + if (netif_is_l3_master(dev)) { + dev = dev_get_by_index_rcu(net, IPCB(skb)->iif); if (!dev) return; } if (ipv6_get_lladdr(dev, &saddr_buf, IFA_F_TENTATIVE)) { - ND_PRINTK(2, warn, "Redirect: no link-local address on %s\n", - dev->name); + net_dbg_ratelimited("Redirect: no link-local address on %s\n", dev->name); return; } if (!ipv6_addr_equal(&ipv6_hdr(skb)->daddr, target) && ipv6_addr_type(target) != (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) { - ND_PRINTK(2, warn, - "Redirect: target address is not link-local unicast\n"); + net_dbg_ratelimited("Redirect: target address is not link-local unicast\n"); return; } @@ -1702,25 +1697,23 @@ void ndisc_send_redirect(struct sk_buff *skb, const struct in6_addr *target) if (IS_ERR(dst)) return; - rt = (struct rt6_info *) dst; + rt = dst_rt6_info(dst); if (rt->rt6i_flags & RTF_GATEWAY) { - ND_PRINTK(2, warn, - "Redirect: destination is not a neighbour\n"); + net_dbg_ratelimited("Redirect: destination is not a neighbour\n"); goto release; } - peer = inet_getpeer_v6(net->ipv6.peers, &ipv6_hdr(skb)->saddr, 1); + + peer = inet_getpeer_v6(net->ipv6.peers, &ipv6_hdr(skb)->saddr); ret = inet_peer_xrlim_allow(peer, 1*HZ); - if (peer) - inet_putpeer(peer); + if (!ret) goto release; if (dev->addr_len) { struct neighbour *neigh = dst_neigh_lookup(skb_dst(skb), target); if (!neigh) { - ND_PRINTK(2, warn, - "Redirect: no neigh for target address\n"); + net_dbg_ratelimited("Redirect: no neigh for target address\n"); goto release; } @@ -1781,8 +1774,9 @@ release: static void pndisc_redo(struct sk_buff *skb) { - ndisc_recv_ns(skb); - kfree_skb(skb); + enum skb_drop_reason reason = ndisc_recv_ns(skb); + + kfree_skb_reason(skb, reason); } static int ndisc_is_multicast(const void *pkey) @@ -1797,63 +1791,62 @@ static bool ndisc_suppress_frag_ndisc(struct sk_buff *skb) if (!idev) return true; if (IP6CB(skb)->flags & IP6SKB_FRAGMENTED && - idev->cnf.suppress_frag_ndisc) { + READ_ONCE(idev->cnf.suppress_frag_ndisc)) { net_warn_ratelimited("Received fragmented ndisc packet. Carefully consider disabling suppress_frag_ndisc.\n"); return true; } return false; } -int ndisc_rcv(struct sk_buff *skb) +enum skb_drop_reason ndisc_rcv(struct sk_buff *skb) { struct nd_msg *msg; + SKB_DR(reason); if (ndisc_suppress_frag_ndisc(skb)) - return 0; + return SKB_DROP_REASON_IPV6_NDISC_FRAG; if (skb_linearize(skb)) - return 0; + return SKB_DROP_REASON_NOMEM; msg = (struct nd_msg *)skb_transport_header(skb); __skb_push(skb, skb->data - skb_transport_header(skb)); if (ipv6_hdr(skb)->hop_limit != 255) { - ND_PRINTK(2, warn, "NDISC: invalid hop-limit: %d\n", - ipv6_hdr(skb)->hop_limit); - return 0; + net_dbg_ratelimited("NDISC: invalid hop-limit: %d\n", ipv6_hdr(skb)->hop_limit); + return SKB_DROP_REASON_IPV6_NDISC_HOP_LIMIT; } if (msg->icmph.icmp6_code != 0) { - ND_PRINTK(2, warn, "NDISC: invalid ICMPv6 code: %d\n", - msg->icmph.icmp6_code); - return 0; + net_dbg_ratelimited("NDISC: invalid ICMPv6 code: %d\n", msg->icmph.icmp6_code); + return SKB_DROP_REASON_IPV6_NDISC_BAD_CODE; } switch (msg->icmph.icmp6_type) { case NDISC_NEIGHBOUR_SOLICITATION: memset(NEIGH_CB(skb), 0, sizeof(struct neighbour_cb)); - ndisc_recv_ns(skb); + reason = ndisc_recv_ns(skb); break; case NDISC_NEIGHBOUR_ADVERTISEMENT: - ndisc_recv_na(skb); + reason = ndisc_recv_na(skb); break; case NDISC_ROUTER_SOLICITATION: - ndisc_recv_rs(skb); + reason = ndisc_recv_rs(skb); break; case NDISC_ROUTER_ADVERTISEMENT: - ndisc_router_discovery(skb); + reason = ndisc_router_discovery(skb); break; case NDISC_REDIRECT: - ndisc_redirect_rcv(skb); + reason = ndisc_redirect_rcv(skb); break; } - return 0; + return reason; } static int ndisc_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) @@ -1873,8 +1866,8 @@ static int ndisc_netdev_event(struct notifier_block *this, unsigned long event, idev = in6_dev_get(dev); if (!idev) break; - if (idev->cnf.ndisc_notify || - net->ipv6.devconf_all->ndisc_notify) + if (READ_ONCE(idev->cnf.ndisc_notify) || + READ_ONCE(net->ipv6.devconf_all->ndisc_notify)) ndisc_send_unsol_na(dev); in6_dev_put(idev); break; @@ -1883,8 +1876,8 @@ static int ndisc_netdev_event(struct notifier_block *this, unsigned long event, if (!idev) evict_nocarrier = true; else { - evict_nocarrier = idev->cnf.ndisc_evict_nocarrier && - net->ipv6.devconf_all->ndisc_evict_nocarrier; + evict_nocarrier = READ_ONCE(idev->cnf.ndisc_evict_nocarrier) && + READ_ONCE(net->ipv6.devconf_all->ndisc_evict_nocarrier); in6_dev_put(idev); } @@ -1914,13 +1907,13 @@ static struct notifier_block ndisc_netdev_notifier = { }; #ifdef CONFIG_SYSCTL -static void ndisc_warn_deprecated_sysctl(struct ctl_table *ctl, +static void ndisc_warn_deprecated_sysctl(const struct ctl_table *ctl, const char *func, const char *dev_name) { static char warncomm[TASK_COMM_LEN]; static int warned; if (strcmp(warncomm, current->comm) && warned < 5) { - strcpy(warncomm, current->comm); + strscpy(warncomm, current->comm); pr_warn("process `%s' is using deprecated sysctl (%s) net.ipv6.neigh.%s.%s - use net.ipv6.neigh.%s.%s_ms instead\n", warncomm, func, dev_name, ctl->procname, @@ -1929,7 +1922,7 @@ static void ndisc_warn_deprecated_sysctl(struct ctl_table *ctl, } } -int ndisc_ifinfo_sysctl_change(struct ctl_table *ctl, int write, void *buffer, +int ndisc_ifinfo_sysctl_change(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct net_device *dev = ctl->extra1; @@ -1955,10 +1948,10 @@ int ndisc_ifinfo_sysctl_change(struct ctl_table *ctl, int write, void *buffer, ret = -1; if (write && ret == 0 && dev && (idev = in6_dev_get(dev)) != NULL) { - if (ctl->data == &NEIGH_VAR(idev->nd_parms, BASE_REACHABLE_TIME)) - idev->nd_parms->reachable_time = - neigh_rand_reach_time(NEIGH_VAR(idev->nd_parms, BASE_REACHABLE_TIME)); - idev->tstamp = jiffies; + if (ctl->data == NEIGH_VAR_PTR(idev->nd_parms, BASE_REACHABLE_TIME)) + neigh_set_reach_time(idev->nd_parms); + + WRITE_ONCE(idev->tstamp, jiffies); inet6_ifinfo_notify(RTM_NEWLINK, idev); in6_dev_put(idev); } @@ -1977,9 +1970,8 @@ static int __net_init ndisc_net_init(struct net *net) err = inet_ctl_sock_create(&sk, PF_INET6, SOCK_RAW, IPPROTO_ICMPV6, net); if (err < 0) { - ND_PRINTK(0, err, - "NDISC: Failed to initialize the control socket (err %d)\n", - err); + net_err_ratelimited("NDISC: Failed to initialize the control socket (err %d)\n", + err); return err; } @@ -1988,7 +1980,7 @@ static int __net_init ndisc_net_init(struct net *net) np = inet6_sk(sk); np->hop_limit = 255; /* Do not loopback ndisc messages */ - np->mc_loop = 0; + inet6_clear_bit(MC6_LOOP, sk); return 0; } diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c index 857713d7a38a..46540a5a4331 100644 --- a/net/ipv6/netfilter.c +++ b/net/ipv6/netfilter.c @@ -24,7 +24,7 @@ int ip6_route_me_harder(struct net *net, struct sock *sk_partial, struct sk_buff { const struct ipv6hdr *iph = ipv6_hdr(skb); struct sock *sk = sk_to_full_sk(sk_partial); - struct net_device *dev = skb_dst(skb)->dev; + struct net_device *dev = skb_dst_dev(skb); struct flow_keys flkeys; unsigned int hh_len; struct dst_entry *dst; @@ -36,6 +36,7 @@ int ip6_route_me_harder(struct net *net, struct sock *sk_partial, struct sk_buff .flowi6_uid = sock_net_uid(net, sk), .daddr = iph->daddr, .saddr = iph->saddr, + .flowlabel = ip6_flowinfo(iph), }; int err; @@ -61,8 +62,11 @@ int ip6_route_me_harder(struct net *net, struct sock *sk_partial, struct sk_buff #ifdef CONFIG_XFRM if (!(IP6CB(skb)->flags & IP6SKB_XFRM_TRANSFORMED) && - xfrm_decode_session(skb, flowi6_to_flowi(&fl6), AF_INET6) == 0) { - skb_dst_set(skb, NULL); + xfrm_decode_session(net, skb, flowi6_to_flowi(&fl6), AF_INET6) == 0) { + /* ignore return value from skb_dstref_steal, xfrm_lookup takes + * care of dropping the refcnt if needed. + */ + skb_dstref_steal(skb); dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), sk, 0); if (IS_ERR(dst)) return PTR_ERR(dst); @@ -71,7 +75,7 @@ int ip6_route_me_harder(struct net *net, struct sock *sk_partial, struct sk_buff #endif /* Change in oif may mean change in hh_len. */ - hh_len = skb_dst(skb)->dev->hard_header_len; + hh_len = skb_dst_dev(skb)->hard_header_len; if (skb_headroom(skb) < hh_len && pskb_expand_head(skb, HH_DATA_ALIGN(hh_len - skb_headroom(skb)), 0, GFP_ATOMIC)) @@ -126,7 +130,7 @@ int br_ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, struct sk_buff *)) { int frag_max_size = BR_INPUT_SKB_CB(skb)->frag_max_size; - bool mono_delivery_time = skb->mono_delivery_time; + u8 tstamp_type = skb->tstamp_type; ktime_t tstamp = skb->tstamp; struct ip6_frag_state state; u8 *prevhdr, nexthdr = 0; @@ -163,20 +167,20 @@ int br_ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, struct ip6_fraglist_iter iter; struct sk_buff *frag2; - if (first_len - hlen > mtu || - skb_headroom(skb) < (hroom + sizeof(struct frag_hdr))) + if (first_len - hlen > mtu) goto blackhole; - if (skb_cloned(skb)) + if (skb_cloned(skb) || + skb_headroom(skb) < (hroom + sizeof(struct frag_hdr))) goto slow_path; skb_walk_frags(skb, frag2) { - if (frag2->len > mtu || - skb_headroom(frag2) < (hlen + hroom + sizeof(struct frag_hdr))) + if (frag2->len > mtu) goto blackhole; /* Partially cloned skb? */ - if (skb_shared(frag2)) + if (skb_shared(frag2) || + skb_headroom(frag2) < (hlen + hroom + sizeof(struct frag_hdr))) goto slow_path; } @@ -192,7 +196,7 @@ int br_ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, if (iter.frag) ip6_fraglist_prepare(skb, &iter); - skb_set_delivery_time(skb, tstamp, mono_delivery_time); + skb_set_delivery_time(skb, tstamp, tstamp_type); err = output(net, sk, data, skb); if (err || !iter.frag) break; @@ -225,7 +229,7 @@ slow_path: goto blackhole; } - skb_set_delivery_time(skb2, tstamp, mono_delivery_time); + skb_set_delivery_time(skb2, tstamp, tstamp_type); err = output(net, sk, data, skb2); if (err) goto blackhole; diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig index 0ba62f4868f9..81daf82ddc2d 100644 --- a/net/ipv6/netfilter/Kconfig +++ b/net/ipv6/netfilter/Kconfig @@ -6,6 +6,17 @@ menu "IPv6: Netfilter Configuration" depends on INET && IPV6 && NETFILTER +# old sockopt interface and eval loop +config IP6_NF_IPTABLES_LEGACY + tristate "Legacy IP6 tables support" + depends on INET && IPV6 && NETFILTER_XTABLES_LEGACY + depends on NETFILTER_XTABLES + default m if NETFILTER_XTABLES_LEGACY + help + ip6tables is a legacy packet classifier. + This is not needed if you are using iptables over nftables + (iptables-nft). + config NF_SOCKET_IPV6 tristate "IPv6 socket lookup support" help @@ -147,7 +158,7 @@ config IP6_NF_MATCH_MH config IP6_NF_MATCH_RPFILTER tristate '"rpfilter" reverse path filter match support' depends on NETFILTER_ADVANCED - depends on IP6_NF_MANGLE || IP6_NF_RAW + depends on IP6_NF_MANGLE || IP6_NF_RAW || NFT_COMPAT help This option allows you to match packets whose replies would go out via the interface the packet came in. @@ -185,7 +196,9 @@ config IP6_NF_TARGET_HL config IP6_NF_FILTER tristate "Packet filtering" - default m if NETFILTER_ADVANCED=n + default m if NETFILTER_ADVANCED=n || IP6_NF_IPTABLES_LEGACY + depends on IP6_NF_IPTABLES_LEGACY + tristate help Packet filtering defines a table `filter', which has a series of rules for simple packet filtering at local input, forwarding and @@ -195,7 +208,7 @@ config IP6_NF_FILTER config IP6_NF_TARGET_REJECT tristate "REJECT target support" - depends on IP6_NF_FILTER + depends on IP6_NF_FILTER || NFT_COMPAT select NF_REJECT_IPV6 default m if NETFILTER_ADVANCED=n help @@ -220,7 +233,8 @@ config IP6_NF_TARGET_SYNPROXY config IP6_NF_MANGLE tristate "Packet mangling" - default m if NETFILTER_ADVANCED=n + default m if NETFILTER_ADVANCED=n || IP6_NF_IPTABLES_LEGACY + depends on IP6_NF_IPTABLES_LEGACY help This option adds a `mangle' table to iptables: see the man page for iptables(8). This table is used for various packet alterations @@ -230,6 +244,7 @@ config IP6_NF_MANGLE config IP6_NF_RAW tristate 'raw table support (required for TRACE)' + depends on IP6_NF_IPTABLES_LEGACY help This option adds a `raw' table to ip6tables. This table is the very first in the netfilter framework and hooks in at the PREROUTING @@ -243,6 +258,7 @@ config IP6_NF_SECURITY tristate "Security table" depends on SECURITY depends on NETFILTER_ADVANCED + depends on IP6_NF_IPTABLES_LEGACY help This option adds a `security' table to iptables, for use with Mandatory Access Control (MAC) policy. @@ -253,6 +269,7 @@ config IP6_NF_NAT tristate "ip6tables NAT support" depends on NF_CONNTRACK depends on NETFILTER_ADVANCED + depends on IP6_NF_IPTABLES_LEGACY select NF_NAT select NETFILTER_XT_NAT help @@ -262,25 +279,23 @@ config IP6_NF_NAT To compile it as a module, choose M here. If unsure, say N. -if IP6_NF_NAT - config IP6_NF_TARGET_MASQUERADE tristate "MASQUERADE target support" select NETFILTER_XT_TARGET_MASQUERADE + depends on IP6_NF_NAT help This is a backwards-compat option for the user's convenience (e.g. when running oldconfig). It selects NETFILTER_XT_TARGET_MASQUERADE. config IP6_NF_TARGET_NPT tristate "NPT (Network Prefix translation) target support" + depends on IP6_NF_NAT || NFT_COMPAT help This option adds the `SNPT' and `DNPT' target, which perform stateless IPv6-to-IPv6 Network Prefix Translation per RFC 6296. To compile it as a module, choose M here. If unsure, say N. -endif # IP6_NF_NAT - endif # IP6_NF_IPTABLES endmenu diff --git a/net/ipv6/netfilter/Makefile b/net/ipv6/netfilter/Makefile index b8d6dc9aeeb6..66ce6fa5b2f5 100644 --- a/net/ipv6/netfilter/Makefile +++ b/net/ipv6/netfilter/Makefile @@ -4,7 +4,7 @@ # # Link order matters here. -obj-$(CONFIG_IP6_NF_IPTABLES) += ip6_tables.o +obj-$(CONFIG_IP6_NF_IPTABLES_LEGACY) += ip6_tables.o obj-$(CONFIG_IP6_NF_FILTER) += ip6table_filter.o obj-$(CONFIG_IP6_NF_MANGLE) += ip6table_mangle.o obj-$(CONFIG_IP6_NF_RAW) += ip6table_raw.o diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index 2d816277f2c5..d585ac3c1113 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -18,7 +18,6 @@ #include <linux/netdevice.h> #include <linux/module.h> #include <linux/poison.h> -#include <linux/icmpv6.h> #include <net/ipv6.h> #include <net/compat.h> #include <linux/uaccess.h> @@ -35,7 +34,6 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); MODULE_DESCRIPTION("IPv6 packet filter"); -MODULE_ALIAS("ip6t_icmp6"); void *ip6t_alloc_initial_table(const struct xt_table *info) { @@ -294,7 +292,7 @@ ip6t_do_table(void *priv, struct sk_buff *skb, * but it is no problem since absolute verdict is issued by these. */ if (static_key_false(&xt_tee_enabled)) - jumpstack += private->stacksize * __this_cpu_read(nf_skb_duplicated); + jumpstack += private->stacksize * current->in_nf_duplicate; e = get_entry(table_base, private->hook_entry[hook]); @@ -1062,7 +1060,6 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks, struct xt_counters *counters; struct ip6t_entry *iter; - ret = 0; counters = xt_counters_alloc(num_counters); if (!counters) { ret = -ENOMEM; @@ -1108,7 +1105,7 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks, net_warn_ratelimited("ip6tables: counters copy to user failed while replacing table\n"); } vfree(counters); - return ret; + return 0; put_module: module_put(t->me); @@ -1128,6 +1125,8 @@ do_replace(struct net *net, sockptr_t arg, unsigned int len) void *loc_cpu_entry; struct ip6t_entry *iter; + if (len < sizeof(tmp)) + return -EINVAL; if (copy_from_sockptr(&tmp, arg, sizeof(tmp)) != 0) return -EFAULT; @@ -1136,6 +1135,8 @@ do_replace(struct net *net, sockptr_t arg, unsigned int len) return -ENOMEM; if (tmp.num_counters == 0) return -EINVAL; + if ((u64)len < (u64)tmp.size + sizeof(tmp)) + return -EINVAL; tmp.name[sizeof(tmp.name)-1] = 0; @@ -1504,6 +1505,8 @@ compat_do_replace(struct net *net, sockptr_t arg, unsigned int len) void *loc_cpu_entry; struct ip6t_entry *iter; + if (len < sizeof(tmp)) + return -EINVAL; if (copy_from_sockptr(&tmp, arg, sizeof(tmp)) != 0) return -EFAULT; @@ -1512,6 +1515,8 @@ compat_do_replace(struct net *net, sockptr_t arg, unsigned int len) return -ENOMEM; if (tmp.num_counters == 0) return -EINVAL; + if ((u64)len < (u64)tmp.size + sizeof(tmp)) + return -EINVAL; tmp.name[sizeof(tmp.name)-1] = 0; @@ -1751,6 +1756,10 @@ int ip6t_register_table(struct net *net, const struct xt_table *table, new_table = xt_register_table(net, table, &bootstrap, newinfo); if (IS_ERR(new_table)) { + struct ip6t_entry *iter; + + xt_entry_foreach(iter, loc_cpu_entry, newinfo->size) + cleanup_entry(iter, net); xt_free_table_info(newinfo); return PTR_ERR(new_table); } @@ -1764,7 +1773,7 @@ int ip6t_register_table(struct net *net, const struct xt_table *table, goto out_free; } - ops = kmemdup(template_ops, sizeof(*ops) * num_ops, GFP_KERNEL); + ops = kmemdup_array(template_ops, num_ops, sizeof(*ops), GFP_KERNEL); if (!ops) { ret = -ENOMEM; goto out_free; @@ -1802,52 +1811,6 @@ void ip6t_unregister_table_exit(struct net *net, const char *name) __ip6t_unregister_table(net, table); } -/* Returns 1 if the type and code is matched by the range, 0 otherwise */ -static inline bool -icmp6_type_code_match(u_int8_t test_type, u_int8_t min_code, u_int8_t max_code, - u_int8_t type, u_int8_t code, - bool invert) -{ - return (type == test_type && code >= min_code && code <= max_code) - ^ invert; -} - -static bool -icmp6_match(const struct sk_buff *skb, struct xt_action_param *par) -{ - const struct icmp6hdr *ic; - struct icmp6hdr _icmph; - const struct ip6t_icmp *icmpinfo = par->matchinfo; - - /* Must not be a fragment. */ - if (par->fragoff != 0) - return false; - - ic = skb_header_pointer(skb, par->thoff, sizeof(_icmph), &_icmph); - if (ic == NULL) { - /* We've been asked to examine this packet, and we - * can't. Hence, no choice but to drop. - */ - par->hotdrop = true; - return false; - } - - return icmp6_type_code_match(icmpinfo->type, - icmpinfo->code[0], - icmpinfo->code[1], - ic->icmp6_type, ic->icmp6_code, - !!(icmpinfo->invflags&IP6T_ICMP_INV)); -} - -/* Called when user tries to insert an entry of this type. */ -static int icmp6_checkentry(const struct xt_mtchk_param *par) -{ - const struct ip6t_icmp *icmpinfo = par->matchinfo; - - /* Must specify no unknown invflags */ - return (icmpinfo->invflags & ~IP6T_ICMP_INV) ? -EINVAL : 0; -} - /* The built-in targets: standard (NULL) and error. */ static struct xt_target ip6t_builtin_tg[] __read_mostly = { { @@ -1879,18 +1842,6 @@ static struct nf_sockopt_ops ip6t_sockopts = { .owner = THIS_MODULE, }; -static struct xt_match ip6t_builtin_mt[] __read_mostly = { - { - .name = "icmp6", - .match = icmp6_match, - .matchsize = sizeof(struct ip6t_icmp), - .checkentry = icmp6_checkentry, - .proto = IPPROTO_ICMPV6, - .family = NFPROTO_IPV6, - .me = THIS_MODULE, - }, -}; - static int __net_init ip6_tables_net_init(struct net *net) { return xt_proto_init(net, NFPROTO_IPV6); @@ -1918,19 +1869,14 @@ static int __init ip6_tables_init(void) ret = xt_register_targets(ip6t_builtin_tg, ARRAY_SIZE(ip6t_builtin_tg)); if (ret < 0) goto err2; - ret = xt_register_matches(ip6t_builtin_mt, ARRAY_SIZE(ip6t_builtin_mt)); - if (ret < 0) - goto err4; /* Register setsockopt */ ret = nf_register_sockopt(&ip6t_sockopts); if (ret < 0) - goto err5; + goto err4; return 0; -err5: - xt_unregister_matches(ip6t_builtin_mt, ARRAY_SIZE(ip6t_builtin_mt)); err4: xt_unregister_targets(ip6t_builtin_tg, ARRAY_SIZE(ip6t_builtin_tg)); err2: @@ -1943,7 +1889,6 @@ static void __exit ip6_tables_fini(void) { nf_unregister_sockopt(&ip6t_sockopts); - xt_unregister_matches(ip6t_builtin_mt, ARRAY_SIZE(ip6t_builtin_mt)); xt_unregister_targets(ip6t_builtin_tg, ARRAY_SIZE(ip6t_builtin_tg)); unregister_pernet_subsys(&ip6_tables_net_ops); } diff --git a/net/ipv6/netfilter/ip6t_rpfilter.c b/net/ipv6/netfilter/ip6t_rpfilter.c index a01d9b842bd0..67c87a88cde4 100644 --- a/net/ipv6/netfilter/ip6t_rpfilter.c +++ b/net/ipv6/netfilter/ip6t_rpfilter.c @@ -72,7 +72,9 @@ static bool rpfilter_lookup_reverse6(struct net *net, const struct sk_buff *skb, goto out; } - if (rt->rt6i_idev->dev == dev || (flags & XT_RPFILTER_LOOSE)) + if (rt->rt6i_idev->dev == dev || + l3mdev_master_ifindex_rcu(rt->rt6i_idev->dev) == dev->ifindex || + (flags & XT_RPFILTER_LOOSE)) ret = true; out: ip6_rt_put(rt); diff --git a/net/ipv6/netfilter/ip6table_filter.c b/net/ipv6/netfilter/ip6table_filter.c index df785ebda0ca..e8992693e14a 100644 --- a/net/ipv6/netfilter/ip6table_filter.c +++ b/net/ipv6/netfilter/ip6table_filter.c @@ -43,7 +43,7 @@ static int ip6table_filter_table_init(struct net *net) return -ENOMEM; /* Entry 1 is the FORWARD hook */ ((struct ip6t_standard *)repl->entries)[1].target.verdict = - forward ? -NF_ACCEPT - 1 : -NF_DROP - 1; + forward ? -NF_ACCEPT - 1 : NF_DROP - 1; err = ip6t_register_table(net, &packet_filter, repl, filter_ops); kfree(repl); diff --git a/net/ipv6/netfilter/ip6table_mangle.c b/net/ipv6/netfilter/ip6table_mangle.c index a88b2ce4a3cb..8dd4cd0c47bd 100644 --- a/net/ipv6/netfilter/ip6table_mangle.c +++ b/net/ipv6/netfilter/ip6table_mangle.c @@ -31,10 +31,10 @@ static const struct xt_table packet_mangler = { static unsigned int ip6t_mangle_out(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - unsigned int ret; struct in6_addr saddr, daddr; - u_int8_t hop_limit; - u_int32_t flowlabel, mark; + unsigned int ret, verdict; + u32 flowlabel, mark; + u8 hop_limit; int err; /* save source/dest address, mark, hoplimit, flowlabel, priority, */ @@ -47,8 +47,9 @@ ip6t_mangle_out(void *priv, struct sk_buff *skb, const struct nf_hook_state *sta flowlabel = *((u_int32_t *)ipv6_hdr(skb)); ret = ip6t_do_table(priv, skb, state); + verdict = ret & NF_VERDICT_MASK; - if (ret != NF_DROP && ret != NF_STOLEN && + if (verdict != NF_DROP && verdict != NF_STOLEN && (!ipv6_addr_equal(&ipv6_hdr(skb)->saddr, &saddr) || !ipv6_addr_equal(&ipv6_hdr(skb)->daddr, &daddr) || skb->mark != mark || diff --git a/net/ipv6/netfilter/ip6table_nat.c b/net/ipv6/netfilter/ip6table_nat.c index bf3cb3a13600..e119d4f090cc 100644 --- a/net/ipv6/netfilter/ip6table_nat.c +++ b/net/ipv6/netfilter/ip6table_nat.c @@ -147,26 +147,31 @@ static struct pernet_operations ip6table_nat_net_ops = { static int __init ip6table_nat_init(void) { - int ret = xt_register_template(&nf_nat_ipv6_table, - ip6table_nat_table_init); + int ret; + /* net->gen->ptr[ip6table_nat_net_id] must be allocated + * before calling ip6t_nat_register_lookups(). + */ + ret = register_pernet_subsys(&ip6table_nat_net_ops); if (ret < 0) return ret; - ret = register_pernet_subsys(&ip6table_nat_net_ops); + ret = xt_register_template(&nf_nat_ipv6_table, + ip6table_nat_table_init); if (ret) - xt_unregister_template(&nf_nat_ipv6_table); + unregister_pernet_subsys(&ip6table_nat_net_ops); return ret; } static void __exit ip6table_nat_exit(void) { - unregister_pernet_subsys(&ip6table_nat_net_ops); xt_unregister_template(&nf_nat_ipv6_table); + unregister_pernet_subsys(&ip6table_nat_net_ops); } module_init(ip6table_nat_init); module_exit(ip6table_nat_exit); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Ip6tables legacy nat table"); diff --git a/net/ipv6/netfilter/ip6table_raw.c b/net/ipv6/netfilter/ip6table_raw.c index 08861d5d1f4d..fc9f6754028f 100644 --- a/net/ipv6/netfilter/ip6table_raw.c +++ b/net/ipv6/netfilter/ip6table_raw.c @@ -106,3 +106,4 @@ static void __exit ip6table_raw_fini(void) module_init(ip6table_raw_init); module_exit(ip6table_raw_fini); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Ip6tables legacy raw table"); diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index d13240f13607..64ab23ff559b 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -62,7 +62,6 @@ static struct ctl_table nf_ct_frag6_sysctl_table[] = { .mode = 0644, .proc_handler = proc_doulongvec_minmax, }, - { } }; static int nf_ct_frag6_sysctl_register(struct net *net) @@ -87,7 +86,8 @@ static int nf_ct_frag6_sysctl_register(struct net *net) table[2].data = &nf_frag->fqdir->high_thresh; table[2].extra1 = &nf_frag->fqdir->low_thresh; - hdr = register_net_sysctl(net, "net/netfilter", table); + hdr = register_net_sysctl_sz(net, "net/netfilter", table, + ARRAY_SIZE(nf_ct_frag6_sysctl_table)); if (hdr == NULL) goto err_reg; @@ -104,7 +104,7 @@ err_alloc: static void __net_exit nf_ct_frags6_sysctl_unregister(struct net *net) { struct nft_ct_frag6_pernet *nf_frag = nf_frag_pernet(net); - struct ctl_table *table; + const struct ctl_table *table; table = nf_frag->nf_frag_frags_hdr->ctl_table_arg; unregister_net_sysctl_table(nf_frag->nf_frag_frags_hdr); @@ -123,7 +123,8 @@ static void __net_exit nf_ct_frags6_sysctl_unregister(struct net *net) #endif static int nf_ct_frag6_reasm(struct frag_queue *fq, struct sk_buff *skb, - struct sk_buff *prev_tail, struct net_device *dev); + struct sk_buff *prev_tail, struct net_device *dev, + int *refs); static inline u8 ip6_frag_ecn(const struct ipv6hdr *ipv6h) { @@ -132,7 +133,7 @@ static inline u8 ip6_frag_ecn(const struct ipv6hdr *ipv6h) static void nf_ct_frag6_expire(struct timer_list *t) { - struct inet_frag_queue *frag = from_timer(frag, t, timer); + struct inet_frag_queue *frag = timer_container_of(frag, t, timer); struct frag_queue *fq; fq = container_of(frag, struct frag_queue, q); @@ -154,6 +155,10 @@ static struct frag_queue *fq_find(struct net *net, __be32 id, u32 user, }; struct inet_frag_queue *q; + if (!(ipv6_addr_type(&hdr->daddr) & (IPV6_ADDR_MULTICAST | + IPV6_ADDR_LINKLOCAL))) + key.iif = 0; + q = inet_frag_find(nf_frag->fqdir, &key); if (!q) return NULL; @@ -163,7 +168,8 @@ static struct frag_queue *fq_find(struct net *net, __be32 id, u32 user, static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb, - const struct frag_hdr *fhdr, int nhoff) + const struct frag_hdr *fhdr, int nhoff, + int *refs) { unsigned int payload_len; struct net_device *dev; @@ -217,7 +223,7 @@ static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb, * this case. -DaveM */ pr_debug("end of fragment not rounded to 8 bytes.\n"); - inet_frag_kill(&fq->q); + inet_frag_kill(&fq->q, refs); return -EPROTO; } if (end > fq->q.len) { @@ -263,7 +269,7 @@ static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb, fq->iif = dev->ifindex; fq->q.stamp = skb->tstamp; - fq->q.mono_delivery_time = skb->mono_delivery_time; + fq->q.tstamp_type = skb->tstamp_type; fq->q.meat += skb->len; fq->ecn |= ecn; if (payload_len > fq->q.max_size) @@ -283,7 +289,7 @@ static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb, unsigned long orefdst = skb->_skb_refdst; skb->_skb_refdst = 0UL; - err = nf_ct_frag6_reasm(fq, skb, prev, dev); + err = nf_ct_frag6_reasm(fq, skb, prev, dev, refs); skb->_skb_refdst = orefdst; /* After queue has assumed skb ownership, only 0 or @@ -293,10 +299,11 @@ static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb, } skb_dst_drop(skb); + skb_orphan(skb); return -EINPROGRESS; insert_error: - inet_frag_kill(&fq->q); + inet_frag_kill(&fq->q, refs); err: skb_dst_drop(skb); return -EINVAL; @@ -310,13 +317,14 @@ err: * the last and the first frames arrived and all the bits are here. */ static int nf_ct_frag6_reasm(struct frag_queue *fq, struct sk_buff *skb, - struct sk_buff *prev_tail, struct net_device *dev) + struct sk_buff *prev_tail, struct net_device *dev, + int *refs) { void *reasm_data; int payload_len; u8 ecn; - inet_frag_kill(&fq->q); + inet_frag_kill(&fq->q, refs); ecn = ip_frag_ecn_table[fq->ecn]; if (unlikely(ecn == 0xff)) @@ -326,9 +334,9 @@ static int nf_ct_frag6_reasm(struct frag_queue *fq, struct sk_buff *skb, if (!reasm_data) goto err; - payload_len = ((skb->data - skb_network_header(skb)) - + payload_len = -skb_network_offset(skb) - sizeof(struct ipv6hdr) + fq->q.len - - sizeof(struct frag_hdr)); + sizeof(struct frag_hdr); if (payload_len > IPV6_MAXPLEN) { net_dbg_ratelimited("nf_ct_frag6_reasm: payload len = %d\n", payload_len); @@ -367,7 +375,7 @@ static int nf_ct_frag6_reasm(struct frag_queue *fq, struct sk_buff *skb, return 0; err: - inet_frag_kill(&fq->q); + inet_frag_kill(&fq->q, refs); return -EINVAL; } @@ -442,6 +450,7 @@ int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user) struct frag_hdr *fhdr; struct frag_queue *fq; struct ipv6hdr *hdr; + int refs = 0; u8 prevhdr; /* Jumbo payload inhibits frag. header */ @@ -468,24 +477,26 @@ int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user) hdr = ipv6_hdr(skb); fhdr = (struct frag_hdr *)skb_transport_header(skb); - skb_orphan(skb); + rcu_read_lock(); fq = fq_find(net, fhdr->identification, user, hdr, skb->dev ? skb->dev->ifindex : 0); if (fq == NULL) { + rcu_read_unlock(); pr_debug("Can't find and can't create new queue\n"); return -ENOMEM; } spin_lock_bh(&fq->q.lock); - ret = nf_ct_frag6_queue(fq, skb, fhdr, nhoff); + ret = nf_ct_frag6_queue(fq, skb, fhdr, nhoff, &refs); if (ret == -EPROTO) { skb->transport_header = savethdr; ret = 0; } spin_unlock_bh(&fq->q.lock); - inet_frag_put(&fq->q); + rcu_read_unlock(); + inet_frag_putn(&fq->q, refs); return ret; } EXPORT_SYMBOL_GPL(nf_ct_frag6_gather); diff --git a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c index cb4eb1d2c620..be7817fbc024 100644 --- a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c +++ b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c @@ -10,6 +10,7 @@ #include <linux/module.h> #include <linux/skbuff.h> #include <linux/icmp.h> +#include <linux/rcupdate.h> #include <linux/sysctl.h> #include <net/ipv6_frag.h> @@ -96,6 +97,12 @@ static void __net_exit defrag6_net_exit(struct net *net) } } +static const struct nf_defrag_hook defrag_hook = { + .owner = THIS_MODULE, + .enable = nf_defrag_ipv6_enable, + .disable = nf_defrag_ipv6_disable, +}; + static struct pernet_operations defrag6_net_ops = { .exit = defrag6_net_exit, }; @@ -114,6 +121,9 @@ static int __init nf_defrag_init(void) pr_err("nf_defrag_ipv6: can't register pernet ops\n"); goto cleanup_frag6; } + + rcu_assign_pointer(nf_defrag_v6_hook, &defrag_hook); + return ret; cleanup_frag6: @@ -124,6 +134,7 @@ cleanup_frag6: static void __exit nf_defrag_fini(void) { + rcu_assign_pointer(nf_defrag_v6_hook, NULL); unregister_pernet_subsys(&defrag6_net_ops); nf_ct_frag6_cleanup(); } @@ -171,3 +182,4 @@ module_init(nf_defrag_init); module_exit(nf_defrag_fini); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("IPv6 defragmentation support"); diff --git a/net/ipv6/netfilter/nf_dup_ipv6.c b/net/ipv6/netfilter/nf_dup_ipv6.c index a0a2de30be3e..6da3102b7c1b 100644 --- a/net/ipv6/netfilter/nf_dup_ipv6.c +++ b/net/ipv6/netfilter/nf_dup_ipv6.c @@ -38,7 +38,7 @@ static bool nf_dup_ipv6_route(struct net *net, struct sk_buff *skb, } skb_dst_drop(skb); skb_dst_set(skb, dst); - skb->dev = dst->dev; + skb->dev = dst_dev(dst); skb->protocol = htons(ETH_P_IPV6); return true; @@ -47,11 +47,12 @@ static bool nf_dup_ipv6_route(struct net *net, struct sk_buff *skb, void nf_dup_ipv6(struct net *net, struct sk_buff *skb, unsigned int hooknum, const struct in6_addr *gw, int oif) { - if (this_cpu_read(nf_skb_duplicated)) - return; + local_bh_disable(); + if (current->in_nf_duplicate) + goto out; skb = pskb_copy(skb, GFP_ATOMIC); if (skb == NULL) - return; + goto out; #if IS_ENABLED(CONFIG_NF_CONNTRACK) nf_reset_ct(skb); @@ -63,12 +64,14 @@ void nf_dup_ipv6(struct net *net, struct sk_buff *skb, unsigned int hooknum, --iph->hop_limit; } if (nf_dup_ipv6_route(net, skb, gw, oif)) { - __this_cpu_write(nf_skb_duplicated, true); + current->in_nf_duplicate = true; ip6_local_out(net, skb->sk, skb); - __this_cpu_write(nf_skb_duplicated, false); + current->in_nf_duplicate = false; } else { kfree_skb(skb); } +out: + local_bh_enable(); } EXPORT_SYMBOL_GPL(nf_dup_ipv6); diff --git a/net/ipv6/netfilter/nf_reject_ipv6.c b/net/ipv6/netfilter/nf_reject_ipv6.c index f61d4f18e1cf..ef5b7e85cffa 100644 --- a/net/ipv6/netfilter/nf_reject_ipv6.c +++ b/net/ipv6/netfilter/nf_reject_ipv6.c @@ -12,6 +12,19 @@ #include <linux/netfilter_ipv6.h> #include <linux/netfilter_bridge.h> +static struct ipv6hdr * +nf_reject_ip6hdr_put(struct sk_buff *nskb, + const struct sk_buff *oldskb, + __u8 protocol, int hoplimit); +static void +nf_reject_ip6_tcphdr_put(struct sk_buff *nskb, + const struct sk_buff *oldskb, + const struct tcphdr *oth, unsigned int otcplen); +static const struct tcphdr * +nf_reject_ip6_tcphdr_get(struct sk_buff *oldskb, + struct tcphdr *otcph, + unsigned int *otcplen, int hook); + static bool nf_reject_v6_csum_ok(struct sk_buff *skb, int hook) { const struct ipv6hdr *ip6h = ipv6_hdr(skb); @@ -83,7 +96,7 @@ struct sk_buff *nf_reject_skb_v6_tcp_reset(struct net *net, skb_reserve(nskb, LL_MAX_HEADER); nip6h = nf_reject_ip6hdr_put(nskb, oldskb, IPPROTO_TCP, - net->ipv6.devconf_all->hop_limit); + READ_ONCE(net->ipv6.devconf_all->hop_limit)); nf_reject_ip6_tcphdr_put(nskb, oldskb, oth, otcplen); nip6h->payload_len = htons(nskb->len - sizeof(struct ipv6hdr)); @@ -91,6 +104,32 @@ struct sk_buff *nf_reject_skb_v6_tcp_reset(struct net *net, } EXPORT_SYMBOL_GPL(nf_reject_skb_v6_tcp_reset); +static bool nf_skb_is_icmp6_unreach(const struct sk_buff *skb) +{ + const struct ipv6hdr *ip6h = ipv6_hdr(skb); + u8 proto = ip6h->nexthdr; + u8 _type, *tp; + int thoff; + __be16 fo; + + thoff = ipv6_skip_exthdr(skb, ((u8 *)(ip6h + 1) - skb->data), &proto, &fo); + + if (thoff < 0 || thoff >= skb->len || fo != 0) + return false; + + if (proto != IPPROTO_ICMPV6) + return false; + + tp = skb_header_pointer(skb, + thoff + offsetof(struct icmp6hdr, icmp6_type), + sizeof(_type), &_type); + + if (!tp) + return false; + + return *tp == ICMPV6_DEST_UNREACH; +} + struct sk_buff *nf_reject_skb_v6_unreach(struct net *net, struct sk_buff *oldskb, const struct net_device *dev, @@ -104,6 +143,10 @@ struct sk_buff *nf_reject_skb_v6_unreach(struct net *net, if (!nf_reject_ip6hdr_validate(oldskb)) return NULL; + /* Don't reply to ICMPV6_DEST_UNREACH with ICMPV6_DEST_UNREACH */ + if (nf_skb_is_icmp6_unreach(oldskb)) + return NULL; + /* Include "As much of invoking packet as possible without the ICMPv6 * packet exceeding the minimum IPv6 MTU" in the ICMP payload. */ @@ -124,7 +167,7 @@ struct sk_buff *nf_reject_skb_v6_unreach(struct net *net, skb_reserve(nskb, LL_MAX_HEADER); nip6h = nf_reject_ip6hdr_put(nskb, oldskb, IPPROTO_ICMPV6, - net->ipv6.devconf_all->hop_limit); + READ_ONCE(net->ipv6.devconf_all->hop_limit)); skb_reset_transport_header(nskb); icmp6h = skb_put_zero(nskb, sizeof(struct icmp6hdr)); @@ -146,9 +189,10 @@ struct sk_buff *nf_reject_skb_v6_unreach(struct net *net, } EXPORT_SYMBOL_GPL(nf_reject_skb_v6_unreach); -const struct tcphdr *nf_reject_ip6_tcphdr_get(struct sk_buff *oldskb, - struct tcphdr *otcph, - unsigned int *otcplen, int hook) +static const struct tcphdr * +nf_reject_ip6_tcphdr_get(struct sk_buff *oldskb, + struct tcphdr *otcph, + unsigned int *otcplen, int hook) { const struct ipv6hdr *oip6h = ipv6_hdr(oldskb); u8 proto; @@ -192,11 +236,11 @@ const struct tcphdr *nf_reject_ip6_tcphdr_get(struct sk_buff *oldskb, return otcph; } -EXPORT_SYMBOL_GPL(nf_reject_ip6_tcphdr_get); -struct ipv6hdr *nf_reject_ip6hdr_put(struct sk_buff *nskb, - const struct sk_buff *oldskb, - __u8 protocol, int hoplimit) +static struct ipv6hdr * +nf_reject_ip6hdr_put(struct sk_buff *nskb, + const struct sk_buff *oldskb, + __u8 protocol, int hoplimit) { struct ipv6hdr *ip6h; const struct ipv6hdr *oip6h = ipv6_hdr(oldskb); @@ -216,40 +260,30 @@ struct ipv6hdr *nf_reject_ip6hdr_put(struct sk_buff *nskb, return ip6h; } -EXPORT_SYMBOL_GPL(nf_reject_ip6hdr_put); -void nf_reject_ip6_tcphdr_put(struct sk_buff *nskb, - const struct sk_buff *oldskb, - const struct tcphdr *oth, unsigned int otcplen) +static void +nf_reject_ip6_tcphdr_put(struct sk_buff *nskb, + const struct sk_buff *oldskb, + const struct tcphdr *oth, unsigned int otcplen) { struct tcphdr *tcph; - int needs_ack; skb_reset_transport_header(nskb); - tcph = skb_put(nskb, sizeof(struct tcphdr)); + tcph = skb_put_zero(nskb, sizeof(struct tcphdr)); /* Truncate to length (no data) */ tcph->doff = sizeof(struct tcphdr)/4; tcph->source = oth->dest; tcph->dest = oth->source; if (oth->ack) { - needs_ack = 0; tcph->seq = oth->ack_seq; - tcph->ack_seq = 0; } else { - needs_ack = 1; tcph->ack_seq = htonl(ntohl(oth->seq) + oth->syn + oth->fin + otcplen - (oth->doff<<2)); - tcph->seq = 0; + tcph->ack = 1; } - /* Reset flags */ - ((u_int8_t *)tcph)[13] = 0; tcph->rst = 1; - tcph->ack = needs_ack; - tcph->window = 0; - tcph->urg_ptr = 0; - tcph->check = 0; /* Adjust TCP checksum */ tcph->check = csum_ipv6_magic(&ipv6_hdr(nskb)->saddr, @@ -258,7 +292,6 @@ void nf_reject_ip6_tcphdr_put(struct sk_buff *nskb, csum_partial(tcph, sizeof(struct tcphdr), 0)); } -EXPORT_SYMBOL_GPL(nf_reject_ip6_tcphdr_put); static int nf_reject6_fill_skb_dst(struct sk_buff *skb_in) { @@ -278,14 +311,12 @@ static int nf_reject6_fill_skb_dst(struct sk_buff *skb_in) void nf_send_reset6(struct net *net, struct sock *sk, struct sk_buff *oldskb, int hook) { - struct net_device *br_indev __maybe_unused; - struct sk_buff *nskb; - struct tcphdr _otcph; - const struct tcphdr *otcph; - unsigned int otcplen, hh_len; const struct ipv6hdr *oip6h = ipv6_hdr(oldskb); - struct ipv6hdr *ip6h; struct dst_entry *dst = NULL; + const struct tcphdr *otcph; + struct sk_buff *nskb; + struct tcphdr _otcph; + unsigned int otcplen; struct flowi6 fl6; if ((!(ipv6_addr_type(&oip6h->saddr) & IPV6_ADDR_UNICAST)) || @@ -305,14 +336,14 @@ void nf_send_reset6(struct net *net, struct sock *sk, struct sk_buff *oldskb, fl6.fl6_sport = otcph->dest; fl6.fl6_dport = otcph->source; - if (hook == NF_INET_PRE_ROUTING || hook == NF_INET_INGRESS) { + if (!skb_dst(oldskb)) { nf_ip6_route(net, &dst, flowi6_to_flowi(&fl6), false); if (!dst) return; skb_dst_set(oldskb, dst); } - fl6.flowi6_oif = l3mdev_master_ifindex(skb_dst(oldskb)->dev); + fl6.flowi6_oif = l3mdev_master_ifindex(skb_dst_dev(oldskb)); fl6.flowi6_mark = IP6_REPLY_MARK(net, oldskb->mark); security_skb_classify_flow(oldskb, flowi6_to_flowi_common(&fl6)); dst = ip6_route_output(net, NULL, &fl6); @@ -324,9 +355,8 @@ void nf_send_reset6(struct net *net, struct sock *sk, struct sk_buff *oldskb, if (IS_ERR(dst)) return; - hh_len = (dst->dev->hard_header_len + 15)&~15; - nskb = alloc_skb(hh_len + 15 + dst->header_len + sizeof(struct ipv6hdr) - + sizeof(struct tcphdr) + dst->trailer_len, + nskb = alloc_skb(LL_MAX_HEADER + sizeof(struct ipv6hdr) + + sizeof(struct tcphdr) + dst->trailer_len, GFP_ATOMIC); if (!nskb) { @@ -339,12 +369,12 @@ void nf_send_reset6(struct net *net, struct sock *sk, struct sk_buff *oldskb, nskb->mark = fl6.flowi6_mark; - skb_reserve(nskb, hh_len + dst->header_len); - ip6h = nf_reject_ip6hdr_put(nskb, oldskb, IPPROTO_TCP, - ip6_dst_hoplimit(dst)); + skb_reserve(nskb, LL_MAX_HEADER); + nf_reject_ip6hdr_put(nskb, oldskb, IPPROTO_TCP, ip6_dst_hoplimit(dst)); nf_reject_ip6_tcphdr_put(nskb, oldskb, otcph, otcplen); nf_ct_attach(nskb, oldskb); + nf_ct_set_closing(skb_nfct(oldskb)); #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) /* If we use ip6_local_out for bridged traffic, the MAC source on @@ -353,9 +383,16 @@ void nf_send_reset6(struct net *net, struct sock *sk, struct sk_buff *oldskb, * build the eth header using the original destination's MAC as the * source, and send the RST packet directly. */ - br_indev = nf_bridge_get_physindev(oldskb); - if (br_indev) { + if (nf_bridge_info_exists(oldskb)) { struct ethhdr *oeth = eth_hdr(oldskb); + struct ipv6hdr *ip6h = ipv6_hdr(nskb); + struct net_device *br_indev; + + br_indev = nf_bridge_get_physindev(oldskb, net); + if (!br_indev) { + kfree_skb(nskb); + return; + } nskb->dev = br_indev; nskb->protocol = htons(ETH_P_IPV6); @@ -403,8 +440,7 @@ void nf_send_unreach6(struct net *net, struct sk_buff *skb_in, if (hooknum == NF_INET_LOCAL_OUT && skb_in->dev == NULL) skb_in->dev = net->loopback_dev; - if ((hooknum == NF_INET_PRE_ROUTING || hooknum == NF_INET_INGRESS) && - nf_reject6_fill_skb_dst(skb_in) < 0) + if (!skb_dst(skb_in) && nf_reject6_fill_skb_dst(skb_in) < 0) return; icmpv6_send(skb_in, ICMPV6_DEST_UNREACH, code, 0); @@ -412,3 +448,4 @@ void nf_send_unreach6(struct net *net, struct sk_buff *skb_in, EXPORT_SYMBOL_GPL(nf_send_unreach6); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("IPv6 packet rejection core"); diff --git a/net/ipv6/netfilter/nf_socket_ipv6.c b/net/ipv6/netfilter/nf_socket_ipv6.c index a7690ec62325..ced8bd44828e 100644 --- a/net/ipv6/netfilter/nf_socket_ipv6.c +++ b/net/ipv6/netfilter/nf_socket_ipv6.c @@ -83,8 +83,7 @@ nf_socket_get_sock_v6(struct net *net, struct sk_buff *skb, int doff, { switch (protocol) { case IPPROTO_TCP: - return inet6_lookup(net, net->ipv4.tcp_death_row.hashinfo, - skb, doff, saddr, sport, daddr, dport, + return inet6_lookup(net, skb, doff, saddr, sport, daddr, dport, in->ifindex); case IPPROTO_UDP: return udp6_lib_lookup(net, saddr, sport, daddr, dport, @@ -103,6 +102,10 @@ struct sock *nf_sk_lookup_slow_v6(struct net *net, const struct sk_buff *skb, struct sk_buff *data_skb = NULL; int doff = 0; int thoff = 0, tproto; +#if IS_ENABLED(CONFIG_NF_CONNTRACK) + enum ip_conntrack_info ctinfo; + struct nf_conn const *ct; +#endif tproto = ipv6_find_hdr(skb, &thoff, -1, NULL, NULL); if (tproto < 0) { @@ -136,6 +139,25 @@ struct sock *nf_sk_lookup_slow_v6(struct net *net, const struct sk_buff *skb, return NULL; } +#if IS_ENABLED(CONFIG_NF_CONNTRACK) + /* Do the lookup with the original socket address in + * case this is a reply packet of an established + * SNAT-ted connection. + */ + ct = nf_ct_get(skb, &ctinfo); + if (ct && + ((tproto != IPPROTO_ICMPV6 && + ctinfo == IP_CT_ESTABLISHED_REPLY) || + (tproto == IPPROTO_ICMPV6 && + ctinfo == IP_CT_RELATED_REPLY)) && + (ct->status & IPS_SRC_NAT_DONE)) { + daddr = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.in6; + dport = (tproto == IPPROTO_TCP) ? + ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.tcp.port : + ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.udp.port; + } +#endif + return nf_socket_get_sock_v6(net, data_skb, doff, tproto, saddr, daddr, sport, dport, indev); } diff --git a/net/ipv6/netfilter/nf_tproxy_ipv6.c b/net/ipv6/netfilter/nf_tproxy_ipv6.c index 929502e51203..b2f59ed9d7cc 100644 --- a/net/ipv6/netfilter/nf_tproxy_ipv6.c +++ b/net/ipv6/netfilter/nf_tproxy_ipv6.c @@ -63,7 +63,7 @@ nf_tproxy_handle_time_wait6(struct sk_buff *skb, int tproto, int thoff, lport ? lport : hp->dest, skb->dev, NF_TPROXY_LOOKUP_LISTENER); if (sk2) { - inet_twsk_deschedule_put(inet_twsk(sk)); + nf_tproxy_twsk_deschedule_put(inet_twsk(sk)); sk = sk2; } } @@ -80,7 +80,6 @@ nf_tproxy_get_sock_v6(struct net *net, struct sk_buff *skb, int thoff, const struct net_device *in, const enum nf_tproxy_lookup_t lookup_type) { - struct inet_hashinfo *hinfo = net->ipv4.tcp_death_row.hashinfo; struct sock *sk; switch (protocol) { @@ -94,7 +93,7 @@ nf_tproxy_get_sock_v6(struct net *net, struct sk_buff *skb, int thoff, switch (lookup_type) { case NF_TPROXY_LOOKUP_LISTENER: - sk = inet6_lookup_listener(net, hinfo, skb, + sk = inet6_lookup_listener(net, skb, thoff + __tcp_hdrlen(hp), saddr, sport, daddr, ntohs(dport), @@ -109,7 +108,7 @@ nf_tproxy_get_sock_v6(struct net *net, struct sk_buff *skb, int thoff, */ break; case NF_TPROXY_LOOKUP_ESTABLISHED: - sk = __inet6_lookup_established(net, hinfo, saddr, sport, daddr, + sk = __inet6_lookup_established(net, saddr, sport, daddr, ntohs(dport), in->ifindex, 0); break; default: diff --git a/net/ipv6/netfilter/nft_dup_ipv6.c b/net/ipv6/netfilter/nft_dup_ipv6.c index c82f3fdd4a65..492a811828a7 100644 --- a/net/ipv6/netfilter/nft_dup_ipv6.c +++ b/net/ipv6/netfilter/nft_dup_ipv6.c @@ -38,13 +38,13 @@ static int nft_dup_ipv6_init(const struct nft_ctx *ctx, if (tb[NFTA_DUP_SREG_ADDR] == NULL) return -EINVAL; - err = nft_parse_register_load(tb[NFTA_DUP_SREG_ADDR], &priv->sreg_addr, + err = nft_parse_register_load(ctx, tb[NFTA_DUP_SREG_ADDR], &priv->sreg_addr, sizeof(struct in6_addr)); if (err < 0) return err; if (tb[NFTA_DUP_SREG_DEV]) - err = nft_parse_register_load(tb[NFTA_DUP_SREG_DEV], + err = nft_parse_register_load(ctx, tb[NFTA_DUP_SREG_DEV], &priv->sreg_dev, sizeof(int)); return err; diff --git a/net/ipv6/netfilter/nft_fib_ipv6.c b/net/ipv6/netfilter/nft_fib_ipv6.c index 36dc14b34388..421036a3605b 100644 --- a/net/ipv6/netfilter/nft_fib_ipv6.c +++ b/net/ipv6/netfilter/nft_fib_ipv6.c @@ -41,8 +41,6 @@ static int nft_fib6_flowi_init(struct flowi6 *fl6, const struct nft_fib *priv, if (ipv6_addr_type(&fl6->daddr) & IPV6_ADDR_LINKLOCAL) { lookup_flags |= RT6_LOOKUP_F_IFACE; fl6->flowi6_oif = get_ifindex(dev ? dev : pkt->skb->dev); - } else if (priv->flags & NFTA_FIB_F_IIF) { - fl6->flowi6_l3mdev = l3mdev_master_ifindex_rcu(dev); } if (ipv6_addr_type(&fl6->saddr) & IPV6_ADDR_UNICAST) @@ -52,6 +50,7 @@ static int nft_fib6_flowi_init(struct flowi6 *fl6, const struct nft_fib *priv, fl6->flowi6_mark = pkt->skb->mark; fl6->flowlabel = (*(__be32 *)iph) & IPV6_FLOWINFO_MASK; + fl6->flowi6_l3mdev = nft_fib_l3mdev_master_ifindex_rcu(pkt, dev); return lookup_flags; } @@ -158,6 +157,7 @@ void nft_fib6_eval(const struct nft_expr *expr, struct nft_regs *regs, { const struct nft_fib *priv = nft_expr_priv(expr); int noff = skb_network_offset(pkt->skb); + const struct net_device *found = NULL; const struct net_device *oif = NULL; u32 *dest = ®s->data[priv->dreg]; struct ipv6hdr *iph, _iph; @@ -169,6 +169,11 @@ void nft_fib6_eval(const struct nft_expr *expr, struct nft_regs *regs, struct rt6_info *rt; int lookup_flags; + if (nft_fib_can_skip(pkt)) { + nft_fib_store_result(dest, priv, nft_in(pkt)); + return; + } + if (priv->flags & NFTA_FIB_F_IIF) oif = nft_in(pkt); else if (priv->flags & NFTA_FIB_F_OIF) @@ -180,17 +185,13 @@ void nft_fib6_eval(const struct nft_expr *expr, struct nft_regs *regs, return; } - lookup_flags = nft_fib6_flowi_init(&fl6, priv, pkt, oif, iph); - - if (nft_hook(pkt) == NF_INET_PRE_ROUTING || - nft_hook(pkt) == NF_INET_INGRESS) { - if (nft_fib_is_loopback(pkt->skb, nft_in(pkt)) || - nft_fib_v6_skip_icmpv6(pkt->skb, pkt->tprot, iph)) { - nft_fib_store_result(dest, priv, nft_in(pkt)); - return; - } + if (nft_fib_v6_skip_icmpv6(pkt->skb, pkt->tprot, iph)) { + nft_fib_store_result(dest, priv, nft_in(pkt)); + return; } + lookup_flags = nft_fib6_flowi_init(&fl6, priv, pkt, oif, iph); + *dest = 0; rt = (void *)ip6_route_lookup(nft_net(pkt), &fl6, pkt->skb, lookup_flags); @@ -201,11 +202,15 @@ void nft_fib6_eval(const struct nft_expr *expr, struct nft_regs *regs, if (rt->rt6i_flags & (RTF_REJECT | RTF_ANYCAST | RTF_LOCAL)) goto put_rt_err; - if (oif && oif != rt->rt6i_idev->dev && - l3mdev_master_ifindex_rcu(rt->rt6i_idev->dev) != oif->ifindex) - goto put_rt_err; + if (!oif) { + found = rt->rt6i_idev->dev; + } else { + if (oif == rt->rt6i_idev->dev || + l3mdev_master_ifindex_rcu(rt->rt6i_idev->dev) == oif->ifindex) + found = oif; + } - nft_fib_store_result(dest, priv, rt->rt6i_idev->dev); + nft_fib_store_result(dest, priv, found); put_rt_err: ip6_rt_put(rt); } diff --git a/net/ipv6/output_core.c b/net/ipv6/output_core.c index b5205311f372..1c9b283a4132 100644 --- a/net/ipv6/output_core.c +++ b/net/ipv6/output_core.c @@ -104,18 +104,20 @@ EXPORT_SYMBOL(ip6_find_1stfragopt); int ip6_dst_hoplimit(struct dst_entry *dst) { int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT); + + rcu_read_lock(); if (hoplimit == 0) { - struct net_device *dev = dst->dev; + struct net_device *dev = dst_dev_rcu(dst); struct inet6_dev *idev; - rcu_read_lock(); idev = __in6_dev_get(dev); if (idev) - hoplimit = idev->cnf.hop_limit; + hoplimit = READ_ONCE(idev->cnf.hop_limit); else - hoplimit = dev_net(dev)->ipv6.devconf_all->hop_limit; - rcu_read_unlock(); + hoplimit = READ_ONCE(dev_net(dev)->ipv6.devconf_all->hop_limit); } + rcu_read_unlock(); + return hoplimit; } EXPORT_SYMBOL(ip6_dst_hoplimit); @@ -141,7 +143,7 @@ int __ip6_local_out(struct net *net, struct sock *sk, struct sk_buff *skb) skb->protocol = htons(ETH_P_IPV6); return nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, - net, sk, skb, NULL, skb_dst(skb)->dev, + net, sk, skb, NULL, skb_dst_dev(skb), dst_output); } EXPORT_SYMBOL_GPL(__ip6_local_out); diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c index 808983bc2ec9..e4afc651731a 100644 --- a/net/ipv6/ping.c +++ b/net/ipv6/ping.c @@ -45,7 +45,7 @@ static int dummy_ipv6_chk_addr(struct net *net, const struct in6_addr *addr, return 0; } -static int ping_v6_pre_connect(struct sock *sk, struct sockaddr *uaddr, +static int ping_v6_pre_connect(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len) { /* This check is replicated from __ip6_datagram_connect() and @@ -56,7 +56,7 @@ static int ping_v6_pre_connect(struct sock *sk, struct sockaddr *uaddr, if (addr_len < SIN6_LEN_RFC2133) return -EINVAL; - return BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr); + return BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr, &addr_len); } static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) @@ -89,7 +89,7 @@ static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) return -EAFNOSUPPORT; } daddr = &(u->sin6_addr); - if (np->sndflow) + if (inet6_test_bit(SNDFLOW, sk)) fl6.flowlabel = u->sin6_flowinfo & IPV6_FLOWINFO_MASK; if (__ipv6_addr_needs_scope_id(ipv6_addr_type(daddr))) oif = u->sin6_scope_id; @@ -107,19 +107,18 @@ static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) oif = np->sticky_pktinfo.ipi6_ifindex; if (!oif && ipv6_addr_is_multicast(daddr)) - oif = np->mcast_oif; + oif = READ_ONCE(np->mcast_oif); else if (!oif) - oif = np->ucast_oif; + oif = READ_ONCE(np->ucast_oif); addr_type = ipv6_addr_type(daddr); if ((__ipv6_addr_needs_scope_id(addr_type) && !oif) || (addr_type & IPV6_ADDR_MAPPED) || - (oif && sk->sk_bound_dev_if && oif != sk->sk_bound_dev_if)) + (oif && sk->sk_bound_dev_if && oif != sk->sk_bound_dev_if && + l3mdev_master_ifindex_by_index(sock_net(sk), oif) != sk->sk_bound_dev_if)) return -EINVAL; - ipcm6_init_sk(&ipc6, np); - ipc6.sockc.tsflags = sk->sk_tsflags; - ipc6.sockc.mark = sk->sk_mark; + ipcm6_init_sk(&ipc6, sk); fl6.flowi6_oif = oif; @@ -143,7 +142,7 @@ static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) fl6.saddr = np->saddr; fl6.daddr = *daddr; fl6.flowi6_mark = ipc6.sockc.mark; - fl6.flowi6_uid = sk->sk_uid; + fl6.flowi6_uid = sk_uid(sk); fl6.fl6_icmp_type = user_icmph.icmp6_type; fl6.fl6_icmp_code = user_icmph.icmp6_code; security_sk_classify_flow(sk, flowi6_to_flowi_common(&fl6)); @@ -153,12 +152,12 @@ static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) dst = ip6_sk_dst_lookup_flow(sk, &fl6, daddr, false); if (IS_ERR(dst)) return PTR_ERR(dst); - rt = (struct rt6_info *) dst; + rt = dst_rt6_info(dst); if (!fl6.flowi6_oif && ipv6_addr_is_multicast(&fl6.daddr)) - fl6.flowi6_oif = np->mcast_oif; + fl6.flowi6_oif = READ_ONCE(np->mcast_oif); else if (!fl6.flowi6_oif) - fl6.flowi6_oif = np->ucast_oif; + fl6.flowi6_oif = READ_ONCE(np->ucast_oif); pfh.icmph.type = user_icmph.icmp6_type; pfh.icmph.code = user_icmph.icmp6_code; @@ -209,11 +208,11 @@ struct proto pingv6_prot = { .recvmsg = ping_recvmsg, .bind = ping_bind, .backlog_rcv = ping_queue_rcv_skb, - .hash = ping_hash, .unhash = ping_unhash, .get_port = ping_get_port, .put_port = ping_unhash, .obj_size = sizeof(struct raw6_sock), + .ipv6_pinfo_offset = offsetof(struct raw6_sock, inet6), }; EXPORT_SYMBOL_GPL(pingv6_prot); @@ -237,7 +236,7 @@ static int ping_v6_seq_show(struct seq_file *seq, void *v) seq_puts(seq, IPV6_SEQ_DGRAM_HEADER); } else { int bucket = ((struct ping_iter_state *) seq->private)->bucket; - struct inet_sock *inet = inet_sk(v); + struct inet_sock *inet = inet_sk((struct sock *)v); __u16 srcp = ntohs(inet->inet_sport); __u16 destp = ntohs(inet->inet_dport); ip6_dgram_sock_seq_show(seq, v, srcp, destp, bucket); diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c index d6306aa46bb1..73296f38c252 100644 --- a/net/ipv6/proc.c +++ b/net/ipv6/proc.c @@ -27,7 +27,7 @@ #include <net/ipv6.h> #define MAX4(a, b, c, d) \ - max_t(u32, max_t(u32, a, b), max_t(u32, c, d)) + MAX_T(u32, MAX_T(u32, a, b), MAX_T(u32, c, d)) #define SNMP_MIB_MAX MAX4(UDP_MIB_MAX, TCP_MIB_MAX, \ IPSTATS_MIB_MAX, ICMP_MIB_MAX) @@ -61,7 +61,7 @@ static const struct snmp_mib snmp6_ipstats_list[] = { SNMP_MIB_ITEM("Ip6InDiscards", IPSTATS_MIB_INDISCARDS), SNMP_MIB_ITEM("Ip6InDelivers", IPSTATS_MIB_INDELIVERS), SNMP_MIB_ITEM("Ip6OutForwDatagrams", IPSTATS_MIB_OUTFORWDATAGRAMS), - SNMP_MIB_ITEM("Ip6OutRequests", IPSTATS_MIB_OUTPKTS), + SNMP_MIB_ITEM("Ip6OutRequests", IPSTATS_MIB_OUTREQUESTS), SNMP_MIB_ITEM("Ip6OutDiscards", IPSTATS_MIB_OUTDISCARDS), SNMP_MIB_ITEM("Ip6OutNoRoutes", IPSTATS_MIB_OUTNOROUTES), SNMP_MIB_ITEM("Ip6ReasmTimeout", IPSTATS_MIB_REASMTIMEOUT), @@ -84,7 +84,7 @@ static const struct snmp_mib snmp6_ipstats_list[] = { SNMP_MIB_ITEM("Ip6InECT1Pkts", IPSTATS_MIB_ECT1PKTS), SNMP_MIB_ITEM("Ip6InECT0Pkts", IPSTATS_MIB_ECT0PKTS), SNMP_MIB_ITEM("Ip6InCEPkts", IPSTATS_MIB_CEPKTS), - SNMP_MIB_SENTINEL + SNMP_MIB_ITEM("Ip6OutTransmits", IPSTATS_MIB_OUTPKTS), }; static const struct snmp_mib snmp6_icmp6_list[] = { @@ -94,29 +94,10 @@ static const struct snmp_mib snmp6_icmp6_list[] = { SNMP_MIB_ITEM("Icmp6OutMsgs", ICMP6_MIB_OUTMSGS), SNMP_MIB_ITEM("Icmp6OutErrors", ICMP6_MIB_OUTERRORS), SNMP_MIB_ITEM("Icmp6InCsumErrors", ICMP6_MIB_CSUMERRORS), - SNMP_MIB_SENTINEL +/* ICMP6_MIB_RATELIMITHOST needs to be last, see snmp6_dev_seq_show(). */ + SNMP_MIB_ITEM("Icmp6OutRateLimitHost", ICMP6_MIB_RATELIMITHOST), }; -/* RFC 4293 v6 ICMPMsgStatsTable; named items for RFC 2466 compatibility */ -static const char *const icmp6type2name[256] = { - [ICMPV6_DEST_UNREACH] = "DestUnreachs", - [ICMPV6_PKT_TOOBIG] = "PktTooBigs", - [ICMPV6_TIME_EXCEED] = "TimeExcds", - [ICMPV6_PARAMPROB] = "ParmProblems", - [ICMPV6_ECHO_REQUEST] = "Echos", - [ICMPV6_ECHO_REPLY] = "EchoReplies", - [ICMPV6_MGM_QUERY] = "GroupMembQueries", - [ICMPV6_MGM_REPORT] = "GroupMembResponses", - [ICMPV6_MGM_REDUCTION] = "GroupMembReductions", - [ICMPV6_MLD2_REPORT] = "MLDv2Reports", - [NDISC_ROUTER_ADVERTISEMENT] = "RouterAdvertisements", - [NDISC_ROUTER_SOLICITATION] = "RouterSolicits", - [NDISC_NEIGHBOUR_ADVERTISEMENT] = "NeighborAdvertisements", - [NDISC_NEIGHBOUR_SOLICITATION] = "NeighborSolicits", - [NDISC_REDIRECT] = "Redirects", -}; - - static const struct snmp_mib snmp6_udp6_list[] = { SNMP_MIB_ITEM("Udp6InDatagrams", UDP_MIB_INDATAGRAMS), SNMP_MIB_ITEM("Udp6NoPorts", UDP_MIB_NOPORTS), @@ -127,7 +108,6 @@ static const struct snmp_mib snmp6_udp6_list[] = { SNMP_MIB_ITEM("Udp6InCsumErrors", UDP_MIB_CSUMERRORS), SNMP_MIB_ITEM("Udp6IgnoredMulti", UDP_MIB_IGNOREDMULTI), SNMP_MIB_ITEM("Udp6MemErrors", UDP_MIB_MEMERRORS), - SNMP_MIB_SENTINEL }; static const struct snmp_mib snmp6_udplite6_list[] = { @@ -139,7 +119,6 @@ static const struct snmp_mib snmp6_udplite6_list[] = { SNMP_MIB_ITEM("UdpLite6SndbufErrors", UDP_MIB_SNDBUFERRORS), SNMP_MIB_ITEM("UdpLite6InCsumErrors", UDP_MIB_CSUMERRORS), SNMP_MIB_ITEM("UdpLite6MemErrors", UDP_MIB_MEMERRORS), - SNMP_MIB_SENTINEL }; static void snmp6_seq_show_icmpv6msg(struct seq_file *seq, atomic_long_t *smib) @@ -149,11 +128,31 @@ static void snmp6_seq_show_icmpv6msg(struct seq_file *seq, atomic_long_t *smib) /* print by name -- deprecated items */ for (i = 0; i < ICMP6MSG_MIB_MAX; i++) { + const char *p = NULL; int icmptype; - const char *p; + +#define CASE(TYP, STR) case TYP: p = STR; break; icmptype = i & 0xff; - p = icmp6type2name[icmptype]; + switch (icmptype) { +/* RFC 4293 v6 ICMPMsgStatsTable; named items for RFC 2466 compatibility */ + CASE(ICMPV6_DEST_UNREACH, "DestUnreachs") + CASE(ICMPV6_PKT_TOOBIG, "PktTooBigs") + CASE(ICMPV6_TIME_EXCEED, "TimeExcds") + CASE(ICMPV6_PARAMPROB, "ParmProblems") + CASE(ICMPV6_ECHO_REQUEST, "Echos") + CASE(ICMPV6_ECHO_REPLY, "EchoReplies") + CASE(ICMPV6_MGM_QUERY, "GroupMembQueries") + CASE(ICMPV6_MGM_REPORT, "GroupMembResponses") + CASE(ICMPV6_MGM_REDUCTION, "GroupMembReductions") + CASE(ICMPV6_MLD2_REPORT, "MLDv2Reports") + CASE(NDISC_ROUTER_ADVERTISEMENT, "RouterAdvertisements") + CASE(NDISC_ROUTER_SOLICITATION, "RouterSolicits") + CASE(NDISC_NEIGHBOUR_ADVERTISEMENT, "NeighborAdvertisements") + CASE(NDISC_NEIGHBOUR_SOLICITATION, "NeighborSolicits") + CASE(NDISC_REDIRECT, "Redirects") + } +#undef CASE if (!p) /* don't print un-named types here */ continue; snprintf(name, sizeof(name), "Icmp6%s%s", @@ -180,35 +179,37 @@ static void snmp6_seq_show_icmpv6msg(struct seq_file *seq, atomic_long_t *smib) */ static void snmp6_seq_show_item(struct seq_file *seq, void __percpu *pcpumib, atomic_long_t *smib, - const struct snmp_mib *itemlist) + const struct snmp_mib *itemlist, + int cnt) { unsigned long buff[SNMP_MIB_MAX]; int i; if (pcpumib) { - memset(buff, 0, sizeof(unsigned long) * SNMP_MIB_MAX); + memset(buff, 0, sizeof(unsigned long) * cnt); - snmp_get_cpu_field_batch(buff, itemlist, pcpumib); - for (i = 0; itemlist[i].name; i++) + snmp_get_cpu_field_batch_cnt(buff, itemlist, cnt, pcpumib); + for (i = 0; i < cnt; i++) seq_printf(seq, "%-32s\t%lu\n", itemlist[i].name, buff[i]); } else { - for (i = 0; itemlist[i].name; i++) + for (i = 0; i < cnt; i++) seq_printf(seq, "%-32s\t%lu\n", itemlist[i].name, atomic_long_read(smib + itemlist[i].entry)); } } static void snmp6_seq_show_item64(struct seq_file *seq, void __percpu *mib, - const struct snmp_mib *itemlist, size_t syncpoff) + const struct snmp_mib *itemlist, + int cnt, size_t syncpoff) { u64 buff64[SNMP_MIB_MAX]; int i; - memset(buff64, 0, sizeof(u64) * SNMP_MIB_MAX); + memset(buff64, 0, sizeof(u64) * cnt); - snmp_get_cpu_field64_batch(buff64, itemlist, mib, syncpoff); - for (i = 0; itemlist[i].name; i++) + snmp_get_cpu_field64_batch_cnt(buff64, itemlist, cnt, mib, syncpoff); + for (i = 0; i < cnt; i++) seq_printf(seq, "%-32s\t%llu\n", itemlist[i].name, buff64[i]); } @@ -217,14 +218,19 @@ static int snmp6_seq_show(struct seq_file *seq, void *v) struct net *net = (struct net *)seq->private; snmp6_seq_show_item64(seq, net->mib.ipv6_statistics, - snmp6_ipstats_list, offsetof(struct ipstats_mib, syncp)); + snmp6_ipstats_list, + ARRAY_SIZE(snmp6_ipstats_list), + offsetof(struct ipstats_mib, syncp)); snmp6_seq_show_item(seq, net->mib.icmpv6_statistics, - NULL, snmp6_icmp6_list); + NULL, snmp6_icmp6_list, + ARRAY_SIZE(snmp6_icmp6_list)); snmp6_seq_show_icmpv6msg(seq, net->mib.icmpv6msg_statistics->mibs); snmp6_seq_show_item(seq, net->mib.udp_stats_in6, - NULL, snmp6_udp6_list); + NULL, snmp6_udp6_list, + ARRAY_SIZE(snmp6_udp6_list)); snmp6_seq_show_item(seq, net->mib.udplite_stats_in6, - NULL, snmp6_udplite6_list); + NULL, snmp6_udplite6_list, + ARRAY_SIZE(snmp6_udplite6_list)); return 0; } @@ -234,9 +240,14 @@ static int snmp6_dev_seq_show(struct seq_file *seq, void *v) seq_printf(seq, "%-32s\t%u\n", "ifIndex", idev->dev->ifindex); snmp6_seq_show_item64(seq, idev->stats.ipv6, - snmp6_ipstats_list, offsetof(struct ipstats_mib, syncp)); + snmp6_ipstats_list, + ARRAY_SIZE(snmp6_ipstats_list), + offsetof(struct ipstats_mib, syncp)); + + /* Per idev icmp stats do not have ICMP6_MIB_RATELIMITHOST */ snmp6_seq_show_item(seq, NULL, idev->stats.icmpv6dev->mibs, - snmp6_icmp6_list); + snmp6_icmp6_list, ARRAY_SIZE(snmp6_icmp6_list) - 1); + snmp6_seq_show_icmpv6msg(seq, idev->stats.icmpv6msgdev->mibs); return 0; } diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index ada087b50541..b4cd05dba9b6 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -64,7 +64,7 @@ struct raw_hashinfo raw_v6_hashinfo; EXPORT_SYMBOL_GPL(raw_v6_hashinfo); -bool raw_v6_match(struct net *net, struct sock *sk, unsigned short num, +bool raw_v6_match(struct net *net, const struct sock *sk, unsigned short num, const struct in6_addr *loc_addr, const struct in6_addr *rmt_addr, int dif, int sdif) { @@ -141,10 +141,9 @@ EXPORT_SYMBOL(rawv6_mh_filter_unregister); static bool ipv6_raw_deliver(struct sk_buff *skb, int nexthdr) { struct net *net = dev_net(skb->dev); - struct hlist_nulls_head *hlist; - struct hlist_nulls_node *hnode; const struct in6_addr *saddr; const struct in6_addr *daddr; + struct hlist_head *hlist; struct sock *sk; bool delivered = false; __u8 hash; @@ -152,15 +151,22 @@ static bool ipv6_raw_deliver(struct sk_buff *skb, int nexthdr) saddr = &ipv6_hdr(skb)->saddr; daddr = saddr + 1; - hash = nexthdr & (RAW_HTABLE_SIZE - 1); + hash = raw_hashfunc(net, nexthdr); hlist = &raw_v6_hashinfo.ht[hash]; rcu_read_lock(); - sk_nulls_for_each(sk, hnode, hlist) { + sk_for_each_rcu(sk, hlist) { int filtered; if (!raw_v6_match(net, sk, nexthdr, daddr, saddr, inet6_iif(skb), inet6_sdif(skb))) continue; + + if (atomic_read(&sk->sk_rmem_alloc) >= + READ_ONCE(sk->sk_rcvbuf)) { + sk_drops_inc(sk); + continue; + } + delivered = true; switch (nexthdr) { case IPPROTO_ICMPV6: @@ -194,10 +200,8 @@ static bool ipv6_raw_deliver(struct sk_buff *skb, int nexthdr) struct sk_buff *clone = skb_clone(skb, GFP_ATOMIC); /* Not releasing hash table! */ - if (clone) { - nf_reset_ct(clone); + if (clone) rawv6_rcv(sk, clone); - } } } rcu_read_unlock(); @@ -210,7 +214,8 @@ bool raw6_local_deliver(struct sk_buff *skb, int nexthdr) } /* This cleans up af_inet6 a bit. -DaveM */ -static int rawv6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) +static int rawv6_bind(struct sock *sk, struct sockaddr_unsized *uaddr, + int addr_len) { struct inet_sock *inet = inet_sk(sk); struct ipv6_pinfo *np = inet6_sk(sk); @@ -291,10 +296,9 @@ out: } static void rawv6_err(struct sock *sk, struct sk_buff *skb, - struct inet6_skb_parm *opt, - u8 type, u8 code, int offset, __be32 info) + u8 type, u8 code, int offset, __be32 info) { - struct inet_sock *inet = inet_sk(sk); + bool recverr = inet6_test_bit(RECVERR6, sk); struct ipv6_pinfo *np = inet6_sk(sk); int err; int harderr; @@ -304,26 +308,26 @@ static void rawv6_err(struct sock *sk, struct sk_buff *skb, 2. Socket is connected (otherwise the error indication is useless without recverr and error is hard. */ - if (!np->recverr && sk->sk_state != TCP_ESTABLISHED) + if (!recverr && sk->sk_state != TCP_ESTABLISHED) return; harderr = icmpv6_err_convert(type, code, &err); if (type == ICMPV6_PKT_TOOBIG) { ip6_sk_update_pmtu(skb, sk, info); - harderr = (np->pmtudisc == IPV6_PMTUDISC_DO); + harderr = (READ_ONCE(np->pmtudisc) == IPV6_PMTUDISC_DO); } if (type == NDISC_REDIRECT) { ip6_sk_redirect(skb, sk); return; } - if (np->recverr) { + if (recverr) { u8 *payload = skb->data; - if (!inet->hdrincl) + if (!inet_test_bit(HDRINCL, sk)) payload += offset; ipv6_icmp_error(sk, skb, err, 0, ntohl(info), payload); } - if (np->recverr || harderr) { + if (recverr || harderr) { sk->sk_err = err; sk_error_report(sk); } @@ -333,39 +337,40 @@ void raw6_icmp_error(struct sk_buff *skb, int nexthdr, u8 type, u8 code, int inner_offset, __be32 info) { struct net *net = dev_net(skb->dev); - struct hlist_nulls_head *hlist; - struct hlist_nulls_node *hnode; + struct hlist_head *hlist; struct sock *sk; int hash; - hash = nexthdr & (RAW_HTABLE_SIZE - 1); + hash = raw_hashfunc(net, nexthdr); hlist = &raw_v6_hashinfo.ht[hash]; rcu_read_lock(); - sk_nulls_for_each(sk, hnode, hlist) { + sk_for_each_rcu(sk, hlist) { /* Note: ipv6_hdr(skb) != skb->data */ const struct ipv6hdr *ip6h = (const struct ipv6hdr *)skb->data; if (!raw_v6_match(net, sk, nexthdr, &ip6h->saddr, &ip6h->daddr, inet6_iif(skb), inet6_iif(skb))) continue; - rawv6_err(sk, skb, NULL, type, code, inner_offset, info); + rawv6_err(sk, skb, type, code, inner_offset, info); } rcu_read_unlock(); } static inline int rawv6_rcv_skb(struct sock *sk, struct sk_buff *skb) { + enum skb_drop_reason reason; + if ((raw6_sk(sk)->checksum || rcu_access_pointer(sk->sk_filter)) && skb_checksum_complete(skb)) { - atomic_inc(&sk->sk_drops); - kfree_skb(skb); + sk_drops_inc(sk); + sk_skb_reason_drop(sk, skb, SKB_DROP_REASON_SKB_CSUM); return NET_RX_DROP; } /* Charge it to the socket. */ skb_dst_drop(skb); - if (sock_queue_rcv_skb(sk, skb) < 0) { - kfree_skb(skb); + if (sock_queue_rcv_skb_reason(sk, skb, &reason) < 0) { + sk_skb_reason_drop(sk, skb, reason); return NET_RX_DROP; } @@ -385,10 +390,11 @@ int rawv6_rcv(struct sock *sk, struct sk_buff *skb) struct raw6_sock *rp = raw6_sk(sk); if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb)) { - atomic_inc(&sk->sk_drops); - kfree_skb(skb); + sk_drops_inc(sk); + sk_skb_reason_drop(sk, skb, SKB_DROP_REASON_XFRM_POLICY); return NET_RX_DROP; } + nf_reset_ct(skb); if (!rp->checksum) skb->ip_summed = CHECKSUM_UNNECESSARY; @@ -407,10 +413,10 @@ int rawv6_rcv(struct sock *sk, struct sk_buff *skb) skb->len, inet->inet_num, 0)); - if (inet->hdrincl) { + if (inet_test_bit(HDRINCL, sk)) { if (skb_checksum_complete(skb)) { - atomic_inc(&sk->sk_drops); - kfree_skb(skb); + sk_drops_inc(sk); + sk_skb_reason_drop(sk, skb, SKB_DROP_REASON_SKB_CSUM); return NET_RX_DROP; } } @@ -440,7 +446,7 @@ static int rawv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, if (flags & MSG_ERRQUEUE) return ipv6_recv_error(sk, msg, len, addr_len); - if (np->rxpmtu && np->rxopt.bits.rxpmtu) + if (np->rxopt.bits.rxpmtu && READ_ONCE(np->rxpmtu)) return ipv6_recv_rxpmtu(sk, msg, len, addr_len); skb = skb_recv_datagram(sk, flags, &err); @@ -589,12 +595,11 @@ static int rawv6_send_hdrinc(struct sock *sk, struct msghdr *msg, int length, struct flowi6 *fl6, struct dst_entry **dstp, unsigned int flags, const struct sockcm_cookie *sockc) { - struct ipv6_pinfo *np = inet6_sk(sk); struct net *net = sock_net(sk); struct ipv6hdr *iph; struct sk_buff *skb; int err; - struct rt6_info *rt = (struct rt6_info *)*dstp; + struct rt6_info *rt = dst_rt6_info(*dstp); int hlen = LL_RESERVED_SPACE(rt->dst.dev); int tlen = rt->dst.dev->needed_tailroom; @@ -615,9 +620,9 @@ static int rawv6_send_hdrinc(struct sock *sk, struct msghdr *msg, int length, skb_reserve(skb, hlen); skb->protocol = htons(ETH_P_IPV6); - skb->priority = sk->sk_priority; + skb->priority = sockc->priority; skb->mark = sockc->mark; - skb->tstamp = sockc->transmit_time; + skb_set_delivery_type_by_clockid(skb, sockc->transmit_time, sk->sk_clockid); skb_put(skb, length); skb_reset_network_header(skb); @@ -625,7 +630,7 @@ static int rawv6_send_hdrinc(struct sock *sk, struct msghdr *msg, int length, skb->ip_summed = CHECKSUM_NONE; - skb_setup_tx_timestamp(skb, sockc->tsflags); + skb_setup_tx_timestamp(skb, sockc); if (flags & MSG_CONFIRM) skb_set_dst_pending_confirm(skb, 1); @@ -653,7 +658,7 @@ static int rawv6_send_hdrinc(struct sock *sk, struct msghdr *msg, int length, * have been queued for deletion. */ rcu_read_lock(); - IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len); + IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTREQUESTS); err = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, net, sk, skb, NULL, rt->dst.dev, dst_output); if (err > 0) @@ -670,7 +675,7 @@ out: error: IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS); error_check: - if (err == -ENOBUFS && !np->recverr) + if (err == -ENOBUFS && !inet6_test_bit(RECVERR6, sk)) err = 0; return err; } @@ -763,24 +768,17 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) if (msg->msg_flags & MSG_OOB) return -EOPNOTSUPP; - /* hdrincl should be READ_ONCE(inet->hdrincl) - * but READ_ONCE() doesn't work with bit fields. - * Doing this indirectly yields the same result. - */ - hdrincl = inet->hdrincl; - hdrincl = READ_ONCE(hdrincl); + hdrincl = inet_test_bit(HDRINCL, sk); + + ipcm6_init_sk(&ipc6, sk); /* * Get and verify the address. */ memset(&fl6, 0, sizeof(fl6)); - fl6.flowi6_mark = sk->sk_mark; - fl6.flowi6_uid = sk->sk_uid; - - ipcm6_init(&ipc6); - ipc6.sockc.tsflags = sk->sk_tsflags; - ipc6.sockc.mark = sk->sk_mark; + fl6.flowi6_mark = ipc6.sockc.mark; + fl6.flowi6_uid = sk_uid(sk); if (sin6) { if (addr_len < SIN6_LEN_RFC2133) @@ -794,14 +792,15 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) if (!proto) proto = inet->inet_num; - else if (proto != inet->inet_num) + else if (proto != inet->inet_num && + inet->inet_num != IPPROTO_RAW) return -EINVAL; if (proto > 255) return -EINVAL; daddr = &sin6->sin6_addr; - if (np->sndflow) { + if (inet6_test_bit(SNDFLOW, sk)) { fl6.flowlabel = sin6->sin6_flowinfo&IPV6_FLOWINFO_MASK; if (fl6.flowlabel&IPV6_FLOWLABEL_MASK) { flowlabel = fl6_sock_lookup(sk, fl6.flowlabel); @@ -882,17 +881,14 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) final_p = fl6_update_dst(&fl6, opt, &final); if (!fl6.flowi6_oif && ipv6_addr_is_multicast(&fl6.daddr)) - fl6.flowi6_oif = np->mcast_oif; + fl6.flowi6_oif = READ_ONCE(np->mcast_oif); else if (!fl6.flowi6_oif) - fl6.flowi6_oif = np->ucast_oif; + fl6.flowi6_oif = READ_ONCE(np->ucast_oif); security_sk_classify_flow(sk, flowi6_to_flowi_common(&fl6)); if (hdrincl) fl6.flowi6_flags |= FLOWI_FLAG_KNOWN_NH; - if (ipc6.tclass < 0) - ipc6.tclass = np->tclass; - fl6.flowlabel = ip6_make_flowinfo(ipc6.tclass, fl6.flowlabel); dst = ip6_dst_lookup_flow(sock_net(sk), sk, &fl6, final_p); @@ -903,9 +899,6 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) if (ipc6.hlimit < 0) ipc6.hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst); - if (ipc6.dontfrag < 0) - ipc6.dontfrag = np->dontfrag; - if (msg->msg_flags&MSG_CONFIRM) goto do_confirm; @@ -917,7 +910,7 @@ back_from_confirm: ipc6.opt = opt; lock_sock(sk); err = ip6_append_data(sk, raw6_getfrag, &rfv, - len, 0, &ipc6, &fl6, (struct rt6_info *)dst, + len, 0, &ipc6, &fl6, dst_rt6_info(dst), msg->msg_flags); if (err) @@ -941,7 +934,7 @@ do_confirm: goto done; } -static int rawv6_seticmpfilter(struct sock *sk, int level, int optname, +static int rawv6_seticmpfilter(struct sock *sk, int optname, sockptr_t optval, int optlen) { switch (optname) { @@ -958,7 +951,7 @@ static int rawv6_seticmpfilter(struct sock *sk, int level, int optname, return 0; } -static int rawv6_geticmpfilter(struct sock *sk, int level, int optname, +static int rawv6_geticmpfilter(struct sock *sk, int optname, char __user *optval, int __user *optlen) { int len; @@ -1000,7 +993,7 @@ static int do_rawv6_setsockopt(struct sock *sk, int level, int optname, case IPV6_HDRINCL: if (sk->sk_type != SOCK_RAW) return -EINVAL; - inet_sk(sk)->hdrincl = !!val; + inet_assign_bit(HDRINCL, sk, val); return 0; case IPV6_CHECKSUM: if (inet_sk(sk)->inet_num == IPPROTO_ICMPV6 && @@ -1044,7 +1037,7 @@ static int rawv6_setsockopt(struct sock *sk, int level, int optname, case SOL_ICMPV6: if (inet_sk(sk)->inet_num != IPPROTO_ICMPV6) return -EOPNOTSUPP; - return rawv6_seticmpfilter(sk, level, optname, optval, optlen); + return rawv6_seticmpfilter(sk, optname, optval, optlen); case SOL_IPV6: if (optname == IPV6_CHECKSUM || optname == IPV6_HDRINCL) @@ -1068,7 +1061,7 @@ static int do_rawv6_getsockopt(struct sock *sk, int level, int optname, switch (optname) { case IPV6_HDRINCL: - val = inet_sk(sk)->hdrincl; + val = inet_test_bit(HDRINCL, sk); break; case IPV6_CHECKSUM: /* @@ -1105,7 +1098,7 @@ static int rawv6_getsockopt(struct sock *sk, int level, int optname, case SOL_ICMPV6: if (inet_sk(sk)->inet_num != IPPROTO_ICMPV6) return -EOPNOTSUPP; - return rawv6_geticmpfilter(sk, level, optname, optval, optlen); + return rawv6_geticmpfilter(sk, optname, optval, optlen); case SOL_IPV6: if (optname == IPV6_CHECKSUM || optname == IPV6_HDRINCL) @@ -1118,29 +1111,29 @@ static int rawv6_getsockopt(struct sock *sk, int level, int optname, return do_rawv6_getsockopt(sk, level, optname, optval, optlen); } -static int rawv6_ioctl(struct sock *sk, int cmd, unsigned long arg) +static int rawv6_ioctl(struct sock *sk, int cmd, int *karg) { switch (cmd) { case SIOCOUTQ: { - int amount = sk_wmem_alloc_get(sk); - - return put_user(amount, (int __user *)arg); + *karg = sk_wmem_alloc_get(sk); + return 0; } case SIOCINQ: { struct sk_buff *skb; - int amount = 0; spin_lock_bh(&sk->sk_receive_queue.lock); skb = skb_peek(&sk->sk_receive_queue); if (skb) - amount = skb->len; + *karg = skb->len; + else + *karg = 0; spin_unlock_bh(&sk->sk_receive_queue.lock); - return put_user(amount, (int __user *)arg); + return 0; } default: #ifdef CONFIG_IPV6_MROUTE - return ip6mr_ioctl(sk, cmd, (void __user *)arg); + return ip6mr_ioctl(sk, cmd, karg); #else return -ENOIOCTLCMD; #endif @@ -1183,6 +1176,7 @@ static int rawv6_init_sk(struct sock *sk) { struct raw6_sock *rp = raw6_sk(sk); + sk->sk_drop_counters = &rp->drop_counters; switch (inet_sk(sk)->inet_num) { case IPPROTO_ICMPV6: rp->checksum = 1; @@ -1216,6 +1210,7 @@ struct proto rawv6_prot = { .hash = raw_hash_sk, .unhash = raw_unhash_sk, .obj_size = sizeof(struct raw6_sock), + .ipv6_pinfo_offset = offsetof(struct raw6_sock, inet6), .useroffset = offsetof(struct raw6_sock, filter), .usersize = sizeof_field(struct raw6_sock, filter), .h.raw_hash = &raw_v6_hashinfo, @@ -1296,7 +1291,6 @@ const struct proto_ops inet6_sockraw_ops = { .sendmsg = inet_sendmsg, /* ok */ .recvmsg = sock_common_recvmsg, /* ok */ .mmap = sock_no_mmap, - .sendpage = sock_no_sendpage, #ifdef CONFIG_COMPAT .compat_ioctl = inet6_compat_ioctl, #endif diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index 5bc8a28e67f9..25ec8001898d 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -68,11 +68,12 @@ static u8 ip6_frag_ecn(const struct ipv6hdr *ipv6h) static struct inet_frags ip6_frags; static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *skb, - struct sk_buff *prev_tail, struct net_device *dev); + struct sk_buff *prev_tail, struct net_device *dev, + int *refs); static void ip6_frag_expire(struct timer_list *t) { - struct inet_frag_queue *frag = from_timer(frag, t, timer); + struct inet_frag_queue *frag = timer_container_of(frag, t, timer); struct frag_queue *fq; fq = container_of(frag, struct frag_queue, q); @@ -103,11 +104,11 @@ fq_find(struct net *net, __be32 id, const struct ipv6hdr *hdr, int iif) return container_of(q, struct frag_queue, q); } -static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb, +static int ip6_frag_queue(struct net *net, + struct frag_queue *fq, struct sk_buff *skb, struct frag_hdr *fhdr, int nhoff, - u32 *prob_offset) + u32 *prob_offset, int *refs) { - struct net *net = dev_net(skb_dst(skb)->dev); int offset, end, fragsize; struct sk_buff *prev_tail; struct net_device *dev; @@ -198,7 +199,7 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb, fq->iif = dev->ifindex; fq->q.stamp = skb->tstamp; - fq->q.mono_delivery_time = skb->mono_delivery_time; + fq->q.tstamp_type = skb->tstamp_type; fq->q.meat += skb->len; fq->ecn |= ecn; add_frag_mem_limit(fq->q.fqdir, skb->truesize); @@ -220,7 +221,7 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb, unsigned long orefdst = skb->_skb_refdst; skb->_skb_refdst = 0UL; - err = ip6_frag_reasm(fq, skb, prev_tail, dev); + err = ip6_frag_reasm(fq, skb, prev_tail, dev, refs); skb->_skb_refdst = orefdst; return err; } @@ -238,7 +239,7 @@ insert_error: __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_REASM_OVERLAPS); discard_fq: - inet_frag_kill(&fq->q); + inet_frag_kill(&fq->q, refs); __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_REASMFAILS); err: @@ -254,7 +255,8 @@ err: * the last and the first frames arrived and all the bits are here. */ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *skb, - struct sk_buff *prev_tail, struct net_device *dev) + struct sk_buff *prev_tail, struct net_device *dev, + int *refs) { struct net *net = fq->q.fqdir->net; unsigned int nhoff; @@ -262,7 +264,7 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *skb, int payload_len; u8 ecn; - inet_frag_kill(&fq->q); + inet_frag_kill(&fq->q, refs); ecn = ip_frag_ecn_table[fq->ecn]; if (unlikely(ecn == 0xff)) @@ -272,9 +274,9 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *skb, if (!reasm_data) goto out_oom; - payload_len = ((skb->data - skb_network_header(skb)) - + payload_len = -skb_network_offset(skb) - sizeof(struct ipv6hdr) + fq->q.len - - sizeof(struct frag_hdr)); + sizeof(struct frag_hdr); if (payload_len > IPV6_MAXPLEN) goto out_oversize; @@ -303,9 +305,7 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *skb, skb_postpush_rcsum(skb, skb_network_header(skb), skb_network_header_len(skb)); - rcu_read_lock(); __IP6_INC_STATS(net, __in6_dev_stats_get(dev, skb), IPSTATS_MIB_REASMOKS); - rcu_read_unlock(); fq->q.rb_fragments = RB_ROOT; fq->q.fragments_tail = NULL; fq->q.last_run_head = NULL; @@ -317,19 +317,17 @@ out_oversize: out_oom: net_dbg_ratelimited("ip6_frag_reasm: no memory for reassembly\n"); out_fail: - rcu_read_lock(); __IP6_INC_STATS(net, __in6_dev_stats_get(dev, skb), IPSTATS_MIB_REASMFAILS); - rcu_read_unlock(); - inet_frag_kill(&fq->q); + inet_frag_kill(&fq->q, refs); return -1; } static int ipv6_frag_rcv(struct sk_buff *skb) { + const struct ipv6hdr *hdr = ipv6_hdr(skb); + struct net *net = skb_dst_dev_net(skb); struct frag_hdr *fhdr; struct frag_queue *fq; - const struct ipv6hdr *hdr = ipv6_hdr(skb); - struct net *net = dev_net(skb_dst(skb)->dev); u8 nexthdr; int iif; @@ -369,7 +367,7 @@ static int ipv6_frag_rcv(struct sk_buff *skb) * the source of the fragment, with the Pointer field set to zero. */ nexthdr = hdr->nexthdr; - if (ipv6frag_thdr_truncated(skb, skb_transport_offset(skb), &nexthdr)) { + if (ipv6frag_thdr_truncated(skb, skb_network_offset(skb) + sizeof(struct ipv6hdr), &nexthdr)) { __IP6_INC_STATS(net, __in6_dev_get_safely(skb->dev), IPSTATS_MIB_INHDRERRORS); icmpv6_param_prob(skb, ICMPV6_HDR_INCOMP, 0); @@ -377,19 +375,21 @@ static int ipv6_frag_rcv(struct sk_buff *skb) } iif = skb->dev ? skb->dev->ifindex : 0; + rcu_read_lock(); fq = fq_find(net, fhdr->identification, hdr, iif); if (fq) { u32 prob_offset = 0; - int ret; + int ret, refs = 0; spin_lock(&fq->q.lock); fq->iif = iif; - ret = ip6_frag_queue(fq, skb, fhdr, IP6CB(skb)->nhoff, - &prob_offset); + ret = ip6_frag_queue(net, fq, skb, fhdr, IP6CB(skb)->nhoff, + &prob_offset, &refs); spin_unlock(&fq->q.lock); - inet_frag_put(&fq->q); + rcu_read_unlock(); + inet_frag_putn(&fq->q, refs); if (prob_offset) { __IP6_INC_STATS(net, __in6_dev_get_safely(skb->dev), IPSTATS_MIB_INHDRERRORS); @@ -398,6 +398,7 @@ static int ipv6_frag_rcv(struct sk_buff *skb) } return ret; } + rcu_read_unlock(); __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_REASMFAILS); kfree_skb(skb); @@ -436,7 +437,6 @@ static struct ctl_table ip6_frags_ns_ctl_table[] = { .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, - { } }; /* secret interval has been deprecated */ @@ -449,7 +449,6 @@ static struct ctl_table ip6_frags_ctl_table[] = { .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, - { } }; static int __net_init ip6_frags_ns_sysctl_register(struct net *net) @@ -470,7 +469,8 @@ static int __net_init ip6_frags_ns_sysctl_register(struct net *net) table[1].extra2 = &net->ipv6.fqdir->high_thresh; table[2].data = &net->ipv6.fqdir->timeout; - hdr = register_net_sysctl(net, "net/ipv6", table); + hdr = register_net_sysctl_sz(net, "net/ipv6", table, + ARRAY_SIZE(ip6_frags_ns_ctl_table)); if (!hdr) goto err_reg; @@ -486,7 +486,7 @@ err_alloc: static void __net_exit ip6_frags_ns_sysctl_unregister(struct net *net) { - struct ctl_table *table; + const struct ctl_table *table; table = net->ipv6.sysctl.frags_hdr->ctl_table_arg; unregister_net_sysctl_table(net->ipv6.sysctl.frags_hdr); diff --git a/net/ipv6/route.c b/net/ipv6/route.c index e74e0361fd92..aee6a10b112a 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -87,11 +87,12 @@ struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie); static unsigned int ip6_default_advmss(const struct dst_entry *dst); INDIRECT_CALLABLE_SCOPE unsigned int ip6_mtu(const struct dst_entry *dst); -static struct dst_entry *ip6_negative_advice(struct dst_entry *); +static void ip6_negative_advice(struct sock *sk, + struct dst_entry *dst); static void ip6_dst_destroy(struct dst_entry *); static void ip6_dst_ifdown(struct dst_entry *, - struct net_device *dev, int how); -static int ip6_dst_gc(struct dst_ops *ops); + struct net_device *dev); +static void ip6_dst_gc(struct dst_ops *ops); static int ip6_pkt_discard(struct sk_buff *skb); static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb); @@ -130,7 +131,6 @@ static struct fib6_info *rt6_get_route_info(struct net *net, struct uncached_list { spinlock_t lock; struct list_head head; - struct list_head quarantine; }; static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list); @@ -139,20 +139,20 @@ void rt6_uncached_list_add(struct rt6_info *rt) { struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list); - rt->rt6i_uncached_list = ul; + rt->dst.rt_uncached_list = ul; spin_lock_bh(&ul->lock); - list_add_tail(&rt->rt6i_uncached, &ul->head); + list_add_tail(&rt->dst.rt_uncached, &ul->head); spin_unlock_bh(&ul->lock); } void rt6_uncached_list_del(struct rt6_info *rt) { - if (!list_empty(&rt->rt6i_uncached)) { - struct uncached_list *ul = rt->rt6i_uncached_list; + if (!list_empty(&rt->dst.rt_uncached)) { + struct uncached_list *ul = rt->dst.rt_uncached_list; spin_lock_bh(&ul->lock); - list_del_init(&rt->rt6i_uncached); + list_del_init(&rt->dst.rt_uncached); spin_unlock_bh(&ul->lock); } } @@ -169,12 +169,12 @@ static void rt6_uncached_list_flush_dev(struct net_device *dev) continue; spin_lock_bh(&ul->lock); - list_for_each_entry_safe(rt, safe, &ul->head, rt6i_uncached) { + list_for_each_entry_safe(rt, safe, &ul->head, dst.rt_uncached) { struct inet6_dev *rt_idev = rt->rt6i_idev; struct net_device *rt_dev = rt->dst.dev; bool handled = false; - if (rt_idev->dev == dev) { + if (rt_idev && rt_idev->dev == dev) { rt->rt6i_idev = in6_dev_get(blackhole_netdev); in6_dev_put(rt_idev); handled = true; @@ -188,8 +188,7 @@ static void rt6_uncached_list_flush_dev(struct net_device *dev) handled = true; } if (handled) - list_move(&rt->rt6i_uncached, - &ul->quarantine); + list_del_init(&rt->dst.rt_uncached); } spin_unlock_bh(&ul->lock); } @@ -226,16 +225,16 @@ static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst, struct sk_buff *skb, const void *daddr) { - const struct rt6_info *rt = container_of(dst, struct rt6_info, dst); + const struct rt6_info *rt = dst_rt6_info(dst); return ip6_neigh_lookup(rt6_nexthop(rt, &in6addr_any), - dst->dev, skb, daddr); + dst_dev(dst), skb, daddr); } static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr) { - struct net_device *dev = dst->dev; - struct rt6_info *rt = (struct rt6_info *)dst; + const struct rt6_info *rt = dst_rt6_info(dst); + struct net_device *dev = dst_dev(dst); daddr = choose_neigh_daddr(rt6_nexthop(rt, &in6addr_any), NULL, daddr); if (!daddr) @@ -293,7 +292,7 @@ static const struct fib6_info fib6_null_entry_template = { static const struct rt6_info ip6_null_entry_template = { .dst = { - .__refcnt = ATOMIC_INIT(1), + .__rcuref = RCUREF_INIT(1), .__use = 1, .obsolete = DST_OBSOLETE_FORCE_CHK, .error = -ENETUNREACH, @@ -307,7 +306,7 @@ static const struct rt6_info ip6_null_entry_template = { static const struct rt6_info ip6_prohibit_entry_template = { .dst = { - .__refcnt = ATOMIC_INIT(1), + .__rcuref = RCUREF_INIT(1), .__use = 1, .obsolete = DST_OBSOLETE_FORCE_CHK, .error = -EACCES, @@ -319,7 +318,7 @@ static const struct rt6_info ip6_prohibit_entry_template = { static const struct rt6_info ip6_blk_hole_entry_template = { .dst = { - .__refcnt = ATOMIC_INIT(1), + .__rcuref = RCUREF_INIT(1), .__use = 1, .obsolete = DST_OBSOLETE_FORCE_CHK, .error = -EINVAL, @@ -334,7 +333,6 @@ static const struct rt6_info ip6_blk_hole_entry_template = { static void rt6_info_init(struct rt6_info *rt) { memset_after(rt, 0, dst); - INIT_LIST_HEAD(&rt->rt6i_uncached); } /* allocate dst with ip6_dst_ops */ @@ -342,7 +340,7 @@ struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev, int flags) { struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev, - 1, DST_OBSOLETE_FORCE_CHK, flags); + DST_OBSOLETE_FORCE_CHK, flags); if (rt) { rt6_info_init(rt); @@ -355,7 +353,7 @@ EXPORT_SYMBOL(ip6_dst_alloc); static void ip6_dst_destroy(struct dst_entry *dst) { - struct rt6_info *rt = (struct rt6_info *)dst; + struct rt6_info *rt = dst_rt6_info(dst); struct fib6_info *from; struct inet6_dev *idev; @@ -368,15 +366,15 @@ static void ip6_dst_destroy(struct dst_entry *dst) in6_dev_put(idev); } - from = xchg((__force struct fib6_info **)&rt->from, NULL); + from = unrcu_pointer(xchg(&rt->from, NULL)); fib6_info_release(from); } -static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev, - int how) +static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev) { - struct rt6_info *rt = (struct rt6_info *)dst; + struct rt6_info *rt = dst_rt6_info(dst); struct inet6_dev *idev = rt->rt6i_idev; + struct fib6_info *from; if (idev && idev->dev != blackhole_netdev) { struct inet6_dev *blackhole_idev = in6_dev_get(blackhole_netdev); @@ -386,14 +384,15 @@ static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev, in6_dev_put(idev); } } + from = unrcu_pointer(xchg(&rt->from, NULL)); + fib6_info_release(from); } static bool __rt6_check_expired(const struct rt6_info *rt) { if (rt->rt6i_flags & RTF_EXPIRES) - return time_after(jiffies, rt->dst.expires); - else - return false; + return time_after(jiffies, READ_ONCE(rt->dst.expires)); + return false; } static bool rt6_check_expired(const struct rt6_info *rt) @@ -403,21 +402,46 @@ static bool rt6_check_expired(const struct rt6_info *rt) from = rcu_dereference(rt->from); if (rt->rt6i_flags & RTF_EXPIRES) { - if (time_after(jiffies, rt->dst.expires)) + if (time_after(jiffies, READ_ONCE(rt->dst.expires))) return true; } else if (from) { - return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK || + return READ_ONCE(rt->dst.obsolete) != DST_OBSOLETE_FORCE_CHK || fib6_check_expired(from); } return false; } +static struct fib6_info * +rt6_multipath_first_sibling_rcu(const struct fib6_info *rt) +{ + struct fib6_info *iter; + struct fib6_node *fn; + + fn = rcu_dereference(rt->fib6_node); + if (!fn) + goto out; + iter = rcu_dereference(fn->leaf); + if (!iter) + goto out; + + while (iter) { + if (iter->fib6_metric == rt->fib6_metric && + rt6_qualify_for_ecmp(iter)) + return iter; + iter = rcu_dereference(iter->fib6_next); + } + +out: + return NULL; +} + void fib6_select_path(const struct net *net, struct fib6_result *res, struct flowi6 *fl6, int oif, bool have_oif_match, const struct sk_buff *skb, int strict) { - struct fib6_info *sibling, *next_sibling; - struct fib6_info *match = res->f6i; + struct fib6_info *first, *match = res->f6i; + struct fib6_info *sibling; + int hash; if (!match->nh && (!match->fib6_nsiblings || have_oif_match)) goto out; @@ -425,6 +449,9 @@ void fib6_select_path(const struct net *net, struct fib6_result *res, if (match->nh && have_oif_match && res->nh) return; + if (skb) + IP6CB(skb)->flags |= IP6SKB_MULTIPATH; + /* We might have already computed the hash for ICMPv6 errors. In such * case it will always be non-zero. Otherwise now is the time to do it. */ @@ -437,16 +464,25 @@ void fib6_select_path(const struct net *net, struct fib6_result *res, return; } - if (fl6->mp_hash <= atomic_read(&match->fib6_nh->fib_nh_upper_bound)) + first = rt6_multipath_first_sibling_rcu(match); + if (!first) + goto out; + + hash = fl6->mp_hash; + if (hash <= atomic_read(&first->fib6_nh->fib_nh_upper_bound)) { + if (rt6_score_route(first->fib6_nh, first->fib6_flags, oif, + strict) >= 0) + match = first; goto out; + } - list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings, - fib6_siblings) { + list_for_each_entry_rcu(sibling, &first->fib6_siblings, + fib6_siblings) { const struct fib6_nh *nh = sibling->fib6_nh; int nh_upper_bound; nh_upper_bound = atomic_read(&nh->fib_nh_upper_bound); - if (fl6->mp_hash > nh_upper_bound) + if (hash > nh_upper_bound) continue; if (rt6_score_route(nh, sibling->fib6_flags, oif, strict) < 0) break; @@ -633,25 +669,28 @@ static void rt6_probe(struct fib6_nh *fib6_nh) nh_gw = &fib6_nh->fib_nh_gw6; dev = fib6_nh->fib_nh_dev; - rcu_read_lock_bh(); + rcu_read_lock(); last_probe = READ_ONCE(fib6_nh->last_probe); idev = __in6_dev_get(dev); + if (!idev) + goto out; neigh = __ipv6_neigh_lookup_noref(dev, nh_gw); if (neigh) { - if (neigh->nud_state & NUD_VALID) + if (READ_ONCE(neigh->nud_state) & NUD_VALID) goto out; - write_lock(&neigh->lock); + write_lock_bh(&neigh->lock); if (!(neigh->nud_state & NUD_VALID) && time_after(jiffies, - neigh->updated + idev->cnf.rtr_probe_interval)) { + neigh->updated + + READ_ONCE(idev->cnf.rtr_probe_interval))) { work = kmalloc(sizeof(*work), GFP_ATOMIC); if (work) __neigh_set_probe_once(neigh); } - write_unlock(&neigh->lock); + write_unlock_bh(&neigh->lock); } else if (time_after(jiffies, last_probe + - idev->cnf.rtr_probe_interval)) { + READ_ONCE(idev->cnf.rtr_probe_interval))) { work = kmalloc(sizeof(*work), GFP_ATOMIC); } @@ -667,7 +706,7 @@ static void rt6_probe(struct fib6_nh *fib6_nh) } out: - rcu_read_unlock_bh(); + rcu_read_unlock(); } #else static inline void rt6_probe(struct fib6_nh *fib6_nh) @@ -683,25 +722,25 @@ static enum rt6_nud_state rt6_check_neigh(const struct fib6_nh *fib6_nh) enum rt6_nud_state ret = RT6_NUD_FAIL_HARD; struct neighbour *neigh; - rcu_read_lock_bh(); + rcu_read_lock(); neigh = __ipv6_neigh_lookup_noref(fib6_nh->fib_nh_dev, &fib6_nh->fib_nh_gw6); if (neigh) { - read_lock(&neigh->lock); - if (neigh->nud_state & NUD_VALID) + u8 nud_state = READ_ONCE(neigh->nud_state); + + if (nud_state & NUD_VALID) ret = RT6_NUD_SUCCEED; #ifdef CONFIG_IPV6_ROUTER_PREF - else if (!(neigh->nud_state & NUD_FAILED)) + else if (!(nud_state & NUD_FAILED)) ret = RT6_NUD_SUCCEED; else ret = RT6_NUD_FAIL_PROBE; #endif - read_unlock(&neigh->lock); } else { ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ? RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR; } - rcu_read_unlock_bh(); + rcu_read_unlock(); return ret; } @@ -930,6 +969,7 @@ int rt6_route_rcv(struct net_device *dev, u8 *opt, int len, struct net *net = dev_net(dev); struct route_info *rinfo = (struct route_info *) opt; struct in6_addr prefix_buf, *prefix; + struct fib6_table *table; unsigned int pref; unsigned long lifetime; struct fib6_info *rt; @@ -988,10 +1028,18 @@ int rt6_route_rcv(struct net_device *dev, u8 *opt, int len, (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref); if (rt) { - if (!addrconf_finite_timeout(lifetime)) + table = rt->fib6_table; + spin_lock_bh(&table->tb6_lock); + + if (!addrconf_finite_timeout(lifetime)) { fib6_clean_expires(rt); - else + fib6_remove_gc_list(rt); + } else { fib6_set_expires(rt, jiffies + HZ * lifetime); + fib6_add_gc_list(rt); + } + + spin_unlock_bh(&table->tb6_lock); fib6_info_release(rt); } @@ -1096,6 +1144,7 @@ static void ip6_rt_init_dst(struct rt6_info *rt, const struct fib6_result *res) rt->dst.input = ip6_input; } else if (ipv6_addr_type(&f6i->fib6_dst.addr) & IPV6_ADDR_MULTICAST) { rt->dst.input = ip6_mc_input; + rt->dst.output = ip6_mr_output; } else { rt->dst.input = ip6_forward; } @@ -1277,7 +1326,7 @@ struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr, dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup); if (dst->error == 0) - return (struct rt6_info *) dst; + return dst_rt6_info(dst); dst_release(dst); @@ -1397,6 +1446,7 @@ static struct rt6_info *rt6_get_pcpu_route(const struct fib6_result *res) struct rt6_info *prev, **p; p = this_cpu_ptr(res->nh->rt6i_pcpu); + /* Paired with READ_ONCE() in __fib6_drop_pcpu_from() */ prev = xchg(p, NULL); if (prev) { dst_dev_put(&prev->dst); @@ -1425,7 +1475,7 @@ static struct rt6_info *rt6_make_pcpu_route(struct net *net, if (res->f6i->fib6_destroying) { struct fib6_info *from; - from = xchg((__force struct fib6_info **)&pcpu_rt->from, NULL); + from = unrcu_pointer(xchg(&pcpu_rt->from, NULL)); fib6_info_release(from); } @@ -1442,7 +1492,6 @@ static DEFINE_SPINLOCK(rt6_exception_lock); static void rt6_remove_exception(struct rt6_exception_bucket *bucket, struct rt6_exception *rt6_ex) { - struct fib6_info *from; struct net *net; if (!bucket || !rt6_ex) @@ -1454,8 +1503,6 @@ static void rt6_remove_exception(struct rt6_exception_bucket *bucket, /* purge completely the exception to allow releasing the held resources: * some [sk] cache may keep the dst around for unlimited time */ - from = xchg((__force struct fib6_info **)&rt6_ex->rt6i->from, NULL); - fib6_info_release(from); dst_dev_put(&rt6_ex->rt6i->dst); hlist_del_rcu(&rt6_ex->hlist); @@ -1586,7 +1633,7 @@ static unsigned int fib6_mtu(const struct fib6_result *res) rcu_read_lock(); idev = __in6_dev_get(dev); - mtu = idev->cnf.mtu6; + mtu = READ_ONCE(idev->cnf.mtu6); rcu_read_unlock(); } @@ -1724,6 +1771,7 @@ out: if (!err) { spin_lock_bh(&f6i->fib6_table->tb6_lock); fib6_update_sernum(net, f6i); + fib6_add_gc_list(f6i); spin_unlock_bh(&f6i->fib6_table->tb6_lock); fib6_force_start_gc(net); } @@ -1772,11 +1820,13 @@ static int rt6_nh_flush_exceptions(struct fib6_nh *nh, void *arg) void rt6_flush_exceptions(struct fib6_info *f6i) { - if (f6i->nh) - nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_flush_exceptions, - f6i); - else + if (f6i->nh) { + rcu_read_lock(); + nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_flush_exceptions, f6i); + rcu_read_unlock(); + } else { fib6_nh_flush_exceptions(f6i->fib6_nh, f6i); + } } /* Find cached rt in the hash table inside passed in rt @@ -2083,13 +2133,14 @@ static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket, * expired, independently from their aging, as per RFC 8201 section 4 */ if (!(rt->rt6i_flags & RTF_EXPIRES)) { - if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) { - RT6_TRACE("aging clone %p\n", rt); + if (time_after_eq(now, READ_ONCE(rt->dst.lastuse) + + gc_args->timeout)) { + pr_debug("aging clone %p\n", rt); rt6_remove_exception(bucket, rt6_ex); return; } - } else if (time_after(jiffies, rt->dst.expires)) { - RT6_TRACE("purging expired route %p\n", rt); + } else if (time_after(jiffies, READ_ONCE(rt->dst.expires))) { + pr_debug("purging expired route %p\n", rt); rt6_remove_exception(bucket, rt6_ex); return; } @@ -2100,8 +2151,8 @@ static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket, neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway); if (!(neigh && (neigh->flags & NTF_ROUTER))) { - RT6_TRACE("purging route %p via non-router but gateway\n", - rt); + pr_debug("purging route %p via non-router but gateway\n", + rt); rt6_remove_exception(bucket, rt6_ex); return; } @@ -2210,7 +2261,7 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, strict |= flags & RT6_LOOKUP_F_IFACE; strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE; - if (net->ipv6.devconf_all->forwarding == 0) + if (READ_ONCE(net->ipv6.devconf_all->forwarding) == 0) strict |= RT6_LOOKUP_F_REACHABLE; rcu_read_lock(); @@ -2361,7 +2412,7 @@ static u32 rt6_multipath_custom_hash_outer(const struct net *net, hash_keys.ports.dst = keys.ports.dst; *p_has_inner = !!(keys.control.flags & FLOW_DIS_ENCAPSULATION); - return flow_hash_from_keys(&hash_keys); + return fib_multipath_hash_from_keys(net, &hash_keys); } static u32 rt6_multipath_custom_hash_inner(const struct net *net, @@ -2410,7 +2461,7 @@ static u32 rt6_multipath_custom_hash_inner(const struct net *net, if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_PORT) hash_keys.ports.dst = keys.ports.dst; - return flow_hash_from_keys(&hash_keys); + return fib_multipath_hash_from_keys(net, &hash_keys); } static u32 rt6_multipath_custom_hash_skb(const struct net *net, @@ -2444,12 +2495,16 @@ static u32 rt6_multipath_custom_hash_fl6(const struct net *net, hash_keys.basic.ip_proto = fl6->flowi6_proto; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_FLOWLABEL) hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6); - if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT) - hash_keys.ports.src = fl6->fl6_sport; + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT) { + if (fl6->flowi6_flags & FLOWI_FLAG_ANY_SPORT) + hash_keys.ports.src = (__force __be16)get_random_u16(); + else + hash_keys.ports.src = fl6->fl6_sport; + } if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_PORT) hash_keys.ports.dst = fl6->fl6_dport; - return flow_hash_from_keys(&hash_keys); + return fib_multipath_hash_from_keys(net, &hash_keys); } /* if skb is set it will be used and fl6 can be NULL */ @@ -2471,7 +2526,7 @@ u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6, hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6); hash_keys.basic.ip_proto = fl6->flowi6_proto; } - mhash = flow_hash_from_keys(&hash_keys); + mhash = fib_multipath_hash_from_keys(net, &hash_keys); break; case 1: if (skb) { @@ -2499,11 +2554,14 @@ u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6, hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; hash_keys.addrs.v6addrs.src = fl6->saddr; hash_keys.addrs.v6addrs.dst = fl6->daddr; - hash_keys.ports.src = fl6->fl6_sport; + if (fl6->flowi6_flags & FLOWI_FLAG_ANY_SPORT) + hash_keys.ports.src = (__force __be16)get_random_u16(); + else + hash_keys.ports.src = fl6->fl6_sport; hash_keys.ports.dst = fl6->fl6_dport; hash_keys.basic.ip_proto = fl6->flowi6_proto; } - mhash = flow_hash_from_keys(&hash_keys); + mhash = fib_multipath_hash_from_keys(net, &hash_keys); break; case 2: memset(&hash_keys, 0, sizeof(hash_keys)); @@ -2540,7 +2598,7 @@ u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6, hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6); hash_keys.basic.ip_proto = fl6->flowi6_proto; } - mhash = flow_hash_from_keys(&hash_keys); + mhash = fib_multipath_hash_from_keys(net, &hash_keys); break; case 3: if (skb) @@ -2593,9 +2651,10 @@ INDIRECT_CALLABLE_SCOPE struct rt6_info *ip6_pol_route_output(struct net *net, return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags); } -struct dst_entry *ip6_route_output_flags_noref(struct net *net, - const struct sock *sk, - struct flowi6 *fl6, int flags) +static struct dst_entry *ip6_route_output_flags_noref(struct net *net, + const struct sock *sk, + struct flowi6 *fl6, + int flags) { bool any_src; @@ -2620,11 +2679,10 @@ struct dst_entry *ip6_route_output_flags_noref(struct net *net, if (!any_src) flags |= RT6_LOOKUP_F_HAS_SADDR; else if (sk) - flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs); + flags |= rt6_srcprefs2flags(READ_ONCE(inet6_sk(sk)->srcprefs)); return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output); } -EXPORT_SYMBOL_GPL(ip6_route_output_flags_noref); struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk, @@ -2636,9 +2694,9 @@ struct dst_entry *ip6_route_output_flags(struct net *net, rcu_read_lock(); dst = ip6_route_output_flags_noref(net, sk, fl6, flags); - rt6 = (struct rt6_info *)dst; + rt6 = dst_rt6_info(dst); /* For dst cached in uncached_list, refcnt is already taken. */ - if (list_empty(&rt6->rt6i_uncached) && !dst_hold_safe(dst)) { + if (list_empty(&rt6->dst.rt_uncached) && !dst_hold_safe(dst)) { dst = &net->ipv6.ip6_null_entry->dst; dst_hold(dst); } @@ -2650,11 +2708,11 @@ EXPORT_SYMBOL_GPL(ip6_route_output_flags); struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig) { - struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig; + struct rt6_info *rt, *ort = dst_rt6_info(dst_orig); struct net_device *loopback_dev = net->loopback_dev; struct dst_entry *new = NULL; - rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1, + rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, DST_OBSOLETE_DEAD, 0); if (rt) { rt6_info_init(rt); @@ -2719,11 +2777,10 @@ static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie) { if (!__rt6_check_expired(rt) && - rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK && + READ_ONCE(rt->dst.obsolete) == DST_OBSOLETE_FORCE_CHK && fib6_check(from, cookie)) return &rt->dst; - else - return NULL; + return NULL; } INDIRECT_CALLABLE_SCOPE struct dst_entry *ip6_dst_check(struct dst_entry *dst, @@ -2733,7 +2790,7 @@ INDIRECT_CALLABLE_SCOPE struct dst_entry *ip6_dst_check(struct dst_entry *dst, struct fib6_info *from; struct rt6_info *rt; - rt = container_of(dst, struct rt6_info, dst); + rt = dst_rt6_info(dst); if (rt->sernum) return rt6_is_valid(rt) ? dst : NULL; @@ -2748,7 +2805,7 @@ INDIRECT_CALLABLE_SCOPE struct dst_entry *ip6_dst_check(struct dst_entry *dst, from = rcu_dereference(rt->from); if (from && (rt->rt6i_flags & RTF_PCPU || - unlikely(!list_empty(&rt->rt6i_uncached)))) + unlikely(!list_empty(&rt->dst.rt_uncached)))) dst_ret = rt6_dst_from_check(rt, from, cookie); else dst_ret = rt6_check(rt, from, cookie); @@ -2759,24 +2816,24 @@ INDIRECT_CALLABLE_SCOPE struct dst_entry *ip6_dst_check(struct dst_entry *dst, } EXPORT_INDIRECT_CALLABLE(ip6_dst_check); -static struct dst_entry *ip6_negative_advice(struct dst_entry *dst) +static void ip6_negative_advice(struct sock *sk, + struct dst_entry *dst) { - struct rt6_info *rt = (struct rt6_info *) dst; + struct rt6_info *rt = dst_rt6_info(dst); - if (rt) { - if (rt->rt6i_flags & RTF_CACHE) { - rcu_read_lock(); - if (rt6_check_expired(rt)) { - rt6_remove_exception_rt(rt); - dst = NULL; - } - rcu_read_unlock(); - } else { - dst_release(dst); - dst = NULL; + if (rt->rt6i_flags & RTF_CACHE) { + rcu_read_lock(); + if (rt6_check_expired(rt)) { + /* rt/dst can not be destroyed yet, + * because of rcu_read_lock() + */ + sk_dst_reset(sk); + rt6_remove_exception_rt(rt); } + rcu_read_unlock(); + return; } - return dst; + sk_dst_reset(sk); } static void ip6_link_failure(struct sk_buff *skb) @@ -2785,7 +2842,7 @@ static void ip6_link_failure(struct sk_buff *skb) icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0); - rt = (struct rt6_info *) skb_dst(skb); + rt = dst_rt6_info(skb_dst(skb)); if (rt) { rcu_read_lock(); if (rt->rt6i_flags & RTF_CACHE) { @@ -2813,7 +2870,7 @@ static void rt6_update_expires(struct rt6_info *rt0, int timeout) rcu_read_lock(); from = rcu_dereference(rt0->from); if (from) - rt0->dst.expires = from->expires; + WRITE_ONCE(rt0->dst.expires, from->expires); rcu_read_unlock(); } @@ -2841,7 +2898,7 @@ static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk, bool confirm_neigh) { const struct in6_addr *daddr, *saddr; - struct rt6_info *rt6 = (struct rt6_info *)dst; + struct rt6_info *rt6 = dst_rt6_info(dst); /* Note: do *NOT* check dst_metric_locked(dst, RTAX_MTU) * IPv6 pmtu discovery isn't optional, so 'mtu lock' cannot disable it. @@ -2886,7 +2943,7 @@ static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk, if (res.f6i->nh) { struct fib6_nh_match_arg arg = { - .dev = dst->dev, + .dev = dst_dev_rcu(dst), .gw = &rt6->rt6i_gateway, }; @@ -2952,10 +3009,11 @@ void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu) if (!oif && skb->dev) oif = l3mdev_master_ifindex(skb->dev); - ip6_update_pmtu(skb, sock_net(sk), mtu, oif, sk->sk_mark, sk->sk_uid); + ip6_update_pmtu(skb, sock_net(sk), mtu, oif, READ_ONCE(sk->sk_mark), + sk_uid(sk)); dst = __sk_dst_get(sk); - if (!dst || !dst->obsolete || + if (!dst || !READ_ONCE(dst->obsolete) || dst->ops->check(dst, inet6_sk(sk)->dst_cookie)) return; @@ -2974,13 +3032,12 @@ void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst, #endif ip6_dst_store(sk, dst, - ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ? - &sk->sk_v6_daddr : NULL, + ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr), #ifdef CONFIG_IPV6_SUBTREES ipv6_addr_equal(&fl6->saddr, &np->saddr) ? - &np->saddr : + true : #endif - NULL); + false); } static bool ip6_redirect_nh_match(const struct fib6_result *res, @@ -3173,22 +3230,26 @@ void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif) void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk) { - ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark, - sk->sk_uid); + ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, + READ_ONCE(sk->sk_mark), sk_uid(sk)); } EXPORT_SYMBOL_GPL(ip6_sk_redirect); static unsigned int ip6_default_advmss(const struct dst_entry *dst) { - struct net_device *dev = dst->dev; unsigned int mtu = dst_mtu(dst); - struct net *net = dev_net(dev); + struct net *net; mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr); + rcu_read_lock(); + + net = dst_dev_net_rcu(dst); if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss) mtu = net->ipv6.sysctl.ip6_rt_min_advmss; + rcu_read_unlock(); + /* * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and * corresponding MSS is IPV6_MAXPLEN - tcp_header_size. @@ -3238,8 +3299,8 @@ u32 ip6_mtu_from_fib6(const struct fib6_result *res, mtu = IPV6_MIN_MTU; idev = __in6_dev_get(dev); - if (idev && idev->cnf.mtu6 > mtu) - mtu = idev->cnf.mtu6; + if (idev) + mtu = max_t(u32, mtu, READ_ONCE(idev->cnf.mtu6)); } mtu = min_t(unsigned int, mtu, IP6_MAX_MTU); @@ -3284,23 +3345,17 @@ out: return dst; } -static int ip6_dst_gc(struct dst_ops *ops) +static void ip6_dst_gc(struct dst_ops *ops) { struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops); int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval; - int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size; int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity; int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout; unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc; unsigned int val; int entries; - entries = dst_entries_get_fast(ops); - if (entries > rt_max_size) - entries = dst_entries_get_slow(ops); - - if (time_after(rt_last_gc + rt_min_interval, jiffies) && - entries <= rt_max_size) + if (time_after(rt_last_gc + rt_min_interval, jiffies)) goto out; fib6_run_gc(atomic_inc_return(&net->ipv6.ip6_rt_gc_expire), net, true); @@ -3310,7 +3365,6 @@ static int ip6_dst_gc(struct dst_ops *ops) out: val = atomic_read(&net->ipv6.ip6_rt_gc_expire); atomic_set(&net->ipv6.ip6_rt_gc_expire, val - (val >> rt_elasticity)); - return entries > rt_max_size; } static int ip6_nh_lookup_table(struct net *net, struct fib6_config *cfg, @@ -3368,6 +3422,7 @@ static int ip6_route_check_nh_onlink(struct net *net, static int ip6_route_check_nh(struct net *net, struct fib6_config *cfg, struct net_device **_dev, + netdevice_tracker *dev_tracker, struct inet6_dev **idev) { const struct in6_addr *gw_addr = &cfg->fc_gateway; @@ -3412,7 +3467,7 @@ static int ip6_route_check_nh(struct net *net, err = -EHOSTUNREACH; } else { *_dev = dev = res.nh->fib_nh_dev; - dev_hold(dev); + netdev_hold(dev, dev_tracker, GFP_ATOMIC); *idev = in6_dev_get(dev); } @@ -3420,7 +3475,9 @@ static int ip6_route_check_nh(struct net *net, } static int ip6_validate_gw(struct net *net, struct fib6_config *cfg, - struct net_device **_dev, struct inet6_dev **idev, + struct net_device **_dev, + netdevice_tracker *dev_tracker, + struct inet6_dev **idev, struct netlink_ext_ack *extack) { const struct in6_addr *gw_addr = &cfg->fc_gateway; @@ -3461,7 +3518,8 @@ static int ip6_validate_gw(struct net *net, struct fib6_config *cfg, if (cfg->fc_flags & RTNH_F_ONLINK) err = ip6_route_check_nh_onlink(net, cfg, dev, extack); else - err = ip6_route_check_nh(net, cfg, _dev, idev); + err = ip6_route_check_nh(net, cfg, _dev, dev_tracker, + idev); rcu_read_unlock(); @@ -3511,6 +3569,7 @@ int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh, struct fib6_config *cfg, gfp_t gfp_flags, struct netlink_ext_ack *extack) { + netdevice_tracker *dev_tracker = &fib6_nh->fib_nh_dev_tracker; struct net_device *dev = NULL; struct inet6_dev *idev = NULL; int addr_type; @@ -3528,7 +3587,8 @@ int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh, err = -ENODEV; if (cfg->fc_ifindex) { - dev = dev_get_by_index(net, cfg->fc_ifindex); + dev = netdev_get_by_index(net, cfg->fc_ifindex, + dev_tracker, gfp_flags); if (!dev) goto out; idev = in6_dev_get(dev); @@ -3562,11 +3622,11 @@ int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh, /* hold loopback dev/idev if we haven't done so. */ if (dev != net->loopback_dev) { if (dev) { - dev_put(dev); + netdev_put(dev, dev_tracker); in6_dev_put(idev); } dev = net->loopback_dev; - dev_hold(dev); + netdev_hold(dev, dev_tracker, gfp_flags); idev = in6_dev_get(dev); if (!idev) { err = -ENODEV; @@ -3577,7 +3637,8 @@ int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh, } if (cfg->fc_flags & RTF_GATEWAY) { - err = ip6_validate_gw(net, cfg, &dev, &idev, extack); + err = ip6_validate_gw(net, cfg, &dev, dev_tracker, + &idev, extack); if (err) goto out; @@ -3589,7 +3650,7 @@ int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh, if (!dev) goto out; - if (idev->cnf.disable_ipv6) { + if (!idev || idev->cnf.disable_ipv6) { NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device"); err = -EACCES; goto out; @@ -3618,8 +3679,6 @@ pcpu_alloc: } fib6_nh->fib_nh_dev = dev; - netdev_tracker_alloc(dev, &fib6_nh->fib_nh_dev_tracker, gfp_flags); - fib6_nh->fib_nh_oif = dev->ifindex; err = 0; out: @@ -3627,9 +3686,10 @@ out: in6_dev_put(idev); if (err) { - lwtstate_put(fib6_nh->fib_nh_lws); + fib_nh_common_release(&fib6_nh->nh_common); + fib6_nh->nh_common.nhc_pcpu_rth_output = NULL; fib6_nh->fib_nh_lws = NULL; - dev_put(dev); + netdev_put(dev, dev_tracker); } return err; @@ -3675,62 +3735,62 @@ void fib6_nh_release_dsts(struct fib6_nh *fib6_nh) } } -static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg, - gfp_t gfp_flags, - struct netlink_ext_ack *extack) +static int fib6_config_validate(struct fib6_config *cfg, + struct netlink_ext_ack *extack) { - struct net *net = cfg->fc_nlinfo.nl_net; - struct fib6_info *rt = NULL; - struct nexthop *nh = NULL; - struct fib6_table *table; - struct fib6_nh *fib6_nh; - int err = -EINVAL; - int addr_type; - /* RTF_PCPU is an internal flag; can not be set by userspace */ if (cfg->fc_flags & RTF_PCPU) { NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU"); - goto out; + goto errout; } /* RTF_CACHE is an internal flag; can not be set by userspace */ if (cfg->fc_flags & RTF_CACHE) { NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE"); - goto out; + goto errout; } if (cfg->fc_type > RTN_MAX) { NL_SET_ERR_MSG(extack, "Invalid route type"); - goto out; + goto errout; } if (cfg->fc_dst_len > 128) { NL_SET_ERR_MSG(extack, "Invalid prefix length"); - goto out; + goto errout; } + +#ifdef CONFIG_IPV6_SUBTREES if (cfg->fc_src_len > 128) { NL_SET_ERR_MSG(extack, "Invalid source address length"); - goto out; + goto errout; } -#ifndef CONFIG_IPV6_SUBTREES + + if (cfg->fc_nh_id && cfg->fc_src_len) { + NL_SET_ERR_MSG(extack, "Nexthops can not be used with source routing"); + goto errout; + } +#else if (cfg->fc_src_len) { NL_SET_ERR_MSG(extack, "Specifying source address requires IPV6_SUBTREES to be enabled"); - goto out; + goto errout; } #endif - if (cfg->fc_nh_id) { - nh = nexthop_find_by_id(net, cfg->fc_nh_id); - if (!nh) { - NL_SET_ERR_MSG(extack, "Nexthop id does not exist"); - goto out; - } - err = fib6_check_nexthop(nh, cfg, extack); - if (err) - goto out; - } + return 0; +errout: + return -EINVAL; +} + +static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg, + gfp_t gfp_flags, + struct netlink_ext_ack *extack) +{ + struct net *net = cfg->fc_nlinfo.nl_net; + struct fib6_table *table; + struct fib6_info *rt; + int err; - err = -ENOBUFS; if (cfg->fc_nlinfo.nlh && !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) { table = fib6_get_table(net, cfg->fc_table); @@ -3741,22 +3801,22 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg, } else { table = fib6_new_table(net, cfg->fc_table); } + if (!table) { + err = -ENOBUFS; + goto err; + } - if (!table) - goto out; - - err = -ENOMEM; - rt = fib6_info_alloc(gfp_flags, !nh); - if (!rt) - goto out; + rt = fib6_info_alloc(gfp_flags, !cfg->fc_nh_id); + if (!rt) { + err = -ENOMEM; + goto err; + } - rt->fib6_metrics = ip_fib_metrics_init(net, cfg->fc_mx, cfg->fc_mx_len, + rt->fib6_metrics = ip_fib_metrics_init(cfg->fc_mx, cfg->fc_mx_len, extack); if (IS_ERR(rt->fib6_metrics)) { err = PTR_ERR(rt->fib6_metrics); - /* Do not leave garbage there. */ - rt->fib6_metrics = (struct dst_metrics *)&dst_default_metrics; - goto out_free; + goto free; } if (cfg->fc_flags & RTF_ADDRCONF) @@ -3764,14 +3824,12 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg, if (cfg->fc_flags & RTF_EXPIRES) fib6_set_expires(rt, jiffies + - clock_t_to_jiffies(cfg->fc_expires)); - else - fib6_clean_expires(rt); + clock_t_to_jiffies(cfg->fc_expires)); if (cfg->fc_protocol == RTPROT_UNSPEC) cfg->fc_protocol = RTPROT_BOOT; - rt->fib6_protocol = cfg->fc_protocol; + rt->fib6_protocol = cfg->fc_protocol; rt->fib6_table = table; rt->fib6_metric = cfg->fc_metric; rt->fib6_type = cfg->fc_type ? : RTN_UNICAST; @@ -3784,21 +3842,54 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg, ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len); rt->fib6_src.plen = cfg->fc_src_len; #endif - if (nh) { - if (rt->fib6_src.plen) { - NL_SET_ERR_MSG(extack, "Nexthops can not be used with source routing"); + return rt; +free: + kfree(rt); +err: + return ERR_PTR(err); +} + +static int ip6_route_info_create_nh(struct fib6_info *rt, + struct fib6_config *cfg, + gfp_t gfp_flags, + struct netlink_ext_ack *extack) +{ + struct net *net = cfg->fc_nlinfo.nl_net; + struct fib6_nh *fib6_nh; + int err; + + if (cfg->fc_nh_id) { + struct nexthop *nh; + + rcu_read_lock(); + + nh = nexthop_find_by_id(net, cfg->fc_nh_id); + if (!nh) { + err = -EINVAL; + NL_SET_ERR_MSG(extack, "Nexthop id does not exist"); goto out_free; } + + err = fib6_check_nexthop(nh, cfg, extack); + if (err) + goto out_free; + if (!nexthop_get(nh)) { NL_SET_ERR_MSG(extack, "Nexthop has been deleted"); + err = -ENOENT; goto out_free; } + rt->nh = nh; fib6_nh = nexthop_fib6_nh(rt->nh); + + rcu_read_unlock(); } else { + int addr_type; + err = fib6_nh_init(net, rt->fib6_nh, cfg, gfp_flags, extack); if (err) - goto out; + goto out_release; fib6_nh = rt->fib6_nh; @@ -3817,21 +3908,21 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg, if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) { NL_SET_ERR_MSG(extack, "Invalid source address"); err = -EINVAL; - goto out; + goto out_release; } rt->fib6_prefsrc.addr = cfg->fc_prefsrc; rt->fib6_prefsrc.plen = 128; - } else - rt->fib6_prefsrc.plen = 0; + } - return rt; -out: + return 0; +out_release: fib6_info_release(rt); - return ERR_PTR(err); + return err; out_free: + rcu_read_unlock(); ip_fib_metrics_put(rt->fib6_metrics); kfree(rt); - return ERR_PTR(err); + return err; } int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags, @@ -3840,10 +3931,18 @@ int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags, struct fib6_info *rt; int err; + err = fib6_config_validate(cfg, extack); + if (err) + return err; + rt = ip6_route_info_create(cfg, gfp_flags, extack); if (IS_ERR(rt)) return PTR_ERR(rt); + err = ip6_route_info_create_nh(rt, cfg, gfp_flags, extack); + if (err) + return err; + err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack); fib6_info_release(rt); @@ -4072,9 +4171,9 @@ static int ip6_route_del(struct fib6_config *cfg, if (rt->nh) { if (!fib6_info_hold_safe(rt)) continue; - rcu_read_unlock(); - return __ip6_del_rt(rt, &cfg->fc_nlinfo); + err = __ip6_del_rt(rt, &cfg->fc_nlinfo); + break; } if (cfg->fc_nh_id) continue; @@ -4089,13 +4188,13 @@ static int ip6_route_del(struct fib6_config *cfg, continue; if (!fib6_info_hold_safe(rt)) continue; - rcu_read_unlock(); /* if gateway was specified only delete the one hop */ if (cfg->fc_flags & RTF_GATEWAY) - return __ip6_del_rt(rt, &cfg->fc_nlinfo); - - return __ip6_del_rt_siblings(rt, cfg); + err = __ip6_del_rt(rt, &cfg->fc_nlinfo); + else + err = __ip6_del_rt_siblings(rt, cfg); + break; } } rcu_read_unlock(); @@ -4142,7 +4241,8 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu in6_dev = __in6_dev_get(skb->dev); if (!in6_dev) return; - if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects) + if (READ_ONCE(in6_dev->cnf.forwarding) || + !READ_ONCE(in6_dev->cnf.accept_redirects)) return; /* RFC2461 8.1: @@ -4165,7 +4265,7 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu } } - rt = (struct rt6_info *) dst; + rt = dst_rt6_info(dst); if (rt->rt6i_flags & RTF_REJECT) { net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n"); return; @@ -4199,7 +4299,7 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu if (res.f6i->nh) { struct fib6_nh_match_arg arg = { - .dev = dst->dev, + .dev = dst_dev_rcu(dst), .gw = &rt->rt6i_gateway, }; @@ -4355,7 +4455,8 @@ struct fib6_info *rt6_add_dflt_router(struct net *net, const struct in6_addr *gwaddr, struct net_device *dev, unsigned int pref, - u32 defrtr_usr_metric) + u32 defrtr_usr_metric, + int lifetime) { struct fib6_config cfg = { .fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT, @@ -4368,6 +4469,7 @@ struct fib6_info *rt6_add_dflt_router(struct net *net, .fc_nlinfo.portid = 0, .fc_nlinfo.nlh = NULL, .fc_nlinfo.nl_net = net, + .fc_expires = jiffies_to_clock_t(lifetime * HZ), }; cfg.fc_gateway = *gwaddr; @@ -4434,7 +4536,7 @@ static void rtmsg_to_fib6_config(struct net *net, .fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ? : RT6_TABLE_MAIN, .fc_ifindex = rtmsg->rtmsg_ifindex, - .fc_metric = rtmsg->rtmsg_metric ? : IP6_RT_PRIO_USER, + .fc_metric = rtmsg->rtmsg_metric, .fc_expires = rtmsg->rtmsg_info, .fc_dst_len = rtmsg->rtmsg_dst_len, .fc_src_len = rtmsg->rtmsg_src_len, @@ -4461,16 +4563,18 @@ int ipv6_route_ioctl(struct net *net, unsigned int cmd, struct in6_rtmsg *rtmsg) rtmsg_to_fib6_config(net, rtmsg, &cfg); - rtnl_lock(); switch (cmd) { case SIOCADDRT: + /* Only do the default setting of fc_metric in route adding */ + if (cfg.fc_metric == 0) + cfg.fc_metric = IP6_RT_PRIO_USER; err = ip6_route_add(&cfg, GFP_KERNEL, NULL); break; case SIOCDELRT: err = ip6_route_del(&cfg, NULL); break; } - rtnl_unlock(); + return err; } @@ -4481,13 +4585,14 @@ int ipv6_route_ioctl(struct net *net, unsigned int cmd, struct in6_rtmsg *rtmsg) static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes) { struct dst_entry *dst = skb_dst(skb); - struct net *net = dev_net(dst->dev); + struct net_device *dev = dst_dev(dst); + struct net *net = dev_net(dev); struct inet6_dev *idev; SKB_DR(reason); int type; if (netif_is_l3_master(skb->dev) || - dst->dev == net->loopback_dev) + dev == net->loopback_dev) idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif)); else idev = ip6_dst_idev(dst); @@ -4524,7 +4629,7 @@ static int ip6_pkt_discard(struct sk_buff *skb) static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb) { - skb->dev = skb_dst(skb)->dev; + skb->dev = skb_dst_dev(skb); return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES); } @@ -4535,7 +4640,7 @@ static int ip6_pkt_prohibit(struct sk_buff *skb) static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb) { - skb->dev = skb_dst(skb)->dev; + skb->dev = skb_dst_dev(skb); return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES); } @@ -4546,7 +4651,8 @@ static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff struct fib6_info *addrconf_f6i_alloc(struct net *net, struct inet6_dev *idev, const struct in6_addr *addr, - bool anycast, gfp_t gfp_flags) + bool anycast, gfp_t gfp_flags, + struct netlink_ext_ack *extack) { struct fib6_config cfg = { .fc_table = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL, @@ -4559,6 +4665,7 @@ struct fib6_info *addrconf_f6i_alloc(struct net *net, .fc_ignore_dev_down = true, }; struct fib6_info *f6i; + int err; if (anycast) { cfg.fc_type = RTN_ANYCAST; @@ -4568,36 +4675,39 @@ struct fib6_info *addrconf_f6i_alloc(struct net *net, cfg.fc_flags |= RTF_LOCAL; } - f6i = ip6_route_info_create(&cfg, gfp_flags, NULL); - if (!IS_ERR(f6i)) { - f6i->dst_nocount = true; + f6i = ip6_route_info_create(&cfg, gfp_flags, extack); + if (IS_ERR(f6i)) + return f6i; - if (!anycast && - (net->ipv6.devconf_all->disable_policy || - idev->cnf.disable_policy)) - f6i->dst_nopolicy = true; - } + err = ip6_route_info_create_nh(f6i, &cfg, gfp_flags, extack); + if (err) + return ERR_PTR(err); + + f6i->dst_nocount = true; + + if (!anycast && + (READ_ONCE(net->ipv6.devconf_all->disable_policy) || + READ_ONCE(idev->cnf.disable_policy))) + f6i->dst_nopolicy = true; return f6i; } /* remove deleted ip from prefsrc entries */ struct arg_dev_net_ip { - struct net_device *dev; struct net *net; struct in6_addr *addr; }; static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg) { - struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev; struct net *net = ((struct arg_dev_net_ip *)arg)->net; struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr; if (!rt->nh && - ((void *)rt->fib6_nh->fib_nh_dev == dev || !dev) && rt != net->ipv6.fib6_null_entry && - ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) { + ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr) && + !ipv6_chk_addr(net, addr, rt->fib6_nh->fib_nh_dev, 0)) { spin_lock_bh(&rt6_exception_lock); /* remove prefsrc entry */ rt->fib6_prefsrc.plen = 0; @@ -4610,7 +4720,6 @@ void rt6_remove_prefsrc(struct inet6_ifaddr *ifp) { struct net *net = dev_net(ifp->idev->dev); struct arg_dev_net_ip adni = { - .dev = ifp->idev->dev, .net = net, .addr = &ifp->addr, }; @@ -4991,14 +5100,63 @@ static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = { [RTA_SPORT] = { .type = NLA_U16 }, [RTA_DPORT] = { .type = NLA_U16 }, [RTA_NH_ID] = { .type = NLA_U32 }, + [RTA_FLOWLABEL] = { .type = NLA_BE32 }, }; +static int rtm_to_fib6_multipath_config(struct fib6_config *cfg, + struct netlink_ext_ack *extack, + bool newroute) +{ + struct rtnexthop *rtnh; + int remaining; + + remaining = cfg->fc_mp_len; + rtnh = (struct rtnexthop *)cfg->fc_mp; + + if (!rtnh_ok(rtnh, remaining)) { + NL_SET_ERR_MSG(extack, "Invalid nexthop configuration - no valid nexthops"); + return -EINVAL; + } + + do { + bool has_gateway = cfg->fc_flags & RTF_GATEWAY; + int attrlen = rtnh_attrlen(rtnh); + + if (attrlen > 0) { + struct nlattr *nla, *attrs; + + attrs = rtnh_attrs(rtnh); + nla = nla_find(attrs, attrlen, RTA_GATEWAY); + if (nla) { + if (nla_len(nla) < sizeof(cfg->fc_gateway)) { + NL_SET_ERR_MSG(extack, + "Invalid IPv6 address in RTA_GATEWAY"); + return -EINVAL; + } + + has_gateway = true; + } + } + + if (newroute && (cfg->fc_nh_id || !has_gateway)) { + NL_SET_ERR_MSG(extack, + "Device only routes can not be added for IPv6 using the multipath API."); + return -EINVAL; + } + + rtnh = rtnh_next(rtnh, &remaining); + } while (rtnh_ok(rtnh, remaining)); + + return lwtunnel_valid_encap_type_attr(cfg->fc_mp, cfg->fc_mp_len, extack); +} + static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, struct fib6_config *cfg, struct netlink_ext_ack *extack) { - struct rtmsg *rtm; + bool newroute = nlh->nlmsg_type == RTM_NEWROUTE; struct nlattr *tb[RTA_MAX+1]; + struct rtmsg *rtm; unsigned int pref; int err; @@ -5016,6 +5174,12 @@ static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, goto errout; } + if (tb[RTA_FLOWLABEL]) { + NL_SET_ERR_MSG_ATTR(extack, tb[RTA_FLOWLABEL], + "Flow label cannot be specified for this operation"); + goto errout; + } + *cfg = (struct fib6_config){ .fc_table = rtm->rtm_table, .fc_dst_len = rtm->rtm_dst_len, @@ -5101,8 +5265,7 @@ static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]); cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]); - err = lwtunnel_valid_encap_type_attr(cfg->fc_mp, - cfg->fc_mp_len, extack); + err = rtm_to_fib6_multipath_config(cfg, extack, newroute); if (err < 0) goto errout; } @@ -5143,29 +5306,28 @@ errout: struct rt6_nh { struct fib6_info *fib6_info; struct fib6_config r_cfg; - struct list_head next; + struct list_head list; }; -static int ip6_route_info_append(struct net *net, - struct list_head *rt6_nh_list, +static int ip6_route_info_append(struct list_head *rt6_nh_list, struct fib6_info *rt, struct fib6_config *r_cfg) { struct rt6_nh *nh; - int err = -EEXIST; - list_for_each_entry(nh, rt6_nh_list, next) { + list_for_each_entry(nh, rt6_nh_list, list) { /* check if fib6_info already exists */ if (rt6_duplicate_nexthop(nh->fib6_info, rt)) - return err; + return -EEXIST; } nh = kzalloc(sizeof(*nh), GFP_KERNEL); if (!nh) return -ENOMEM; + nh->fib6_info = rt; memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg)); - list_add_tail(&nh->next, rt6_nh_list); + list_add_tail(&nh->list, rt6_nh_list); return 0; } @@ -5181,14 +5343,19 @@ static void ip6_route_mpath_notify(struct fib6_info *rt, * nexthop. Since sibling routes are always added at the end of * the list, find the first sibling of the last route appended */ - if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) { - rt = list_first_entry(&rt_last->fib6_siblings, - struct fib6_info, - fib6_siblings); + rcu_read_lock(); + + if ((nlflags & NLM_F_APPEND) && rt_last && + READ_ONCE(rt_last->fib6_nsiblings)) { + rt = list_first_or_null_rcu(&rt_last->fib6_siblings, + struct fib6_info, + fib6_siblings); } if (rt) inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags); + + rcu_read_unlock(); } static bool ip6_route_mpath_should_notify(const struct fib6_info *rt) @@ -5217,37 +5384,30 @@ out: return should_notify; } -static int fib6_gw_from_attr(struct in6_addr *gw, struct nlattr *nla, - struct netlink_ext_ack *extack) -{ - if (nla_len(nla) < sizeof(*gw)) { - NL_SET_ERR_MSG(extack, "Invalid IPv6 address in RTA_GATEWAY"); - return -EINVAL; - } - - *gw = nla_get_in6_addr(nla); - - return 0; -} - static int ip6_route_multipath_add(struct fib6_config *cfg, struct netlink_ext_ack *extack) { struct fib6_info *rt_notif = NULL, *rt_last = NULL; struct nl_info *info = &cfg->fc_nlinfo; + struct rt6_nh *nh, *nh_safe; struct fib6_config r_cfg; struct rtnexthop *rtnh; - struct fib6_info *rt; + LIST_HEAD(rt6_nh_list); struct rt6_nh *err_nh; - struct rt6_nh *nh, *nh_safe; + struct fib6_info *rt; __u16 nlflags; int remaining; int attrlen; - int err = 1; + int replace; int nhn = 0; - int replace = (cfg->fc_nlinfo.nlh && - (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE)); - LIST_HEAD(rt6_nh_list); + int err; + + err = fib6_config_validate(cfg, extack); + if (err) + return err; + + replace = (cfg->fc_nlinfo.nlh && + (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE)); nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE; if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND) @@ -5270,18 +5430,11 @@ static int ip6_route_multipath_add(struct fib6_config *cfg, nla = nla_find(attrs, attrlen, RTA_GATEWAY); if (nla) { - err = fib6_gw_from_attr(&r_cfg.fc_gateway, nla, - extack); - if (err) - goto cleanup; - + r_cfg.fc_gateway = nla_get_in6_addr(nla); r_cfg.fc_flags |= RTF_GATEWAY; } - r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP); - /* RTA_ENCAP_TYPE length checked in - * lwtunnel_valid_encap_type_attr - */ + r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP); nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE); if (nla) r_cfg.fc_encap_type = nla_get_u16(nla); @@ -5294,18 +5447,16 @@ static int ip6_route_multipath_add(struct fib6_config *cfg, rt = NULL; goto cleanup; } - if (!rt6_qualify_for_ecmp(rt)) { - err = -EINVAL; - NL_SET_ERR_MSG(extack, - "Device only routes can not be added for IPv6 using the multipath API."); - fib6_info_release(rt); + + err = ip6_route_info_create_nh(rt, &r_cfg, GFP_KERNEL, extack); + if (err) { + rt = NULL; goto cleanup; } rt->fib6_nh->fib_nh_weight = rtnh->rtnh_hops + 1; - err = ip6_route_info_append(info->nl_net, &rt6_nh_list, - rt, &r_cfg); + err = ip6_route_info_append(&rt6_nh_list, rt, &r_cfg); if (err) { fib6_info_release(rt); goto cleanup; @@ -5314,12 +5465,6 @@ static int ip6_route_multipath_add(struct fib6_config *cfg, rtnh = rtnh_next(rtnh, &remaining); } - if (list_empty(&rt6_nh_list)) { - NL_SET_ERR_MSG(extack, - "Invalid nexthop configuration - no valid nexthops"); - return -EINVAL; - } - /* for add and replace send one notification with all nexthops. * Skip the notification in fib6_add_rt2node and send one with * the full route when done @@ -5332,21 +5477,9 @@ static int ip6_route_multipath_add(struct fib6_config *cfg, info->skip_notify_kernel = 1; err_nh = NULL; - list_for_each_entry(nh, &rt6_nh_list, next) { + list_for_each_entry(nh, &rt6_nh_list, list) { err = __ip6_ins_rt(nh->fib6_info, info, extack); - fib6_info_release(nh->fib6_info); - if (!err) { - /* save reference to last route successfully inserted */ - rt_last = nh->fib6_info; - - /* save reference to first route for notification */ - if (!rt_notif) - rt_notif = nh->fib6_info; - } - - /* nh->fib6_info is used or freed at this point, reset to NULL*/ - nh->fib6_info = NULL; if (err) { if (replace && nhn) NL_SET_ERR_MSG_MOD(extack, @@ -5354,6 +5487,12 @@ static int ip6_route_multipath_add(struct fib6_config *cfg, err_nh = nh; goto add_errout; } + /* save reference to last route successfully inserted */ + rt_last = nh->fib6_info; + + /* save reference to first route for notification */ + if (!rt_notif) + rt_notif = nh->fib6_info; /* Because each route is added like a single route we remove * these flags after the first nexthop: if there is a collision, @@ -5406,17 +5545,16 @@ add_errout: ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags); /* Delete routes that were already added */ - list_for_each_entry(nh, &rt6_nh_list, next) { + list_for_each_entry(nh, &rt6_nh_list, list) { if (err_nh == nh) break; ip6_route_del(&nh->r_cfg, extack); } cleanup: - list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) { - if (nh->fib6_info) - fib6_info_release(nh->fib6_info); - list_del(&nh->next); + list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, list) { + fib6_info_release(nh->fib6_info); + list_del(&nh->list); kfree(nh); } @@ -5448,21 +5586,15 @@ static int ip6_route_multipath_del(struct fib6_config *cfg, nla = nla_find(attrs, attrlen, RTA_GATEWAY); if (nla) { - err = fib6_gw_from_attr(&r_cfg.fc_gateway, nla, - extack); - if (err) { - last_err = err; - goto next_rtnh; - } - + r_cfg.fc_gateway = nla_get_in6_addr(nla); r_cfg.fc_flags |= RTF_GATEWAY; } } + err = ip6_route_del(&r_cfg, extack); if (err) last_err = err; -next_rtnh: rtnh = rtnh_next(rtnh, &remaining); } @@ -5479,15 +5611,20 @@ static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh, if (err < 0) return err; - if (cfg.fc_nh_id && - !nexthop_find_by_id(sock_net(skb->sk), cfg.fc_nh_id)) { - NL_SET_ERR_MSG(extack, "Nexthop id does not exist"); - return -EINVAL; + if (cfg.fc_nh_id) { + rcu_read_lock(); + err = !nexthop_find_by_id(sock_net(skb->sk), cfg.fc_nh_id); + rcu_read_unlock(); + + if (err) { + NL_SET_ERR_MSG(extack, "Nexthop id does not exist"); + return -EINVAL; + } } - if (cfg.fc_mp) + if (cfg.fc_mp) { return ip6_route_multipath_del(&cfg, extack); - else { + } else { cfg.fc_delete_all_nh = 1; return ip6_route_del(&cfg, extack); } @@ -5533,27 +5670,34 @@ static int rt6_nh_nlmsg_size(struct fib6_nh *nh, void *arg) static size_t rt6_nlmsg_size(struct fib6_info *f6i) { + struct fib6_info *sibling; + struct fib6_nh *nh; int nexthop_len; if (f6i->nh) { nexthop_len = nla_total_size(4); /* RTA_NH_ID */ nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_nlmsg_size, &nexthop_len); - } else { - struct fib6_nh *nh = f6i->fib6_nh; - - nexthop_len = 0; - if (f6i->fib6_nsiblings) { - nexthop_len = nla_total_size(0) /* RTA_MULTIPATH */ - + NLA_ALIGN(sizeof(struct rtnexthop)) - + nla_total_size(16) /* RTA_GATEWAY */ - + lwtunnel_get_encap_size(nh->fib_nh_lws); + goto common; + } - nexthop_len *= f6i->fib6_nsiblings; + rcu_read_lock(); +retry: + nh = f6i->fib6_nh; + nexthop_len = 0; + if (READ_ONCE(f6i->fib6_nsiblings)) { + rt6_nh_nlmsg_size(nh, &nexthop_len); + + list_for_each_entry_rcu(sibling, &f6i->fib6_siblings, + fib6_siblings) { + rt6_nh_nlmsg_size(sibling->fib6_nh, &nexthop_len); + if (!READ_ONCE(f6i->fib6_nsiblings)) + goto retry; } - nexthop_len += lwtunnel_get_encap_size(nh->fib_nh_lws); } - + rcu_read_unlock(); + nexthop_len += lwtunnel_get_encap_size(nh->fib_nh_lws); +common: return NLMSG_ALIGN(sizeof(struct rtmsg)) + nla_total_size(16) /* RTA_SRC */ + nla_total_size(16) /* RTA_DST */ @@ -5605,7 +5749,7 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb, int iif, int type, u32 portid, u32 seq, unsigned int flags) { - struct rt6_info *rt6 = (struct rt6_info *)dst; + struct rt6_info *rt6 = dst_rt6_info(dst); struct rt6key *rt6_dst, *rt6_src; u32 *pmetrics, table, rt6_flags; unsigned char nh_flags = 0; @@ -5679,7 +5823,7 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb, goto nla_put_failure; } else if (dest) { struct in6_addr saddr_buf; - if (ip6_route_get_saddr(net, rt, dest, 0, &saddr_buf) == 0 && + if (ip6_route_get_saddr(net, rt, dest, 0, 0, &saddr_buf) == 0 && nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf)) goto nla_put_failure; } @@ -5702,18 +5846,20 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb, * each as a nexthop within RTA_MULTIPATH. */ if (rt6) { + struct net_device *dev; + if (rt6_flags & RTF_GATEWAY && nla_put_in6_addr(skb, RTA_GATEWAY, &rt6->rt6i_gateway)) goto nla_put_failure; - if (dst->dev && nla_put_u32(skb, RTA_OIF, dst->dev->ifindex)) + dev = dst_dev(dst); + if (dev && nla_put_u32(skb, RTA_OIF, dev->ifindex)) goto nla_put_failure; - if (dst->lwtstate && - lwtunnel_fill_encap(skb, dst->lwtstate, RTA_ENCAP, RTA_ENCAP_TYPE) < 0) + if (lwtunnel_fill_encap(skb, dst->lwtstate, RTA_ENCAP, RTA_ENCAP_TYPE) < 0) goto nla_put_failure; - } else if (rt->fib6_nsiblings) { - struct fib6_info *sibling, *next_sibling; + } else if (READ_ONCE(rt->fib6_nsiblings)) { + struct fib6_info *sibling; struct nlattr *mp; mp = nla_nest_start_noflag(skb, RTA_MULTIPATH); @@ -5725,14 +5871,21 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb, 0) < 0) goto nla_put_failure; - list_for_each_entry_safe(sibling, next_sibling, - &rt->fib6_siblings, fib6_siblings) { + rcu_read_lock(); + + list_for_each_entry_rcu(sibling, &rt->fib6_siblings, + fib6_siblings) { if (fib_add_nexthop(skb, &sibling->fib6_nh->nh_common, sibling->fib6_nh->fib_nh_weight, - AF_INET6, 0) < 0) + AF_INET6, 0) < 0) { + rcu_read_unlock(); + goto nla_put_failure; + } } + rcu_read_unlock(); + nla_nest_end(skb, mp); } else if (rt->nh) { if (nla_put_u32(skb, RTA_NH_ID, rt->nh->id)) @@ -5755,7 +5908,7 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb, } if (rt6_flags & RTF_EXPIRES) { - expires = dst ? dst->expires : rt->expires; + expires = dst ? READ_ONCE(dst->expires) : rt->expires; expires -= jiffies; } @@ -5807,16 +5960,21 @@ static bool fib6_info_uses_dev(const struct fib6_info *f6i, if (f6i->fib6_nh->fib_nh_dev == dev) return true; - if (f6i->fib6_nsiblings) { - struct fib6_info *sibling, *next_sibling; + if (READ_ONCE(f6i->fib6_nsiblings)) { + const struct fib6_info *sibling; - list_for_each_entry_safe(sibling, next_sibling, - &f6i->fib6_siblings, fib6_siblings) { - if (sibling->fib6_nh->fib_nh_dev == dev) + rcu_read_lock(); + list_for_each_entry_rcu(sibling, &f6i->fib6_siblings, + fib6_siblings) { + if (sibling->fib6_nh->fib_nh_dev == dev) { + rcu_read_unlock(); return true; + } + if (!READ_ONCE(f6i->fib6_nsiblings)) + break; } + rcu_read_unlock(); } - return false; } @@ -5955,7 +6113,8 @@ static int inet6_rtm_valid_getroute_req(struct sk_buff *skb, struct rtmsg *rtm; int i, err; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) { + rtm = nlmsg_payload(nlh, sizeof(*rtm)); + if (!rtm) { NL_SET_ERR_MSG_MOD(extack, "Invalid header for get route request"); return -EINVAL; @@ -5965,7 +6124,6 @@ static int inet6_rtm_valid_getroute_req(struct sk_buff *skb, return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy, extack); - rtm = nlmsg_data(nlh); if ((rtm->rtm_src_len && rtm->rtm_src_len != 128) || (rtm->rtm_dst_len && rtm->rtm_dst_len != 128) || rtm->rtm_table || rtm->rtm_protocol || rtm->rtm_scope || @@ -5990,6 +6148,13 @@ static int inet6_rtm_valid_getroute_req(struct sk_buff *skb, return -EINVAL; } + if (tb[RTA_FLOWLABEL] && + (nla_get_be32(tb[RTA_FLOWLABEL]) & ~IPV6_FLOWLABEL_MASK)) { + NL_SET_ERR_MSG_ATTR(extack, tb[RTA_FLOWLABEL], + "Invalid flow label"); + return -EINVAL; + } + for (i = 0; i <= RTA_MAX; i++) { if (!tb[i]) continue; @@ -6004,6 +6169,7 @@ static int inet6_rtm_valid_getroute_req(struct sk_buff *skb, case RTA_SPORT: case RTA_DPORT: case RTA_IP_PROTO: + case RTA_FLOWLABEL: break; default: NL_SET_ERR_MSG_MOD(extack, "Unsupported attribute in get route request"); @@ -6026,6 +6192,7 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, struct sk_buff *skb; struct rtmsg *rtm; struct flowi6 fl6 = {}; + __be32 flowlabel; bool fibmatch; err = inet6_rtm_valid_getroute_req(in_skb, nlh, tb, extack); @@ -6034,7 +6201,6 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, err = -EINVAL; rtm = nlmsg_data(nlh); - fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0); fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH); if (tb[RTA_SRC]) { @@ -6080,6 +6246,9 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, goto errout; } + flowlabel = nla_get_be32_default(tb[RTA_FLOWLABEL], 0); + fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, flowlabel); + if (iif) { struct net_device *dev; int flags = 0; @@ -6108,7 +6277,7 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, } - rt = container_of(dst, struct rt6_info, dst); + rt = dst_rt6_info(dst); if (rt->dst.error) { err = rt->dst.error; ip6_rt_put(rt); @@ -6161,32 +6330,42 @@ errout: void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info, unsigned int nlm_flags) { - struct sk_buff *skb; struct net *net = info->nl_net; + struct sk_buff *skb; + size_t sz; u32 seq; int err; err = -ENOBUFS; seq = info->nlh ? info->nlh->nlmsg_seq : 0; - skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any()); + rcu_read_lock(); + sz = rt6_nlmsg_size(rt); +retry: + skb = nlmsg_new(sz, GFP_ATOMIC); if (!skb) goto errout; err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0, event, info->portid, seq, nlm_flags); if (err < 0) { - /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */ - WARN_ON(err == -EMSGSIZE); kfree_skb(skb); + /* -EMSGSIZE implies needed space grew under us. */ + if (err == -EMSGSIZE) { + sz = max(rt6_nlmsg_size(rt), sz << 1); + goto retry; + } goto errout; } + + rcu_read_unlock(); + rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE, - info->nlh, gfp_any()); + info->nlh, GFP_ATOMIC); return; errout: - if (err < 0) - rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err); + rcu_read_unlock(); + rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err); } void fib6_rt_update(struct net *net, struct fib6_info *rt, @@ -6212,8 +6391,7 @@ void fib6_rt_update(struct net *net, struct fib6_info *rt, info->nlh, gfp_any()); return; errout: - if (err < 0) - rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err); + rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err); } void fib6_info_hw_flags_set(struct net *net, struct fib6_info *f6i, @@ -6326,7 +6504,7 @@ static int rt6_stats_seq_show(struct seq_file *seq, void *v) #ifdef CONFIG_SYSCTL -static int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write, +static int ipv6_sysctl_rtcache_flush(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct net *net; @@ -6335,12 +6513,12 @@ static int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write, if (!write) return -EINVAL; - net = (struct net *)ctl->extra1; - delay = net->ipv6.sysctl.flush_delay; ret = proc_dointvec(ctl, write, buffer, lenp, ppos); if (ret) return ret; + net = (struct net *)ctl->extra1; + delay = net->ipv6.sysctl.flush_delay; fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0); return 0; } @@ -6419,13 +6597,12 @@ static struct ctl_table ipv6_route_table_template[] = { { .procname = "skip_notify_on_dev_down", .data = &init_net.ipv6.sysctl.skip_notify_on_dev_down, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, - { } }; struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net) @@ -6449,14 +6626,19 @@ struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net) table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss; table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval; table[10].data = &net->ipv6.sysctl.skip_notify_on_dev_down; - - /* Don't export sysctls to unprivileged users */ - if (net->user_ns != &init_user_ns) - table[1].procname = NULL; } return table; } + +size_t ipv6_route_sysctl_table_size(struct net *net) +{ + /* Don't export sysctls to unprivileged users */ + if (net->user_ns != &init_user_ns) + return 1; + + return ARRAY_SIZE(ipv6_route_table_template); +} #endif static int __net_init ip6_route_net_init(struct net *net) @@ -6483,7 +6665,7 @@ static int __net_init ip6_route_net_init(struct net *net) net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops; dst_init_metrics(&net->ipv6.ip6_null_entry->dst, ip6_template_metrics, true); - INIT_LIST_HEAD(&net->ipv6.ip6_null_entry->rt6i_uncached); + INIT_LIST_HEAD(&net->ipv6.ip6_null_entry->dst.rt_uncached); #ifdef CONFIG_IPV6_MULTIPLE_TABLES net->ipv6.fib6_has_custom_rules = false; @@ -6495,7 +6677,7 @@ static int __net_init ip6_route_net_init(struct net *net) net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops; dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst, ip6_template_metrics, true); - INIT_LIST_HEAD(&net->ipv6.ip6_prohibit_entry->rt6i_uncached); + INIT_LIST_HEAD(&net->ipv6.ip6_prohibit_entry->dst.rt_uncached); net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template, sizeof(*net->ipv6.ip6_blk_hole_entry), @@ -6505,14 +6687,14 @@ static int __net_init ip6_route_net_init(struct net *net) net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops; dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst, ip6_template_metrics, true); - INIT_LIST_HEAD(&net->ipv6.ip6_blk_hole_entry->rt6i_uncached); + INIT_LIST_HEAD(&net->ipv6.ip6_blk_hole_entry->dst.rt_uncached); #ifdef CONFIG_IPV6_SUBTREES net->ipv6.fib6_routes_require_src = 0; #endif #endif net->ipv6.sysctl.flush_delay = 0; - net->ipv6.sysctl.ip6_rt_max_size = 4096; + net->ipv6.sysctl.ip6_rt_max_size = INT_MAX; net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2; net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ; net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ; @@ -6637,8 +6819,7 @@ void __init ip6_route_init_special_entries(void) #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) DEFINE_BPF_ITER_FUNC(ipv6_route, struct bpf_iter_meta *meta, struct fib6_info *rt) -BTF_ID_LIST(btf_fib6_info_id) -BTF_ID(struct, fib6_info) +BTF_ID_LIST_SINGLE(btf_fib6_info_id, struct, fib6_info) static const struct bpf_iter_seq_info ipv6_route_seq_info = { .seq_ops = &ipv6_route_seq_ops, @@ -6670,6 +6851,15 @@ static void bpf_iter_unregister(void) #endif #endif +static const struct rtnl_msg_handler ip6_route_rtnl_msg_handlers[] __initconst_or_module = { + {.owner = THIS_MODULE, .protocol = PF_INET6, .msgtype = RTM_NEWROUTE, + .doit = inet6_rtm_newroute, .flags = RTNL_FLAG_DOIT_UNLOCKED}, + {.owner = THIS_MODULE, .protocol = PF_INET6, .msgtype = RTM_DELROUTE, + .doit = inet6_rtm_delroute, .flags = RTNL_FLAG_DOIT_UNLOCKED}, + {.owner = THIS_MODULE, .protocol = PF_INET6, .msgtype = RTM_GETROUTE, + .doit = inet6_rtm_getroute, .flags = RTNL_FLAG_DOIT_UNLOCKED}, +}; + int __init ip6_route_init(void) { int ret; @@ -6712,19 +6902,7 @@ int __init ip6_route_init(void) if (ret) goto fib6_rules_init; - ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE, - inet6_rtm_newroute, NULL, 0); - if (ret < 0) - goto out_register_late_subsys; - - ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE, - inet6_rtm_delroute, NULL, 0); - if (ret < 0) - goto out_register_late_subsys; - - ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE, - inet6_rtm_getroute, NULL, - RTNL_FLAG_DOIT_UNLOCKED); + ret = rtnl_register_many(ip6_route_rtnl_msg_handlers); if (ret < 0) goto out_register_late_subsys; @@ -6744,7 +6922,6 @@ int __init ip6_route_init(void) struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu); INIT_LIST_HEAD(&ul->head); - INIT_LIST_HEAD(&ul->quarantine); spin_lock_init(&ul->lock); } diff --git a/net/ipv6/rpl.c b/net/ipv6/rpl.c index 488aec9e1a74..e186998bfbf7 100644 --- a/net/ipv6/rpl.c +++ b/net/ipv6/rpl.c @@ -29,12 +29,6 @@ static void *ipv6_rpl_segdata_pos(const struct ipv6_rpl_sr_hdr *hdr, int i) return (void *)&hdr->rpl_segdata[i * IPV6_PFXTAIL_LEN(hdr->cmpri)]; } -size_t ipv6_rpl_srh_size(unsigned char n, unsigned char cmpri, - unsigned char cmpre) -{ - return (n * IPV6_PFXTAIL_LEN(cmpri)) + IPV6_PFXTAIL_LEN(cmpre); -} - void ipv6_rpl_srh_decompress(struct ipv6_rpl_sr_hdr *outhdr, const struct ipv6_rpl_sr_hdr *inhdr, const struct in6_addr *daddr, unsigned char n) diff --git a/net/ipv6/rpl_iptunnel.c b/net/ipv6/rpl_iptunnel.c index ff691d9f4a04..c7942cf65567 100644 --- a/net/ipv6/rpl_iptunnel.c +++ b/net/ipv6/rpl_iptunnel.c @@ -13,7 +13,7 @@ #include <net/rpl.h> struct rpl_iptunnel_encap { - struct ipv6_rpl_sr_hdr srh[0]; + DECLARE_FLEX_ARRAY(struct ipv6_rpl_sr_hdr, srh); }; struct rpl_lwt { @@ -125,16 +125,17 @@ static void rpl_destroy_state(struct lwtunnel_state *lwt) } static int rpl_do_srh_inline(struct sk_buff *skb, const struct rpl_lwt *rlwt, - const struct ipv6_rpl_sr_hdr *srh) + const struct ipv6_rpl_sr_hdr *srh, + struct dst_entry *cache_dst) { struct ipv6_rpl_sr_hdr *isrh, *csrh; - const struct ipv6hdr *oldhdr; + struct ipv6hdr oldhdr; struct ipv6hdr *hdr; unsigned char *buf; size_t hdrlen; int err; - oldhdr = ipv6_hdr(skb); + memcpy(&oldhdr, ipv6_hdr(skb), sizeof(oldhdr)); buf = kcalloc(struct_size(srh, segments.addr, srh->segments_left), 2, GFP_ATOMIC); if (!buf) @@ -146,14 +147,14 @@ static int rpl_do_srh_inline(struct sk_buff *skb, const struct rpl_lwt *rlwt, memcpy(isrh, srh, sizeof(*isrh)); memcpy(isrh->rpl_segaddr, &srh->rpl_segaddr[1], (srh->segments_left - 1) * 16); - isrh->rpl_segaddr[srh->segments_left - 1] = oldhdr->daddr; + isrh->rpl_segaddr[srh->segments_left - 1] = oldhdr.daddr; ipv6_rpl_srh_compress(csrh, isrh, &srh->rpl_segaddr[0], isrh->segments_left - 1); hdrlen = ((csrh->hdrlen + 1) << 3); - err = skb_cow_head(skb, hdrlen + skb->mac_len); + err = skb_cow_head(skb, hdrlen + dst_dev_overhead(cache_dst, skb)); if (unlikely(err)) { kfree(buf); return err; @@ -168,7 +169,7 @@ static int rpl_do_srh_inline(struct sk_buff *skb, const struct rpl_lwt *rlwt, skb_mac_header_rebuild(skb); hdr = ipv6_hdr(skb); - memmove(hdr, oldhdr, sizeof(*hdr)); + memmove(hdr, &oldhdr, sizeof(*hdr)); isrh = (void *)hdr + sizeof(*hdr); memcpy(isrh, csrh, hdrlen); @@ -186,7 +187,8 @@ static int rpl_do_srh_inline(struct sk_buff *skb, const struct rpl_lwt *rlwt, return 0; } -static int rpl_do_srh(struct sk_buff *skb, const struct rpl_lwt *rlwt) +static int rpl_do_srh(struct sk_buff *skb, const struct rpl_lwt *rlwt, + struct dst_entry *cache_dst) { struct dst_entry *dst = skb_dst(skb); struct rpl_iptunnel_encap *tinfo; @@ -196,7 +198,7 @@ static int rpl_do_srh(struct sk_buff *skb, const struct rpl_lwt *rlwt) tinfo = rpl_encap_lwtunnel(dst->lwtstate); - return rpl_do_srh_inline(skb, rlwt, tinfo->srh); + return rpl_do_srh_inline(skb, rlwt, tinfo->srh, cache_dst); } static int rpl_output(struct net *net, struct sock *sk, struct sk_buff *skb) @@ -208,14 +210,14 @@ static int rpl_output(struct net *net, struct sock *sk, struct sk_buff *skb) rlwt = rpl_lwt_lwtunnel(orig_dst->lwtstate); - err = rpl_do_srh(skb, rlwt); + local_bh_disable(); + dst = dst_cache_get(&rlwt->cache); + local_bh_enable(); + + err = rpl_do_srh(skb, rlwt, dst); if (unlikely(err)) goto drop; - preempt_disable(); - dst = dst_cache_get(&rlwt->cache); - preempt_enable(); - if (unlikely(!dst)) { struct ipv6hdr *hdr = ipv6_hdr(skb); struct flowi6 fl6; @@ -230,25 +232,28 @@ static int rpl_output(struct net *net, struct sock *sk, struct sk_buff *skb) dst = ip6_route_output(net, NULL, &fl6); if (dst->error) { err = dst->error; - dst_release(dst); goto drop; } - preempt_disable(); - dst_cache_set_ip6(&rlwt->cache, dst, &fl6.saddr); - preempt_enable(); + /* cache only if we don't create a dst reference loop */ + if (orig_dst->lwtstate != dst->lwtstate) { + local_bh_disable(); + dst_cache_set_ip6(&rlwt->cache, dst, &fl6.saddr); + local_bh_enable(); + } + + err = skb_cow_head(skb, LL_RESERVED_SPACE(dst_dev(dst))); + if (unlikely(err)) + goto drop; } skb_dst_drop(skb); skb_dst_set(skb, dst); - err = skb_cow_head(skb, LL_RESERVED_SPACE(dst->dev)); - if (unlikely(err)) - goto drop; - return dst_output(net, sk, skb); drop: + dst_release(dst); kfree_skb(skb); return err; } @@ -257,41 +262,54 @@ static int rpl_input(struct sk_buff *skb) { struct dst_entry *orig_dst = skb_dst(skb); struct dst_entry *dst = NULL; + struct lwtunnel_state *lwtst; struct rpl_lwt *rlwt; int err; - rlwt = rpl_lwt_lwtunnel(orig_dst->lwtstate); + /* We cannot dereference "orig_dst" once ip6_route_input() or + * skb_dst_drop() is called. However, in order to detect a dst loop, we + * need the address of its lwtstate. So, save the address of lwtstate + * now and use it later as a comparison. + */ + lwtst = orig_dst->lwtstate; - err = rpl_do_srh(skb, rlwt); - if (unlikely(err)) { - kfree_skb(skb); - return err; - } + rlwt = rpl_lwt_lwtunnel(lwtst); - preempt_disable(); + local_bh_disable(); dst = dst_cache_get(&rlwt->cache); - preempt_enable(); + local_bh_enable(); - skb_dst_drop(skb); + err = rpl_do_srh(skb, rlwt, dst); + if (unlikely(err)) { + dst_release(dst); + goto drop; + } if (!dst) { ip6_route_input(skb); dst = skb_dst(skb); - if (!dst->error) { - preempt_disable(); + + /* cache only if we don't create a dst reference loop */ + if (!dst->error && lwtst != dst->lwtstate) { + local_bh_disable(); dst_cache_set_ip6(&rlwt->cache, dst, &ipv6_hdr(skb)->saddr); - preempt_enable(); + local_bh_enable(); } + + err = skb_cow_head(skb, LL_RESERVED_SPACE(dst_dev(dst))); + if (unlikely(err)) + goto drop; } else { + skb_dst_drop(skb); skb_dst_set(skb, dst); } - err = skb_cow_head(skb, LL_RESERVED_SPACE(dst->dev)); - if (unlikely(err)) - return err; - return dst_input(skb); + +drop: + kfree_skb(skb); + return err; } static int nla_put_rpl_srh(struct sk_buff *skb, int attrtype, diff --git a/net/ipv6/seg6.c b/net/ipv6/seg6.c index 29346a6eec9f..a5c4c629b788 100644 --- a/net/ipv6/seg6.c +++ b/net/ipv6/seg6.c @@ -21,9 +21,7 @@ #include <net/genetlink.h> #include <linux/seg6.h> #include <linux/seg6_genl.h> -#ifdef CONFIG_IPV6_SEG6_HMAC #include <net/seg6_hmac.h> -#endif bool seg6_validate_srh(struct ipv6_sr_hdr *srh, int len, bool reduced) { @@ -437,13 +435,11 @@ static int __net_init seg6_net_init(struct net *net) net->ipv6.seg6_data = sdata; -#ifdef CONFIG_IPV6_SEG6_HMAC if (seg6_hmac_net_init(net)) { kfree(rcu_dereference_raw(sdata->tun_src)); kfree(sdata); return -ENOMEM; } -#endif return 0; } @@ -452,9 +448,7 @@ static void __net_exit seg6_net_exit(struct net *net) { struct seg6_pernet_data *sdata = seg6_pernet(net); -#ifdef CONFIG_IPV6_SEG6_HMAC seg6_hmac_net_exit(net); -#endif kfree(rcu_dereference_raw(sdata->tun_src)); kfree(sdata); @@ -512,58 +506,39 @@ int __init seg6_init(void) { int err; - err = genl_register_family(&seg6_genl_family); + err = register_pernet_subsys(&ip6_segments_ops); if (err) goto out; - err = register_pernet_subsys(&ip6_segments_ops); + err = genl_register_family(&seg6_genl_family); if (err) - goto out_unregister_genl; + goto out_unregister_pernet; -#ifdef CONFIG_IPV6_SEG6_LWTUNNEL err = seg6_iptunnel_init(); if (err) - goto out_unregister_pernet; + goto out_unregister_genl; err = seg6_local_init(); if (err) - goto out_unregister_pernet; -#endif - -#ifdef CONFIG_IPV6_SEG6_HMAC - err = seg6_hmac_init(); - if (err) goto out_unregister_iptun; -#endif pr_info("Segment Routing with IPv6\n"); out: return err; -#ifdef CONFIG_IPV6_SEG6_HMAC out_unregister_iptun: -#ifdef CONFIG_IPV6_SEG6_LWTUNNEL - seg6_local_exit(); seg6_iptunnel_exit(); -#endif -#endif -#ifdef CONFIG_IPV6_SEG6_LWTUNNEL -out_unregister_pernet: - unregister_pernet_subsys(&ip6_segments_ops); -#endif out_unregister_genl: genl_unregister_family(&seg6_genl_family); +out_unregister_pernet: + unregister_pernet_subsys(&ip6_segments_ops); goto out; } void seg6_exit(void) { -#ifdef CONFIG_IPV6_SEG6_HMAC - seg6_hmac_exit(); -#endif -#ifdef CONFIG_IPV6_SEG6_LWTUNNEL + seg6_local_exit(); seg6_iptunnel_exit(); -#endif - unregister_pernet_subsys(&ip6_segments_ops); genl_unregister_family(&seg6_genl_family); + unregister_pernet_subsys(&ip6_segments_ops); } diff --git a/net/ipv6/seg6_hmac.c b/net/ipv6/seg6_hmac.c index d43c50a7310d..ee6bac0160ac 100644 --- a/net/ipv6/seg6_hmac.c +++ b/net/ipv6/seg6_hmac.c @@ -16,7 +16,6 @@ #include <linux/in6.h> #include <linux/icmpv6.h> #include <linux/mroute6.h> -#include <linux/slab.h> #include <linux/rhashtable.h> #include <linux/netfilter.h> @@ -34,13 +33,22 @@ #include <net/addrconf.h> #include <net/xfrm.h> -#include <crypto/hash.h> +#include <crypto/sha1.h> +#include <crypto/sha2.h> +#include <crypto/utils.h> #include <net/seg6.h> #include <net/genetlink.h> #include <net/seg6_hmac.h> #include <linux/random.h> -static DEFINE_PER_CPU(char [SEG6_HMAC_RING_SIZE], hmac_ring); +struct hmac_storage { + local_lock_t bh_lock; + char hmac_ring[SEG6_HMAC_RING_SIZE]; +}; + +static DEFINE_PER_CPU(struct hmac_storage, hmac_storage) = { + .bh_lock = INIT_LOCAL_LOCK(bh_lock), +}; static int seg6_hmac_cmpfn(struct rhashtable_compare_arg *arg, const void *obj) { @@ -70,17 +78,6 @@ static const struct rhashtable_params rht_params = { .obj_cmpfn = seg6_hmac_cmpfn, }; -static struct seg6_hmac_algo hmac_algos[] = { - { - .alg_id = SEG6_HMAC_ALGO_SHA1, - .name = "hmac(sha1)", - }, - { - .alg_id = SEG6_HMAC_ALGO_SHA256, - .name = "hmac(sha256)", - }, -}; - static struct sr6_tlv_hmac *seg6_get_tlv_hmac(struct ipv6_sr_hdr *srh) { struct sr6_tlv_hmac *tlv; @@ -100,75 +97,13 @@ static struct sr6_tlv_hmac *seg6_get_tlv_hmac(struct ipv6_sr_hdr *srh) return tlv; } -static struct seg6_hmac_algo *__hmac_get_algo(u8 alg_id) -{ - struct seg6_hmac_algo *algo; - int i, alg_count; - - alg_count = ARRAY_SIZE(hmac_algos); - for (i = 0; i < alg_count; i++) { - algo = &hmac_algos[i]; - if (algo->alg_id == alg_id) - return algo; - } - - return NULL; -} - -static int __do_hmac(struct seg6_hmac_info *hinfo, const char *text, u8 psize, - u8 *output, int outlen) -{ - struct seg6_hmac_algo *algo; - struct crypto_shash *tfm; - struct shash_desc *shash; - int ret, dgsize; - - algo = __hmac_get_algo(hinfo->alg_id); - if (!algo) - return -ENOENT; - - tfm = *this_cpu_ptr(algo->tfms); - - dgsize = crypto_shash_digestsize(tfm); - if (dgsize > outlen) { - pr_debug("sr-ipv6: __do_hmac: digest size too big (%d / %d)\n", - dgsize, outlen); - return -ENOMEM; - } - - ret = crypto_shash_setkey(tfm, hinfo->secret, hinfo->slen); - if (ret < 0) { - pr_debug("sr-ipv6: crypto_shash_setkey failed: err %d\n", ret); - goto failed; - } - - shash = *this_cpu_ptr(algo->shashs); - shash->tfm = tfm; - - ret = crypto_shash_digest(shash, text, psize, output); - if (ret < 0) { - pr_debug("sr-ipv6: crypto_shash_digest failed: err %d\n", ret); - goto failed; - } - - return dgsize; - -failed: - return ret; -} - int seg6_hmac_compute(struct seg6_hmac_info *hinfo, struct ipv6_sr_hdr *hdr, struct in6_addr *saddr, u8 *output) { __be32 hmackeyid = cpu_to_be32(hinfo->hmackeyid); - u8 tmp_out[SEG6_HMAC_MAX_DIGESTSIZE]; - int plen, i, dgsize, wrsize; + int plen, i, ret = 0; char *ring, *off; - /* a 160-byte buffer for digest output allows to store highest known - * hash function (RadioGatun) with up to 1216 bits - */ - /* saddr(16) + first_seg(1) + flags(1) + keyid(4) + seglist(16n) */ plen = 16 + 1 + 1 + 4 + (hdr->first_segment + 1) * 16; @@ -187,7 +122,8 @@ int seg6_hmac_compute(struct seg6_hmac_info *hinfo, struct ipv6_sr_hdr *hdr, */ local_bh_disable(); - ring = this_cpu_ptr(hmac_ring); + local_lock_nested_bh(&hmac_storage.bh_lock); + ring = this_cpu_ptr(hmac_storage.hmac_ring); off = ring; /* source address */ @@ -210,21 +146,25 @@ int seg6_hmac_compute(struct seg6_hmac_info *hinfo, struct ipv6_sr_hdr *hdr, off += 16; } - dgsize = __do_hmac(hinfo, ring, plen, tmp_out, - SEG6_HMAC_MAX_DIGESTSIZE); + switch (hinfo->alg_id) { + case SEG6_HMAC_ALGO_SHA1: + hmac_sha1(&hinfo->key.sha1, ring, plen, output); + static_assert(SEG6_HMAC_FIELD_LEN > SHA1_DIGEST_SIZE); + memset(&output[SHA1_DIGEST_SIZE], 0, + SEG6_HMAC_FIELD_LEN - SHA1_DIGEST_SIZE); + break; + case SEG6_HMAC_ALGO_SHA256: + hmac_sha256(&hinfo->key.sha256, ring, plen, output); + static_assert(SEG6_HMAC_FIELD_LEN == SHA256_DIGEST_SIZE); + break; + default: + WARN_ON_ONCE(1); + ret = -EINVAL; + break; + } + local_unlock_nested_bh(&hmac_storage.bh_lock); local_bh_enable(); - - if (dgsize < 0) - return dgsize; - - wrsize = SEG6_HMAC_FIELD_LEN; - if (wrsize > dgsize) - wrsize = dgsize; - - memset(output, 0, SEG6_HMAC_FIELD_LEN); - memcpy(output, tmp_out, wrsize); - - return 0; + return ret; } EXPORT_SYMBOL(seg6_hmac_compute); @@ -241,6 +181,7 @@ bool seg6_hmac_validate_skb(struct sk_buff *skb) struct sr6_tlv_hmac *tlv; struct ipv6_sr_hdr *srh; struct inet6_dev *idev; + int require_hmac; idev = __in6_dev_get(skb->dev); @@ -248,16 +189,17 @@ bool seg6_hmac_validate_skb(struct sk_buff *skb) tlv = seg6_get_tlv_hmac(srh); + require_hmac = READ_ONCE(idev->cnf.seg6_require_hmac); /* mandatory check but no tlv */ - if (idev->cnf.seg6_require_hmac > 0 && !tlv) + if (require_hmac > 0 && !tlv) return false; /* no check */ - if (idev->cnf.seg6_require_hmac < 0) + if (require_hmac < 0) return true; /* check only if present */ - if (idev->cnf.seg6_require_hmac == 0 && !tlv) + if (require_hmac == 0 && !tlv) return true; /* now, seg6_require_hmac >= 0 && tlv */ @@ -269,7 +211,7 @@ bool seg6_hmac_validate_skb(struct sk_buff *skb) if (seg6_hmac_compute(hinfo, srh, &ipv6_hdr(skb)->saddr, hmac_output)) return false; - if (memcmp(hmac_output, tlv->hmac, SEG6_HMAC_FIELD_LEN) != 0) + if (crypto_memneq(hmac_output, tlv->hmac, SEG6_HMAC_FIELD_LEN)) return false; return true; @@ -293,6 +235,19 @@ int seg6_hmac_info_add(struct net *net, u32 key, struct seg6_hmac_info *hinfo) struct seg6_pernet_data *sdata = seg6_pernet(net); int err; + switch (hinfo->alg_id) { + case SEG6_HMAC_ALGO_SHA1: + hmac_sha1_preparekey(&hinfo->key.sha1, + hinfo->secret, hinfo->slen); + break; + case SEG6_HMAC_ALGO_SHA256: + hmac_sha256_preparekey(&hinfo->key.sha256, + hinfo->secret, hinfo->slen); + break; + default: + return -EINVAL; + } + err = rhashtable_lookup_insert_fast(&sdata->hmac_infos, &hinfo->node, rht_params); @@ -348,58 +303,6 @@ out: } EXPORT_SYMBOL(seg6_push_hmac); -static int seg6_hmac_init_algo(void) -{ - struct seg6_hmac_algo *algo; - struct crypto_shash *tfm; - struct shash_desc *shash; - int i, alg_count, cpu; - - alg_count = ARRAY_SIZE(hmac_algos); - - for (i = 0; i < alg_count; i++) { - struct crypto_shash **p_tfm; - int shsize; - - algo = &hmac_algos[i]; - algo->tfms = alloc_percpu(struct crypto_shash *); - if (!algo->tfms) - return -ENOMEM; - - for_each_possible_cpu(cpu) { - tfm = crypto_alloc_shash(algo->name, 0, 0); - if (IS_ERR(tfm)) - return PTR_ERR(tfm); - p_tfm = per_cpu_ptr(algo->tfms, cpu); - *p_tfm = tfm; - } - - p_tfm = raw_cpu_ptr(algo->tfms); - tfm = *p_tfm; - - shsize = sizeof(*shash) + crypto_shash_descsize(tfm); - - algo->shashs = alloc_percpu(struct shash_desc *); - if (!algo->shashs) - return -ENOMEM; - - for_each_possible_cpu(cpu) { - shash = kzalloc_node(shsize, GFP_KERNEL, - cpu_to_node(cpu)); - if (!shash) - return -ENOMEM; - *per_cpu_ptr(algo->shashs, cpu) = shash; - } - } - - return 0; -} - -int __init seg6_hmac_init(void) -{ - return seg6_hmac_init_algo(); -} - int __net_init seg6_hmac_net_init(struct net *net) { struct seg6_pernet_data *sdata = seg6_pernet(net); @@ -407,29 +310,6 @@ int __net_init seg6_hmac_net_init(struct net *net) return rhashtable_init(&sdata->hmac_infos, &rht_params); } -void seg6_hmac_exit(void) -{ - struct seg6_hmac_algo *algo = NULL; - int i, alg_count, cpu; - - alg_count = ARRAY_SIZE(hmac_algos); - for (i = 0; i < alg_count; i++) { - algo = &hmac_algos[i]; - for_each_possible_cpu(cpu) { - struct crypto_shash *tfm; - struct shash_desc *shash; - - shash = *per_cpu_ptr(algo->shashs, cpu); - kfree(shash); - tfm = *per_cpu_ptr(algo->tfms, cpu); - crypto_free_shash(tfm); - } - free_percpu(algo->tfms); - free_percpu(algo->shashs); - } -} -EXPORT_SYMBOL(seg6_hmac_exit); - void __net_exit seg6_hmac_net_exit(struct net *net) { struct seg6_pernet_data *sdata = seg6_pernet(net); diff --git a/net/ipv6/seg6_iptunnel.c b/net/ipv6/seg6_iptunnel.c index 34db881204d2..3e1b9991131a 100644 --- a/net/ipv6/seg6_iptunnel.c +++ b/net/ipv6/seg6_iptunnel.c @@ -124,11 +124,12 @@ static __be32 seg6_make_flowlabel(struct net *net, struct sk_buff *skb, return flowlabel; } -/* encapsulate an IPv6 packet within an outer IPv6 header with a given SRH */ -int seg6_do_srh_encap(struct sk_buff *skb, struct ipv6_sr_hdr *osrh, int proto) +static int __seg6_do_srh_encap(struct sk_buff *skb, struct ipv6_sr_hdr *osrh, + int proto, struct dst_entry *cache_dst) { struct dst_entry *dst = skb_dst(skb); - struct net *net = dev_net(dst->dev); + struct net_device *dev = dst_dev(dst); + struct net *net = dev_net(dev); struct ipv6hdr *hdr, *inner_hdr; struct ipv6_sr_hdr *isrh; int hdrlen, tot_len, err; @@ -137,7 +138,7 @@ int seg6_do_srh_encap(struct sk_buff *skb, struct ipv6_sr_hdr *osrh, int proto) hdrlen = (osrh->hdrlen + 1) << 3; tot_len = hdrlen + sizeof(*hdr); - err = skb_cow_head(skb, tot_len + skb->mac_len); + err = skb_cow_head(skb, tot_len + dst_dev_overhead(cache_dst, skb)); if (unlikely(err)) return err; @@ -181,7 +182,7 @@ int seg6_do_srh_encap(struct sk_buff *skb, struct ipv6_sr_hdr *osrh, int proto) isrh->nexthdr = proto; hdr->daddr = isrh->segments[isrh->first_segment]; - set_tun_src(net, dst->dev, &hdr->daddr, &hdr->saddr); + set_tun_src(net, dev, &hdr->daddr, &hdr->saddr); #ifdef CONFIG_IPV6_SEG6_HMAC if (sr_has_hmac(isrh)) { @@ -197,15 +198,23 @@ int seg6_do_srh_encap(struct sk_buff *skb, struct ipv6_sr_hdr *osrh, int proto) return 0; } + +/* encapsulate an IPv6 packet within an outer IPv6 header with a given SRH */ +int seg6_do_srh_encap(struct sk_buff *skb, struct ipv6_sr_hdr *osrh, int proto) +{ + return __seg6_do_srh_encap(skb, osrh, proto, NULL); +} EXPORT_SYMBOL_GPL(seg6_do_srh_encap); /* encapsulate an IPv6 packet within an outer IPv6 header with reduced SRH */ static int seg6_do_srh_encap_red(struct sk_buff *skb, - struct ipv6_sr_hdr *osrh, int proto) + struct ipv6_sr_hdr *osrh, int proto, + struct dst_entry *cache_dst) { __u8 first_seg = osrh->first_segment; struct dst_entry *dst = skb_dst(skb); - struct net *net = dev_net(dst->dev); + struct net_device *dev = dst_dev(dst); + struct net *net = dev_net(dev); struct ipv6hdr *hdr, *inner_hdr; int hdrlen = ipv6_optlen(osrh); int red_tlv_offset, tlv_offset; @@ -230,7 +239,7 @@ static int seg6_do_srh_encap_red(struct sk_buff *skb, tot_len = red_hdrlen + sizeof(struct ipv6hdr); - err = skb_cow_head(skb, tot_len + skb->mac_len); + err = skb_cow_head(skb, tot_len + dst_dev_overhead(cache_dst, skb)); if (unlikely(err)) return err; @@ -263,7 +272,7 @@ static int seg6_do_srh_encap_red(struct sk_buff *skb, if (skip_srh) { hdr->nexthdr = proto; - set_tun_src(net, dst->dev, &hdr->daddr, &hdr->saddr); + set_tun_src(net, dev, &hdr->daddr, &hdr->saddr); goto out; } @@ -299,7 +308,7 @@ static int seg6_do_srh_encap_red(struct sk_buff *skb, srcaddr: isrh->nexthdr = proto; - set_tun_src(net, dst->dev, &hdr->daddr, &hdr->saddr); + set_tun_src(net, dev, &hdr->daddr, &hdr->saddr); #ifdef CONFIG_IPV6_SEG6_HMAC if (unlikely(!skip_srh && sr_has_hmac(isrh))) { @@ -317,8 +326,8 @@ out: return 0; } -/* insert an SRH within an IPv6 packet, just after the IPv6 header */ -int seg6_do_srh_inline(struct sk_buff *skb, struct ipv6_sr_hdr *osrh) +static int __seg6_do_srh_inline(struct sk_buff *skb, struct ipv6_sr_hdr *osrh, + struct dst_entry *cache_dst) { struct ipv6hdr *hdr, *oldhdr; struct ipv6_sr_hdr *isrh; @@ -326,7 +335,7 @@ int seg6_do_srh_inline(struct sk_buff *skb, struct ipv6_sr_hdr *osrh) hdrlen = (osrh->hdrlen + 1) << 3; - err = skb_cow_head(skb, hdrlen + skb->mac_len); + err = skb_cow_head(skb, hdrlen + dst_dev_overhead(cache_dst, skb)); if (unlikely(err)) return err; @@ -355,7 +364,7 @@ int seg6_do_srh_inline(struct sk_buff *skb, struct ipv6_sr_hdr *osrh) #ifdef CONFIG_IPV6_SEG6_HMAC if (sr_has_hmac(isrh)) { - struct net *net = dev_net(skb_dst(skb)->dev); + struct net *net = skb_dst_dev_net(skb); err = seg6_push_hmac(net, &hdr->saddr, isrh); if (unlikely(err)) @@ -369,9 +378,8 @@ int seg6_do_srh_inline(struct sk_buff *skb, struct ipv6_sr_hdr *osrh) return 0; } -EXPORT_SYMBOL_GPL(seg6_do_srh_inline); -static int seg6_do_srh(struct sk_buff *skb) +static int seg6_do_srh(struct sk_buff *skb, struct dst_entry *cache_dst) { struct dst_entry *dst = skb_dst(skb); struct seg6_iptunnel_encap *tinfo; @@ -384,7 +392,7 @@ static int seg6_do_srh(struct sk_buff *skb) if (skb->protocol != htons(ETH_P_IPV6)) return -EINVAL; - err = seg6_do_srh_inline(skb, tinfo->srh); + err = __seg6_do_srh_inline(skb, tinfo->srh, cache_dst); if (err) return err; break; @@ -402,9 +410,11 @@ static int seg6_do_srh(struct sk_buff *skb) return -EINVAL; if (tinfo->mode == SEG6_IPTUN_MODE_ENCAP) - err = seg6_do_srh_encap(skb, tinfo->srh, proto); + err = __seg6_do_srh_encap(skb, tinfo->srh, + proto, cache_dst); else - err = seg6_do_srh_encap_red(skb, tinfo->srh, proto); + err = seg6_do_srh_encap_red(skb, tinfo->srh, + proto, cache_dst); if (err) return err; @@ -425,11 +435,13 @@ static int seg6_do_srh(struct sk_buff *skb) skb_push(skb, skb->mac_len); if (tinfo->mode == SEG6_IPTUN_MODE_L2ENCAP) - err = seg6_do_srh_encap(skb, tinfo->srh, - IPPROTO_ETHERNET); + err = __seg6_do_srh_encap(skb, tinfo->srh, + IPPROTO_ETHERNET, + cache_dst); else err = seg6_do_srh_encap_red(skb, tinfo->srh, - IPPROTO_ETHERNET); + IPPROTO_ETHERNET, + cache_dst); if (err) return err; @@ -444,6 +456,13 @@ static int seg6_do_srh(struct sk_buff *skb) return 0; } +/* insert an SRH within an IPv6 packet, just after the IPv6 header */ +int seg6_do_srh_inline(struct sk_buff *skb, struct ipv6_sr_hdr *osrh) +{ + return __seg6_do_srh_inline(skb, osrh, NULL); +} +EXPORT_SYMBOL_GPL(seg6_do_srh_inline); + static int seg6_input_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { @@ -455,51 +474,63 @@ static int seg6_input_core(struct net *net, struct sock *sk, { struct dst_entry *orig_dst = skb_dst(skb); struct dst_entry *dst = NULL; + struct lwtunnel_state *lwtst; struct seg6_lwt *slwt; int err; - err = seg6_do_srh(skb); - if (unlikely(err)) { - kfree_skb(skb); - return err; - } + /* We cannot dereference "orig_dst" once ip6_route_input() or + * skb_dst_drop() is called. However, in order to detect a dst loop, we + * need the address of its lwtstate. So, save the address of lwtstate + * now and use it later as a comparison. + */ + lwtst = orig_dst->lwtstate; - slwt = seg6_lwt_lwtunnel(orig_dst->lwtstate); + slwt = seg6_lwt_lwtunnel(lwtst); - preempt_disable(); + local_bh_disable(); dst = dst_cache_get(&slwt->cache); - preempt_enable(); + local_bh_enable(); - skb_dst_drop(skb); + err = seg6_do_srh(skb, dst); + if (unlikely(err)) { + dst_release(dst); + goto drop; + } if (!dst) { ip6_route_input(skb); dst = skb_dst(skb); - if (!dst->error) { - preempt_disable(); + + /* cache only if we don't create a dst reference loop */ + if (!dst->error && lwtst != dst->lwtstate) { + local_bh_disable(); dst_cache_set_ip6(&slwt->cache, dst, &ipv6_hdr(skb)->saddr); - preempt_enable(); + local_bh_enable(); } + + err = skb_cow_head(skb, LL_RESERVED_SPACE(dst_dev(dst))); + if (unlikely(err)) + goto drop; } else { + skb_dst_drop(skb); skb_dst_set(skb, dst); } - err = skb_cow_head(skb, LL_RESERVED_SPACE(dst->dev)); - if (unlikely(err)) - return err; - if (static_branch_unlikely(&nf_hooks_lwtunnel_enabled)) return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, dev_net(skb->dev), NULL, skb, NULL, - skb_dst(skb)->dev, seg6_input_finish); + skb_dst_dev(skb), seg6_input_finish); return seg6_input_finish(dev_net(skb->dev), NULL, skb); +drop: + kfree_skb(skb); + return err; } static int seg6_input_nf(struct sk_buff *skb) { - struct net_device *dev = skb_dst(skb)->dev; + struct net_device *dev = skb_dst_dev(skb); struct net *net = dev_net(skb->dev); switch (skb->protocol) { @@ -530,15 +561,15 @@ static int seg6_output_core(struct net *net, struct sock *sk, struct seg6_lwt *slwt; int err; - err = seg6_do_srh(skb); - if (unlikely(err)) - goto drop; - slwt = seg6_lwt_lwtunnel(orig_dst->lwtstate); - preempt_disable(); + local_bh_disable(); dst = dst_cache_get(&slwt->cache); - preempt_enable(); + local_bh_enable(); + + err = seg6_do_srh(skb, dst); + if (unlikely(err)) + goto drop; if (unlikely(!dst)) { struct ipv6hdr *hdr = ipv6_hdr(skb); @@ -554,35 +585,38 @@ static int seg6_output_core(struct net *net, struct sock *sk, dst = ip6_route_output(net, NULL, &fl6); if (dst->error) { err = dst->error; - dst_release(dst); goto drop; } - preempt_disable(); - dst_cache_set_ip6(&slwt->cache, dst, &fl6.saddr); - preempt_enable(); + /* cache only if we don't create a dst reference loop */ + if (orig_dst->lwtstate != dst->lwtstate) { + local_bh_disable(); + dst_cache_set_ip6(&slwt->cache, dst, &fl6.saddr); + local_bh_enable(); + } + + err = skb_cow_head(skb, LL_RESERVED_SPACE(dst_dev(dst))); + if (unlikely(err)) + goto drop; } skb_dst_drop(skb); skb_dst_set(skb, dst); - err = skb_cow_head(skb, LL_RESERVED_SPACE(dst->dev)); - if (unlikely(err)) - goto drop; - if (static_branch_unlikely(&nf_hooks_lwtunnel_enabled)) return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, net, sk, skb, - NULL, skb_dst(skb)->dev, dst_output); + NULL, dst_dev(dst), dst_output); return dst_output(net, sk, skb); drop: + dst_release(dst); kfree_skb(skb); return err; } static int seg6_output_nf(struct net *net, struct sock *sk, struct sk_buff *skb) { - struct net_device *dev = skb_dst(skb)->dev; + struct net_device *dev = skb_dst_dev(skb); switch (skb->protocol) { case htons(ETH_P_IP): diff --git a/net/ipv6/seg6_local.c b/net/ipv6/seg6_local.c index 487f8e98deaa..2b41e4c0dddd 100644 --- a/net/ipv6/seg6_local.c +++ b/net/ipv6/seg6_local.c @@ -109,8 +109,19 @@ struct bpf_lwt_prog { #define next_csid_chk_lcnode_fn_bits(flen) \ next_csid_chk_lcblock_bits(flen) -/* Supported Flavor operations are reported in this bitmask */ -#define SEG6_LOCAL_FLV_SUPP_OPS (BIT(SEG6_LOCAL_FLV_OP_NEXT_CSID)) +/* flag indicating that flavors are set up for a given End* behavior */ +#define SEG6_F_LOCAL_FLAVORS SEG6_F_ATTR(SEG6_LOCAL_FLAVORS) + +#define SEG6_F_LOCAL_FLV_OP(flvname) BIT(SEG6_LOCAL_FLV_OP_##flvname) +#define SEG6_F_LOCAL_FLV_NEXT_CSID SEG6_F_LOCAL_FLV_OP(NEXT_CSID) +#define SEG6_F_LOCAL_FLV_PSP SEG6_F_LOCAL_FLV_OP(PSP) + +/* Supported RFC8986 Flavor operations are reported in this bitmask */ +#define SEG6_LOCAL_FLV8986_SUPP_OPS SEG6_F_LOCAL_FLV_PSP + +#define SEG6_LOCAL_END_FLV_SUPP_OPS (SEG6_F_LOCAL_FLV_NEXT_CSID | \ + SEG6_LOCAL_FLV8986_SUPP_OPS) +#define SEG6_LOCAL_END_X_FLV_SUPP_OPS SEG6_F_LOCAL_FLV_NEXT_CSID struct seg6_flavors_info { /* Flavor operations */ @@ -259,7 +270,7 @@ static void advance_nextseg(struct ipv6_sr_hdr *srh, struct in6_addr *daddr) static int seg6_lookup_any_nexthop(struct sk_buff *skb, struct in6_addr *nhaddr, - u32 tbl_id, bool local_delivery) + u32 tbl_id, bool local_delivery, int oif) { struct net *net = dev_net(skb->dev); struct ipv6hdr *hdr = ipv6_hdr(skb); @@ -271,6 +282,7 @@ seg6_lookup_any_nexthop(struct sk_buff *skb, struct in6_addr *nhaddr, memset(&fl6, 0, sizeof(fl6)); fl6.flowi6_iif = skb->dev->ifindex; + fl6.flowi6_oif = oif; fl6.daddr = nhaddr ? *nhaddr : hdr->daddr; fl6.saddr = hdr->saddr; fl6.flowlabel = ip6_flowinfo(hdr); @@ -280,17 +292,19 @@ seg6_lookup_any_nexthop(struct sk_buff *skb, struct in6_addr *nhaddr, if (nhaddr) fl6.flowi6_flags = FLOWI_FLAG_KNOWN_NH; - if (!tbl_id) { + if (!tbl_id && !oif) { dst = ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags); - } else { + } else if (tbl_id) { struct fib6_table *table; table = fib6_get_table(net, tbl_id); if (!table) goto out; - rt = ip6_pol_route(net, table, 0, &fl6, skb, flags); + rt = ip6_pol_route(net, table, oif, &fl6, skb, flags); dst = &rt->dst; + } else { + dst = ip6_route_output(net, NULL, &fl6); } /* we want to discard traffic destined for local packet processing, @@ -299,7 +313,7 @@ seg6_lookup_any_nexthop(struct sk_buff *skb, struct in6_addr *nhaddr, if (!local_delivery) dev_flags |= IFF_LOOPBACK; - if (dst && (dst->dev->flags & dev_flags) && !dst->error) { + if (dst && (dst_dev(dst)->flags & dev_flags) && !dst->error) { dst_release(dst); dst = NULL; } @@ -319,7 +333,7 @@ out: int seg6_lookup_nexthop(struct sk_buff *skb, struct in6_addr *nhaddr, u32 tbl_id) { - return seg6_lookup_any_nexthop(skb, nhaddr, tbl_id, false); + return seg6_lookup_any_nexthop(skb, nhaddr, tbl_id, false, 0); } static __u8 seg6_flv_lcblock_octects(const struct seg6_flavors_info *finfo) @@ -364,6 +378,14 @@ static void seg6_next_csid_advance_arg(struct in6_addr *addr, memset(&addr->s6_addr[16 - fnc_octects], 0x00, fnc_octects); } +static int input_action_end_finish(struct sk_buff *skb, + struct seg6_local_lwt *slwt) +{ + seg6_lookup_nexthop(skb, NULL, 0); + + return dst_input(skb); +} + static int input_action_end_core(struct sk_buff *skb, struct seg6_local_lwt *slwt) { @@ -375,9 +397,7 @@ static int input_action_end_core(struct sk_buff *skb, advance_nextseg(srh, &ipv6_hdr(skb)->daddr); - seg6_lookup_nexthop(skb, NULL, 0); - - return dst_input(skb); + return input_action_end_finish(skb, slwt); drop: kfree_skb(skb); @@ -395,47 +415,417 @@ static int end_next_csid_core(struct sk_buff *skb, struct seg6_local_lwt *slwt) /* update DA */ seg6_next_csid_advance_arg(daddr, finfo); - seg6_lookup_nexthop(skb, NULL, 0); + return input_action_end_finish(skb, slwt); +} + +static int input_action_end_x_finish(struct sk_buff *skb, + struct seg6_local_lwt *slwt) +{ + seg6_lookup_any_nexthop(skb, &slwt->nh6, 0, false, slwt->oif); return dst_input(skb); } -static bool seg6_next_csid_enabled(__u32 fops) +static int input_action_end_x_core(struct sk_buff *skb, + struct seg6_local_lwt *slwt) { - return fops & BIT(SEG6_LOCAL_FLV_OP_NEXT_CSID); + struct ipv6_sr_hdr *srh; + + srh = get_and_validate_srh(skb); + if (!srh) + goto drop; + + advance_nextseg(srh, &ipv6_hdr(skb)->daddr); + + return input_action_end_x_finish(skb, slwt); + +drop: + kfree_skb(skb); + return -EINVAL; } -/* regular endpoint function */ -static int input_action_end(struct sk_buff *skb, struct seg6_local_lwt *slwt) +static int end_x_next_csid_core(struct sk_buff *skb, + struct seg6_local_lwt *slwt) { const struct seg6_flavors_info *finfo = &slwt->flv_info; + struct in6_addr *daddr = &ipv6_hdr(skb)->daddr; - if (seg6_next_csid_enabled(finfo->flv_ops)) - return end_next_csid_core(skb, slwt); + if (seg6_next_csid_is_arg_zero(daddr, finfo)) + return input_action_end_x_core(skb, slwt); - return input_action_end_core(skb, slwt); + /* update DA */ + seg6_next_csid_advance_arg(daddr, finfo); + + return input_action_end_x_finish(skb, slwt); } -/* regular endpoint, and forward to specified nexthop */ -static int input_action_end_x(struct sk_buff *skb, struct seg6_local_lwt *slwt) +static bool seg6_next_csid_enabled(__u32 fops) { - struct ipv6_sr_hdr *srh; + return fops & SEG6_F_LOCAL_FLV_NEXT_CSID; +} + +/* Processing of SRv6 End, End.X, and End.T behaviors can be extended through + * the flavors framework. These behaviors must report the subset of (flavor) + * operations they currently implement. In this way, if a user specifies a + * flavor combination that is not supported by a given End* behavior, the + * kernel refuses to instantiate the tunnel reporting the error. + */ +static int seg6_flv_supp_ops_by_action(int action, __u32 *fops) +{ + switch (action) { + case SEG6_LOCAL_ACTION_END: + *fops = SEG6_LOCAL_END_FLV_SUPP_OPS; + break; + case SEG6_LOCAL_ACTION_END_X: + *fops = SEG6_LOCAL_END_X_FLV_SUPP_OPS; + break; + default: + return -EOPNOTSUPP; + } + + return 0; +} + +/* We describe the packet state in relation to the absence/presence of the SRH + * and the Segment Left (SL) field. + * For our purposes, it is not necessary to record the exact value of the SL + * when the SID List consists of two or more segments. + */ +enum seg6_local_pktinfo { + /* the order really matters! */ + SEG6_LOCAL_PKTINFO_NOHDR = 0, + SEG6_LOCAL_PKTINFO_SL_ZERO, + SEG6_LOCAL_PKTINFO_SL_ONE, + SEG6_LOCAL_PKTINFO_SL_MORE, + __SEG6_LOCAL_PKTINFO_MAX, +}; + +#define SEG6_LOCAL_PKTINFO_MAX (__SEG6_LOCAL_PKTINFO_MAX - 1) + +static enum seg6_local_pktinfo seg6_get_srh_pktinfo(struct ipv6_sr_hdr *srh) +{ + __u8 sgl; - srh = get_and_validate_srh(skb); if (!srh) + return SEG6_LOCAL_PKTINFO_NOHDR; + + sgl = srh->segments_left; + if (sgl < 2) + return SEG6_LOCAL_PKTINFO_SL_ZERO + sgl; + + return SEG6_LOCAL_PKTINFO_SL_MORE; +} + +enum seg6_local_flv_action { + SEG6_LOCAL_FLV_ACT_UNSPEC = 0, + SEG6_LOCAL_FLV_ACT_END, + SEG6_LOCAL_FLV_ACT_PSP, + SEG6_LOCAL_FLV_ACT_USP, + SEG6_LOCAL_FLV_ACT_USD, + __SEG6_LOCAL_FLV_ACT_MAX +}; + +#define SEG6_LOCAL_FLV_ACT_MAX (__SEG6_LOCAL_FLV_ACT_MAX - 1) + +/* The action table for RFC8986 flavors (see the flv8986_act_tbl below) + * contains the actions (i.e. processing operations) to be applied on packets + * when flavors are configured for an End* behavior. + * By combining the pkinfo data and from the flavors mask, the macro + * computes the index used to access the elements (actions) stored in the + * action table. The index is structured as follows: + * + * index + * _______________/\________________ + * / \ + * +----------------+----------------+ + * | pf | afm | + * +----------------+----------------+ + * ph-1 ... p1 p0 fk-1 ... f1 f0 + * MSB LSB + * + * where: + * - 'afm' (adjusted flavor mask) is the mask containing a combination of the + * RFC8986 flavors currently supported. 'afm' corresponds to the @fm + * argument of the macro whose value is righ-shifted by 1 bit. By doing so, + * we discard the SEG6_LOCAL_FLV_OP_UNSPEC flag (bit 0 in @fm) which is + * never used here; + * - 'pf' encodes the packet info (pktinfo) regarding the presence/absence of + * the SRH, SL = 0, etc. 'pf' is set with the value of @pf provided as + * argument to the macro. + */ +#define flv8986_act_tbl_idx(pf, fm) \ + ((((pf) << bits_per(SEG6_LOCAL_FLV8986_SUPP_OPS)) | \ + ((fm) & SEG6_LOCAL_FLV8986_SUPP_OPS)) >> SEG6_LOCAL_FLV_OP_PSP) + +/* We compute the size of the action table by considering the RFC8986 flavors + * actually supported by the kernel. In this way, the size is automatically + * adjusted when new flavors are supported. + */ +#define FLV8986_ACT_TBL_SIZE \ + roundup_pow_of_two(flv8986_act_tbl_idx(SEG6_LOCAL_PKTINFO_MAX, \ + SEG6_LOCAL_FLV8986_SUPP_OPS)) + +/* tbl_cfg(act, pf, fm) macro is used to easily configure the action + * table; it accepts 3 arguments: + * i) @act, the suffix from SEG6_LOCAL_FLV_ACT_{act} representing + * the action that should be applied on the packet; + * ii) @pf, the suffix from SEG6_LOCAL_PKTINFO_{pf} reporting the packet + * info about the lack/presence of SRH, SRH with SL = 0, etc; + * iii) @fm, the mask of flavors. + */ +#define tbl_cfg(act, pf, fm) \ + [flv8986_act_tbl_idx(SEG6_LOCAL_PKTINFO_##pf, \ + (fm))] = SEG6_LOCAL_FLV_ACT_##act + +/* shorthand for improving readability */ +#define F_PSP SEG6_F_LOCAL_FLV_PSP + +/* The table contains, for each combination of the pktinfo data and + * flavors, the action that should be taken on a packet (e.g. + * "standard" Endpoint processing, Penultimate Segment Pop, etc). + * + * By default, table entries not explicitly configured are initialized with the + * SEG6_LOCAL_FLV_ACT_UNSPEC action, which generally has the effect of + * discarding the processed packet. + */ +static const u8 flv8986_act_tbl[FLV8986_ACT_TBL_SIZE] = { + /* PSP variant for packet where SRH with SL = 1 */ + tbl_cfg(PSP, SL_ONE, F_PSP), + /* End for packet where the SRH with SL > 1*/ + tbl_cfg(END, SL_MORE, F_PSP), +}; + +#undef F_PSP +#undef tbl_cfg + +/* For each flavor defined in RFC8986 (or a combination of them) an action is + * performed on the packet. The specific action depends on: + * - info extracted from the packet (i.e. pktinfo data) regarding the + * lack/presence of the SRH, and if the SRH is available, on the value of + * Segment Left field; + * - the mask of flavors configured for the specific SRv6 End* behavior. + * + * The function combines both the pkinfo and the flavors mask to evaluate the + * corresponding action to be taken on the packet. + */ +static enum seg6_local_flv_action +seg6_local_flv8986_act_lookup(enum seg6_local_pktinfo pinfo, __u32 flvmask) +{ + unsigned long index; + + /* check if the provided mask of flavors is supported */ + if (unlikely(flvmask & ~SEG6_LOCAL_FLV8986_SUPP_OPS)) + return SEG6_LOCAL_FLV_ACT_UNSPEC; + + index = flv8986_act_tbl_idx(pinfo, flvmask); + if (unlikely(index >= FLV8986_ACT_TBL_SIZE)) + return SEG6_LOCAL_FLV_ACT_UNSPEC; + + return flv8986_act_tbl[index]; +} + +/* skb->data must be aligned with skb->network_header */ +static bool seg6_pop_srh(struct sk_buff *skb, int srhoff) +{ + struct ipv6_sr_hdr *srh; + struct ipv6hdr *iph; + __u8 srh_nexthdr; + int thoff = -1; + int srhlen; + int nhlen; + + if (unlikely(srhoff < sizeof(*iph) || + !pskb_may_pull(skb, srhoff + sizeof(*srh)))) + return false; + + srh = (struct ipv6_sr_hdr *)(skb->data + srhoff); + srhlen = ipv6_optlen(srh); + + /* we are about to mangle the pkt, let's check if we can write on it */ + if (unlikely(skb_ensure_writable(skb, srhoff + srhlen))) + return false; + + /* skb_ensure_writable() may change skb pointers; evaluate srh again */ + srh = (struct ipv6_sr_hdr *)(skb->data + srhoff); + srh_nexthdr = srh->nexthdr; + + if (unlikely(!skb_transport_header_was_set(skb))) + goto pull; + + nhlen = skb_network_header_len(skb); + /* we have to deal with the transport header: it could be set before + * the SRH, after the SRH, or within it (which is considered wrong, + * however). + */ + if (likely(nhlen <= srhoff)) + thoff = nhlen; + else if (nhlen >= srhoff + srhlen) + /* transport_header is set after the SRH */ + thoff = nhlen - srhlen; + else + /* transport_header falls inside the SRH; hence, we can't + * restore the transport_header pointer properly after + * SRH removing operation. + */ + return false; +pull: + /* we need to pop the SRH: + * 1) first of all, we pull out everything from IPv6 header up to SRH + * (included) evaluating also the rcsum; + * 2) we overwrite (and then remove) the SRH by properly moving the + * IPv6 along with any extension header that precedes the SRH; + * 3) At the end, we push back the pulled headers (except for SRH, + * obviously). + */ + skb_pull_rcsum(skb, srhoff + srhlen); + memmove(skb_network_header(skb) + srhlen, skb_network_header(skb), + srhoff); + skb_push(skb, srhoff); + + skb_reset_network_header(skb); + skb_mac_header_rebuild(skb); + if (likely(thoff >= 0)) + skb_set_transport_header(skb, thoff); + + iph = ipv6_hdr(skb); + if (iph->nexthdr == NEXTHDR_ROUTING) { + iph->nexthdr = srh_nexthdr; + } else { + /* we must look for the extension header (EXTH, for short) that + * immediately precedes the SRH we have just removed. + * Then, we update the value of the EXTH nexthdr with the one + * contained in the SRH nexthdr. + */ + unsigned int off = sizeof(*iph); + struct ipv6_opt_hdr *hp, _hdr; + __u8 nexthdr = iph->nexthdr; + + for (;;) { + if (unlikely(!ipv6_ext_hdr(nexthdr) || + nexthdr == NEXTHDR_NONE)) + return false; + + hp = skb_header_pointer(skb, off, sizeof(_hdr), &_hdr); + if (unlikely(!hp)) + return false; + + if (hp->nexthdr == NEXTHDR_ROUTING) { + hp->nexthdr = srh_nexthdr; + break; + } + + switch (nexthdr) { + case NEXTHDR_FRAGMENT: + fallthrough; + case NEXTHDR_AUTH: + /* we expect SRH before FRAG and AUTH */ + return false; + default: + off += ipv6_optlen(hp); + break; + } + + nexthdr = hp->nexthdr; + } + } + + iph->payload_len = htons(skb->len - sizeof(struct ipv6hdr)); + + skb_postpush_rcsum(skb, iph, srhoff); + + return true; +} + +/* process the packet on the basis of the RFC8986 flavors set for the given + * SRv6 End behavior instance. + */ +static int end_flv8986_core(struct sk_buff *skb, struct seg6_local_lwt *slwt) +{ + const struct seg6_flavors_info *finfo = &slwt->flv_info; + enum seg6_local_flv_action action; + enum seg6_local_pktinfo pinfo; + struct ipv6_sr_hdr *srh; + __u32 flvmask; + int srhoff; + + srh = seg6_get_srh(skb, 0); + srhoff = srh ? ((unsigned char *)srh - skb->data) : 0; + pinfo = seg6_get_srh_pktinfo(srh); +#ifdef CONFIG_IPV6_SEG6_HMAC + if (srh && !seg6_hmac_validate_skb(skb)) + goto drop; +#endif + flvmask = finfo->flv_ops; + if (unlikely(flvmask & ~SEG6_LOCAL_FLV8986_SUPP_OPS)) { + pr_warn_once("seg6local: invalid RFC8986 flavors\n"); goto drop; + } - advance_nextseg(srh, &ipv6_hdr(skb)->daddr); + /* retrieve the action triggered by the combination of pktinfo data and + * the flavors mask. + */ + action = seg6_local_flv8986_act_lookup(pinfo, flvmask); + switch (action) { + case SEG6_LOCAL_FLV_ACT_END: + /* process the packet as the "standard" End behavior */ + advance_nextseg(srh, &ipv6_hdr(skb)->daddr); + break; + case SEG6_LOCAL_FLV_ACT_PSP: + advance_nextseg(srh, &ipv6_hdr(skb)->daddr); - seg6_lookup_nexthop(skb, &slwt->nh6, 0); + if (unlikely(!seg6_pop_srh(skb, srhoff))) + goto drop; + break; + case SEG6_LOCAL_FLV_ACT_UNSPEC: + fallthrough; + default: + /* by default, we drop the packet since we could not find a + * suitable action. + */ + goto drop; + } - return dst_input(skb); + return input_action_end_finish(skb, slwt); drop: kfree_skb(skb); return -EINVAL; } +/* regular endpoint function */ +static int input_action_end(struct sk_buff *skb, struct seg6_local_lwt *slwt) +{ + const struct seg6_flavors_info *finfo = &slwt->flv_info; + __u32 fops = finfo->flv_ops; + + if (!fops) + return input_action_end_core(skb, slwt); + + /* check for the presence of NEXT-C-SID since it applies first */ + if (seg6_next_csid_enabled(fops)) + return end_next_csid_core(skb, slwt); + + /* the specific processing function to be performed on the packet + * depends on the combination of flavors defined in RFC8986 and some + * information extracted from the packet, e.g. presence/absence of SRH, + * Segment Left = 0, etc. + */ + return end_flv8986_core(skb, slwt); +} + +/* regular endpoint, and forward to specified nexthop */ +static int input_action_end_x(struct sk_buff *skb, struct seg6_local_lwt *slwt) +{ + const struct seg6_flavors_info *finfo = &slwt->flv_info; + __u32 fops = finfo->flv_ops; + + /* check for the presence of NEXT-C-SID since it applies first */ + if (seg6_next_csid_enabled(fops)) + return end_x_next_csid_core(skb, slwt); + + return input_action_end_x_core(skb, slwt); +} + static int input_action_end_t(struct sk_buff *skb, struct seg6_local_lwt *slwt) { struct ipv6_sr_hdr *srh; @@ -554,8 +944,8 @@ static int input_action_end_dx6(struct sk_buff *skb, if (static_branch_unlikely(&nf_hooks_lwtunnel_enabled)) return NF_HOOK(NFPROTO_IPV6, NF_INET_PRE_ROUTING, - dev_net(skb->dev), NULL, skb, NULL, - skb_dst(skb)->dev, input_action_end_dx6_finish); + dev_net(skb->dev), NULL, skb, skb->dev, + NULL, input_action_end_dx6_finish); return input_action_end_dx6_finish(dev_net(skb->dev), NULL, skb); drop: @@ -567,10 +957,10 @@ static int input_action_end_dx4_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { struct dst_entry *orig_dst = skb_dst(skb); + enum skb_drop_reason reason; struct seg6_local_lwt *slwt; struct iphdr *iph; __be32 nhaddr; - int err; slwt = seg6_local_lwtunnel(orig_dst->lwtstate); @@ -580,9 +970,9 @@ static int input_action_end_dx4_finish(struct net *net, struct sock *sk, skb_dst_drop(skb); - err = ip_route_input(skb, nhaddr, iph->saddr, 0, skb->dev); - if (err) { - kfree_skb(skb); + reason = ip_route_input(skb, nhaddr, iph->saddr, 0, skb->dev); + if (reason) { + kfree_skb_reason(skb, reason); return -EINVAL; } @@ -604,8 +994,8 @@ static int input_action_end_dx4(struct sk_buff *skb, if (static_branch_unlikely(&nf_hooks_lwtunnel_enabled)) return NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, - dev_net(skb->dev), NULL, skb, NULL, - skb_dst(skb)->dev, input_action_end_dx4_finish); + dev_net(skb->dev), NULL, skb, skb->dev, + NULL, input_action_end_dx4_finish); return input_action_end_dx4_finish(dev_net(skb->dev), NULL, skb); drop: @@ -787,8 +1177,8 @@ drop: static int input_action_end_dt4(struct sk_buff *skb, struct seg6_local_lwt *slwt) { + enum skb_drop_reason reason; struct iphdr *iph; - int err; if (!decap_and_validate(skb, IPPROTO_IPIP)) goto drop; @@ -806,8 +1196,8 @@ static int input_action_end_dt4(struct sk_buff *skb, iph = ip_hdr(skb); - err = ip_route_input(skb, iph->daddr, iph->saddr, 0, skb->dev); - if (unlikely(err)) + reason = ip_route_input(skb, iph->daddr, iph->saddr, 0, skb->dev); + if (unlikely(reason)) goto drop; return dst_input(skb); @@ -890,7 +1280,7 @@ static int input_action_end_dt6(struct sk_buff *skb, /* note: this time we do not need to specify the table because the VRF * takes care of selecting the correct table. */ - seg6_lookup_any_nexthop(skb, NULL, 0, true); + seg6_lookup_any_nexthop(skb, NULL, 0, true, 0); return dst_input(skb); @@ -898,7 +1288,7 @@ legacy_mode: #endif skb_set_transport_header(skb, sizeof(struct ipv6hdr)); - seg6_lookup_any_nexthop(skb, NULL, slwt->table, true); + seg6_lookup_any_nexthop(skb, NULL, slwt->table, true, 0); return dst_input(skb); @@ -993,7 +1383,9 @@ drop: return err; } -DEFINE_PER_CPU(struct seg6_bpf_srh_state, seg6_bpf_srh_states); +DEFINE_PER_CPU(struct seg6_bpf_srh_state, seg6_bpf_srh_states) = { + .bh_lock = INIT_LOCAL_LOCK(bh_lock), +}; bool seg6_bpf_has_valid_srh(struct sk_buff *skb) { @@ -1001,6 +1393,7 @@ bool seg6_bpf_has_valid_srh(struct sk_buff *skb) this_cpu_ptr(&seg6_bpf_srh_states); struct ipv6_sr_hdr *srh = srh_state->srh; + lockdep_assert_held(&srh_state->bh_lock); if (unlikely(srh == NULL)) return false; @@ -1021,8 +1414,7 @@ bool seg6_bpf_has_valid_srh(struct sk_buff *skb) static int input_action_end_bpf(struct sk_buff *skb, struct seg6_local_lwt *slwt) { - struct seg6_bpf_srh_state *srh_state = - this_cpu_ptr(&seg6_bpf_srh_states); + struct seg6_bpf_srh_state *srh_state; struct ipv6_sr_hdr *srh; int ret; @@ -1033,10 +1425,14 @@ static int input_action_end_bpf(struct sk_buff *skb, } advance_nextseg(srh, &ipv6_hdr(skb)->daddr); - /* preempt_disable is needed to protect the per-CPU buffer srh_state, - * which is also accessed by the bpf_lwt_seg6_* helpers + /* The access to the per-CPU buffer srh_state is protected by running + * always in softirq context (with disabled BH). On PREEMPT_RT the + * required locking is provided by the following local_lock_nested_bh() + * statement. It is also accessed by the bpf_lwt_seg6_* helpers via + * bpf_prog_run_save_cb(). */ - preempt_disable(); + local_lock_nested_bh(&seg6_bpf_srh_states.bh_lock); + srh_state = this_cpu_ptr(&seg6_bpf_srh_states); srh_state->srh = srh; srh_state->hdrlen = srh->hdrlen << 3; srh_state->valid = true; @@ -1059,15 +1455,15 @@ static int input_action_end_bpf(struct sk_buff *skb, if (srh_state->srh && !seg6_bpf_has_valid_srh(skb)) goto drop; + local_unlock_nested_bh(&seg6_bpf_srh_states.bh_lock); - preempt_enable(); if (ret != BPF_REDIRECT) seg6_lookup_nexthop(skb, NULL, 0); return dst_input(skb); drop: - preempt_enable(); + local_unlock_nested_bh(&seg6_bpf_srh_states.bh_lock); kfree_skb(skb); return -EINVAL; } @@ -1077,13 +1473,15 @@ static struct seg6_action_desc seg6_action_table[] = { .action = SEG6_LOCAL_ACTION_END, .attrs = 0, .optattrs = SEG6_F_LOCAL_COUNTERS | - SEG6_F_ATTR(SEG6_LOCAL_FLAVORS), + SEG6_F_LOCAL_FLAVORS, .input = input_action_end, }, { .action = SEG6_LOCAL_ACTION_END_X, .attrs = SEG6_F_ATTR(SEG6_LOCAL_NH6), - .optattrs = SEG6_F_LOCAL_COUNTERS, + .optattrs = SEG6_F_LOCAL_COUNTERS | + SEG6_F_LOCAL_FLAVORS | + SEG6_F_ATTR(SEG6_LOCAL_OIF), .input = input_action_end_x, }, { @@ -1250,10 +1648,8 @@ static const struct nla_policy seg6_local_policy[SEG6_LOCAL_MAX + 1] = { [SEG6_LOCAL_SRH] = { .type = NLA_BINARY }, [SEG6_LOCAL_TABLE] = { .type = NLA_U32 }, [SEG6_LOCAL_VRFTABLE] = { .type = NLA_U32 }, - [SEG6_LOCAL_NH4] = { .type = NLA_BINARY, - .len = sizeof(struct in_addr) }, - [SEG6_LOCAL_NH6] = { .type = NLA_BINARY, - .len = sizeof(struct in6_addr) }, + [SEG6_LOCAL_NH4] = NLA_POLICY_EXACT_LEN(sizeof(struct in_addr)), + [SEG6_LOCAL_NH6] = NLA_POLICY_EXACT_LEN(sizeof(struct in6_addr)), [SEG6_LOCAL_IIF] = { .type = NLA_U32 }, [SEG6_LOCAL_OIF] = { .type = NLA_U32 }, [SEG6_LOCAL_BPF] = { .type = NLA_NESTED }, @@ -1691,7 +2087,7 @@ struct nla_policy seg6_local_flavors_policy[SEG6_LOCAL_FLV_MAX + 1] = { static int seg6_chk_next_csid_cfg(__u8 block_len, __u8 func_len) { /* Locator-Block and Locator-Node Function cannot exceed 128 bits - * (i.e. C-SID container lenghts). + * (i.e. C-SID container length). */ if (next_csid_chk_cntr_bits(block_len, func_len)) return -EINVAL; @@ -1743,7 +2139,8 @@ static int parse_nla_flavors(struct nlattr **attrs, struct seg6_local_lwt *slwt, { struct seg6_flavors_info *finfo = &slwt->flv_info; struct nlattr *tb[SEG6_LOCAL_FLV_MAX + 1]; - unsigned long fops; + int action = slwt->action; + __u32 fops, supp_fops; int rc; rc = nla_parse_nested_deprecated(tb, SEG6_LOCAL_FLV_MAX, @@ -1759,7 +2156,8 @@ static int parse_nla_flavors(struct nlattr **attrs, struct seg6_local_lwt *slwt, return -EINVAL; fops = nla_get_u32(tb[SEG6_LOCAL_FLV_OPERATION]); - if (fops & ~SEG6_LOCAL_FLV_SUPP_OPS) { + rc = seg6_flv_supp_ops_by_action(action, &supp_fops); + if (rc < 0 || (fops & ~supp_fops)) { NL_SET_ERR_MSG(extack, "Unsupported Flavor operation(s)"); return -EOPNOTSUPP; } @@ -2291,6 +2689,11 @@ int __init seg6_local_init(void) */ BUILD_BUG_ON(SEG6_LOCAL_MAX + 1 > BITS_PER_TYPE(unsigned long)); + /* Check whether the number of defined flavors exceeds the maximum + * allowed value. + */ + BUILD_BUG_ON(SEG6_LOCAL_FLV_OP_MAX + 1 > BITS_PER_TYPE(__u32)); + /* If the default NEXT-C-SID Locator-Block/Node Function lengths (in * bits) have been changed with invalid values, kernel build stops * here. @@ -2300,6 +2703,13 @@ int __init seg6_local_init(void) BUILD_BUG_ON(next_csid_chk_lcblock_bits(SEG6_LOCAL_LCBLOCK_DBITS)); BUILD_BUG_ON(next_csid_chk_lcnode_fn_bits(SEG6_LOCAL_LCNODE_FN_DBITS)); + /* To be memory efficient, we use 'u8' to represent the different + * actions related to RFC8986 flavors. If the kernel build stops here, + * it means that it is not possible to correctly encode these actions + * with the data type chosen for the action table. + */ + BUILD_BUG_ON(SEG6_LOCAL_FLV_ACT_MAX > (typeof(flv8986_act_tbl[0]))~0U); + return lwtunnel_encap_add_ops(&seg6_local_ops, LWTUNNEL_ENCAP_SEG6_LOCAL); } diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index 70d81bba5093..cf37ad9686e6 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -51,6 +51,8 @@ #include <net/dsfield.h> #include <net/net_namespace.h> #include <net/netns/generic.h> +#include <net/netdev_lock.h> +#include <net/inet_dscp.h> /* This version of net/ipv6/sit.c is cloned of net/ipv4/ip_gre.c @@ -132,8 +134,8 @@ static struct ip_tunnel *ipip6_tunnel_lookup(struct net *net, return NULL; } -static struct ip_tunnel __rcu **__ipip6_bucket(struct sit_net *sitn, - struct ip_tunnel_parm *parms) +static struct ip_tunnel __rcu ** +__ipip6_bucket(struct sit_net *sitn, struct ip_tunnel_parm_kern *parms) { __be32 remote = parms->iph.daddr; __be32 local = parms->iph.saddr; @@ -200,14 +202,13 @@ static void ipip6_tunnel_clone_6rd(struct net_device *dev, struct sit_net *sitn) static int ipip6_tunnel_create(struct net_device *dev) { struct ip_tunnel *t = netdev_priv(dev); - struct net *net = dev_net(dev); - struct sit_net *sitn = net_generic(net, sit_net_id); + struct sit_net *sitn = net_generic(t->net, sit_net_id); int err; __dev_addr_set(dev, &t->parms.iph.saddr, 4); memcpy(dev->broadcast, &t->parms.iph.daddr, 4); - if ((__force u16)t->parms.i_flags & SIT_ISATAP) + if (test_bit(IP_TUNNEL_SIT_ISATAP_BIT, t->parms.i_flags)) dev->priv_flags |= IFF_ISATAP; dev->rtnl_link_ops = &sit_link_ops; @@ -226,7 +227,8 @@ out: } static struct ip_tunnel *ipip6_tunnel_locate(struct net *net, - struct ip_tunnel_parm *parms, int create) + struct ip_tunnel_parm_kern *parms, + int create) { __be32 remote = parms->iph.daddr; __be32 local = parms->iph.saddr; @@ -267,6 +269,7 @@ static struct ip_tunnel *ipip6_tunnel_locate(struct net *net, nt = netdev_priv(dev); + nt->net = net; nt->parms = *parms; if (ipip6_tunnel_create(dev) < 0) goto failed_free; @@ -845,6 +848,49 @@ static inline __be32 try_6rd(struct ip_tunnel *tunnel, return dst; } +static bool ipip6_tunnel_dst_find(struct sk_buff *skb, __be32 *dst, + bool is_isatap) +{ + const struct ipv6hdr *iph6 = ipv6_hdr(skb); + struct neighbour *neigh = NULL; + const struct in6_addr *addr6; + bool found = false; + int addr_type; + + if (skb_dst(skb)) + neigh = dst_neigh_lookup(skb_dst(skb), &iph6->daddr); + + if (!neigh) { + net_dbg_ratelimited("nexthop == NULL\n"); + return false; + } + + addr6 = (const struct in6_addr *)&neigh->primary_key; + addr_type = ipv6_addr_type(addr6); + + if (is_isatap) { + if ((addr_type & IPV6_ADDR_UNICAST) && + ipv6_addr_is_isatap(addr6)) { + *dst = addr6->s6_addr32[3]; + found = true; + } + } else { + if (addr_type == IPV6_ADDR_ANY) { + addr6 = &ipv6_hdr(skb)->daddr; + addr_type = ipv6_addr_type(addr6); + } + + if ((addr_type & IPV6_ADDR_COMPATv4) != 0) { + *dst = addr6->s6_addr32[3]; + found = true; + } + } + + neigh_release(neigh); + + return found; +} + /* * This function assumes it is being called from dev_queue_xmit() * and that skb is filled properly by that function. @@ -864,8 +910,6 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb, __be32 dst = tiph->daddr; struct flowi4 fl4; int mtu; - const struct in6_addr *addr6; - int addr_type; u8 ttl; u8 protocol = IPPROTO_IPV6; int t_hlen = tunnel->hlen + sizeof(struct iphdr); @@ -874,68 +918,19 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb, tos = ipv6_get_dsfield(iph6); /* ISATAP (RFC4214) - must come before 6to4 */ - if (dev->priv_flags & IFF_ISATAP) { - struct neighbour *neigh = NULL; - bool do_tx_error = false; - - if (skb_dst(skb)) - neigh = dst_neigh_lookup(skb_dst(skb), &iph6->daddr); - - if (!neigh) { - net_dbg_ratelimited("nexthop == NULL\n"); - goto tx_error; - } - - addr6 = (const struct in6_addr *)&neigh->primary_key; - addr_type = ipv6_addr_type(addr6); - - if ((addr_type & IPV6_ADDR_UNICAST) && - ipv6_addr_is_isatap(addr6)) - dst = addr6->s6_addr32[3]; - else - do_tx_error = true; - - neigh_release(neigh); - if (do_tx_error) - goto tx_error; - } + if ((dev->priv_flags & IFF_ISATAP) && + !ipip6_tunnel_dst_find(skb, &dst, true)) + goto tx_error; if (!dst) dst = try_6rd(tunnel, &iph6->daddr); - if (!dst) { - struct neighbour *neigh = NULL; - bool do_tx_error = false; - - if (skb_dst(skb)) - neigh = dst_neigh_lookup(skb_dst(skb), &iph6->daddr); - - if (!neigh) { - net_dbg_ratelimited("nexthop == NULL\n"); - goto tx_error; - } - - addr6 = (const struct in6_addr *)&neigh->primary_key; - addr_type = ipv6_addr_type(addr6); - - if (addr_type == IPV6_ADDR_ANY) { - addr6 = &ipv6_hdr(skb)->daddr; - addr_type = ipv6_addr_type(addr6); - } - - if ((addr_type & IPV6_ADDR_COMPATv4) != 0) - dst = addr6->s6_addr32[3]; - else - do_tx_error = true; - - neigh_release(neigh); - if (do_tx_error) - goto tx_error; - } + if (!dst && !ipip6_tunnel_dst_find(skb, &dst, false)) + goto tx_error; flowi4_init_output(&fl4, tunnel->parms.link, tunnel->fwmark, - RT_TOS(tos), RT_SCOPE_UNIVERSE, IPPROTO_IPV6, - 0, dst, tiph->saddr, 0, 0, + tos & INET_DSCP_MASK, RT_SCOPE_UNIVERSE, + IPPROTO_IPV6, 0, dst, tiph->saddr, 0, 0, sock_net_uid(tunnel->net, NULL)); rt = dst_cache_get_ip4(&tunnel->dst_cache, &fl4.saddr); @@ -1024,7 +1019,7 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb, ttl = iph6->hop_limit; tos = INET_ECN_encapsulate(tos, ipv6_get_dsfield(iph6)); - if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0) { + if (ip_tunnel_encap(skb, &tunnel->encap, &protocol, &fl4) < 0) { ip_rt_put(rt); goto tx_error; } @@ -1032,7 +1027,7 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb, skb_set_inner_ipproto(skb, IPPROTO_IPV6); iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol, tos, ttl, - df, !net_eq(tunnel->net, dev_net(dev))); + df, !net_eq(tunnel->net, dev_net(dev)), 0); return NETDEV_TX_OK; tx_error_icmp: @@ -1095,12 +1090,13 @@ tx_err: static void ipip6_tunnel_bind_dev(struct net_device *dev) { + struct ip_tunnel *tunnel = netdev_priv(dev); + int t_hlen = tunnel->hlen + sizeof(struct iphdr); struct net_device *tdev = NULL; - struct ip_tunnel *tunnel; + int hlen = LL_MAX_HEADER; const struct iphdr *iph; struct flowi4 fl4; - tunnel = netdev_priv(dev); iph = &tunnel->parms.iph; if (iph->daddr) { @@ -1109,7 +1105,7 @@ static void ipip6_tunnel_bind_dev(struct net_device *dev) iph->daddr, iph->saddr, 0, 0, IPPROTO_IPV6, - RT_TOS(iph->tos), + iph->tos & INET_DSCP_MASK, tunnel->parms.link); if (!IS_ERR(rt)) { @@ -1123,17 +1119,19 @@ static void ipip6_tunnel_bind_dev(struct net_device *dev) tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link); if (tdev && !netif_is_l3_master(tdev)) { - int t_hlen = tunnel->hlen + sizeof(struct iphdr); int mtu; mtu = tdev->mtu - t_hlen; if (mtu < IPV6_MIN_MTU) mtu = IPV6_MIN_MTU; WRITE_ONCE(dev->mtu, mtu); + hlen = tdev->hard_header_len + tdev->needed_headroom; } + dev->needed_headroom = t_hlen + hlen; } -static void ipip6_tunnel_update(struct ip_tunnel *t, struct ip_tunnel_parm *p, +static void ipip6_tunnel_update(struct ip_tunnel *t, + struct ip_tunnel_parm_kern *p, __u32 fwmark) { struct net *net = t->net; @@ -1194,11 +1192,11 @@ static int ipip6_tunnel_get6rd(struct net_device *dev, struct ip_tunnel_parm __user *data) { struct ip_tunnel *t = netdev_priv(dev); + struct ip_tunnel_parm_kern p; struct ip_tunnel_6rd ip6rd; - struct ip_tunnel_parm p; if (dev == dev_to_sit_net(dev)->fb_tunnel_dev) { - if (copy_from_user(&p, data, sizeof(p))) + if (!ip_tunnel_parm_from_user(&p, data)) return -EFAULT; t = ipip6_tunnel_locate(t->net, &p, 0); } @@ -1249,7 +1247,7 @@ static bool ipip6_valid_ip_proto(u8 ipproto) } static int -__ipip6_tunnel_ioctl_validate(struct net *net, struct ip_tunnel_parm *p) +__ipip6_tunnel_ioctl_validate(struct net *net, struct ip_tunnel_parm_kern *p) { if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; @@ -1266,7 +1264,7 @@ __ipip6_tunnel_ioctl_validate(struct net *net, struct ip_tunnel_parm *p) } static int -ipip6_tunnel_get(struct net_device *dev, struct ip_tunnel_parm *p) +ipip6_tunnel_get(struct net_device *dev, struct ip_tunnel_parm_kern *p) { struct ip_tunnel *t = netdev_priv(dev); @@ -1279,7 +1277,7 @@ ipip6_tunnel_get(struct net_device *dev, struct ip_tunnel_parm *p) } static int -ipip6_tunnel_add(struct net_device *dev, struct ip_tunnel_parm *p) +ipip6_tunnel_add(struct net_device *dev, struct ip_tunnel_parm_kern *p) { struct ip_tunnel *t = netdev_priv(dev); int err; @@ -1295,7 +1293,7 @@ ipip6_tunnel_add(struct net_device *dev, struct ip_tunnel_parm *p) } static int -ipip6_tunnel_change(struct net_device *dev, struct ip_tunnel_parm *p) +ipip6_tunnel_change(struct net_device *dev, struct ip_tunnel_parm_kern *p) { struct ip_tunnel *t = netdev_priv(dev); int err; @@ -1326,7 +1324,7 @@ ipip6_tunnel_change(struct net_device *dev, struct ip_tunnel_parm *p) } static int -ipip6_tunnel_del(struct net_device *dev, struct ip_tunnel_parm *p) +ipip6_tunnel_del(struct net_device *dev, struct ip_tunnel_parm_kern *p) { struct ip_tunnel *t = netdev_priv(dev); @@ -1346,7 +1344,8 @@ ipip6_tunnel_del(struct net_device *dev, struct ip_tunnel_parm *p) } static int -ipip6_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd) +ipip6_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm_kern *p, + int cmd) { switch (cmd) { case SIOCGETTUNNEL: @@ -1396,7 +1395,6 @@ static const struct net_device_ops ipip6_netdev_ops = { .ndo_uninit = ipip6_tunnel_uninit, .ndo_start_xmit = sit_tunnel_xmit, .ndo_siocdevprivate = ipip6_tunnel_siocdevprivate, - .ndo_get_stats64 = dev_get_tstats64, .ndo_get_iflink = ip_tunnel_get_iflink, .ndo_tunnel_ctl = ipip6_tunnel_ctl, }; @@ -1406,7 +1404,6 @@ static void ipip6_dev_free(struct net_device *dev) struct ip_tunnel *tunnel = netdev_priv(dev); dst_cache_destroy(&tunnel->dst_cache); - free_percpu(dev->tstats); } #define SIT_FEATURES (NETIF_F_SG | \ @@ -1432,9 +1429,11 @@ static void ipip6_tunnel_setup(struct net_device *dev) dev->flags = IFF_NOARP; netif_keep_dst(dev); dev->addr_len = 4; - dev->features |= NETIF_F_LLTX; + dev->lltx = true; dev->features |= SIT_FEATURES; dev->hw_features |= SIT_FEATURES; + dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS; + } static int ipip6_tunnel_init(struct net_device *dev) @@ -1443,21 +1442,16 @@ static int ipip6_tunnel_init(struct net_device *dev) int err; tunnel->dev = dev; - tunnel->net = dev_net(dev); strcpy(tunnel->parms.name, dev->name); ipip6_tunnel_bind_dev(dev); - dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); - if (!dev->tstats) - return -ENOMEM; err = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL); - if (err) { - free_percpu(dev->tstats); - dev->tstats = NULL; + if (err) return err; - } + netdev_hold(dev, &tunnel->dev_tracker, GFP_KERNEL); + netdev_lockdep_set_classes(dev); return 0; } @@ -1492,7 +1486,7 @@ static int ipip6_validate(struct nlattr *tb[], struct nlattr *data[], } static void ipip6_netlink_parms(struct nlattr *data[], - struct ip_tunnel_parm *parms, + struct ip_tunnel_parm_kern *parms, __u32 *fwmark) { memset(parms, 0, sizeof(*parms)); @@ -1548,19 +1542,23 @@ static bool ipip6_netlink_6rd_parms(struct nlattr *data[], } #endif -static int ipip6_newlink(struct net *src_net, struct net_device *dev, - struct nlattr *tb[], struct nlattr *data[], +static int ipip6_newlink(struct net_device *dev, + struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { - struct net *net = dev_net(dev); + struct nlattr **data = params->data; + struct nlattr **tb = params->tb; struct ip_tunnel *nt; struct ip_tunnel_encap ipencap; #ifdef CONFIG_IPV6_SIT_6RD struct ip_tunnel_6rd ip6rd; #endif + struct net *net; int err; + net = params->link_net ? : dev_net(dev); nt = netdev_priv(dev); + nt->net = net; if (ip_tunnel_netlink_encap_parms(data, &ipencap)) { err = ip_tunnel_encap_setup(nt, &ipencap); @@ -1601,8 +1599,8 @@ static int ipip6_changelink(struct net_device *dev, struct nlattr *tb[], struct netlink_ext_ack *extack) { struct ip_tunnel *t = netdev_priv(dev); - struct ip_tunnel_parm p; struct ip_tunnel_encap ipencap; + struct ip_tunnel_parm_kern p; struct net *net = t->net; struct sit_net *sitn = net_generic(net, sit_net_id); #ifdef CONFIG_IPV6_SIT_6RD @@ -1689,7 +1687,7 @@ static size_t ipip6_get_size(const struct net_device *dev) static int ipip6_fill_info(struct sk_buff *skb, const struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); - struct ip_tunnel_parm *parm = &tunnel->parms; + struct ip_tunnel_parm_kern *parm = &tunnel->parms; if (nla_put_u32(skb, IFLA_IPTUN_LINK, parm->link) || nla_put_in_addr(skb, IFLA_IPTUN_LOCAL, parm->iph.saddr) || @@ -1699,7 +1697,8 @@ static int ipip6_fill_info(struct sk_buff *skb, const struct net_device *dev) nla_put_u8(skb, IFLA_IPTUN_PMTUDISC, !!(parm->iph.frag_off & htons(IP_DF))) || nla_put_u8(skb, IFLA_IPTUN_PROTO, parm->iph.protocol) || - nla_put_be16(skb, IFLA_IPTUN_FLAGS, parm->i_flags) || + nla_put_be16(skb, IFLA_IPTUN_FLAGS, + ip_tunnel_flags_to_be16(parm->i_flags)) || nla_put_u32(skb, IFLA_IPTUN_FWMARK, tunnel->fwmark)) goto nla_put_failure; @@ -1797,8 +1796,7 @@ static struct xfrm_tunnel mplsip_handler __read_mostly = { }; #endif -static void __net_exit sit_destroy_tunnels(struct net *net, - struct list_head *head) +static void __net_exit sit_exit_rtnl_net(struct net *net, struct list_head *head) { struct sit_net *sitn = net_generic(net, sit_net_id); struct net_device *dev, *aux; @@ -1813,15 +1811,15 @@ static void __net_exit sit_destroy_tunnels(struct net *net, for (h = 0; h < (prio ? IP6_SIT_HASH_SIZE : 1); h++) { struct ip_tunnel *t; - t = rtnl_dereference(sitn->tunnels[prio][h]); + t = rtnl_net_dereference(net, sitn->tunnels[prio][h]); while (t) { /* If dev is in the same netns, it has already * been added to the list by the previous loop. */ if (!net_eq(dev_net(t->dev), net)) - unregister_netdevice_queue(t->dev, - head); - t = rtnl_dereference(t->next); + unregister_netdevice_queue(t->dev, head); + + t = rtnl_net_dereference(net, t->next); } } } @@ -1853,7 +1851,10 @@ static int __net_init sit_init_net(struct net *net) /* FB netdevice is special: we have one, and only one per netns. * Allowing to move it to another netns is clearly unsafe. */ - sitn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL; + sitn->fb_tunnel_dev->netns_immutable = true; + + t = netdev_priv(sitn->fb_tunnel_dev); + t->net = net; err = register_netdev(sitn->fb_tunnel_dev); if (err) @@ -1862,8 +1863,6 @@ static int __net_init sit_init_net(struct net *net) ipip6_tunnel_clone_6rd(sitn->fb_tunnel_dev, sitn); ipip6_fb_tunnel_init(sitn->fb_tunnel_dev); - t = netdev_priv(sitn->fb_tunnel_dev); - strcpy(t->parms.name, sitn->fb_tunnel_dev->name); return 0; @@ -1873,22 +1872,9 @@ err_alloc_dev: return err; } -static void __net_exit sit_exit_batch_net(struct list_head *net_list) -{ - LIST_HEAD(list); - struct net *net; - - rtnl_lock(); - list_for_each_entry(net, net_list, exit_list) - sit_destroy_tunnels(net, &list); - - unregister_netdevice_many(&list); - rtnl_unlock(); -} - static struct pernet_operations sit_net_ops = { .init = sit_init_net, - .exit_batch = sit_exit_batch_net, + .exit_rtnl = sit_exit_rtnl_net, .id = &sit_net_id, .size = sizeof(struct sit_net), }; @@ -1954,6 +1940,7 @@ xfrm_tunnel_failed: module_init(sit_init); module_exit(sit_cleanup); +MODULE_DESCRIPTION("IPv6-in-IPv4 tunnel SIT driver"); MODULE_LICENSE("GPL"); MODULE_ALIAS_RTNL_LINK("sit"); MODULE_ALIAS_NETDEV("sit0"); diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c index 5014aa663452..7e007f013ec8 100644 --- a/net/ipv6/syncookies.c +++ b/net/ipv6/syncookies.c @@ -16,6 +16,7 @@ #include <net/secure_seq.h> #include <net/ipv6.h> #include <net/tcp.h> +#include <net/tcp_ecn.h> #define COOKIEBITS 24 /* Upper bits store count */ #define COOKIEMASK (((__u32)1 << COOKIEBITS) - 1) @@ -114,80 +115,97 @@ __u32 cookie_v6_init_sequence(const struct sk_buff *skb, __u16 *mssp) return __cookie_v6_init_sequence(iph, th, mssp); } -int __cookie_v6_check(const struct ipv6hdr *iph, const struct tcphdr *th, - __u32 cookie) +int __cookie_v6_check(const struct ipv6hdr *iph, const struct tcphdr *th) { + __u32 cookie = ntohl(th->ack_seq) - 1; __u32 seq = ntohl(th->seq) - 1; - __u32 mssind = check_tcp_syn_cookie(cookie, &iph->saddr, &iph->daddr, - th->source, th->dest, seq); + __u32 mssind; + + mssind = check_tcp_syn_cookie(cookie, &iph->saddr, &iph->daddr, + th->source, th->dest, seq); return mssind < ARRAY_SIZE(msstab) ? msstab[mssind] : 0; } EXPORT_SYMBOL_GPL(__cookie_v6_check); -struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) +static struct request_sock *cookie_tcp_check(struct net *net, struct sock *sk, + struct sk_buff *skb) { struct tcp_options_received tcp_opt; - struct inet_request_sock *ireq; - struct tcp_request_sock *treq; - struct ipv6_pinfo *np = inet6_sk(sk); - struct tcp_sock *tp = tcp_sk(sk); - const struct tcphdr *th = tcp_hdr(skb); - __u32 cookie = ntohl(th->ack_seq) - 1; - struct sock *ret = sk; - struct request_sock *req; - int full_space, mss; - struct dst_entry *dst; - __u8 rcv_wscale; u32 tsoff = 0; - - if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_syncookies) || - !th->ack || th->rst) - goto out; + int mss; if (tcp_synq_no_recent_overflow(sk)) goto out; - mss = __cookie_v6_check(ipv6_hdr(skb), th, cookie); - if (mss == 0) { - __NET_INC_STATS(sock_net(sk), LINUX_MIB_SYNCOOKIESFAILED); + mss = __cookie_v6_check(ipv6_hdr(skb), tcp_hdr(skb)); + if (!mss) { + __NET_INC_STATS(net, LINUX_MIB_SYNCOOKIESFAILED); goto out; } - __NET_INC_STATS(sock_net(sk), LINUX_MIB_SYNCOOKIESRECV); + __NET_INC_STATS(net, LINUX_MIB_SYNCOOKIESRECV); /* check for timestamp cookie support */ memset(&tcp_opt, 0, sizeof(tcp_opt)); - tcp_parse_options(sock_net(sk), skb, &tcp_opt, 0, NULL); + tcp_parse_options(net, skb, &tcp_opt, 0, NULL); if (tcp_opt.saw_tstamp && tcp_opt.rcv_tsecr) { - tsoff = secure_tcpv6_ts_off(sock_net(sk), + tsoff = secure_tcpv6_ts_off(net, ipv6_hdr(skb)->daddr.s6_addr32, ipv6_hdr(skb)->saddr.s6_addr32); tcp_opt.rcv_tsecr -= tsoff; } - if (!cookie_timestamp_decode(sock_net(sk), &tcp_opt)) + if (!cookie_timestamp_decode(net, &tcp_opt)) goto out; - ret = NULL; - req = cookie_tcp_reqsk_alloc(&tcp6_request_sock_ops, - &tcp_request_sock_ipv6_ops, sk, skb); - if (!req) + return cookie_tcp_reqsk_alloc(&tcp6_request_sock_ops, sk, skb, + &tcp_opt, mss, tsoff); +out: + return ERR_PTR(-EINVAL); +} + +struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) +{ + const struct tcphdr *th = tcp_hdr(skb); + struct ipv6_pinfo *np = inet6_sk(sk); + struct tcp_sock *tp = tcp_sk(sk); + struct inet_request_sock *ireq; + struct net *net = sock_net(sk); + struct request_sock *req; + struct dst_entry *dst; + struct sock *ret = sk; + __u8 rcv_wscale; + int full_space; + SKB_DR(reason); + + if (!READ_ONCE(net->ipv4.sysctl_tcp_syncookies) || + !th->ack || th->rst) goto out; - ireq = inet_rsk(req); - treq = tcp_rsk(req); - treq->tfo_listener = false; + if (cookie_bpf_ok(skb)) { + req = cookie_bpf_check(sk, skb); + } else { + req = cookie_tcp_check(net, sk, skb); + if (IS_ERR(req)) + goto out; + } + if (!req) { + SKB_DR_SET(reason, NO_SOCKET); + goto out_drop; + } - if (security_inet_conn_request(sk, skb, req)) - goto out_free; + ireq = inet_rsk(req); - req->mss = mss; - ireq->ir_rmt_port = th->source; - ireq->ir_num = ntohs(th->dest); ireq->ir_v6_rmt_addr = ipv6_hdr(skb)->saddr; ireq->ir_v6_loc_addr = ipv6_hdr(skb)->daddr; + + if (security_inet_conn_request(sk, skb, req)) { + SKB_DR_SET(reason, SECURITY_HOOK); + goto out_free; + } + if (ipv6_opt_accepted(sk, skb, &TCP_SKB_CB(skb)->header.h6) || np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo || np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim) { @@ -195,27 +213,12 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) ireq->pktopts = skb; } - ireq->ir_iif = inet_request_bound_dev_if(sk, skb); /* So that link locals have meaning */ if (!sk->sk_bound_dev_if && ipv6_addr_type(&ireq->ir_v6_rmt_addr) & IPV6_ADDR_LINKLOCAL) ireq->ir_iif = tcp_v6_iif(skb); - ireq->ir_mark = inet_request_mark(sk, skb); - - req->num_retrans = 0; - ireq->snd_wscale = tcp_opt.snd_wscale; - ireq->sack_ok = tcp_opt.sack_ok; - ireq->wscale_ok = tcp_opt.wscale_ok; - ireq->tstamp_ok = tcp_opt.saw_tstamp; - req->ts_recent = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsval : 0; - treq->snt_synack = 0; - treq->rcv_isn = ntohl(th->seq) - 1; - treq->snt_isn = cookie; - treq->ts_off = 0; - treq->txhash = net_tx_rndhash(); - if (IS_ENABLED(CONFIG_SMC)) - ireq->smc_ok = 0; + tcp_ao_syncookie(sk, skb, req, AF_INET6); /* * We need to lookup the dst_entry to get the correct window size. @@ -234,15 +237,17 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) fl6.flowi6_mark = ireq->ir_mark; fl6.fl6_dport = ireq->ir_rmt_port; fl6.fl6_sport = inet_sk(sk)->inet_sport; - fl6.flowi6_uid = sk->sk_uid; + fl6.flowi6_uid = sk_uid(sk); security_req_classify_flow(req, flowi6_to_flowi_common(&fl6)); - dst = ip6_dst_lookup_flow(sock_net(sk), sk, &fl6, final_p); - if (IS_ERR(dst)) + dst = ip6_dst_lookup_flow(net, sk, &fl6, final_p); + if (IS_ERR(dst)) { + SKB_DR_SET(reason, IP_OUTNOROUTES); goto out_free; + } } - req->rsk_window_clamp = tp->window_clamp ? :dst_metric(dst, RTAX_WINDOW); + req->rsk_window_clamp = READ_ONCE(tp->window_clamp) ? :dst_metric(dst, RTAX_WINDOW); /* limit the window selection if the user enforce a smaller rx buffer */ full_space = tcp_full_space(sk); if (sk->sk_userlocks & SOCK_RCVBUF_LOCK && @@ -254,13 +259,24 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) ireq->wscale_ok, &rcv_wscale, dst_metric(dst, RTAX_INITRWND)); - ireq->rcv_wscale = rcv_wscale; - ireq->ecn_ok = cookie_ecn_ok(&tcp_opt, sock_net(sk), dst); - - ret = tcp_get_cookie_sock(sk, skb, req, dst, tsoff); + /* req->syncookie is set true only if ACK is validated + * by BPF kfunc, then, rcv_wscale is already configured. + */ + if (!req->syncookie) + ireq->rcv_wscale = rcv_wscale; + ireq->ecn_ok &= cookie_ecn_ok(net, dst); + tcp_rsk(req)->accecn_ok = ireq->ecn_ok && cookie_accecn_ok(th); + + ret = tcp_get_cookie_sock(sk, skb, req, dst); + if (!ret) { + SKB_DR_SET(reason, NO_SOCKET); + goto out_drop; + } out: return ret; out_free: reqsk_free(req); +out_drop: + sk_skb_reason_drop(sk, skb, reason); return NULL; } diff --git a/net/ipv6/sysctl_net_ipv6.c b/net/ipv6/sysctl_net_ipv6.c index 94a0a294c6a1..d2cd33e2698d 100644 --- a/net/ipv6/sysctl_net_ipv6.c +++ b/net/ipv6/sysctl_net_ipv6.c @@ -30,7 +30,7 @@ static u32 rt6_multipath_hash_fields_all_mask = static u32 ioam6_id_max = IOAM6_DEFAULT_ID; static u64 ioam6_id_wide_max = IOAM6_DEFAULT_ID_WIDE; -static int proc_rt6_multipath_hash_policy(struct ctl_table *table, int write, +static int proc_rt6_multipath_hash_policy(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct net *net; @@ -46,7 +46,7 @@ static int proc_rt6_multipath_hash_policy(struct ctl_table *table, int write, } static int -proc_rt6_multipath_hash_fields(struct ctl_table *table, int write, void *buffer, +proc_rt6_multipath_hash_fields(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct net *net; @@ -213,7 +213,6 @@ static struct ctl_table ipv6_table_template[] = { .proc_handler = proc_doulongvec_minmax, .extra2 = &ioam6_id_wide_max, }, - { } }; static struct ctl_table ipv6_rotable[] = { @@ -248,11 +247,11 @@ static struct ctl_table ipv6_rotable[] = { .proc_handler = proc_dointvec, }, #endif /* CONFIG_NETLABEL */ - { } }; static int __net_init ipv6_sysctl_net_init(struct net *net) { + size_t table_size = ARRAY_SIZE(ipv6_table_template); struct ctl_table *ipv6_table; struct ctl_table *ipv6_route_table; struct ctl_table *ipv6_icmp_table; @@ -264,7 +263,7 @@ static int __net_init ipv6_sysctl_net_init(struct net *net) if (!ipv6_table) goto out; /* Update the variables to point into the current struct net */ - for (i = 0; i < ARRAY_SIZE(ipv6_table_template) - 1; i++) + for (i = 0; i < table_size; i++) ipv6_table[i].data += (void *)net - (void *)&init_net; ipv6_route_table = ipv6_route_sysctl_init(net); @@ -275,17 +274,22 @@ static int __net_init ipv6_sysctl_net_init(struct net *net) if (!ipv6_icmp_table) goto out_ipv6_route_table; - net->ipv6.sysctl.hdr = register_net_sysctl(net, "net/ipv6", ipv6_table); + net->ipv6.sysctl.hdr = register_net_sysctl_sz(net, "net/ipv6", + ipv6_table, table_size); if (!net->ipv6.sysctl.hdr) goto out_ipv6_icmp_table; - net->ipv6.sysctl.route_hdr = - register_net_sysctl(net, "net/ipv6/route", ipv6_route_table); + net->ipv6.sysctl.route_hdr = register_net_sysctl_sz(net, + "net/ipv6/route", + ipv6_route_table, + ipv6_route_sysctl_table_size(net)); if (!net->ipv6.sysctl.route_hdr) goto out_unregister_ipv6_table; - net->ipv6.sysctl.icmp_hdr = - register_net_sysctl(net, "net/ipv6/icmp", ipv6_icmp_table); + net->ipv6.sysctl.icmp_hdr = register_net_sysctl_sz(net, + "net/ipv6/icmp", + ipv6_icmp_table, + ipv6_icmp_sysctl_table_size()); if (!net->ipv6.sysctl.icmp_hdr) goto out_unregister_route_table; @@ -307,9 +311,9 @@ out_ipv6_table: static void __net_exit ipv6_sysctl_net_exit(struct net *net) { - struct ctl_table *ipv6_table; - struct ctl_table *ipv6_route_table; - struct ctl_table *ipv6_icmp_table; + const struct ctl_table *ipv6_table; + const struct ctl_table *ipv6_route_table; + const struct ctl_table *ipv6_icmp_table; ipv6_table = net->ipv6.sysctl.hdr->ctl_table_arg; ipv6_route_table = net->ipv6.sysctl.route_hdr->ctl_table_arg; diff --git a/net/ipv6/tcp_ao.c b/net/ipv6/tcp_ao.c new file mode 100644 index 000000000000..3c09ac26206e --- /dev/null +++ b/net/ipv6/tcp_ao.c @@ -0,0 +1,168 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * INET An implementation of the TCP Authentication Option (TCP-AO). + * See RFC5925. + * + * Authors: Dmitry Safonov <dima@arista.com> + * Francesco Ruggeri <fruggeri@arista.com> + * Salam Noureddine <noureddine@arista.com> + */ +#include <crypto/hash.h> +#include <linux/tcp.h> + +#include <net/tcp.h> +#include <net/ipv6.h> + +static int tcp_v6_ao_calc_key(struct tcp_ao_key *mkt, u8 *key, + const struct in6_addr *saddr, + const struct in6_addr *daddr, + __be16 sport, __be16 dport, + __be32 sisn, __be32 disn) +{ + struct kdf_input_block { + u8 counter; + u8 label[6]; + struct tcp6_ao_context ctx; + __be16 outlen; + } __packed * tmp; + struct tcp_sigpool hp; + int err; + + err = tcp_sigpool_start(mkt->tcp_sigpool_id, &hp); + if (err) + return err; + + tmp = hp.scratch; + tmp->counter = 1; + memcpy(tmp->label, "TCP-AO", 6); + tmp->ctx.saddr = *saddr; + tmp->ctx.daddr = *daddr; + tmp->ctx.sport = sport; + tmp->ctx.dport = dport; + tmp->ctx.sisn = sisn; + tmp->ctx.disn = disn; + tmp->outlen = htons(tcp_ao_digest_size(mkt) * 8); /* in bits */ + + err = tcp_ao_calc_traffic_key(mkt, key, tmp, sizeof(*tmp), &hp); + tcp_sigpool_end(&hp); + + return err; +} + +int tcp_v6_ao_calc_key_skb(struct tcp_ao_key *mkt, u8 *key, + const struct sk_buff *skb, + __be32 sisn, __be32 disn) +{ + const struct ipv6hdr *iph = ipv6_hdr(skb); + const struct tcphdr *th = tcp_hdr(skb); + + return tcp_v6_ao_calc_key(mkt, key, &iph->saddr, + &iph->daddr, th->source, + th->dest, sisn, disn); +} + +int tcp_v6_ao_calc_key_sk(struct tcp_ao_key *mkt, u8 *key, + const struct sock *sk, __be32 sisn, + __be32 disn, bool send) +{ + if (send) + return tcp_v6_ao_calc_key(mkt, key, &sk->sk_v6_rcv_saddr, + &sk->sk_v6_daddr, htons(sk->sk_num), + sk->sk_dport, sisn, disn); + else + return tcp_v6_ao_calc_key(mkt, key, &sk->sk_v6_daddr, + &sk->sk_v6_rcv_saddr, sk->sk_dport, + htons(sk->sk_num), disn, sisn); +} + +int tcp_v6_ao_calc_key_rsk(struct tcp_ao_key *mkt, u8 *key, + struct request_sock *req) +{ + struct inet_request_sock *ireq = inet_rsk(req); + + return tcp_v6_ao_calc_key(mkt, key, + &ireq->ir_v6_loc_addr, &ireq->ir_v6_rmt_addr, + htons(ireq->ir_num), ireq->ir_rmt_port, + htonl(tcp_rsk(req)->snt_isn), + htonl(tcp_rsk(req)->rcv_isn)); +} + +struct tcp_ao_key *tcp_v6_ao_lookup(const struct sock *sk, + struct sock *addr_sk, + int sndid, int rcvid) +{ + int l3index = l3mdev_master_ifindex_by_index(sock_net(sk), + addr_sk->sk_bound_dev_if); + struct in6_addr *addr = &addr_sk->sk_v6_daddr; + + return tcp_ao_do_lookup(sk, l3index, (union tcp_ao_addr *)addr, + AF_INET6, sndid, rcvid); +} + +struct tcp_ao_key *tcp_v6_ao_lookup_rsk(const struct sock *sk, + struct request_sock *req, + int sndid, int rcvid) +{ + struct inet_request_sock *ireq = inet_rsk(req); + struct in6_addr *addr = &ireq->ir_v6_rmt_addr; + int l3index; + + l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif); + return tcp_ao_do_lookup(sk, l3index, (union tcp_ao_addr *)addr, + AF_INET6, sndid, rcvid); +} + +int tcp_v6_ao_hash_pseudoheader(struct tcp_sigpool *hp, + const struct in6_addr *daddr, + const struct in6_addr *saddr, int nbytes) +{ + struct tcp6_pseudohdr *bp; + struct scatterlist sg; + + bp = hp->scratch; + /* 1. TCP pseudo-header (RFC2460) */ + bp->saddr = *saddr; + bp->daddr = *daddr; + bp->len = cpu_to_be32(nbytes); + bp->protocol = cpu_to_be32(IPPROTO_TCP); + + sg_init_one(&sg, bp, sizeof(*bp)); + ahash_request_set_crypt(hp->req, &sg, NULL, sizeof(*bp)); + return crypto_ahash_update(hp->req); +} + +int tcp_v6_ao_hash_skb(char *ao_hash, struct tcp_ao_key *key, + const struct sock *sk, const struct sk_buff *skb, + const u8 *tkey, int hash_offset, u32 sne) +{ + return tcp_ao_hash_skb(AF_INET6, ao_hash, key, sk, skb, tkey, + hash_offset, sne); +} + +int tcp_v6_parse_ao(struct sock *sk, int cmd, + sockptr_t optval, int optlen) +{ + return tcp_parse_ao(sk, cmd, AF_INET6, optval, optlen); +} + +int tcp_v6_ao_synack_hash(char *ao_hash, struct tcp_ao_key *ao_key, + struct request_sock *req, const struct sk_buff *skb, + int hash_offset, u32 sne) +{ + void *hash_buf = NULL; + int err; + + hash_buf = kmalloc(tcp_ao_digest_size(ao_key), GFP_ATOMIC); + if (!hash_buf) + return -ENOMEM; + + err = tcp_v6_ao_calc_key_rsk(ao_key, hash_buf, req); + if (err) + goto out; + + err = tcp_ao_hash_skb(AF_INET6, ao_hash, ao_key, req_to_sk(req), skb, + hash_buf, hash_offset, sne); +out: + kfree(hash_buf); + return err; +} diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 11b736a76bd7..280fe5978559 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -41,6 +41,7 @@ #include <linux/random.h> #include <linux/indirect_call_wrapper.h> +#include <net/aligned_data.h> #include <net/tcp.h> #include <net/ndisc.h> #include <net/inet6_hashtables.h> @@ -58,17 +59,20 @@ #include <net/timewait_sock.h> #include <net/inet_common.h> #include <net/secure_seq.h> +#include <net/hotdata.h> #include <net/busy_poll.h> +#include <net/rstreason.h> +#include <net/psp.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> -#include <crypto/hash.h> -#include <linux/scatterlist.h> +#include <crypto/md5.h> #include <trace/events/tcp.h> -static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb); +static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb, + enum sk_rst_reason reason); static void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, struct request_sock *req); @@ -76,16 +80,9 @@ INDIRECT_CALLABLE_SCOPE int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb); static const struct inet_connection_sock_af_ops ipv6_mapped; const struct inet_connection_sock_af_ops ipv6_specific; -#ifdef CONFIG_TCP_MD5SIG +#if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) static const struct tcp_sock_af_ops tcp_sock_ipv6_specific; static const struct tcp_sock_af_ops tcp_sock_ipv6_mapped_specific; -#else -static struct tcp_md5sig_key *tcp_v6_md5_do_lookup(const struct sock *sk, - const struct in6_addr *addr, - int l3index) -{ - return NULL; -} #endif /* Helper returning the inet6 address from a given tcp socket. @@ -93,23 +90,17 @@ static struct tcp_md5sig_key *tcp_v6_md5_do_lookup(const struct sock *sk, * This avoids a dereference and allow compiler optimizations. * It is a specialized version of inet6_sk_generic(). */ -static struct ipv6_pinfo *tcp_inet6_sk(const struct sock *sk) -{ - unsigned int offset = sizeof(struct tcp6_sock) - sizeof(struct ipv6_pinfo); - - return (struct ipv6_pinfo *)(((u8 *)sk) + offset); -} +#define tcp_inet6_sk(sk) (&container_of_const(tcp_sk(sk), \ + struct tcp6_sock, tcp)->inet6) static void inet6_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); if (dst && dst_hold_safe(dst)) { - const struct rt6_info *rt = (const struct rt6_info *)dst; - rcu_assign_pointer(sk->sk_rx_dst, dst); sk->sk_rx_dst_ifindex = skb->skb_iif; - sk->sk_rx_dst_cookie = rt6_get_cookie(rt); + sk->sk_rx_dst_cookie = rt6_get_cookie(dst_rt6_info(dst)); } } @@ -127,7 +118,7 @@ static u32 tcp_v6_init_ts_off(const struct net *net, const struct sk_buff *skb) ipv6_hdr(skb)->saddr.s6_addr32); } -static int tcp_v6_pre_connect(struct sock *sk, struct sockaddr *uaddr, +static int tcp_v6_pre_connect(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len) { /* This check is replicated from tcp_v6_connect() and intended to @@ -139,10 +130,10 @@ static int tcp_v6_pre_connect(struct sock *sk, struct sockaddr *uaddr, sock_owned_by_me(sk); - return BPF_CGROUP_RUN_PROG_INET6_CONNECT(sk, uaddr); + return BPF_CGROUP_RUN_PROG_INET6_CONNECT(sk, uaddr, &addr_len); } -static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, +static int tcp_v6_connect(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len) { struct sockaddr_in6 *usin = (struct sockaddr_in6 *) uaddr; @@ -167,7 +158,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, memset(&fl6, 0, sizeof(fl6)); - if (np->sndflow) { + if (inet6_test_bit(SNDFLOW, sk)) { fl6.flowlabel = usin->sin6_flowinfo&IPV6_FLOWINFO_MASK; IP6_ECN_flow_init(fl6.flowlabel); if (fl6.flowlabel&IPV6_FLOWLABEL_MASK) { @@ -243,11 +234,11 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, if (sk_is_mptcp(sk)) mptcpv6_handle_mapped(sk, true); sk->sk_backlog_rcv = tcp_v4_do_rcv; -#ifdef CONFIG_TCP_MD5SIG +#if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) tp->af_specific = &tcp_sock_ipv6_mapped_specific; #endif - err = tcp_v4_connect(sk, (struct sockaddr *)&sin, sizeof(sin)); + err = tcp_v4_connect(sk, (struct sockaddr_unsized *)&sin, sizeof(sin)); if (err) { icsk->icsk_ext_hdr_len = exthdrlen; @@ -256,7 +247,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, if (sk_is_mptcp(sk)) mptcpv6_handle_mapped(sk, false); sk->sk_backlog_rcv = tcp_v6_do_rcv; -#ifdef CONFIG_TCP_MD5SIG +#if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) tp->af_specific = &tcp_sock_ipv6_specific; #endif goto failure; @@ -272,11 +263,14 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, fl6.flowi6_proto = IPPROTO_TCP; fl6.daddr = sk->sk_v6_daddr; fl6.saddr = saddr ? *saddr : np->saddr; + fl6.flowlabel = ip6_make_flowinfo(np->tclass, np->flow_label); fl6.flowi6_oif = sk->sk_bound_dev_if; fl6.flowi6_mark = sk->sk_mark; fl6.fl6_dport = usin->sin6_port; fl6.fl6_sport = inet->inet_sport; - fl6.flowi6_uid = sk->sk_uid; + if (IS_ENABLED(CONFIG_IP_ROUTE_MULTIPATH) && !fl6.fl6_sport) + fl6.flowi6_flags = FLOWI_FLAG_ANY_SPORT; + fl6.flowi6_uid = sk_uid(sk); opt = rcu_dereference_protected(np->opt, lockdep_sock_is_held(sk)); final_p = fl6_update_dst(&fl6, opt, &final); @@ -289,6 +283,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, goto failure; } + tp->tcp_usec_ts = dst_tcp_usec_ts(dst); tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row; if (!saddr) { @@ -304,12 +299,12 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, inet->inet_rcv_saddr = LOOPBACK4_IPV6; sk->sk_gso_type = SKB_GSO_TCPV6; - ip6_dst_store(sk, dst, NULL, NULL); + ip6_dst_store(sk, dst, false, false); - icsk->icsk_ext_hdr_len = 0; + icsk->icsk_ext_hdr_len = psp_sk_overhead(sk); if (opt) - icsk->icsk_ext_hdr_len = opt->opt_flen + - opt->opt_nflen; + icsk->icsk_ext_hdr_len += opt->opt_flen + + opt->opt_nflen; tp->rx_opt.mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - sizeof(struct ipv6hdr); @@ -384,7 +379,7 @@ static int tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, { const struct ipv6hdr *hdr = (const struct ipv6hdr *)skb->data; const struct tcphdr *th = (struct tcphdr *)(skb->data+offset); - struct net *net = dev_net(skb->dev); + struct net *net = dev_net_rcu(skb->dev); struct request_sock *fastopen; struct ipv6_pinfo *np; struct tcp_sock *tp; @@ -393,8 +388,7 @@ static int tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, bool fatal; int err; - sk = __inet6_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, - &hdr->daddr, th->dest, + sk = __inet6_lookup_established(net, &hdr->daddr, th->dest, &hdr->saddr, ntohs(th->source), skb->dev->ifindex, inet6_sdif(skb)); @@ -405,6 +399,8 @@ static int tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, } if (sk->sk_state == TCP_TIME_WAIT) { + /* To increase the counter of ignored icmps for TCP-AO */ + tcp_ao_ignore_icmp(sk, AF_INET6, type, code); inet_twsk_put(inet_twsk(sk)); return 0; } @@ -415,6 +411,11 @@ static int tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, return 0; } + if (tcp_ao_ignore_icmp(sk, AF_INET6, type, code)) { + sock_put(sk); + return 0; + } + bh_lock_sock(sk); if (sock_owned_by_user(sk) && type != ICMPV6_PKT_TOOBIG) __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS); @@ -491,13 +492,10 @@ static int tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, ipv6_icmp_error(sk, skb, err, th->dest, ntohl(info), (u8 *)th); - if (!sock_owned_by_user(sk)) { - sk->sk_err = err; - sk_error_report(sk); /* Wake people up to see the error (see connect in sock.c) */ - - tcp_done(sk); - } else - sk->sk_err_soft = err; + if (!sock_owned_by_user(sk)) + tcp_done_with_error(sk, err); + else + WRITE_ONCE(sk->sk_err_soft, err); goto out; case TCP_LISTEN: break; @@ -510,12 +508,12 @@ static int tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, tcp_ld_RTO_revert(sk, seq); } - if (!sock_owned_by_user(sk) && np->recverr) { - sk->sk_err = err; + if (!sock_owned_by_user(sk) && inet6_test_bit(RECVERR6, sk)) { + WRITE_ONCE(sk->sk_err, err); sk_error_report(sk); - } else - sk->sk_err_soft = err; - + } else { + WRITE_ONCE(sk->sk_err_soft, err); + } out: bh_unlock_sock(sk); sock_put(sk); @@ -531,7 +529,7 @@ static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst, struct sk_buff *syn_skb) { struct inet_request_sock *ireq = inet_rsk(req); - struct ipv6_pinfo *np = tcp_inet6_sk(sk); + const struct ipv6_pinfo *np = tcp_inet6_sk(sk); struct ipv6_txoptions *opt; struct flowi6 *fl6 = &fl->u.ip6; struct sk_buff *skb; @@ -546,11 +544,12 @@ static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst, skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb); if (skb) { + tcp_rsk(req)->syn_ect_snt = np->tclass & INET_ECN_MASK; __tcp_v6_send_check(skb, &ireq->ir_v6_loc_addr, &ireq->ir_v6_rmt_addr); fl6->daddr = ireq->ir_v6_rmt_addr; - if (np->repflow && ireq->pktopts) + if (inet6_test_bit(REPFLOW, sk) && ireq->pktopts) fl6->flowlabel = ip6_flowlabel(ipv6_hdr(ireq->pktopts)); tclass = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos) ? @@ -566,8 +565,8 @@ static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst, opt = ireq->ipv6_opt; if (!opt) opt = rcu_dereference(np->opt); - err = ip6_xmit(sk, skb, fl6, skb->mark ? : sk->sk_mark, opt, - tclass, sk->sk_priority); + err = ip6_xmit(sk, skb, fl6, skb->mark ? : READ_ONCE(sk->sk_mark), + opt, tclass, READ_ONCE(sk->sk_priority)); rcu_read_unlock(); err = net_xmit_eval(err); } @@ -608,8 +607,10 @@ static int tcp_v6_parse_md5_keys(struct sock *sk, int optname, { struct tcp_md5sig cmd; struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)&cmd.tcpm_addr; + union tcp_ao_addr *addr; int l3index = 0; u8 prefixlen; + bool l3flag; u8 flags; if (optlen < sizeof(cmd)) @@ -622,6 +623,7 @@ static int tcp_v6_parse_md5_keys(struct sock *sk, int optname, return -EINVAL; flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX; + l3flag = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX; if (optname == TCP_MD5SIG_EXT && cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) { @@ -662,83 +664,71 @@ static int tcp_v6_parse_md5_keys(struct sock *sk, int optname, if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN) return -EINVAL; - if (ipv6_addr_v4mapped(&sin6->sin6_addr)) - return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin6->sin6_addr.s6_addr32[3], + if (ipv6_addr_v4mapped(&sin6->sin6_addr)) { + addr = (union tcp_md5_addr *)&sin6->sin6_addr.s6_addr32[3]; + + /* Don't allow keys for peers that have a matching TCP-AO key. + * See the comment in tcp_ao_add_cmd() + */ + if (tcp_ao_required(sk, addr, AF_INET, + l3flag ? l3index : -1, false)) + return -EKEYREJECTED; + return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags, cmd.tcpm_key, cmd.tcpm_keylen); + } + + addr = (union tcp_md5_addr *)&sin6->sin6_addr; - return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin6->sin6_addr, - AF_INET6, prefixlen, l3index, flags, + /* Don't allow keys for peers that have a matching TCP-AO key. + * See the comment in tcp_ao_add_cmd() + */ + if (tcp_ao_required(sk, addr, AF_INET6, l3flag ? l3index : -1, false)) + return -EKEYREJECTED; + + return tcp_md5_do_add(sk, addr, AF_INET6, prefixlen, l3index, flags, cmd.tcpm_key, cmd.tcpm_keylen); } -static int tcp_v6_md5_hash_headers(struct tcp_md5sig_pool *hp, - const struct in6_addr *daddr, - const struct in6_addr *saddr, - const struct tcphdr *th, int nbytes) +static void tcp_v6_md5_hash_headers(struct md5_ctx *ctx, + const struct in6_addr *daddr, + const struct in6_addr *saddr, + const struct tcphdr *th, int nbytes) { - struct tcp6_pseudohdr *bp; - struct scatterlist sg; - struct tcphdr *_th; - - bp = hp->scratch; - /* 1. TCP pseudo-header (RFC2460) */ - bp->saddr = *saddr; - bp->daddr = *daddr; - bp->protocol = cpu_to_be32(IPPROTO_TCP); - bp->len = cpu_to_be32(nbytes); - - _th = (struct tcphdr *)(bp + 1); - memcpy(_th, th, sizeof(*th)); - _th->check = 0; - - sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th)); - ahash_request_set_crypt(hp->md5_req, &sg, NULL, - sizeof(*bp) + sizeof(*th)); - return crypto_ahash_update(hp->md5_req); + struct { + struct tcp6_pseudohdr ip; /* TCP pseudo-header (RFC2460) */ + struct tcphdr tcp; + } h; + + h.ip.saddr = *saddr; + h.ip.daddr = *daddr; + h.ip.protocol = cpu_to_be32(IPPROTO_TCP); + h.ip.len = cpu_to_be32(nbytes); + h.tcp = *th; + h.tcp.check = 0; + md5_update(ctx, (const u8 *)&h, sizeof(h.ip) + sizeof(h.tcp)); } -static int tcp_v6_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, - const struct in6_addr *daddr, struct in6_addr *saddr, - const struct tcphdr *th) +static noinline_for_stack void +tcp_v6_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, + const struct in6_addr *daddr, struct in6_addr *saddr, + const struct tcphdr *th) { - struct tcp_md5sig_pool *hp; - struct ahash_request *req; - - hp = tcp_get_md5sig_pool(); - if (!hp) - goto clear_hash_noput; - req = hp->md5_req; - - if (crypto_ahash_init(req)) - goto clear_hash; - if (tcp_v6_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2)) - goto clear_hash; - if (tcp_md5_hash_key(hp, key)) - goto clear_hash; - ahash_request_set_crypt(req, NULL, md5_hash, 0); - if (crypto_ahash_final(req)) - goto clear_hash; - - tcp_put_md5sig_pool(); - return 0; + struct md5_ctx ctx; -clear_hash: - tcp_put_md5sig_pool(); -clear_hash_noput: - memset(md5_hash, 0, 16); - return 1; + md5_init(&ctx); + tcp_v6_md5_hash_headers(&ctx, daddr, saddr, th, th->doff << 2); + tcp_md5_hash_key(&ctx, key); + md5_final(&ctx, md5_hash); } -static int tcp_v6_md5_hash_skb(char *md5_hash, - const struct tcp_md5sig_key *key, - const struct sock *sk, - const struct sk_buff *skb) +static noinline_for_stack void +tcp_v6_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key, + const struct sock *sk, const struct sk_buff *skb) { - const struct in6_addr *saddr, *daddr; - struct tcp_md5sig_pool *hp; - struct ahash_request *req; const struct tcphdr *th = tcp_hdr(skb); + const struct in6_addr *saddr, *daddr; + struct md5_ctx ctx; if (sk) { /* valid for establish/request sockets */ saddr = &sk->sk_v6_rcv_saddr; @@ -749,39 +739,18 @@ static int tcp_v6_md5_hash_skb(char *md5_hash, daddr = &ip6h->daddr; } - hp = tcp_get_md5sig_pool(); - if (!hp) - goto clear_hash_noput; - req = hp->md5_req; - - if (crypto_ahash_init(req)) - goto clear_hash; - - if (tcp_v6_md5_hash_headers(hp, daddr, saddr, th, skb->len)) - goto clear_hash; - if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2)) - goto clear_hash; - if (tcp_md5_hash_key(hp, key)) - goto clear_hash; - ahash_request_set_crypt(req, NULL, md5_hash, 0); - if (crypto_ahash_final(req)) - goto clear_hash; - - tcp_put_md5sig_pool(); - return 0; - -clear_hash: - tcp_put_md5sig_pool(); -clear_hash_noput: - memset(md5_hash, 0, 16); - return 1; + md5_init(&ctx); + tcp_v6_md5_hash_headers(&ctx, daddr, saddr, th, skb->len); + tcp_md5_hash_skb_data(&ctx, skb, th->doff << 2); + tcp_md5_hash_key(&ctx, key); + md5_final(&ctx, md5_hash); } - #endif static void tcp_v6_init_req(struct request_sock *req, const struct sock *sk_listener, - struct sk_buff *skb) + struct sk_buff *skb, + u32 tw_isn) { bool l3_slave = ipv6_l3mdev_skb(TCP_SKB_CB(skb)->header.h6.flags); struct inet_request_sock *ireq = inet_rsk(req); @@ -789,17 +758,19 @@ static void tcp_v6_init_req(struct request_sock *req, ireq->ir_v6_rmt_addr = ipv6_hdr(skb)->saddr; ireq->ir_v6_loc_addr = ipv6_hdr(skb)->daddr; + ireq->ir_rmt_addr = LOOPBACK4_IPV6; + ireq->ir_loc_addr = LOOPBACK4_IPV6; /* So that link locals have meaning */ if ((!sk_listener->sk_bound_dev_if || l3_slave) && ipv6_addr_type(&ireq->ir_v6_rmt_addr) & IPV6_ADDR_LINKLOCAL) ireq->ir_iif = tcp_v6_iif(skb); - if (!TCP_SKB_CB(skb)->tcp_tw_isn && + if (!tw_isn && (ipv6_opt_accepted(sk_listener, skb, &TCP_SKB_CB(skb)->header.h6) || np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo || np->rxopt.bits.rxhlim || - np->rxopt.bits.rxohlim || np->repflow)) { + np->rxopt.bits.rxohlim || inet6_test_bit(REPFLOW, sk_listener))) { refcount_inc(&skb->users); ireq->pktopts = skb; } @@ -808,9 +779,10 @@ static void tcp_v6_init_req(struct request_sock *req, static struct dst_entry *tcp_v6_route_req(const struct sock *sk, struct sk_buff *skb, struct flowi *fl, - struct request_sock *req) + struct request_sock *req, + u32 tw_isn) { - tcp_v6_init_req(req, sk, skb); + tcp_v6_init_req(req, sk, skb, tw_isn); if (security_inet_conn_request(sk, skb, req)) return NULL; @@ -821,11 +793,9 @@ static struct dst_entry *tcp_v6_route_req(const struct sock *sk, struct request_sock_ops tcp6_request_sock_ops __read_mostly = { .family = AF_INET6, .obj_size = sizeof(struct tcp6_request_sock), - .rtx_syn_ack = tcp_rtx_synack, .send_ack = tcp_v6_reqsk_send_ack, .destructor = tcp_v6_reqsk_destructor, .send_reset = tcp_v6_send_reset, - .syn_ack_timeout = tcp_syn_ack_timeout, }; const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops = { @@ -835,6 +805,11 @@ const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops = { .req_md5_lookup = tcp_v6_md5_lookup, .calc_md5_hash = tcp_v6_md5_hash_skb, #endif +#ifdef CONFIG_TCP_AO + .ao_lookup = tcp_v6_ao_lookup_rsk, + .ao_calc_key = tcp_v6_ao_calc_key_rsk, + .ao_synack_hash = tcp_v6_ao_synack_hash, +#endif #ifdef CONFIG_SYN_COOKIES .cookie_init_seq = cookie_v6_init_sequence, #endif @@ -846,29 +821,29 @@ const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops = { static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 seq, u32 ack, u32 win, u32 tsval, u32 tsecr, - int oif, struct tcp_md5sig_key *key, int rst, - u8 tclass, __be32 label, u32 priority, u32 txhash) + int oif, int rst, u8 tclass, __be32 label, + u32 priority, u32 txhash, struct tcp_key *key) { - const struct tcphdr *th = tcp_hdr(skb); - struct tcphdr *t1; - struct sk_buff *buff; - struct flowi6 fl6; - struct net *net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev); - struct sock *ctl_sk = net->ipv6.tcp_sk; + struct net *net = sk ? sock_net(sk) : skb_dst_dev_net_rcu(skb); unsigned int tot_len = sizeof(struct tcphdr); + struct sock *ctl_sk = net->ipv6.tcp_sk; + const struct tcphdr *th = tcp_hdr(skb); __be32 mrst = 0, *topt; struct dst_entry *dst; - __u32 mark = 0; + struct sk_buff *buff; + struct tcphdr *t1; + struct flowi6 fl6; + u32 mark = 0; if (tsecr) tot_len += TCPOLEN_TSTAMP_ALIGNED; -#ifdef CONFIG_TCP_MD5SIG - if (key) + if (tcp_key_is_md5(key)) tot_len += TCPOLEN_MD5SIG_ALIGNED; -#endif + if (tcp_key_is_ao(key)) + tot_len += tcp_ao_len_aligned(key->ao_key); #ifdef CONFIG_MPTCP - if (rst && !key) { + if (rst && !tcp_key_is_md5(key)) { mrst = mptcp_reset_option(skb); if (mrst) @@ -909,14 +884,28 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 *topt++ = mrst; #ifdef CONFIG_TCP_MD5SIG - if (key) { + if (tcp_key_is_md5(key)) { *topt++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | (TCPOPT_MD5SIG << 8) | TCPOLEN_MD5SIG); - tcp_v6_md5_hash_hdr((__u8 *)topt, key, + tcp_v6_md5_hash_hdr((__u8 *)topt, key->md5_key, &ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr, t1); } #endif +#ifdef CONFIG_TCP_AO + if (tcp_key_is_ao(key)) { + *topt++ = htonl((TCPOPT_AO << 24) | + (tcp_ao_len(key->ao_key) << 16) | + (key->ao_key->sndid << 8) | + (key->rcv_next)); + + tcp_ao_hash_hdr(AF_INET6, (char *)topt, key->ao_key, + key->traffic_key, + (union tcp_ao_addr *)&ipv6_hdr(skb)->saddr, + (union tcp_ao_addr *)&ipv6_hdr(skb)->daddr, + t1, key->sne); + } +#endif memset(&fl6, 0, sizeof(fl6)); fl6.daddr = ipv6_hdr(skb)->saddr; @@ -938,11 +927,15 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 } if (sk) { + /* unconstify the socket only to attach it to buff with care. */ + skb_set_owner_edemux(buff, (struct sock *)sk); + psp_reply_set_decrypted(sk, buff); + if (sk->sk_state == TCP_TIME_WAIT) mark = inet_twsk(sk)->tw_mark; else - mark = sk->sk_mark; - skb_set_delivery_time(buff, tcp_transmit_time(sk), true); + mark = READ_ONCE(sk->sk_mark); + skb_set_delivery_time(buff, tcp_transmit_time(sk), SKB_CLOCK_MONOTONIC); } if (txhash) { /* autoflowlabel/skb_get_hash_flowi6 rely on buff->hash */ @@ -965,7 +958,7 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 if (!IS_ERR(dst)) { skb_dst_set(buff, dst); ip6_xmit(ctl_sk, buff, &fl6, fl6.flowi6_mark, NULL, - tclass & ~INET_ECN_MASK, priority); + tclass, priority); TCP_INC_STATS(net, TCP_MIB_OUTSEGS); if (rst) TCP_INC_STATS(net, TCP_MIB_OUTRSTS); @@ -975,23 +968,27 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 kfree_skb(buff); } -static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb) +static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb, + enum sk_rst_reason reason) { const struct tcphdr *th = tcp_hdr(skb); struct ipv6hdr *ipv6h = ipv6_hdr(skb); - u32 seq = 0, ack_seq = 0; - struct tcp_md5sig_key *key = NULL; -#ifdef CONFIG_TCP_MD5SIG - const __u8 *hash_location = NULL; - unsigned char newhash[16]; - int genhash; - struct sock *sk1 = NULL; + const __u8 *md5_hash_location = NULL; +#if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) + bool allocated_traffic_key = false; #endif + const struct tcp_ao_hdr *aoh; + struct tcp_key key = {}; + u32 seq = 0, ack_seq = 0; __be32 label = 0; u32 priority = 0; struct net *net; u32 txhash = 0; int oif = 0; +#ifdef CONFIG_TCP_MD5SIG + unsigned char newhash[16]; + struct sock *sk1 = NULL; +#endif if (th->rst) return; @@ -1002,10 +999,14 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb) if (!sk && !ipv6_unicast_destination(skb)) return; - net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev); -#ifdef CONFIG_TCP_MD5SIG + net = sk ? sock_net(sk) : skb_dst_dev_net_rcu(skb); + /* Invalid TCP option size or twice included auth */ + if (tcp_parse_auth_options(th, &md5_hash_location, &aoh)) + return; +#if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) rcu_read_lock(); - hash_location = tcp_parse_md5sig_option(th); +#endif +#ifdef CONFIG_TCP_MD5SIG if (sk && sk_fullsock(sk)) { int l3index; @@ -1013,8 +1014,10 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb) * in an L3 domain and inet_iif is set to it. */ l3index = tcp_v6_sdif(skb) ? tcp_v6_iif_l3_slave(skb) : 0; - key = tcp_v6_md5_do_lookup(sk, &ipv6h->saddr, l3index); - } else if (hash_location) { + key.md5_key = tcp_v6_md5_do_lookup(sk, &ipv6h->saddr, l3index); + if (key.md5_key) + key.type = TCP_KEY_MD5; + } else if (md5_hash_location) { int dif = tcp_v6_iif_l3_slave(skb); int sdif = tcp_v6_sdif(skb); int l3index; @@ -1026,8 +1029,7 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb) * Incoming packet is checked with md5 hash with finding key, * no RST generated if md5 hash doesn't match. */ - sk1 = inet6_lookup_listener(net, net->ipv4.tcp_death_row.hashinfo, - NULL, 0, &ipv6h->saddr, th->source, + sk1 = inet6_lookup_listener(net, NULL, 0, &ipv6h->saddr, th->source, &ipv6h->daddr, ntohs(th->source), dif, sdif); if (!sk1) @@ -1038,12 +1040,13 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb) */ l3index = tcp_v6_sdif(skb) ? dif : 0; - key = tcp_v6_md5_do_lookup(sk1, &ipv6h->saddr, l3index); - if (!key) + key.md5_key = tcp_v6_md5_do_lookup(sk1, &ipv6h->saddr, l3index); + if (!key.md5_key) goto out; + key.type = TCP_KEY_MD5; - genhash = tcp_v6_md5_hash_skb(newhash, key, NULL, skb); - if (genhash || memcmp(hash_location, newhash, 16) != 0) + tcp_v6_md5_hash_skb(newhash, key.md5_key, NULL, skb); + if (memcmp(md5_hash_location, newhash, 16) != 0) goto out; } #endif @@ -1054,16 +1057,27 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb) ack_seq = ntohl(th->seq) + th->syn + th->fin + skb->len - (th->doff << 2); +#ifdef CONFIG_TCP_AO + if (aoh) { + int l3index; + + l3index = tcp_v6_sdif(skb) ? tcp_v6_iif_l3_slave(skb) : 0; + if (tcp_ao_prepare_reset(sk, skb, aoh, l3index, seq, + &key.ao_key, &key.traffic_key, + &allocated_traffic_key, + &key.rcv_next, &key.sne)) + goto out; + key.type = TCP_KEY_AO; + } +#endif + if (sk) { oif = sk->sk_bound_dev_if; if (sk_fullsock(sk)) { - const struct ipv6_pinfo *np = tcp_inet6_sk(sk); - - trace_tcp_send_reset(sk, skb); - if (np->repflow) + if (inet6_test_bit(REPFLOW, sk)) label = ip6_flowlabel(ipv6h); - priority = sk->sk_priority; - txhash = sk->sk_hash; + priority = READ_ONCE(sk->sk_priority); + txhash = sk->sk_txhash; } if (sk->sk_state == TCP_TIME_WAIT) { label = cpu_to_be32(inet_twsk(sk)->tw_flowlabel); @@ -1075,63 +1089,166 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb) label = ip6_flowlabel(ipv6h); } - tcp_v6_send_response(sk, skb, seq, ack_seq, 0, 0, 0, oif, key, 1, - ipv6_get_dsfield(ipv6h), label, priority, txhash); + trace_tcp_send_reset(sk, skb, reason); -#ifdef CONFIG_TCP_MD5SIG + tcp_v6_send_response(sk, skb, seq, ack_seq, 0, 0, 0, oif, 1, + ipv6_get_dsfield(ipv6h) & ~INET_ECN_MASK, + label, priority, txhash, + &key); + +#if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) out: + if (allocated_traffic_key) + kfree(key.traffic_key); rcu_read_unlock(); #endif } static void tcp_v6_send_ack(const struct sock *sk, struct sk_buff *skb, u32 seq, u32 ack, u32 win, u32 tsval, u32 tsecr, int oif, - struct tcp_md5sig_key *key, u8 tclass, + struct tcp_key *key, u8 tclass, __be32 label, u32 priority, u32 txhash) { - tcp_v6_send_response(sk, skb, seq, ack, win, tsval, tsecr, oif, key, 0, - tclass, label, priority, txhash); + tcp_v6_send_response(sk, skb, seq, ack, win, tsval, tsecr, oif, 0, + tclass, label, priority, txhash, key); } -static void tcp_v6_timewait_ack(struct sock *sk, struct sk_buff *skb) +static void tcp_v6_timewait_ack(struct sock *sk, struct sk_buff *skb, + enum tcp_tw_status tw_status) { struct inet_timewait_sock *tw = inet_twsk(sk); struct tcp_timewait_sock *tcptw = tcp_twsk(sk); + u8 tclass = tw->tw_tclass; + struct tcp_key key = {}; + + if (tw_status == TCP_TW_ACK_OOW) + tclass &= ~INET_ECN_MASK; +#ifdef CONFIG_TCP_AO + struct tcp_ao_info *ao_info; + + if (static_branch_unlikely(&tcp_ao_needed.key)) { + + /* FIXME: the segment to-be-acked is not verified yet */ + ao_info = rcu_dereference(tcptw->ao_info); + if (ao_info) { + const struct tcp_ao_hdr *aoh; + + /* Invalid TCP option size or twice included auth */ + if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh)) + goto out; + if (aoh) + key.ao_key = tcp_ao_established_key(sk, ao_info, + aoh->rnext_keyid, -1); + } + } + if (key.ao_key) { + struct tcp_ao_key *rnext_key; + + key.traffic_key = snd_other_key(key.ao_key); + /* rcv_next switches to our rcv_next */ + rnext_key = READ_ONCE(ao_info->rnext_key); + key.rcv_next = rnext_key->rcvid; + key.sne = READ_ONCE(ao_info->snd_sne); + key.type = TCP_KEY_AO; +#else + if (0) { +#endif +#ifdef CONFIG_TCP_MD5SIG + } else if (static_branch_unlikely(&tcp_md5_needed.key)) { + key.md5_key = tcp_twsk_md5_key(tcptw); + if (key.md5_key) + key.type = TCP_KEY_MD5; +#endif + } - tcp_v6_send_ack(sk, skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt, + tcp_v6_send_ack(sk, skb, tcptw->tw_snd_nxt, + READ_ONCE(tcptw->tw_rcv_nxt), tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, - tcp_time_stamp_raw() + tcptw->tw_ts_offset, - tcptw->tw_ts_recent, tw->tw_bound_dev_if, tcp_twsk_md5_key(tcptw), - tw->tw_tclass, cpu_to_be32(tw->tw_flowlabel), tw->tw_priority, - tw->tw_txhash); + tcp_tw_tsval(tcptw), + READ_ONCE(tcptw->tw_ts_recent), tw->tw_bound_dev_if, + &key, tclass, cpu_to_be32(tw->tw_flowlabel), + tw->tw_priority, tw->tw_txhash); +#ifdef CONFIG_TCP_AO +out: +#endif inet_twsk_put(tw); } static void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, struct request_sock *req) { - int l3index; + struct tcp_key key = {}; - l3index = tcp_v6_sdif(skb) ? tcp_v6_iif_l3_slave(skb) : 0; +#ifdef CONFIG_TCP_AO + if (static_branch_unlikely(&tcp_ao_needed.key) && + tcp_rsk_used_ao(req)) { + const struct in6_addr *addr = &ipv6_hdr(skb)->saddr; + const struct tcp_ao_hdr *aoh; + int l3index; + + l3index = tcp_v6_sdif(skb) ? tcp_v6_iif_l3_slave(skb) : 0; + /* Invalid TCP option size or twice included auth */ + if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh)) + return; + if (!aoh) + return; + key.ao_key = tcp_ao_do_lookup(sk, l3index, + (union tcp_ao_addr *)addr, + AF_INET6, aoh->rnext_keyid, -1); + if (unlikely(!key.ao_key)) { + /* Send ACK with any matching MKT for the peer */ + key.ao_key = tcp_ao_do_lookup(sk, l3index, + (union tcp_ao_addr *)addr, + AF_INET6, -1, -1); + /* Matching key disappeared (user removed the key?) + * let the handshake timeout. + */ + if (!key.ao_key) { + net_info_ratelimited("TCP-AO key for (%pI6, %d)->(%pI6, %d) suddenly disappeared, won't ACK new connection\n", + addr, + ntohs(tcp_hdr(skb)->source), + &ipv6_hdr(skb)->daddr, + ntohs(tcp_hdr(skb)->dest)); + return; + } + } + key.traffic_key = kmalloc(tcp_ao_digest_size(key.ao_key), GFP_ATOMIC); + if (!key.traffic_key) + return; + + key.type = TCP_KEY_AO; + key.rcv_next = aoh->keyid; + tcp_v6_ao_calc_key_rsk(key.ao_key, key.traffic_key, req); +#else + if (0) { +#endif +#ifdef CONFIG_TCP_MD5SIG + } else if (static_branch_unlikely(&tcp_md5_needed.key)) { + int l3index = tcp_v6_sdif(skb) ? tcp_v6_iif_l3_slave(skb) : 0; + + key.md5_key = tcp_v6_md5_do_lookup(sk, &ipv6_hdr(skb)->saddr, + l3index); + if (key.md5_key) + key.type = TCP_KEY_MD5; +#endif + } /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV * sk->sk_state == TCP_SYN_RECV -> for Fast Open. */ - /* RFC 7323 2.3 - * The window field (SEG.WND) of every outgoing segment, with the - * exception of <SYN> segments, MUST be right-shifted by - * Rcv.Wind.Shift bits: - */ tcp_v6_send_ack(sk, skb, (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt, tcp_rsk(req)->rcv_nxt, - req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale, - tcp_time_stamp_raw() + tcp_rsk(req)->ts_off, + tcp_synack_window(req) >> inet_rsk(req)->rcv_wscale, + tcp_rsk_tsval(tcp_rsk(req)), req->ts_recent, sk->sk_bound_dev_if, - tcp_v6_md5_do_lookup(sk, &ipv6_hdr(skb)->saddr, l3index), - ipv6_get_dsfield(ipv6_hdr(skb)), 0, sk->sk_priority, - tcp_rsk(req)->txhash); + &key, ipv6_get_dsfield(ipv6_hdr(skb)) & ~INET_ECN_MASK, + 0, + READ_ONCE(sk->sk_priority), + READ_ONCE(tcp_rsk(req)->txhash)); + if (tcp_key_is_ao(&key)) + kfree(key.traffic_key); } @@ -1223,7 +1340,9 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * if (!newsk) return NULL; - inet_sk(newsk)->pinet6 = tcp_inet6_sk(newsk); + newinet = inet_sk(newsk); + newinet->pinet6 = tcp_inet6_sk(newsk); + newinet->ipv6_fl_list = NULL; newnp = tcp_inet6_sk(newsk); newtp = tcp_sk(newsk); @@ -1236,19 +1355,18 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * if (sk_is_mptcp(newsk)) mptcpv6_handle_mapped(newsk, true); newsk->sk_backlog_rcv = tcp_v4_do_rcv; -#ifdef CONFIG_TCP_MD5SIG +#if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) newtp->af_specific = &tcp_sock_ipv6_mapped_specific; #endif newnp->ipv6_mc_list = NULL; newnp->ipv6_ac_list = NULL; - newnp->ipv6_fl_list = NULL; newnp->pktoptions = NULL; newnp->opt = NULL; newnp->mcast_oif = inet_iif(skb); newnp->mcast_hops = ip_hdr(skb)->ttl; newnp->rcv_flowinfo = 0; - if (np->repflow) + if (inet6_test_bit(REPFLOW, sk)) newnp->flow_label = 0; /* @@ -1269,17 +1387,17 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * ireq = inet_rsk(req); if (sk_acceptq_is_full(sk)) - goto out_overflow; + goto exit_overflow; if (!dst) { dst = inet6_csk_route_req(sk, &fl6, req, IPPROTO_TCP); if (!dst) - goto out; + goto exit; } newsk = tcp_create_openreq_child(sk, req, skb); if (!newsk) - goto out_nonewsk; + goto exit_nonewsk; /* * No need to charge this sock to the relevant IPv6 refcnt debug socks @@ -1288,30 +1406,28 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * */ newsk->sk_gso_type = SKB_GSO_TCPV6; - ip6_dst_store(newsk, dst, NULL, NULL); inet6_sk_rx_dst_set(newsk, skb); - inet_sk(newsk)->pinet6 = tcp_inet6_sk(newsk); + newinet = inet_sk(newsk); + newinet->pinet6 = tcp_inet6_sk(newsk); + newinet->ipv6_fl_list = NULL; + newinet->inet_opt = NULL; newtp = tcp_sk(newsk); - newinet = inet_sk(newsk); newnp = tcp_inet6_sk(newsk); memcpy(newnp, np, sizeof(struct ipv6_pinfo)); - newsk->sk_v6_daddr = ireq->ir_v6_rmt_addr; + ip6_dst_store(newsk, dst, false, false); + newnp->saddr = ireq->ir_v6_loc_addr; - newsk->sk_v6_rcv_saddr = ireq->ir_v6_loc_addr; - newsk->sk_bound_dev_if = ireq->ir_iif; /* Now IPv6 options... First: no IPv4 options. */ - newinet->inet_opt = NULL; newnp->ipv6_mc_list = NULL; newnp->ipv6_ac_list = NULL; - newnp->ipv6_fl_list = NULL; /* Clone RX bits */ newnp->rxopt.all = np->rxopt.all; @@ -1321,7 +1437,7 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * newnp->mcast_oif = tcp_v6_iif(skb); newnp->mcast_hops = ipv6_hdr(skb)->hop_limit; newnp->rcv_flowinfo = ip6_flowinfo(ipv6_hdr(skb)); - if (np->repflow) + if (inet6_test_bit(REPFLOW, sk)) newnp->flow_label = ip6_flowlabel(ipv6_hdr(skb)); /* Set ToS of the new socket based upon the value of incoming SYN. @@ -1355,31 +1471,29 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * tcp_initialize_rcv_mss(newsk); - newinet->inet_daddr = newinet->inet_saddr = LOOPBACK4_IPV6; - newinet->inet_rcv_saddr = LOOPBACK4_IPV6; - #ifdef CONFIG_TCP_MD5SIG l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif); - /* Copy over the MD5 key from the original socket */ - key = tcp_v6_md5_do_lookup(sk, &newsk->sk_v6_daddr, l3index); - if (key) { - const union tcp_md5_addr *addr; + if (!tcp_rsk_used_ao(req)) { + /* Copy over the MD5 key from the original socket */ + key = tcp_v6_md5_do_lookup(sk, &newsk->sk_v6_daddr, l3index); + if (key) { + const union tcp_md5_addr *addr; - addr = (union tcp_md5_addr *)&newsk->sk_v6_daddr; - if (tcp_md5_key_copy(newsk, addr, AF_INET6, 128, l3index, key)) { - inet_csk_prepare_forced_close(newsk); - tcp_done(newsk); - goto out; + addr = (union tcp_md5_addr *)&newsk->sk_v6_daddr; + if (tcp_md5_key_copy(newsk, addr, AF_INET6, 128, l3index, key)) + goto put_and_exit; } } #endif +#ifdef CONFIG_TCP_AO + /* Copy over tcp_ao_info if any */ + if (tcp_ao_copy_all_matching(sk, newsk, req, skb, AF_INET6)) + goto put_and_exit; /* OOM */ +#endif - if (__inet_inherit_port(sk, newsk) < 0) { - inet_csk_prepare_forced_close(newsk); - tcp_done(newsk); - goto out; - } + if (__inet_inherit_port(sk, newsk) < 0) + goto put_and_exit; *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash), &found_dup_sk); if (*own_req) { @@ -1387,14 +1501,11 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * /* Clone pktoptions received with SYN, if we own the req */ if (ireq->pktopts) { - newnp->pktoptions = skb_clone(ireq->pktopts, - sk_gfp_mask(sk, GFP_ATOMIC)); + newnp->pktoptions = skb_clone_and_charge_r(ireq->pktopts, newsk); consume_skb(ireq->pktopts); ireq->pktopts = NULL; - if (newnp->pktoptions) { + if (newnp->pktoptions) tcp_v6_restore_cb(newnp->pktoptions); - skb_set_owner_r(newnp->pktoptions, newsk); - } } } else { if (!req_unhash && found_dup_sk) { @@ -1409,13 +1520,17 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * return newsk; -out_overflow: +exit_overflow: __NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); -out_nonewsk: +exit_nonewsk: dst_release(dst); -out: +exit: tcp_listendrop(sk); return NULL; +put_and_exit: + inet_csk_prepare_forced_close(newsk); + tcp_done(newsk); + goto exit; } INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *, @@ -1447,6 +1562,10 @@ int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) if (skb->protocol == htons(ETH_P_IP)) return tcp_v4_do_rcv(sk, skb); + reason = psp_sk_rx_policy_check(sk, skb); + if (reason) + goto err_discard; + /* * socket locking is here for SMP purposes as backlog rcv * is currently called with bh processing disabled. @@ -1465,10 +1584,9 @@ int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) by tcp. Feel free to propose better solution. --ANK (980728) */ - if (np->rxopt.all) - opt_skb = skb_clone(skb, sk_gfp_mask(sk, GFP_ATOMIC)); + if (np->rxopt.all && sk->sk_state != TCP_LISTEN) + opt_skb = skb_clone_and_charge_r(skb, sk); - reason = SKB_DROP_REASON_NOT_SPECIFIED; if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ struct dst_entry *dst; @@ -1498,36 +1616,36 @@ int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) if (sk->sk_state == TCP_LISTEN) { struct sock *nsk = tcp_v6_cookie_check(sk, skb); - if (!nsk) - goto discard; - if (nsk != sk) { - if (tcp_child_process(sk, nsk, skb)) - goto reset; - if (opt_skb) - __kfree_skb(opt_skb); + if (nsk) { + reason = tcp_child_process(sk, nsk, skb); + if (reason) + goto reset; + } return 0; } } else sock_rps_save_rxhash(sk, skb); - if (tcp_rcv_state_process(sk, skb)) + reason = tcp_rcv_state_process(sk, skb); + if (reason) goto reset; if (opt_skb) goto ipv6_pktoptions; return 0; reset: - tcp_v6_send_reset(sk, skb); + tcp_v6_send_reset(sk, skb, sk_rst_convert_drop_reason(reason)); discard: if (opt_skb) __kfree_skb(opt_skb); - kfree_skb_reason(skb, reason); + sk_skb_reason_drop(sk, skb, reason); return 0; csum_err: reason = SKB_DROP_REASON_TCP_CSUM; trace_tcp_bad_csum(skb); TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); +err_discard: TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); goto discard; @@ -1544,15 +1662,15 @@ ipv6_pktoptions: if (TCP_SKB_CB(opt_skb)->end_seq == tp->rcv_nxt && !((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) { if (np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo) - np->mcast_oif = tcp_v6_iif(opt_skb); + WRITE_ONCE(np->mcast_oif, tcp_v6_iif(opt_skb)); if (np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim) - np->mcast_hops = ipv6_hdr(opt_skb)->hop_limit; + WRITE_ONCE(np->mcast_hops, + ipv6_hdr(opt_skb)->hop_limit); if (np->rxopt.bits.rxflow || np->rxopt.bits.rxtclass) np->rcv_flowinfo = ip6_flowinfo(ipv6_hdr(opt_skb)); - if (np->repflow) + if (inet6_test_bit(REPFLOW, sk)) np->flow_label = ip6_flowlabel(ipv6_hdr(opt_skb)); if (ipv6_opt_accepted(sk, opt_skb, &TCP_SKB_CB(opt_skb)->header.h6)) { - skb_set_owner_r(opt_skb, sk); tcp_v6_restore_cb(opt_skb); opt_skb = xchg(&np->pktoptions, opt_skb); } else { @@ -1581,8 +1699,7 @@ static void tcp_v6_fill_cb(struct sk_buff *skb, const struct ipv6hdr *hdr, TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + skb->len - th->doff*4); TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); - TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th); - TCP_SKB_CB(skb)->tcp_tw_isn = 0; + TCP_SKB_CB(skb)->tcp_flags = tcp_flags_ntohs(th); TCP_SKB_CB(skb)->ip_dsfield = ipv6_get_dsfield(hdr); TCP_SKB_CB(skb)->sacked = 0; TCP_SKB_CB(skb)->has_rxtstamp = @@ -1591,15 +1708,17 @@ static void tcp_v6_fill_cb(struct sk_buff *skb, const struct ipv6hdr *hdr, INDIRECT_CALLABLE_SCOPE int tcp_v6_rcv(struct sk_buff *skb) { + struct net *net = dev_net_rcu(skb->dev); enum skb_drop_reason drop_reason; + enum tcp_tw_status tw_status; int sdif = inet6_sdif(skb); int dif = inet6_iif(skb); const struct tcphdr *th; const struct ipv6hdr *hdr; + struct sock *sk = NULL; bool refcounted; - struct sock *sk; int ret; - struct net *net = dev_net(skb->dev); + u32 isn; drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; if (skb->pkt_type != PACKET_HOST) @@ -1629,13 +1748,12 @@ INDIRECT_CALLABLE_SCOPE int tcp_v6_rcv(struct sk_buff *skb) hdr = ipv6_hdr(skb); lookup: - sk = __inet6_lookup_skb(net->ipv4.tcp_death_row.hashinfo, skb, __tcp_hdrlen(th), + sk = __inet6_lookup_skb(skb, __tcp_hdrlen(th), th->source, th->dest, inet6_iif(skb), sdif, &refcounted); if (!sk) goto no_tcp_socket; -process: if (sk->sk_state == TCP_TIME_WAIT) goto do_time_wait; @@ -1645,11 +1763,14 @@ process: struct sock *nsk; sk = req->rsk_listener; - drop_reason = tcp_inbound_md5_hash(sk, skb, - &hdr->saddr, &hdr->daddr, - AF_INET6, dif, sdif); + if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb)) + drop_reason = SKB_DROP_REASON_XFRM_POLICY; + else + drop_reason = tcp_inbound_hash(sk, req, skb, + &hdr->saddr, &hdr->daddr, + AF_INET6, dif, sdif); if (drop_reason) { - sk_drops_add(sk, skb); + sk_drops_skbadd(sk, skb); reqsk_put(req); goto discard_it; } @@ -1672,13 +1793,12 @@ process: } refcounted = true; nsk = NULL; - if (!tcp_filter(sk, skb)) { + if (!tcp_filter(sk, skb, &drop_reason)) { th = (const struct tcphdr *)skb->data; hdr = ipv6_hdr(skb); tcp_v6_fill_cb(skb, hdr, th); - nsk = tcp_check_req(sk, skb, req, false, &req_stolen); - } else { - drop_reason = SKB_DROP_REASON_SOCKET_FILTER; + nsk = tcp_check_req(sk, skb, req, false, &req_stolen, + &drop_reason); } if (!nsk) { reqsk_put(req); @@ -1694,22 +1814,30 @@ process: } goto discard_and_relse; } + nf_reset_ct(skb); if (nsk == sk) { reqsk_put(req); tcp_v6_restore_cb(skb); - } else if (tcp_child_process(sk, nsk, skb)) { - tcp_v6_send_reset(nsk, skb); - goto discard_and_relse; } else { + drop_reason = tcp_child_process(sk, nsk, skb); + if (drop_reason) { + enum sk_rst_reason rst_reason; + + rst_reason = sk_rst_convert_drop_reason(drop_reason); + tcp_v6_send_reset(nsk, skb, rst_reason); + goto discard_and_relse; + } sock_put(sk); return 0; } } +process: if (static_branch_unlikely(&ip6_min_hopcount)) { /* min_hopcount can be changed concurrently from do_ipv6_setsockopt() */ - if (hdr->hop_limit < READ_ONCE(tcp_inet6_sk(sk)->min_hopcount)) { + if (unlikely(hdr->hop_limit < READ_ONCE(tcp_inet6_sk(sk)->min_hopcount))) { __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); + drop_reason = SKB_DROP_REASON_TCP_MINTTL; goto discard_and_relse; } } @@ -1719,15 +1847,16 @@ process: goto discard_and_relse; } - drop_reason = tcp_inbound_md5_hash(sk, skb, &hdr->saddr, &hdr->daddr, - AF_INET6, dif, sdif); + drop_reason = tcp_inbound_hash(sk, NULL, skb, &hdr->saddr, &hdr->daddr, + AF_INET6, dif, sdif); if (drop_reason) goto discard_and_relse; - if (tcp_filter(sk, skb)) { - drop_reason = SKB_DROP_REASON_SOCKET_FILTER; + nf_reset_ct(skb); + + if (tcp_filter(sk, skb, &drop_reason)) goto discard_and_relse; - } + th = (const struct tcphdr *)skb->data; hdr = ipv6_hdr(skb); tcp_v6_fill_cb(skb, hdr, th); @@ -1771,16 +1900,16 @@ csum_error: bad_packet: __TCP_INC_STATS(net, TCP_MIB_INERRS); } else { - tcp_v6_send_reset(NULL, skb); + tcp_v6_send_reset(NULL, skb, sk_rst_convert_drop_reason(drop_reason)); } discard_it: SKB_DR_OR(drop_reason, NOT_SPECIFIED); - kfree_skb_reason(skb, drop_reason); + sk_skb_reason_drop(sk, skb, drop_reason); return 0; discard_and_relse: - sk_drops_add(sk, skb); + sk_drops_skbadd(sk, skb); if (refcounted) sock_put(sk); goto discard_it; @@ -1799,13 +1928,14 @@ do_time_wait: goto csum_error; } - switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) { + tw_status = tcp_timewait_state_process(inet_twsk(sk), skb, th, &isn, + &drop_reason); + switch (tw_status) { case TCP_TW_SYN: { struct sock *sk2; - sk2 = inet6_lookup_listener(net, net->ipv4.tcp_death_row.hashinfo, - skb, __tcp_hdrlen(th), + sk2 = inet6_lookup_listener(net, skb, __tcp_hdrlen(th), &ipv6_hdr(skb)->saddr, th->source, &ipv6_hdr(skb)->daddr, ntohs(th->dest), @@ -1817,16 +1947,22 @@ do_time_wait: sk = sk2; tcp_v6_restore_cb(skb); refcounted = false; + __this_cpu_write(tcp_tw_isn, isn); goto process; } + + drop_reason = psp_twsk_rx_policy_check(inet_twsk(sk), skb); + if (drop_reason) + break; } /* to ACK */ fallthrough; case TCP_TW_ACK: - tcp_v6_timewait_ack(sk, skb); + case TCP_TW_ACK_OOW: + tcp_v6_timewait_ack(sk, skb, tw_status); break; case TCP_TW_RST: - tcp_v6_send_reset(sk, skb); + tcp_v6_send_reset(sk, skb, SK_RST_REASON_TCP_TIMEWAIT_SOCKET); inet_twsk_deschedule_put(inet_twsk(sk)); goto discard_it; case TCP_TW_SUCCESS: @@ -1837,7 +1973,7 @@ do_time_wait: void tcp_v6_early_demux(struct sk_buff *skb) { - struct net *net = dev_net(skb->dev); + struct net *net = dev_net_rcu(skb->dev); const struct ipv6hdr *hdr; const struct tcphdr *th; struct sock *sk; @@ -1855,8 +1991,7 @@ void tcp_v6_early_demux(struct sk_buff *skb) return; /* Note : We use inet6_iif() here, not tcp_v6_iif() */ - sk = __inet6_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, - &hdr->saddr, th->source, + sk = __inet6_lookup_established(net, &hdr->saddr, th->source, &hdr->daddr, ntohs(th->dest), inet6_iif(skb), inet6_sdif(skb)); if (sk) { @@ -1876,8 +2011,6 @@ void tcp_v6_early_demux(struct sk_buff *skb) static struct timewait_sock_ops tcp6_timewait_sock_ops = { .twsk_obj_size = sizeof(struct tcp6_timewait_sock), - .twsk_unique = tcp_twsk_unique, - .twsk_destructor = tcp_twsk_destructor, }; INDIRECT_CALLABLE_SCOPE void tcp_v6_send_check(struct sock *sk, struct sk_buff *skb) @@ -1893,19 +2026,24 @@ const struct inet_connection_sock_af_ops ipv6_specific = { .conn_request = tcp_v6_conn_request, .syn_recv_sock = tcp_v6_syn_recv_sock, .net_header_len = sizeof(struct ipv6hdr), - .net_frag_header_len = sizeof(struct frag_hdr), .setsockopt = ipv6_setsockopt, .getsockopt = ipv6_getsockopt, - .addr2sockaddr = inet6_csk_addr2sockaddr, - .sockaddr_len = sizeof(struct sockaddr_in6), .mtu_reduced = tcp_v6_mtu_reduced, }; -#ifdef CONFIG_TCP_MD5SIG +#if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) static const struct tcp_sock_af_ops tcp_sock_ipv6_specific = { +#ifdef CONFIG_TCP_MD5SIG .md5_lookup = tcp_v6_md5_lookup, .calc_md5_hash = tcp_v6_md5_hash_skb, .md5_parse = tcp_v6_parse_md5_keys, +#endif +#ifdef CONFIG_TCP_AO + .ao_lookup = tcp_v6_ao_lookup, + .calc_ao_hash = tcp_v6_ao_hash_skb, + .ao_parse = tcp_v6_parse_ao, + .ao_calc_key_sk = tcp_v6_ao_calc_key_sk, +#endif }; #endif @@ -1922,17 +2060,30 @@ static const struct inet_connection_sock_af_ops ipv6_mapped = { .net_header_len = sizeof(struct iphdr), .setsockopt = ipv6_setsockopt, .getsockopt = ipv6_getsockopt, - .addr2sockaddr = inet6_csk_addr2sockaddr, - .sockaddr_len = sizeof(struct sockaddr_in6), .mtu_reduced = tcp_v4_mtu_reduced, }; -#ifdef CONFIG_TCP_MD5SIG +#if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) static const struct tcp_sock_af_ops tcp_sock_ipv6_mapped_specific = { +#ifdef CONFIG_TCP_MD5SIG .md5_lookup = tcp_v4_md5_lookup, .calc_md5_hash = tcp_v4_md5_hash_skb, .md5_parse = tcp_v6_parse_md5_keys, +#endif +#ifdef CONFIG_TCP_AO + .ao_lookup = tcp_v6_ao_lookup, + .calc_ao_hash = tcp_v4_ao_hash_skb, + .ao_parse = tcp_v6_parse_ao, + .ao_calc_key_sk = tcp_v4_ao_calc_key_sk, +#endif }; + +static void tcp6_destruct_sock(struct sock *sk) +{ + tcp_md5_destruct_sock(sk); + tcp_ao_destroy_sock(sk, false); + inet6_sock_destruct(sk); +} #endif /* NOTE: A lot of things set to zero explicitly by call to @@ -1946,8 +2097,9 @@ static int tcp_v6_init_sock(struct sock *sk) icsk->icsk_af_ops = &ipv6_specific; -#ifdef CONFIG_TCP_MD5SIG +#if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) tcp_sk(sk)->af_specific = &tcp_sock_ipv6_specific; + sk->sk_destruct = tcp6_destruct_sock; #endif return 0; @@ -1981,7 +2133,7 @@ static void get_openreq6(struct seq_file *seq, jiffies_to_clock_t(ttd), req->num_timeout, from_kuid_munged(seq_user_ns(seq), - sock_i_uid(req->rsk_listener)), + sk_uid(req->rsk_listener)), 0, /* non standard timer */ 0, /* open_requests have no inode */ 0, req); @@ -1997,6 +2149,7 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i) const struct tcp_sock *tp = tcp_sk(sp); const struct inet_connection_sock *icsk = inet_csk(sp); const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq; + u8 icsk_pending; int rx_queue; int state; @@ -2005,17 +2158,18 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i) destp = ntohs(inet->inet_dport); srcp = ntohs(inet->inet_sport); - if (icsk->icsk_pending == ICSK_TIME_RETRANS || - icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT || - icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { + icsk_pending = smp_load_acquire(&icsk->icsk_pending); + if (icsk_pending == ICSK_TIME_RETRANS || + icsk_pending == ICSK_TIME_REO_TIMEOUT || + icsk_pending == ICSK_TIME_LOSS_PROBE) { timer_active = 1; - timer_expires = icsk->icsk_timeout; - } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) { + timer_expires = tcp_timeout_expires(sp); + } else if (icsk_pending == ICSK_TIME_PROBE0) { timer_active = 4; - timer_expires = icsk->icsk_timeout; - } else if (timer_pending(&sp->sk_timer)) { + timer_expires = tcp_timeout_expires(sp); + } else if (timer_pending(&icsk->icsk_keepalive_timer)) { timer_active = 2; - timer_expires = sp->sk_timer.expires; + timer_expires = icsk->icsk_keepalive_timer.expires; } else { timer_active = 0; timer_expires = jiffies; @@ -2044,9 +2198,9 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i) rx_queue, timer_active, jiffies_delta_to_clock_t(timer_expires - jiffies), - icsk->icsk_retransmits, - from_kuid_munged(seq_user_ns(seq), sock_i_uid(sp)), - icsk->icsk_probes_out, + READ_ONCE(icsk->icsk_retransmits), + from_kuid_munged(seq_user_ns(seq), sk_uid(sp)), + READ_ONCE(icsk->icsk_probes_out), sock_i_ino(sp), refcount_read(&sp->sk_refcnt), sp, jiffies_to_clock_t(icsk->icsk_rto), @@ -2079,7 +2233,7 @@ static void get_timewait6_sock(struct seq_file *seq, src->s6_addr32[2], src->s6_addr32[3], srcp, dest->s6_addr32[0], dest->s6_addr32[1], dest->s6_addr32[2], dest->s6_addr32[3], destp, - tw->tw_substate, 0, 0, + READ_ONCE(tw->tw_substate), 0, 0, 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0, refcount_read(&tw->tw_refcnt), tw); } @@ -2153,10 +2307,10 @@ struct proto tcpv6_prot = { .keepalive = tcp_set_keepalive, .recvmsg = tcp_recvmsg, .sendmsg = tcp_sendmsg, - .sendpage = tcp_sendpage, + .splice_eof = tcp_splice_eof, .backlog_rcv = tcp_v6_do_rcv, .release_cb = tcp_release_cb, - .hash = inet6_hash, + .hash = inet_hash, .unhash = inet_unhash, .get_port = inet_csk_get_port, .put_port = inet_put_port, @@ -2168,16 +2322,16 @@ struct proto tcpv6_prot = { .stream_memory_free = tcp_stream_memory_free, .sockets_allocated = &tcp_sockets_allocated, - .memory_allocated = &tcp_memory_allocated, + .memory_allocated = &net_aligned_data.tcp_memory_allocated, .per_cpu_fw_alloc = &tcp_memory_per_cpu_fw_alloc, .memory_pressure = &tcp_memory_pressure, - .orphan_count = &tcp_orphan_count, .sysctl_mem = sysctl_tcp_mem, .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem), .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem), .max_header = MAX_TCP_HEADER, .obj_size = sizeof(struct tcp6_sock), + .ipv6_pinfo_offset = offsetof(struct tcp6_sock, inet6), .slab_flags = SLAB_TYPESAFE_BY_RCU, .twsk_prot = &tcp6_timewait_sock_ops, .rsk_prot = &tcp6_request_sock_ops, @@ -2187,11 +2341,6 @@ struct proto tcpv6_prot = { }; EXPORT_SYMBOL_GPL(tcpv6_prot); -static const struct inet6_protocol tcpv6_protocol = { - .handler = tcp_v6_rcv, - .err_handler = tcp_v6_err, - .flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL, -}; static struct inet_protosw tcpv6_protosw = { .type = SOCK_STREAM, @@ -2204,8 +2353,14 @@ static struct inet_protosw tcpv6_protosw = { static int __net_init tcpv6_net_init(struct net *net) { - return inet_ctl_sock_create(&net->ipv6.tcp_sk, PF_INET6, - SOCK_RAW, IPPROTO_TCP, net); + int res; + + res = inet_ctl_sock_create(&net->ipv6.tcp_sk, PF_INET6, + SOCK_RAW, IPPROTO_TCP, net); + if (!res) + net->ipv6.tcp_sk->sk_clockid = CLOCK_MONOTONIC; + + return res; } static void __net_exit tcpv6_net_exit(struct net *net) @@ -2213,22 +2368,21 @@ static void __net_exit tcpv6_net_exit(struct net *net) inet_ctl_sock_destroy(net->ipv6.tcp_sk); } -static void __net_exit tcpv6_net_exit_batch(struct list_head *net_exit_list) -{ - tcp_twsk_purge(net_exit_list, AF_INET6); -} - static struct pernet_operations tcpv6_net_ops = { .init = tcpv6_net_init, .exit = tcpv6_net_exit, - .exit_batch = tcpv6_net_exit_batch, }; int __init tcpv6_init(void) { int ret; - ret = inet6_add_protocol(&tcpv6_protocol, IPPROTO_TCP); + net_hotdata.tcpv6_protocol = (struct inet6_protocol) { + .handler = tcp_v6_rcv, + .err_handler = tcp_v6_err, + .flags = INET6_PROTO_NOPOLICY | INET6_PROTO_FINAL, + }; + ret = inet6_add_protocol(&net_hotdata.tcpv6_protocol, IPPROTO_TCP); if (ret) goto out; @@ -2253,7 +2407,7 @@ out_tcpv6_pernet_subsys: out_tcpv6_protosw: inet6_unregister_protosw(&tcpv6_protosw); out_tcpv6_protocol: - inet6_del_protocol(&tcpv6_protocol, IPPROTO_TCP); + inet6_del_protocol(&net_hotdata.tcpv6_protocol, IPPROTO_TCP); goto out; } @@ -2261,5 +2415,5 @@ void tcpv6_exit(void) { unregister_pernet_subsys(&tcpv6_net_ops); inet6_unregister_protosw(&tcpv6_protosw); - inet6_del_protocol(&tcpv6_protocol, IPPROTO_TCP); + inet6_del_protocol(&net_hotdata.tcpv6_protocol, IPPROTO_TCP); } diff --git a/net/ipv6/tcpv6_offload.c b/net/ipv6/tcpv6_offload.c index 39db5a226855..effeba58630b 100644 --- a/net/ipv6/tcpv6_offload.c +++ b/net/ipv6/tcpv6_offload.c @@ -7,36 +7,153 @@ */ #include <linux/indirect_call_wrapper.h> #include <linux/skbuff.h> +#include <net/inet6_hashtables.h> #include <net/gro.h> #include <net/protocol.h> #include <net/tcp.h> #include <net/ip6_checksum.h> #include "ip6_offload.h" +static void tcp6_check_fraglist_gro(struct list_head *head, struct sk_buff *skb, + struct tcphdr *th) +{ +#if IS_ENABLED(CONFIG_IPV6) + const struct ipv6hdr *hdr; + struct sk_buff *p; + struct sock *sk; + struct net *net; + int iif, sdif; + + if (likely(!(skb->dev->features & NETIF_F_GRO_FRAGLIST))) + return; + + p = tcp_gro_lookup(head, th); + if (p) { + NAPI_GRO_CB(skb)->is_flist = NAPI_GRO_CB(p)->is_flist; + return; + } + + inet6_get_iif_sdif(skb, &iif, &sdif); + hdr = skb_gro_network_header(skb); + net = dev_net_rcu(skb->dev); + sk = __inet6_lookup_established(net, &hdr->saddr, th->source, + &hdr->daddr, ntohs(th->dest), + iif, sdif); + NAPI_GRO_CB(skb)->is_flist = !sk; + if (sk) + sock_gen_put(sk); +#endif /* IS_ENABLED(CONFIG_IPV6) */ +} + INDIRECT_CALLABLE_SCOPE struct sk_buff *tcp6_gro_receive(struct list_head *head, struct sk_buff *skb) { + struct tcphdr *th; + /* Don't bother verifying checksum if we're going to flush anyway. */ if (!NAPI_GRO_CB(skb)->flush && skb_gro_checksum_validate(skb, IPPROTO_TCP, - ip6_gro_compute_pseudo)) { - NAPI_GRO_CB(skb)->flush = 1; - return NULL; - } + ip6_gro_compute_pseudo)) + goto flush; + + th = tcp_gro_pull_header(skb); + if (!th) + goto flush; - return tcp_gro_receive(head, skb); + tcp6_check_fraglist_gro(head, skb, th); + + return tcp_gro_receive(head, skb, th); + +flush: + NAPI_GRO_CB(skb)->flush = 1; + return NULL; } INDIRECT_CALLABLE_SCOPE int tcp6_gro_complete(struct sk_buff *skb, int thoff) { - const struct ipv6hdr *iph = ipv6_hdr(skb); + const u16 offset = NAPI_GRO_CB(skb)->network_offsets[skb->encapsulation]; + const struct ipv6hdr *iph = (struct ipv6hdr *)(skb->data + offset); struct tcphdr *th = tcp_hdr(skb); + if (unlikely(NAPI_GRO_CB(skb)->is_flist)) { + skb_shinfo(skb)->gso_type |= SKB_GSO_FRAGLIST | SKB_GSO_TCPV6; + skb_shinfo(skb)->gso_segs = NAPI_GRO_CB(skb)->count; + + __skb_incr_checksum_unnecessary(skb); + + return 0; + } + th->check = ~tcp_v6_check(skb->len - thoff, &iph->saddr, &iph->daddr, 0); skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV6; - return tcp_gro_complete(skb); + tcp_gro_complete(skb); + return 0; +} + +static void __tcpv6_gso_segment_csum(struct sk_buff *seg, + struct in6_addr *oldip, + const struct in6_addr *newip, + __be16 *oldport, __be16 newport) +{ + struct tcphdr *th = tcp_hdr(seg); + + if (!ipv6_addr_equal(oldip, newip)) { + inet_proto_csum_replace16(&th->check, seg, + oldip->s6_addr32, + newip->s6_addr32, + true); + *oldip = *newip; + } + + if (*oldport == newport) + return; + + inet_proto_csum_replace2(&th->check, seg, *oldport, newport, false); + *oldport = newport; +} + +static struct sk_buff *__tcpv6_gso_segment_list_csum(struct sk_buff *segs) +{ + const struct tcphdr *th; + const struct ipv6hdr *iph; + struct sk_buff *seg; + struct tcphdr *th2; + struct ipv6hdr *iph2; + + seg = segs; + th = tcp_hdr(seg); + iph = ipv6_hdr(seg); + th2 = tcp_hdr(seg->next); + iph2 = ipv6_hdr(seg->next); + + if (!(*(const u32 *)&th->source ^ *(const u32 *)&th2->source) && + ipv6_addr_equal(&iph->saddr, &iph2->saddr) && + ipv6_addr_equal(&iph->daddr, &iph2->daddr)) + return segs; + + while ((seg = seg->next)) { + th2 = tcp_hdr(seg); + iph2 = ipv6_hdr(seg); + + __tcpv6_gso_segment_csum(seg, &iph2->saddr, &iph->saddr, + &th2->source, th->source); + __tcpv6_gso_segment_csum(seg, &iph2->daddr, &iph->daddr, + &th2->dest, th->dest); + } + + return segs; +} + +static struct sk_buff *__tcp6_gso_segment_list(struct sk_buff *skb, + netdev_features_t features) +{ + skb = skb_segment_list(skb, features, skb_mac_header_len(skb)); + if (IS_ERR(skb)) + return skb; + + return __tcpv6_gso_segment_list_csum(skb); } static struct sk_buff *tcp6_gso_segment(struct sk_buff *skb, @@ -50,6 +167,15 @@ static struct sk_buff *tcp6_gso_segment(struct sk_buff *skb, if (!pskb_may_pull(skb, sizeof(*th))) return ERR_PTR(-EINVAL); + if (skb_shinfo(skb)->gso_type & SKB_GSO_FRAGLIST) { + struct tcphdr *th = tcp_hdr(skb); + + if (skb_pagelen(skb) - th->doff * 4 == skb_shinfo(skb)->gso_size) + return __tcp6_gso_segment_list(skb, features); + + skb->ip_summed = CHECKSUM_NONE; + } + if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) { const struct ipv6hdr *ipv6h = ipv6_hdr(skb); struct tcphdr *th = tcp_hdr(skb); @@ -65,15 +191,15 @@ static struct sk_buff *tcp6_gso_segment(struct sk_buff *skb, return tcp_gso_segment(skb, features); } -static const struct net_offload tcpv6_offload = { - .callbacks = { - .gso_segment = tcp6_gso_segment, - .gro_receive = tcp6_gro_receive, - .gro_complete = tcp6_gro_complete, - }, -}; int __init tcpv6_offload_init(void) { - return inet6_add_offload(&tcpv6_offload, IPPROTO_TCP); + net_hotdata.tcpv6_offload = (struct net_offload) { + .callbacks = { + .gso_segment = tcp6_gso_segment, + .gro_receive = tcp6_gro_receive, + .gro_complete = tcp6_gro_complete, + }, + }; + return inet6_add_offload(&net_hotdata.tcpv6_offload, IPPROTO_TCP); } diff --git a/net/ipv6/tunnel6.c b/net/ipv6/tunnel6.c index 00e8d8b1c9a7..dc4ea9b11794 100644 --- a/net/ipv6/tunnel6.c +++ b/net/ipv6/tunnel6.c @@ -302,4 +302,5 @@ static void __exit tunnel6_fini(void) module_init(tunnel6_init); module_exit(tunnel6_fini); +MODULE_DESCRIPTION("IP-in-IPv6 tunnel driver"); MODULE_LICENSE("GPL"); diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 9fb2f33ee3a7..794c13674e8a 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -34,6 +34,7 @@ #include <linux/slab.h> #include <linux/uaccess.h> #include <linux/indirect_call_wrapper.h> +#include <trace/events/udp.h> #include <net/addrconf.h> #include <net/ndisc.h> @@ -45,11 +46,13 @@ #include <net/tcp_states.h> #include <net/ip6_checksum.h> #include <net/ip6_tunnel.h> +#include <net/udp_tunnel.h> #include <net/xfrm.h> #include <net/inet_hashtables.h> #include <net/inet6_hashtables.h> #include <net/busy_poll.h> #include <net/sock_reuseport.h> +#include <net/gro.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> @@ -64,21 +67,20 @@ static void udpv6_destruct_sock(struct sock *sk) int udpv6_init_sock(struct sock *sk) { - udp_lib_init_sock(sk); + int res = udp_lib_init_sock(sk); + sk->sk_destruct = udpv6_destruct_sock; set_bit(SOCK_SUPPORT_ZC, &sk->sk_socket->flags); - return 0; + return res; } -static u32 udp6_ehashfn(const struct net *net, - const struct in6_addr *laddr, - const u16 lport, - const struct in6_addr *faddr, - const __be16 fport) +INDIRECT_CALLABLE_SCOPE +u32 udp6_ehashfn(const struct net *net, + const struct in6_addr *laddr, + const u16 lport, + const struct in6_addr *faddr, + const __be16 fport) { - static u32 udp6_ehash_secret __read_mostly; - static u32 udp_ipv6_hash_secret __read_mostly; - u32 lhash, fhash; net_get_random_once(&udp6_ehash_secret, @@ -90,7 +92,7 @@ static u32 udp6_ehashfn(const struct net *net, fhash = __ipv6_addr_jhash(faddr, udp_ipv6_hash_secret); return __inet6_ehashfn(lhash, lport, fhash, fport, - udp_ipv6_hash_secret + net_hash_mix(net)); + udp6_ehash_secret + net_hash_mix(net)); } int udp_v6_get_port(struct sock *sk, unsigned short snum) @@ -110,11 +112,22 @@ void udp_v6_rehash(struct sock *sk) u16 new_hash = ipv6_portaddr_hash(sock_net(sk), &sk->sk_v6_rcv_saddr, inet_sk(sk)->inet_num); + u16 new_hash4; + + if (ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr)) { + new_hash4 = udp_ehashfn(sock_net(sk), + sk->sk_rcv_saddr, sk->sk_num, + sk->sk_daddr, sk->sk_dport); + } else { + new_hash4 = udp6_ehashfn(sock_net(sk), + &sk->sk_v6_rcv_saddr, sk->sk_num, + &sk->sk_v6_daddr, sk->sk_dport); + } - udp_lib_rehash(sk, new_hash); + udp_lib_rehash(sk, new_hash, new_hash4); } -static int compute_score(struct sock *sk, struct net *net, +static int compute_score(struct sock *sk, const struct net *net, const struct in6_addr *saddr, __be16 sport, const struct in6_addr *daddr, unsigned short hnum, int dif, int sdif) @@ -159,26 +172,51 @@ static int compute_score(struct sock *sk, struct net *net, return score; } -static struct sock *lookup_reuseport(struct net *net, struct sock *sk, - struct sk_buff *skb, - const struct in6_addr *saddr, - __be16 sport, +/** + * udp6_lib_lookup1() - Simplified lookup using primary hash (destination port) + * @net: Network namespace + * @saddr: Source address, network order + * @sport: Source port, network order + * @daddr: Destination address, network order + * @hnum: Destination port, host order + * @dif: Destination interface index + * @sdif: Destination bridge port index, if relevant + * @udptable: Set of UDP hash tables + * + * Simplified lookup to be used as fallback if no sockets are found due to a + * potential race between (receive) address change, and lookup happening before + * the rehash operation. This function ignores SO_REUSEPORT groups while scoring + * result sockets, because if we have one, we don't need the fallback at all. + * + * Called under rcu_read_lock(). + * + * Return: socket with highest matching score if any, NULL if none + */ +static struct sock *udp6_lib_lookup1(const struct net *net, + const struct in6_addr *saddr, __be16 sport, const struct in6_addr *daddr, - unsigned int hnum) + unsigned int hnum, int dif, int sdif, + const struct udp_table *udptable) { - struct sock *reuse_sk = NULL; - u32 hash; - - if (sk->sk_reuseport && sk->sk_state != TCP_ESTABLISHED) { - hash = udp6_ehashfn(net, daddr, hnum, saddr, sport); - reuse_sk = reuseport_select_sock(sk, hash, skb, - sizeof(struct udphdr)); + unsigned int slot = udp_hashfn(net, hnum, udptable->mask); + struct udp_hslot *hslot = &udptable->hash[slot]; + struct sock *sk, *result = NULL; + int score, badness = 0; + + sk_for_each_rcu(sk, &hslot->head) { + score = compute_score(sk, net, + saddr, sport, daddr, hnum, dif, sdif); + if (score > badness) { + result = sk; + badness = score; + } } - return reuse_sk; + + return result; } /* called with rcu_read_lock() */ -static struct sock *udp6_lib_lookup2(struct net *net, +static struct sock *udp6_lib_lookup2(const struct net *net, const struct in6_addr *saddr, __be16 sport, const struct in6_addr *daddr, unsigned int hnum, int dif, int sdif, struct udp_hslot *hslot2, @@ -186,66 +224,143 @@ static struct sock *udp6_lib_lookup2(struct net *net, { struct sock *sk, *result; int score, badness; + bool need_rescore; result = NULL; badness = -1; udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) { - score = compute_score(sk, net, saddr, sport, - daddr, hnum, dif, sdif); + need_rescore = false; +rescore: + score = compute_score(need_rescore ? result : sk, net, saddr, + sport, daddr, hnum, dif, sdif); if (score > badness) { - result = lookup_reuseport(net, sk, skb, - saddr, sport, daddr, hnum); + badness = score; + + if (need_rescore) + continue; + + if (sk->sk_state == TCP_ESTABLISHED) { + result = sk; + continue; + } + + result = inet6_lookup_reuseport(net, sk, skb, sizeof(struct udphdr), + saddr, sport, daddr, hnum, udp6_ehashfn); + if (!result) { + result = sk; + continue; + } + /* Fall back to scoring if group has connections */ - if (result && !reuseport_has_conns(sk)) + if (!reuseport_has_conns(sk)) return result; - result = result ? : sk; - badness = score; + /* Reuseport logic returned an error, keep original score. */ + if (IS_ERR(result)) + continue; + + /* compute_score is too long of a function to be + * inlined, and calling it again here yields + * measurable overhead for some + * workloads. Work around it by jumping + * backwards to rescore 'result'. + */ + need_rescore = true; + goto rescore; } } return result; } -static inline struct sock *udp6_lookup_run_bpf(struct net *net, - struct udp_table *udptable, - struct sk_buff *skb, - const struct in6_addr *saddr, - __be16 sport, - const struct in6_addr *daddr, - u16 hnum, const int dif) +#if IS_ENABLED(CONFIG_BASE_SMALL) +static struct sock *udp6_lib_lookup4(const struct net *net, + const struct in6_addr *saddr, __be16 sport, + const struct in6_addr *daddr, + unsigned int hnum, int dif, int sdif, + struct udp_table *udptable) +{ + return NULL; +} + +static void udp6_hash4(struct sock *sk) +{ +} +#else /* !CONFIG_BASE_SMALL */ +static struct sock *udp6_lib_lookup4(const struct net *net, + const struct in6_addr *saddr, __be16 sport, + const struct in6_addr *daddr, + unsigned int hnum, int dif, int sdif, + struct udp_table *udptable) { - struct sock *sk, *reuse_sk; - bool no_reuseport; + const __portpair ports = INET_COMBINED_PORTS(sport, hnum); + const struct hlist_nulls_node *node; + struct udp_hslot *hslot4; + unsigned int hash4, slot; + struct udp_sock *up; + struct sock *sk; - if (udptable != net->ipv4.udp_table) - return NULL; /* only UDP is supported */ + hash4 = udp6_ehashfn(net, daddr, hnum, saddr, sport); + slot = hash4 & udptable->mask; + hslot4 = &udptable->hash4[slot]; - no_reuseport = bpf_sk_lookup_run_v6(net, IPPROTO_UDP, saddr, sport, - daddr, hnum, dif, &sk); - if (no_reuseport || IS_ERR_OR_NULL(sk)) - return sk; +begin: + udp_lrpa_for_each_entry_rcu(up, node, &hslot4->nulls_head) { + sk = (struct sock *)up; + if (inet6_match(net, sk, saddr, daddr, ports, dif, sdif)) + return sk; + } - reuse_sk = lookup_reuseport(net, sk, skb, saddr, sport, daddr, hnum); - if (reuse_sk) - sk = reuse_sk; - return sk; + /* if the nulls value we got at the end of this lookup is not the + * expected one, we must restart lookup. We probably met an item that + * was moved to another chain due to rehash. + */ + if (get_nulls_value(node) != slot) + goto begin; + + return NULL; +} + +static void udp6_hash4(struct sock *sk) +{ + struct net *net = sock_net(sk); + unsigned int hash; + + if (ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr)) { + udp4_hash4(sk); + return; + } + + if (sk_unhashed(sk) || ipv6_addr_any(&sk->sk_v6_rcv_saddr)) + return; + + hash = udp6_ehashfn(net, &sk->sk_v6_rcv_saddr, sk->sk_num, + &sk->sk_v6_daddr, sk->sk_dport); + + udp_lib_hash4(sk, hash); } +#endif /* CONFIG_BASE_SMALL */ /* rcu_read_lock() must be held */ -struct sock *__udp6_lib_lookup(struct net *net, +struct sock *__udp6_lib_lookup(const struct net *net, const struct in6_addr *saddr, __be16 sport, const struct in6_addr *daddr, __be16 dport, int dif, int sdif, struct udp_table *udptable, struct sk_buff *skb) { unsigned short hnum = ntohs(dport); - unsigned int hash2, slot2; struct udp_hslot *hslot2; struct sock *result, *sk; + unsigned int hash2; hash2 = ipv6_portaddr_hash(net, daddr, hnum); - slot2 = hash2 & udptable->mask; - hslot2 = &udptable->hash2[slot2]; + hslot2 = udp_hashslot2(udptable, hash2); + + if (udp_has_hash4(hslot2)) { + result = udp6_lib_lookup4(net, saddr, sport, daddr, hnum, + dif, sdif, udptable); + if (result) /* udp6_lib_lookup4 return sk or NULL */ + return result; + } /* Lookup connected or non-wildcard sockets */ result = udp6_lib_lookup2(net, saddr, sport, @@ -255,9 +370,11 @@ struct sock *__udp6_lib_lookup(struct net *net, goto done; /* Lookup redirect from BPF */ - if (static_branch_unlikely(&bpf_sk_lookup_enabled)) { - sk = udp6_lookup_run_bpf(net, udptable, skb, - saddr, sport, daddr, hnum, dif); + if (static_branch_unlikely(&bpf_sk_lookup_enabled) && + udptable == net->ipv4.udp_table) { + sk = inet6_lookup_run_sk_lookup(net, IPPROTO_UDP, skb, sizeof(struct udphdr), + saddr, sport, daddr, hnum, dif, + udp6_ehashfn); if (sk) { result = sk; goto done; @@ -270,12 +387,18 @@ struct sock *__udp6_lib_lookup(struct net *net, /* Lookup wildcard sockets */ hash2 = ipv6_portaddr_hash(net, &in6addr_any, hnum); - slot2 = hash2 & udptable->mask; - hslot2 = &udptable->hash2[slot2]; + hslot2 = udp_hashslot2(udptable, hash2); result = udp6_lib_lookup2(net, saddr, sport, &in6addr_any, hnum, dif, sdif, hslot2, skb); + if (!IS_ERR_OR_NULL(result)) + goto done; + + /* Cover address change/lookup/rehash race: see __udp4_lib_lookup() */ + result = udp6_lib_lookup1(net, saddr, sport, daddr, hnum, dif, sdif, + udptable); + done: if (IS_ERR(result)) return NULL; @@ -297,19 +420,23 @@ static struct sock *__udp6_lib_lookup_skb(struct sk_buff *skb, struct sock *udp6_lib_lookup_skb(const struct sk_buff *skb, __be16 sport, __be16 dport) { - const struct ipv6hdr *iph = ipv6_hdr(skb); + const u16 offset = NAPI_GRO_CB(skb)->network_offsets[skb->encapsulation]; + const struct ipv6hdr *iph = (struct ipv6hdr *)(skb->data + offset); struct net *net = dev_net(skb->dev); + int iif, sdif; + + inet6_get_iif_sdif(skb, &iif, &sdif); return __udp6_lib_lookup(net, &iph->saddr, sport, - &iph->daddr, dport, inet6_iif(skb), - inet6_sdif(skb), net->ipv4.udp_table, NULL); + &iph->daddr, dport, iif, + sdif, net->ipv4.udp_table, NULL); } /* Must be called under rcu_read_lock(). * Does increment socket refcount. */ #if IS_ENABLED(CONFIG_NF_TPROXY_IPV6) || IS_ENABLED(CONFIG_NF_SOCKET_IPV6) -struct sock *udp6_lib_lookup(struct net *net, const struct in6_addr *saddr, __be16 sport, +struct sock *udp6_lib_lookup(const struct net *net, const struct in6_addr *saddr, __be16 sport, const struct in6_addr *daddr, __be16 dport, int dif) { struct sock *sk; @@ -323,7 +450,7 @@ struct sock *udp6_lib_lookup(struct net *net, const struct in6_addr *saddr, __be EXPORT_SYMBOL_GPL(udp6_lib_lookup); #endif -/* do not use the scratch area len for jumbogram: their length execeeds the +/* do not use the scratch area len for jumbogram: their length exceeds the * scratch area space; note that the IP6CB flags is still in the first * cacheline, so checking for jumbograms is cheap */ @@ -353,7 +480,7 @@ int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, if (flags & MSG_ERRQUEUE) return ipv6_recv_error(sk, msg, len, addr_len); - if (np->rxpmtu && np->rxopt.bits.rxpmtu) + if (np->rxopt.bits.rxpmtu && READ_ONCE(np->rxpmtu)) return ipv6_recv_rxpmtu(sk, msg, len, addr_len); try_again: @@ -398,7 +525,7 @@ try_again: } if (unlikely(err)) { if (!peeking) { - atomic_inc(&sk->sk_drops); + udp_drops_inc(sk); SNMP_INC_STATS(mib, UDP_MIB_INERRORS); } kfree_skb(skb); @@ -429,17 +556,18 @@ try_again: *addr_len = sizeof(*sin6); BPF_CGROUP_RUN_PROG_UDP6_RECVMSG_LOCK(sk, - (struct sockaddr *)sin6); + (struct sockaddr *)sin6, + addr_len); } - if (udp_sk(sk)->gro_enabled) + if (udp_test_bit(GRO_ENABLED, sk)) udp_cmsg_recv(msg, sk, skb); if (np->rxopt.all) ip6_datagram_recv_common_ctl(sk, msg, skb); if (is_udp4) { - if (inet->cmsg_flags) + if (inet_cmsg_flags(inet)) ip_cmsg_recv_offset(msg, sk, skb, sizeof(struct udphdr), off); } else { @@ -460,7 +588,7 @@ csum_copy_err: SNMP_INC_STATS(mib, UDP_MIB_CSUMERRORS); SNMP_INC_STATS(mib, UDP_MIB_INERRORS); } - kfree_skb(skb); + kfree_skb_reason(skb, SKB_DROP_REASON_UDP_CSUM); /* starting over for a new packet, but check if we need to yield */ cond_resched(); @@ -468,7 +596,7 @@ csum_copy_err: goto try_again; } -DEFINE_STATIC_KEY_FALSE(udpv6_encap_needed_key); +DECLARE_STATIC_KEY_FALSE(udpv6_encap_needed_key); void udpv6_encap_enable(void) { static_branch_inc(&udpv6_encap_needed_key); @@ -590,7 +718,7 @@ int __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt, sk = __udp6_lib_lookup(net, daddr, uh->dest, saddr, uh->source, inet6_iif(skb), inet6_sdif(skb), udptable, NULL); - if (!sk || udp_sk(sk)->encap_type) { + if (!sk || READ_ONCE(udp_sk(sk)->encap_type)) { /* No socket for error: try tunnels before discarding */ if (static_branch_unlikely(&udpv6_encap_needed_key)) { sk = __udp6_lib_err_encap(net, hdr, offset, uh, @@ -617,13 +745,14 @@ int __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt, if (!ip6_sk_accept_pmtu(sk)) goto out; ip6_sk_update_pmtu(skb, sk, info); - if (np->pmtudisc != IPV6_PMTUDISC_DONT) + if (READ_ONCE(np->pmtudisc) != IPV6_PMTUDISC_DONT) harderr = 1; } if (type == NDISC_REDIRECT) { if (tunnel) { ip6_redirect(skb, sock_net(sk), inet6_iif(skb), - sk->sk_mark, sk->sk_uid); + READ_ONCE(sk->sk_mark), + sk_uid(sk)); } else { ip6_sk_redirect(skb, sk); } @@ -638,7 +767,7 @@ int __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt, goto out; } - if (!np->recverr) { + if (!inet6_test_bit(RECVERR6, sk)) { if (!harderr || sk->sk_state != TCP_ESTABLISHED) goto out; } else { @@ -679,7 +808,8 @@ static int __udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) drop_reason = SKB_DROP_REASON_PROTO_MEM; } UDP6_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite); - kfree_skb_reason(skb, drop_reason); + trace_udp_fail_queue_rcv_skb(rc, sk, skb); + sk_skb_reason_drop(sk, skb, drop_reason); return -1; } @@ -704,8 +834,10 @@ static int udpv6_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb) drop_reason = SKB_DROP_REASON_XFRM_POLICY; goto drop; } + nf_reset_ct(skb); - if (static_branch_unlikely(&udpv6_encap_needed_key) && up->encap_type) { + if (static_branch_unlikely(&udpv6_encap_needed_key) && + READ_ONCE(up->encap_type)) { int (*encap_rcv)(struct sock *sk, struct sk_buff *skb); /* @@ -743,16 +875,17 @@ static int udpv6_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb) /* * UDP-Lite specific tests, ignored on UDP sockets (see net/ipv4/udp.c). */ - if ((up->pcflag & UDPLITE_RECV_CC) && UDP_SKB_CB(skb)->partial_cov) { + if (udp_test_bit(UDPLITE_RECV_CC, sk) && UDP_SKB_CB(skb)->partial_cov) { + u16 pcrlen = READ_ONCE(up->pcrlen); - if (up->pcrlen == 0) { /* full coverage was set */ + if (pcrlen == 0) { /* full coverage was set */ net_dbg_ratelimited("UDPLITE6: partial coverage %d while full coverage %d requested\n", UDP_SKB_CB(skb)->cscov, skb->len); goto drop; } - if (UDP_SKB_CB(skb)->cscov < up->pcrlen) { + if (UDP_SKB_CB(skb)->cscov < pcrlen) { net_dbg_ratelimited("UDPLITE6: coverage %d too small, need min %d\n", - UDP_SKB_CB(skb)->cscov, up->pcrlen); + UDP_SKB_CB(skb)->cscov, pcrlen); goto drop; } } @@ -762,10 +895,8 @@ static int udpv6_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb) udp_lib_checksum_complete(skb)) goto csum_error; - if (sk_filter_trim_cap(sk, skb, sizeof(struct udphdr))) { - drop_reason = SKB_DROP_REASON_SOCKET_FILTER; + if (sk_filter_trim_cap(sk, skb, sizeof(struct udphdr), &drop_reason)) goto drop; - } udp_csum_pull_header(skb); @@ -778,8 +909,8 @@ csum_error: __UDP6_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite); drop: __UDP6_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite); - atomic_inc(&sk->sk_drops); - kfree_skb_reason(skb, drop_reason); + udp_drops_inc(sk); + sk_skb_reason_drop(sk, skb, drop_reason); return -1; } @@ -805,12 +936,12 @@ static int udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) return 0; } -static bool __udp_v6_is_mcast_sock(struct net *net, struct sock *sk, +static bool __udp_v6_is_mcast_sock(struct net *net, const struct sock *sk, __be16 loc_port, const struct in6_addr *loc_addr, __be16 rmt_port, const struct in6_addr *rmt_addr, int dif, int sdif, unsigned short hnum) { - struct inet_sock *inet = inet_sk(sk); + const struct inet_sock *inet = inet_sk(sk); if (!net_eq(sock_net(sk), net)) return false; @@ -863,7 +994,7 @@ static int __udp6_lib_mcast_deliver(struct net *net, struct sk_buff *skb, udptable->mask; hash2 = ipv6_portaddr_hash(net, daddr, hnum) & udptable->mask; start_lookup: - hslot = &udptable->hash2[hash2]; + hslot = &udptable->hash2[hash2].hslot; offset = offsetof(typeof(*sk), __sk_common.skc_portaddr_node); } @@ -875,7 +1006,7 @@ start_lookup: /* If zero checksum and no_check is not on for * the socket then skip it. */ - if (!uh->check && !udp_sk(sk)->no_check6_rx) + if (!uh->check && !udp_get_no_check6_rx(sk)) continue; if (!first) { first = sk; @@ -883,7 +1014,7 @@ start_lookup: } nskb = skb_clone(skb, GFP_ATOMIC); if (unlikely(!nskb)) { - atomic_inc(&sk->sk_drops); + udp_drops_inc(sk); __UDP6_INC_STATS(net, UDP_MIB_RCVBUFERRORS, IS_UDPLITE(sk)); __UDP6_INC_STATS(net, UDP_MIB_INERRORS, @@ -914,14 +1045,11 @@ start_lookup: static void udp6_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst) { - if (udp_sk_rx_dst_set(sk, dst)) { - const struct rt6_info *rt = (const struct rt6_info *)dst; - - sk->sk_rx_dst_cookie = rt6_get_cookie(rt); - } + if (udp_sk_rx_dst_set(sk, dst)) + sk->sk_rx_dst_cookie = rt6_get_cookie(dst_rt6_info(dst)); } -/* wrapper for udp_queue_rcv_skb tacking care of csum conversion and +/* wrapper for udp_queue_rcv_skb taking care of csum conversion and * return code conversion for ip layer consumption */ static int udp6_unicast_rcv_skb(struct sock *sk, struct sk_buff *skb, @@ -946,8 +1074,8 @@ int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED; const struct in6_addr *saddr, *daddr; struct net *net = dev_net(skb->dev); + struct sock *sk = NULL; struct udphdr *uh; - struct sock *sk; bool refcounted; u32 ulen = 0; @@ -985,7 +1113,11 @@ int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, goto csum_error; /* Check if the socket is already available, e.g. due to early demux */ - sk = skb_steal_sock(skb, &refcounted); + sk = inet6_steal_sock(net, skb, sizeof(struct udphdr), saddr, uh->source, daddr, uh->dest, + &refcounted, udp6_ehashfn); + if (IS_ERR(sk)) + goto no_sk; + if (sk) { struct dst_entry *dst = skb_dst(skb); int ret; @@ -993,7 +1125,7 @@ int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, if (unlikely(rcu_dereference(sk->sk_rx_dst) != dst)) udp6_sk_rx_dst_set(sk, dst); - if (!uh->check && !udp_sk(sk)->no_check6_rx) { + if (!uh->check && !udp_get_no_check6_rx(sk)) { if (refcounted) sock_put(sk); goto report_csum_error; @@ -1015,11 +1147,11 @@ int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, /* Unicast */ sk = __udp6_lib_lookup_skb(skb, uh->source, uh->dest, udptable); if (sk) { - if (!uh->check && !udp_sk(sk)->no_check6_rx) + if (!uh->check && !udp_get_no_check6_rx(sk)) goto report_csum_error; return udp6_unicast_rcv_skb(sk, skb, uh); } - +no_sk: reason = SKB_DROP_REASON_NO_SOCKET; if (!uh->check) @@ -1027,6 +1159,7 @@ int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) goto discard; + nf_reset_ct(skb); if (udp_lib_checksum_complete(skb)) goto csum_error; @@ -1034,7 +1167,7 @@ int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, __UDP6_INC_STATS(net, UDP_MIB_NOPORTS, proto == IPPROTO_UDPLITE); icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0); - kfree_skb_reason(skb, reason); + sk_skb_reason_drop(sk, skb, reason); return 0; short_packet: @@ -1055,7 +1188,7 @@ csum_error: __UDP6_INC_STATS(net, UDP_MIB_CSUMERRORS, proto == IPPROTO_UDPLITE); discard: __UDP6_INC_STATS(net, UDP_MIB_INERRORS, proto == IPPROTO_UDPLITE); - kfree_skb_reason(skb, reason); + sk_skb_reason_drop(sk, skb, reason); return 0; } @@ -1067,14 +1200,13 @@ static struct sock *__udp6_lib_demux_lookup(struct net *net, { struct udp_table *udptable = net->ipv4.udp_table; unsigned short hnum = ntohs(loc_port); - unsigned int hash2, slot2; struct udp_hslot *hslot2; + unsigned int hash2; __portpair ports; struct sock *sk; hash2 = ipv6_portaddr_hash(net, loc_addr, hnum); - slot2 = hash2 & udptable->mask; - hslot2 = &udptable->hash2[slot2]; + hslot2 = udp_hashslot2(udptable, hash2); ports = INET_COMBINED_PORTS(rmt_port, hnum); udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) { @@ -1110,11 +1242,12 @@ void udp_v6_early_demux(struct sk_buff *skb) else return; - if (!sk || !refcount_inc_not_zero(&sk->sk_refcnt)) + if (!sk) return; skb->sk = sk; - skb->destructor = sock_efree; + DEBUG_NET_WARN_ON_ONCE(sk_is_refcounted(sk)); + skb->destructor = sock_pfree; dst = rcu_dereference(sk->sk_rx_dst); if (dst) @@ -1144,12 +1277,12 @@ static void udp_v6_flush_pending_frames(struct sock *sk) udp_flush_pending_frames(sk); else if (up->pending) { up->len = 0; - up->pending = 0; + WRITE_ONCE(up->pending, 0); ip6_flush_pending_frames(sk); } } -static int udpv6_pre_connect(struct sock *sk, struct sockaddr *uaddr, +static int udpv6_pre_connect(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len) { if (addr_len < offsetofend(struct sockaddr, sa_family)) @@ -1167,7 +1300,20 @@ static int udpv6_pre_connect(struct sock *sk, struct sockaddr *uaddr, if (addr_len < SIN6_LEN_RFC2133) return -EINVAL; - return BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr); + return BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr, &addr_len); +} + +static int udpv6_connect(struct sock *sk, struct sockaddr_unsized *uaddr, + int addr_len) +{ + int res; + + lock_sock(sk); + res = __ip6_datagram_connect(sk, uaddr, addr_len); + if (!res) + udp6_hash4(sk); + release_sock(sk); + return res; } /** @@ -1245,20 +1391,19 @@ static int udp_v6_send_skb(struct sk_buff *skb, struct flowi6 *fl6, const int hlen = skb_network_header_len(skb) + sizeof(struct udphdr); - if (hlen + cork->gso_size > cork->fragsize) { + if (hlen + min(datalen, cork->gso_size) > cork->fragsize) { kfree_skb(skb); - return -EINVAL; + return -EMSGSIZE; } if (datalen > cork->gso_size * UDP_MAX_SEGMENTS) { kfree_skb(skb); return -EINVAL; } - if (udp_sk(sk)->no_check6_tx) { + if (udp_get_no_check6_tx(sk)) { kfree_skb(skb); return -EINVAL; } - if (skb->ip_summed != CHECKSUM_PARTIAL || is_udplite || - dst_xfrm(skb_dst(skb))) { + if (is_udplite || dst_xfrm(skb_dst(skb))) { kfree_skb(skb); return -EIO; } @@ -1268,13 +1413,15 @@ static int udp_v6_send_skb(struct sk_buff *skb, struct flowi6 *fl6, skb_shinfo(skb)->gso_type = SKB_GSO_UDP_L4; skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(datalen, cork->gso_size); + + /* Don't checksum the payload, skb will get segmented */ + goto csum_partial; } - goto csum_partial; } if (is_udplite) csum = udplite_csum(skb); - else if (udp_sk(sk)->no_check6_tx) { /* UDP csum disabled */ + else if (udp_get_no_check6_tx(sk)) { /* UDP csum disabled */ skb->ip_summed = CHECKSUM_NONE; goto send; } else if (skb->ip_summed == CHECKSUM_PARTIAL) { /* UDP hardware csum */ @@ -1293,7 +1440,7 @@ csum_partial: send: err = ip6_send_skb(skb); if (err) { - if (err == -ENOBUFS && !inet6_sk(sk)->recverr) { + if (err == -ENOBUFS && !inet6_test_bit(RECVERR6, sk)) { UDP6_INC_STATS(sock_net(sk), UDP_MIB_SNDBUFERRORS, is_udplite); err = 0; @@ -1322,7 +1469,7 @@ static int udp_v6_push_pending_frames(struct sock *sk) &inet_sk(sk)->cork.base); out: up->len = 0; - up->pending = 0; + WRITE_ONCE(up->pending, 0); return err; } @@ -1344,15 +1491,13 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) int addr_len = msg->msg_namelen; bool connected = false; int ulen = len; - int corkreq = READ_ONCE(up->corkflag) || msg->msg_flags&MSG_MORE; + int corkreq = udp_test_bit(CORK, sk) || msg->msg_flags & MSG_MORE; int err; int is_udplite = IS_UDPLITE(sk); int (*getfrag)(void *, char *, int, int, int, struct sk_buff *); - ipcm6_init(&ipc6); + ipcm6_init_sk(&ipc6, sk); ipc6.gso_size = READ_ONCE(up->gso_size); - ipc6.sockc.tsflags = sk->sk_tsflags; - ipc6.sockc.mark = sk->sk_mark; /* destination address check */ if (sin6) { @@ -1379,7 +1524,7 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) default: return -EINVAL; } - } else if (!up->pending) { + } else if (!READ_ONCE(up->pending)) { if (sk->sk_state != TCP_ESTABLISHED) return -EDESTADDRREQ; daddr = &sk->sk_v6_daddr; @@ -1395,9 +1540,11 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) msg->msg_name = &sin; msg->msg_namelen = sizeof(sin); do_udp_sendmsg: - if (ipv6_only_sock(sk)) - return -ENETUNREACH; - return udp_sendmsg(sk, msg, len); + err = ipv6_only_sock(sk) ? + -ENETUNREACH : udp_sendmsg(sk, msg, len); + msg->msg_name = sin6; + msg->msg_namelen = addr_len; + return err; } } @@ -1408,8 +1555,8 @@ do_udp_sendmsg: return -EMSGSIZE; getfrag = is_udplite ? udplite_getfrag : ip_generic_getfrag; - if (up->pending) { - if (up->pending == AF_INET) + if (READ_ONCE(up->pending)) { + if (READ_ONCE(up->pending) == AF_INET) return udp_sendmsg(sk, msg, len); /* * There are pending frames. @@ -1437,7 +1584,7 @@ do_udp_sendmsg: fl6->fl6_dport = sin6->sin6_port; daddr = &sin6->sin6_addr; - if (np->sndflow) { + if (inet6_test_bit(SNDFLOW, sk)) { fl6->flowlabel = sin6->sin6_flowinfo&IPV6_FLOWINFO_MASK; if (fl6->flowlabel & IPV6_FLOWLABEL_MASK) { flowlabel = fl6_sock_lookup(sk, fl6->flowlabel); @@ -1474,7 +1621,7 @@ do_udp_sendmsg: if (!fl6->flowi6_oif) fl6->flowi6_oif = np->sticky_pktinfo.ipi6_ifindex; - fl6->flowi6_uid = sk->sk_uid; + fl6->flowi6_uid = sk_uid(sk); if (msg->msg_controllen) { opt = &opt_space; @@ -1483,9 +1630,11 @@ do_udp_sendmsg: ipc6.opt = opt; err = udp_cmsg_send(sk, msg, &ipc6.gso_size); - if (err > 0) + if (err > 0) { err = ip6_datagram_send_ctl(sock_net(sk), sk, msg, fl6, &ipc6); + connected = false; + } if (err < 0) { fl6_sock_release(flowlabel); return err; @@ -1497,7 +1646,6 @@ do_udp_sendmsg: } if (!(opt->opt_nflen|opt->opt_flen)) opt = NULL; - connected = false; } if (!opt) { opt = txopt_get(np); @@ -1518,6 +1666,7 @@ do_udp_sendmsg: if (cgroup_bpf_enabled(CGROUP_UDP6_SENDMSG) && !connected) { err = BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk, (struct sockaddr *)sin6, + &addr_len, &fl6->saddr); if (err) goto out_no_dst; @@ -1547,16 +1696,13 @@ do_udp_sendmsg: connected = false; if (!fl6->flowi6_oif && ipv6_addr_is_multicast(&fl6->daddr)) { - fl6->flowi6_oif = np->mcast_oif; + fl6->flowi6_oif = READ_ONCE(np->mcast_oif); connected = false; } else if (!fl6->flowi6_oif) - fl6->flowi6_oif = np->ucast_oif; + fl6->flowi6_oif = READ_ONCE(np->ucast_oif); security_sk_classify_flow(sk, flowi6_to_flowi_common(fl6)); - if (ipc6.tclass < 0) - ipc6.tclass = np->tclass; - fl6->flowlabel = ip6_make_flowinfo(ipc6.tclass, fl6->flowlabel); dst = ip6_sk_dst_lookup_flow(sk, fl6, final_p, connected); @@ -1579,7 +1725,7 @@ back_from_confirm: skb = ip6_make_skb(sk, getfrag, msg, ulen, sizeof(struct udphdr), &ipc6, - (struct rt6_info *)dst, + dst_rt6_info(dst), msg->msg_flags, &cork); err = PTR_ERR(skb); if (!IS_ERR_OR_NULL(skb)) @@ -1599,24 +1745,22 @@ back_from_confirm: goto out; } - up->pending = AF_INET6; + WRITE_ONCE(up->pending, AF_INET6); do_append_data: - if (ipc6.dontfrag < 0) - ipc6.dontfrag = np->dontfrag; up->len += ulen; err = ip6_append_data(sk, getfrag, msg, ulen, sizeof(struct udphdr), - &ipc6, fl6, (struct rt6_info *)dst, + &ipc6, fl6, dst_rt6_info(dst), corkreq ? msg->msg_flags|MSG_MORE : msg->msg_flags); if (err) udp_v6_flush_pending_frames(sk); else if (!corkreq) err = udp_v6_push_pending_frames(sk); else if (unlikely(skb_queue_empty(&sk->sk_write_queue))) - up->pending = 0; + WRITE_ONCE(up->pending, 0); if (err > 0) - err = np->recverr ? net_xmit_errno(err) : 0; + err = inet6_test_bit(RECVERR6, sk) ? net_xmit_errno(err) : 0; release_sock(sk); out: @@ -1649,6 +1793,20 @@ do_confirm: } EXPORT_SYMBOL(udpv6_sendmsg); +static void udpv6_splice_eof(struct socket *sock) +{ + struct sock *sk = sock->sk; + struct udp_sock *up = udp_sk(sk); + + if (!READ_ONCE(up->pending) || udp_test_bit(CORK, sk)) + return; + + lock_sock(sk); + if (up->pending && !udp_test_bit(CORK, sk)) + udp_v6_push_pending_frames(sk); + release_sock(sk); +} + void udpv6_destroy_sock(struct sock *sk) { struct udp_sock *up = udp_sk(sk); @@ -1666,9 +1824,10 @@ void udpv6_destroy_sock(struct sock *sk) if (encap_destroy) encap_destroy(sk); } - if (up->encap_enabled) { + if (udp_test_bit(ENCAP_ENABLED, sk)) { static_branch_dec(&udpv6_encap_needed_key); udp_encap_disable(); + udp_tunnel_cleanup_gro(sk); } } } @@ -1694,11 +1853,6 @@ int udpv6_getsockopt(struct sock *sk, int level, int optname, return ipv6_getsockopt(sk, level, optname, optval, optlen); } -static const struct inet6_protocol udpv6_protocol = { - .handler = udpv6_rcv, - .err_handler = udpv6_err, - .flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL, -}; /* ------------------------------------------------------------------------ */ #ifdef CONFIG_PROC_FS @@ -1708,7 +1862,7 @@ int udp6_seq_show(struct seq_file *seq, void *v) seq_puts(seq, IPV6_SEQ_DGRAM_HEADER); } else { int bucket = ((struct udp_iter_state *)seq->private)->bucket; - struct inet_sock *inet = inet_sk(v); + const struct inet_sock *inet = inet_sk((const struct sock *)v); __u16 srcp = ntohs(inet->inet_sport); __u16 destp = ntohs(inet->inet_dport); __ip6_dgram_sock_seq_show(seq, v, srcp, destp, @@ -1751,7 +1905,7 @@ struct proto udpv6_prot = { .owner = THIS_MODULE, .close = udp_lib_close, .pre_connect = udpv6_pre_connect, - .connect = ip6_datagram_connect, + .connect = udpv6_connect, .disconnect = udp_disconnect, .ioctl = udp_ioctl, .init = udpv6_init_sock, @@ -1760,6 +1914,7 @@ struct proto udpv6_prot = { .getsockopt = udpv6_getsockopt, .sendmsg = udpv6_sendmsg, .recvmsg = udpv6_recvmsg, + .splice_eof = udpv6_splice_eof, .release_cb = ip6_datagram_release_cb, .hash = udp_lib_hash, .unhash = udp_lib_unhash, @@ -1770,13 +1925,14 @@ struct proto udpv6_prot = { .psock_update_sk_prot = udp_bpf_update_proto, #endif - .memory_allocated = &udp_memory_allocated, + .memory_allocated = &net_aligned_data.udp_memory_allocated, .per_cpu_fw_alloc = &udp_memory_per_cpu_fw_alloc, .sysctl_mem = sysctl_udp_mem, .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_udp_wmem_min), .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_udp_rmem_min), .obj_size = sizeof(struct udp6_sock), + .ipv6_pinfo_offset = offsetof(struct udp6_sock, inet6), .h.udp_table = NULL, .diag_destroy = udp_abort, }; @@ -1793,7 +1949,12 @@ int __init udpv6_init(void) { int ret; - ret = inet6_add_protocol(&udpv6_protocol, IPPROTO_UDP); + net_hotdata.udpv6_protocol = (struct inet6_protocol) { + .handler = udpv6_rcv, + .err_handler = udpv6_err, + .flags = INET6_PROTO_NOPOLICY | INET6_PROTO_FINAL, + }; + ret = inet6_add_protocol(&net_hotdata.udpv6_protocol, IPPROTO_UDP); if (ret) goto out; @@ -1804,12 +1965,12 @@ out: return ret; out_udpv6_protocol: - inet6_del_protocol(&udpv6_protocol, IPPROTO_UDP); + inet6_del_protocol(&net_hotdata.udpv6_protocol, IPPROTO_UDP); goto out; } void udpv6_exit(void) { inet6_unregister_protosw(&udpv6_protosw); - inet6_del_protocol(&udpv6_protocol, IPPROTO_UDP); + inet6_del_protocol(&net_hotdata.udpv6_protocol, IPPROTO_UDP); } diff --git a/net/ipv6/udp_impl.h b/net/ipv6/udp_impl.h index 0590f566379d..8a406be25a3a 100644 --- a/net/ipv6/udp_impl.h +++ b/net/ipv6/udp_impl.h @@ -1,6 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _UDP6_IMPL_H #define _UDP6_IMPL_H +#include <net/aligned_data.h> #include <net/udp.h> #include <net/udplite.h> #include <net/protocol.h> diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c index c39c1e32f980..046f13b1d77a 100644 --- a/net/ipv6/udp_offload.c +++ b/net/ipv6/udp_offload.c @@ -14,6 +14,7 @@ #include <net/ip6_checksum.h> #include "ip6_offload.h" #include <net/gro.h> +#include <net/gso.h> static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb, netdev_features_t features) @@ -42,8 +43,7 @@ static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb, if (!pskb_may_pull(skb, sizeof(struct udphdr))) goto out; - if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4 && - !skb_gso_ok(skb, features | NETIF_F_GSO_ROBUST)) + if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4) return __udp_gso_segment(skb, features, true); mss = skb_shinfo(skb)->gso_size; @@ -117,11 +117,19 @@ static struct sock *udp6_gro_lookup_skb(struct sk_buff *skb, __be16 sport, __be16 dport) { const struct ipv6hdr *iph = skb_gro_network_header(skb); - struct net *net = dev_net(skb->dev); + struct net *net = dev_net_rcu(skb->dev); + struct sock *sk; + int iif, sdif; + + sk = udp_tunnel_sk(net, true); + if (sk && dport == htons(sk->sk_num)) + return sk; + + inet6_get_iif_sdif(skb, &iif, &sdif); return __udp6_lib_lookup(net, &iph->saddr, sport, - &iph->daddr, dport, inet6_iif(skb), - inet6_sdif(skb), net->ipv4.udp_table, NULL); + &iph->daddr, dport, iif, + sdif, net->ipv4.udp_table, NULL); } INDIRECT_CALLABLE_SCOPE @@ -146,8 +154,6 @@ struct sk_buff *udp6_gro_receive(struct list_head *head, struct sk_buff *skb) ip6_gro_compute_pseudo); skip: - NAPI_GRO_CB(skb)->is_ipv6 = 1; - if (static_branch_unlikely(&udpv6_encap_needed_key)) sk = udp6_gro_lookup_skb(skb, uh->source, uh->dest); @@ -161,7 +167,8 @@ flush: INDIRECT_CALLABLE_SCOPE int udp6_gro_complete(struct sk_buff *skb, int nhoff) { - const struct ipv6hdr *ipv6h = ipv6_hdr(skb); + const u16 offset = NAPI_GRO_CB(skb)->network_offsets[skb->encapsulation]; + const struct ipv6hdr *ipv6h = (struct ipv6hdr *)(skb->data + offset); struct udphdr *uh = (struct udphdr *)(skb->data + nhoff); /* do fraglist only if there is no outer UDP encap (or we already processed it) */ @@ -171,13 +178,7 @@ INDIRECT_CALLABLE_SCOPE int udp6_gro_complete(struct sk_buff *skb, int nhoff) skb_shinfo(skb)->gso_type |= (SKB_GSO_FRAGLIST|SKB_GSO_UDP_L4); skb_shinfo(skb)->gso_segs = NAPI_GRO_CB(skb)->count; - if (skb->ip_summed == CHECKSUM_UNNECESSARY) { - if (skb->csum_level < SKB_MAX_CSUM_LEVEL) - skb->csum_level++; - } else { - skb->ip_summed = CHECKSUM_UNNECESSARY; - skb->csum_level = 0; - } + __skb_incr_checksum_unnecessary(skb); return 0; } @@ -189,20 +190,19 @@ INDIRECT_CALLABLE_SCOPE int udp6_gro_complete(struct sk_buff *skb, int nhoff) return udp_gro_complete(skb, nhoff, udp6_lib_lookup_skb); } -static const struct net_offload udpv6_offload = { - .callbacks = { - .gso_segment = udp6_ufo_fragment, - .gro_receive = udp6_gro_receive, - .gro_complete = udp6_gro_complete, - }, -}; - -int udpv6_offload_init(void) +int __init udpv6_offload_init(void) { - return inet6_add_offload(&udpv6_offload, IPPROTO_UDP); + net_hotdata.udpv6_offload = (struct net_offload) { + .callbacks = { + .gso_segment = udp6_ufo_fragment, + .gro_receive = udp6_gro_receive, + .gro_complete = udp6_gro_complete, + }, + }; + return inet6_add_offload(&net_hotdata.udpv6_offload, IPPROTO_UDP); } int udpv6_offload_exit(void) { - return inet6_del_offload(&udpv6_offload, IPPROTO_UDP); + return inet6_del_offload(&net_hotdata.udpv6_offload, IPPROTO_UDP); } diff --git a/net/ipv6/udplite.c b/net/ipv6/udplite.c index 67eaf3ca14ce..2cec542437f7 100644 --- a/net/ipv6/udplite.c +++ b/net/ipv6/udplite.c @@ -8,6 +8,8 @@ * Changes: * Fixes: */ +#define pr_fmt(fmt) "UDPLite6: " fmt + #include <linux/export.h> #include <linux/proc_fs.h> #include "udp_impl.h" @@ -15,7 +17,8 @@ static int udplitev6_sk_init(struct sock *sk) { udpv6_init_sock(sk); - udp_sk(sk)->pcflag = UDPLITE_BIT; + pr_warn_once("UDP-Lite is deprecated and scheduled to be removed in 2025, " + "please contact the netdev mailing list\n"); return 0; } @@ -56,11 +59,14 @@ struct proto udplitev6_prot = { .rehash = udp_v6_rehash, .get_port = udp_v6_get_port, - .memory_allocated = &udp_memory_allocated, + .memory_allocated = &net_aligned_data.udp_memory_allocated, .per_cpu_fw_alloc = &udp_memory_per_cpu_fw_alloc, .sysctl_mem = sysctl_udp_mem, + .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_udp_wmem_min), + .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_udp_rmem_min), .obj_size = sizeof(struct udp6_sock), + .ipv6_pinfo_offset = offsetof(struct udp6_sock, inet6), .h.udp_table = &udplite_table, }; diff --git a/net/ipv6/xfrm6_input.c b/net/ipv6/xfrm6_input.c index 04cbeefd8982..9005fc156a20 100644 --- a/net/ipv6/xfrm6_input.c +++ b/net/ipv6/xfrm6_input.c @@ -16,6 +16,8 @@ #include <linux/netfilter_ipv6.h> #include <net/ipv6.h> #include <net/xfrm.h> +#include <net/protocol.h> +#include <net/gro.h> int xfrm6_rcv_spi(struct sk_buff *skb, int nexthdr, __be32 spi, struct ip6_tnl *t) @@ -41,7 +43,7 @@ static int xfrm6_transport_finish2(struct net *net, struct sock *sk, int xfrm6_transport_finish(struct sk_buff *skb, int async) { struct xfrm_offload *xo = xfrm_offload(skb); - int nhlen = skb->data - skb_network_header(skb); + int nhlen = -skb_network_offset(skb); skb_network_header(skb)[IP6CB(skb)->nhoff] = XFRM_MODE_SKB_CB(skb)->protocol; @@ -56,7 +58,11 @@ int xfrm6_transport_finish(struct sk_buff *skb, int async) skb_postpush_rcsum(skb, skb_network_header(skb), nhlen); if (xo && (xo->flags & XFRM_GRO)) { - skb_mac_header_rebuild(skb); + /* The full l2 header needs to be preserved so that re-injecting the packet at l2 + * works correctly in the presence of vlan tags. + */ + skb_mac_header_rebuild_full(skb, xo->orig_mac_len); + skb_reset_network_header(skb); skb_reset_transport_header(skb); return 0; } @@ -67,25 +73,18 @@ int xfrm6_transport_finish(struct sk_buff *skb, int async) return 0; } -/* If it's a keepalive packet, then just eat it. - * If it's an encapsulated packet, then pass it to the - * IPsec xfrm input. - * Returns 0 if skb passed to xfrm or was dropped. - * Returns >0 if skb should be passed to UDP. - * Returns <0 if skb should be resubmitted (-ret is protocol) - */ -int xfrm6_udp_encap_rcv(struct sock *sk, struct sk_buff *skb) +static int __xfrm6_udp_encap_rcv(struct sock *sk, struct sk_buff *skb, bool pull) { struct udp_sock *up = udp_sk(sk); struct udphdr *uh; struct ipv6hdr *ip6h; int len; int ip6hlen = sizeof(struct ipv6hdr); - __u8 *udpdata; __be32 *udpdata32; - __u16 encap_type = up->encap_type; + u16 encap_type; + encap_type = READ_ONCE(up->encap_type); /* if this is not encapsulated socket, then just return now */ if (!encap_type) return 1; @@ -106,7 +105,7 @@ int xfrm6_udp_encap_rcv(struct sock *sk, struct sk_buff *skb) case UDP_ENCAP_ESPINUDP: /* Check if this is a keepalive packet. If so, eat it. */ if (len == 1 && udpdata[0] == 0xff) { - goto drop; + return -EINVAL; } else if (len > sizeof(struct ip_esp_hdr) && udpdata32[0] != 0) { /* ESP Packet without Non-ESP header */ len = sizeof(struct udphdr); @@ -114,19 +113,6 @@ int xfrm6_udp_encap_rcv(struct sock *sk, struct sk_buff *skb) /* Must be an IKE packet.. pass it through */ return 1; break; - case UDP_ENCAP_ESPINUDP_NON_IKE: - /* Check if this is a keepalive packet. If so, eat it. */ - if (len == 1 && udpdata[0] == 0xff) { - goto drop; - } else if (len > 2 * sizeof(u32) + sizeof(struct ip_esp_hdr) && - udpdata32[0] == 0 && udpdata32[1] == 0) { - - /* ESP Packet with Non-IKE marker */ - len = sizeof(struct udphdr) + 2 * sizeof(u32); - } else - /* Must be an IKE packet.. pass it through */ - return 1; - break; } /* At this point we are sure that this is an ESPinUDP packet, @@ -135,31 +121,105 @@ int xfrm6_udp_encap_rcv(struct sock *sk, struct sk_buff *skb) * protocol to ESP, and then call into the transform receiver. */ if (skb_unclone(skb, GFP_ATOMIC)) - goto drop; + return -EINVAL; /* Now we can update and verify the packet length... */ ip6h = ipv6_hdr(skb); ip6h->payload_len = htons(ntohs(ip6h->payload_len) - len); if (skb->len < ip6hlen + len) { /* packet is too small!?! */ - goto drop; + return -EINVAL; } /* pull the data buffer up to the ESP header and set the * transport header to point to ESP. Keep UDP on the stack * for later. */ - __skb_pull(skb, len); - skb_reset_transport_header(skb); + if (pull) { + __skb_pull(skb, len); + skb_reset_transport_header(skb); + } else { + skb_set_transport_header(skb, len); + } /* process ESP */ - return xfrm6_rcv_encap(skb, IPPROTO_ESP, 0, encap_type); - -drop: - kfree_skb(skb); return 0; } +/* If it's a keepalive packet, then just eat it. + * If it's an encapsulated packet, then pass it to the + * IPsec xfrm input. + * Returns 0 if skb passed to xfrm or was dropped. + * Returns >0 if skb should be passed to UDP. + * Returns <0 if skb should be resubmitted (-ret is protocol) + */ +int xfrm6_udp_encap_rcv(struct sock *sk, struct sk_buff *skb) +{ + int ret; + + if (skb->protocol == htons(ETH_P_IP)) + return xfrm4_udp_encap_rcv(sk, skb); + + ret = __xfrm6_udp_encap_rcv(sk, skb, true); + if (!ret) + return xfrm6_rcv_encap(skb, IPPROTO_ESP, 0, + udp_sk(sk)->encap_type); + + if (ret < 0) { + kfree_skb(skb); + return 0; + } + + return ret; +} + +struct sk_buff *xfrm6_gro_udp_encap_rcv(struct sock *sk, struct list_head *head, + struct sk_buff *skb) +{ + int offset = skb_gro_offset(skb); + const struct net_offload *ops; + struct sk_buff *pp = NULL; + int len, dlen; + __u8 *udpdata; + __be32 *udpdata32; + + if (skb->protocol == htons(ETH_P_IP)) + return xfrm4_gro_udp_encap_rcv(sk, head, skb); + + len = skb->len - offset; + dlen = offset + min(len, 8); + udpdata = skb_gro_header(skb, dlen, offset); + udpdata32 = (__be32 *)udpdata; + if (unlikely(!udpdata)) + return NULL; + + rcu_read_lock(); + ops = rcu_dereference(inet6_offloads[IPPROTO_ESP]); + if (!ops || !ops->callbacks.gro_receive) + goto out; + + /* check if it is a keepalive or IKE packet */ + if (len <= sizeof(struct ip_esp_hdr) || udpdata32[0] == 0) + goto out; + + /* set the transport header to ESP */ + skb_set_transport_header(skb, offset); + + NAPI_GRO_CB(skb)->proto = IPPROTO_UDP; + + pp = call_gro_receive(ops->callbacks.gro_receive, head, skb); + rcu_read_unlock(); + + return pp; + +out: + rcu_read_unlock(); + NAPI_GRO_CB(skb)->same_flow = 0; + NAPI_GRO_CB(skb)->flush = 1; + + return NULL; +} + int xfrm6_rcv_tnl(struct sk_buff *skb, struct ip6_tnl *t) { return xfrm6_rcv_spi(skb, skb_network_header(skb)[IP6CB(skb)->nhoff], @@ -215,6 +275,13 @@ int xfrm6_input_addr(struct sk_buff *skb, xfrm_address_t *daddr, if (!x) continue; + if (unlikely(x->dir && x->dir != XFRM_SA_DIR_IN)) { + XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATEDIRERROR); + xfrm_state_put(x); + x = NULL; + continue; + } + spin_lock(&x->lock); if ((!i || (x->props.flags & XFRM_STATE_WILDRECV)) && diff --git a/net/ipv6/xfrm6_output.c b/net/ipv6/xfrm6_output.c index ad07904642ca..512bdaf13699 100644 --- a/net/ipv6/xfrm6_output.c +++ b/net/ipv6/xfrm6_output.c @@ -82,20 +82,20 @@ static int __xfrm6_output(struct net *net, struct sock *sk, struct sk_buff *skb) toobig = skb->len > mtu && !skb_is_gso(skb); - if (toobig && xfrm6_local_dontfrag(skb->sk)) { + if (toobig && xfrm6_local_dontfrag(sk)) { xfrm6_local_rxpmtu(skb, mtu); kfree_skb(skb); return -EMSGSIZE; } else if (toobig && xfrm6_noneed_fragment(skb)) { skb->ignore_df = 1; goto skip_frag; - } else if (!skb->ignore_df && toobig && skb->sk) { + } else if (!skb->ignore_df && toobig && sk) { xfrm_local_error(skb, mtu); kfree_skb(skb); return -EMSGSIZE; } - if (toobig || dst_allfrag(skb_dst(skb))) + if (toobig) return ip6_fragment(net, sk, skb, __xfrm6_output_finish); @@ -106,7 +106,7 @@ skip_frag: int xfrm6_output(struct net *net, struct sock *sk, struct sk_buff *skb) { return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, - net, sk, skb, skb->dev, skb_dst(skb)->dev, + net, sk, skb, skb->dev, skb_dst_dev(skb), __xfrm6_output, !(IP6CB(skb)->flags & IP6SKB_REROUTED)); } diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c index ea435eba3053..1f19b6f14484 100644 --- a/net/ipv6/xfrm6_policy.c +++ b/net/ipv6/xfrm6_policy.c @@ -23,23 +23,24 @@ #include <net/ip6_route.h> #include <net/l3mdev.h> -static struct dst_entry *xfrm6_dst_lookup(struct net *net, int tos, int oif, - const xfrm_address_t *saddr, - const xfrm_address_t *daddr, - u32 mark) +static struct dst_entry *xfrm6_dst_lookup(const struct xfrm_dst_lookup_params *params) { struct flowi6 fl6; struct dst_entry *dst; int err; memset(&fl6, 0, sizeof(fl6)); - fl6.flowi6_l3mdev = l3mdev_master_ifindex_by_index(net, oif); - fl6.flowi6_mark = mark; - memcpy(&fl6.daddr, daddr, sizeof(fl6.daddr)); - if (saddr) - memcpy(&fl6.saddr, saddr, sizeof(fl6.saddr)); + fl6.flowi6_l3mdev = l3mdev_master_ifindex_by_index(params->net, + params->oif); + fl6.flowi6_mark = params->mark; + memcpy(&fl6.daddr, params->daddr, sizeof(fl6.daddr)); + if (params->saddr) + memcpy(&fl6.saddr, params->saddr, sizeof(fl6.saddr)); - dst = ip6_route_output(net, NULL, &fl6); + fl6.flowi4_proto = params->ipproto; + fl6.uli = params->uli; + + dst = ip6_route_output(params->net, NULL, &fl6); err = dst->error; if (dst->error) { @@ -50,19 +51,25 @@ static struct dst_entry *xfrm6_dst_lookup(struct net *net, int tos, int oif, return dst; } -static int xfrm6_get_saddr(struct net *net, int oif, - xfrm_address_t *saddr, xfrm_address_t *daddr, - u32 mark) +static int xfrm6_get_saddr(xfrm_address_t *saddr, + const struct xfrm_dst_lookup_params *params) { struct dst_entry *dst; struct net_device *dev; + struct inet6_dev *idev; - dst = xfrm6_dst_lookup(net, 0, oif, NULL, daddr, mark); + dst = xfrm6_dst_lookup(params); if (IS_ERR(dst)) return -EHOSTUNREACH; - dev = ip6_dst_idev(dst)->dev; - ipv6_dev_get_saddr(dev_net(dev), dev, &daddr->in6, 0, &saddr->in6); + idev = ip6_dst_idev(dst); + if (!idev) { + dst_release(dst); + return -EHOSTUNREACH; + } + dev = idev->dev; + ipv6_dev_get_saddr(dev_net(dev), dev, ¶ms->daddr->in6, 0, + &saddr->in6); dst_release(dst); return 0; } @@ -70,7 +77,7 @@ static int xfrm6_get_saddr(struct net *net, int oif, static int xfrm6_fill_dst(struct xfrm_dst *xdst, struct net_device *dev, const struct flowi *fl) { - struct rt6_info *rt = (struct rt6_info *)xdst->route; + struct rt6_info *rt = dst_rt6_info(xdst->route); xdst->u.dst.dev = dev; netdev_hold(dev, &xdst->u.dst.dev_tracker, GFP_ATOMIC); @@ -89,7 +96,6 @@ static int xfrm6_fill_dst(struct xfrm_dst *xdst, struct net_device *dev, xdst->u.rt6.rt6i_gateway = rt->rt6i_gateway; xdst->u.rt6.rt6i_dst = rt->rt6i_dst; xdst->u.rt6.rt6i_src = rt->rt6i_src; - INIT_LIST_HEAD(&xdst->u.rt6.rt6i_uncached); rt6_uncached_list_add(&xdst->u.rt6); return 0; @@ -118,22 +124,17 @@ static void xfrm6_dst_destroy(struct dst_entry *dst) { struct xfrm_dst *xdst = (struct xfrm_dst *)dst; + dst_destroy_metrics_generic(dst); + rt6_uncached_list_del(&xdst->u.rt6); if (likely(xdst->u.rt6.rt6i_idev)) in6_dev_put(xdst->u.rt6.rt6i_idev); - dst_destroy_metrics_generic(dst); - if (xdst->u.rt6.rt6i_uncached_list) - rt6_uncached_list_del(&xdst->u.rt6); xfrm_dst_destroy(xdst); } -static void xfrm6_dst_ifdown(struct dst_entry *dst, struct net_device *dev, - int unregister) +static void xfrm6_dst_ifdown(struct dst_entry *dst, struct net_device *dev) { struct xfrm_dst *xdst; - if (!unregister) - return; - xdst = (struct xfrm_dst *)dst; if (xdst->u.rt6.rt6i_idev->dev == dev) { struct inet6_dev *loopback_idev = @@ -190,7 +191,6 @@ static struct ctl_table xfrm6_policy_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, - { } }; static int __net_init xfrm6_net_sysctl_init(struct net *net) @@ -207,7 +207,8 @@ static int __net_init xfrm6_net_sysctl_init(struct net *net) table[0].data = &net->xfrm.xfrm6_dst_ops.gc_thresh; } - hdr = register_net_sysctl(net, "net/ipv6", table); + hdr = register_net_sysctl_sz(net, "net/ipv6", table, + ARRAY_SIZE(xfrm6_policy_table)); if (!hdr) goto err_reg; @@ -223,7 +224,7 @@ err_alloc: static void __net_exit xfrm6_net_sysctl_exit(struct net *net) { - struct ctl_table *table; + const struct ctl_table *table; if (!net->ipv6.sysctl.xfrm6_hdr) return; @@ -290,8 +291,14 @@ int __init xfrm6_init(void) ret = register_pernet_subsys(&xfrm6_net_ops); if (ret) goto out_protocol; + + ret = xfrm_nat_keepalive_init(AF_INET6); + if (ret) + goto out_nat_keepalive; out: return ret; +out_nat_keepalive: + unregister_pernet_subsys(&xfrm6_net_ops); out_protocol: xfrm6_protocol_fini(); out_state: @@ -303,6 +310,7 @@ out_policy: void xfrm6_fini(void) { + xfrm_nat_keepalive_fini(AF_INET6); unregister_pernet_subsys(&xfrm6_net_ops); xfrm6_protocol_fini(); xfrm6_policy_fini(); diff --git a/net/ipv6/xfrm6_tunnel.c b/net/ipv6/xfrm6_tunnel.c index 1323f2f6928e..0a0eeaed0591 100644 --- a/net/ipv6/xfrm6_tunnel.c +++ b/net/ipv6/xfrm6_tunnel.c @@ -334,8 +334,8 @@ static void __net_exit xfrm6_tunnel_net_exit(struct net *net) struct xfrm6_tunnel_net *xfrm6_tn = xfrm6_tunnel_pernet(net); unsigned int i; + xfrm_state_flush(net, 0, false); xfrm_flush_gc(); - xfrm_state_flush(net, 0, false, true); for (i = 0; i < XFRM6_TUNNEL_SPI_BYADDR_HSIZE; i++) WARN_ON_ONCE(!hlist_empty(&xfrm6_tn->spi_byaddr[i])); @@ -355,10 +355,7 @@ static int __init xfrm6_tunnel_init(void) { int rv; - xfrm6_tunnel_spi_kmem = kmem_cache_create("xfrm6_tunnel_spi", - sizeof(struct xfrm6_tunnel_spi), - 0, SLAB_HWCACHE_ALIGN, - NULL); + xfrm6_tunnel_spi_kmem = KMEM_CACHE(xfrm6_tunnel_spi, SLAB_HWCACHE_ALIGN); if (!xfrm6_tunnel_spi_kmem) return -ENOMEM; rv = register_pernet_subsys(&xfrm6_tunnel_net_ops); @@ -401,5 +398,6 @@ static void __exit xfrm6_tunnel_fini(void) module_init(xfrm6_tunnel_init); module_exit(xfrm6_tunnel_fini); +MODULE_DESCRIPTION("IPv6 XFRM tunnel driver"); MODULE_LICENSE("GPL"); MODULE_ALIAS_XFRM_TYPE(AF_INET6, XFRM_PROTO_IPV6); |
