diff options
Diffstat (limited to 'net/ipv6')
63 files changed, 2728 insertions, 1767 deletions
diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig index 08d4b7132d4c..1c9c686d9522 100644 --- a/net/ipv6/Kconfig +++ b/net/ipv6/Kconfig @@ -323,6 +323,7 @@ config IPV6_RPL_LWTUNNEL bool "IPv6: RPL Source Routing Header support" depends on IPV6 select LWTUNNEL + select DST_CACHE help Support for RFC6554 RPL Source Routing Header using the lightweight tunnels mechanism. diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 779aa6ecdd49..43b19adfbf88 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -63,6 +63,7 @@ #include <linux/string.h> #include <linux/hash.h> +#include <net/ip_tunnels.h> #include <net/net_namespace.h> #include <net/sock.h> #include <net/snmp.h> @@ -79,20 +80,19 @@ #include <net/netlink.h> #include <net/pkt_sched.h> #include <net/l3mdev.h> +#include <net/netdev_lock.h> #include <linux/if_tunnel.h> #include <linux/rtnetlink.h> #include <linux/netconf.h> #include <linux/random.h> #include <linux/uaccess.h> -#include <asm/unaligned.h> +#include <linux/unaligned.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/export.h> #include <linux/ioam6.h> -#define INFINITY_LIFE_TIME 0xFFFFFFFF - #define IPV6_MAX_STRLEN \ sizeof("ffff:ffff:ffff:ffff:ffff:ffff:255.255.255.255") @@ -238,6 +238,7 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = { .ioam6_id_wide = IOAM6_DEFAULT_IF_ID_WIDE, .ndisc_evict_nocarrier = 1, .ra_honor_pio_life = 0, + .ra_honor_pio_pflag = 0, }; static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = { @@ -301,6 +302,7 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = { .ioam6_id_wide = IOAM6_DEFAULT_IF_ID_WIDE, .ndisc_evict_nocarrier = 1, .ra_honor_pio_life = 0, + .ra_honor_pio_pflag = 0, }; /* Check if link is ready: is it up and is a valid qdisc available */ @@ -311,7 +313,7 @@ static inline bool addrconf_link_ready(const struct net_device *dev) static void addrconf_del_rs_timer(struct inet6_dev *idev) { - if (del_timer(&idev->rs_timer)) + if (timer_delete(&idev->rs_timer)) __in6_dev_put(idev); } @@ -376,6 +378,7 @@ static struct inet6_dev *ipv6_add_dev(struct net_device *dev) int err = -ENOMEM; ASSERT_RTNL(); + netdev_ops_assert_locked(dev); if (dev->mtu < IPV6_MIN_MTU && dev != blackhole_netdev) return ERR_PTR(-EINVAL); @@ -401,7 +404,7 @@ static struct inet6_dev *ipv6_add_dev(struct net_device *dev) return ERR_PTR(err); } if (ndev->cnf.forwarding) - dev_disable_lro(dev); + netif_disable_lro(dev); /* We refer to the device */ netdev_hold(dev, &ndev->dev_tracker, GFP_KERNEL); @@ -851,7 +854,7 @@ static void addrconf_forward_change(struct net *net, __s32 newf) struct inet6_dev *idev; for_each_netdev(net, dev) { - idev = __in6_dev_get(dev); + idev = __in6_dev_get_rtnl_net(dev); if (idev) { int changed = (!idev->cnf.forwarding) ^ (!newf); @@ -862,15 +865,14 @@ static void addrconf_forward_change(struct net *net, __s32 newf) } } -static int addrconf_fixup_forwarding(struct ctl_table *table, int *p, int newf) +static int addrconf_fixup_forwarding(const struct ctl_table *table, int *p, int newf) { - struct net *net; + struct net *net = (struct net *)table->extra2; int old; - if (!rtnl_trylock()) + if (!rtnl_net_trylock(net)) return restart_syscall(); - net = (struct net *)table->extra2; old = *p; WRITE_ONCE(*p, newf); @@ -880,7 +882,7 @@ static int addrconf_fixup_forwarding(struct ctl_table *table, int *p, int newf) NETCONFA_FORWARDING, NETCONFA_IFINDEX_DEFAULT, net->ipv6.devconf_dflt); - rtnl_unlock(); + rtnl_net_unlock(net); return 0; } @@ -902,7 +904,7 @@ static int addrconf_fixup_forwarding(struct ctl_table *table, int *p, int newf) net->ipv6.devconf_all); } else if ((!newf) ^ (!old)) dev_forward_change((struct inet6_dev *)table->extra1); - rtnl_unlock(); + rtnl_net_unlock(net); if (newf) rt6_purge_dflt_routers(net); @@ -915,7 +917,7 @@ static void addrconf_linkdown_change(struct net *net, __s32 newf) struct inet6_dev *idev; for_each_netdev(net, dev) { - idev = __in6_dev_get(dev); + idev = __in6_dev_get_rtnl_net(dev); if (idev) { int changed = (!idev->cnf.ignore_routes_with_linkdown) ^ (!newf); @@ -930,15 +932,14 @@ static void addrconf_linkdown_change(struct net *net, __s32 newf) } } -static int addrconf_fixup_linkdown(struct ctl_table *table, int *p, int newf) +static int addrconf_fixup_linkdown(const struct ctl_table *table, int *p, int newf) { - struct net *net; + struct net *net = (struct net *)table->extra2; int old; - if (!rtnl_trylock()) + if (!rtnl_net_trylock(net)) return restart_syscall(); - net = (struct net *)table->extra2; old = *p; WRITE_ONCE(*p, newf); @@ -949,7 +950,7 @@ static int addrconf_fixup_linkdown(struct ctl_table *table, int *p, int newf) NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN, NETCONFA_IFINDEX_DEFAULT, net->ipv6.devconf_dflt); - rtnl_unlock(); + rtnl_net_unlock(net); return 0; } @@ -963,7 +964,8 @@ static int addrconf_fixup_linkdown(struct ctl_table *table, int *p, int newf) NETCONFA_IFINDEX_ALL, net->ipv6.devconf_all); } - rtnl_unlock(); + + rtnl_net_unlock(net); return 1; } @@ -1015,7 +1017,7 @@ ipv6_link_dev_addr(struct inet6_dev *idev, struct inet6_ifaddr *ifp) static u32 inet6_addr_hash(const struct net *net, const struct in6_addr *addr) { - u32 val = ipv6_addr_hash(addr) ^ net_hash_mix(net); + u32 val = __ipv6_addr_jhash(addr, net_hash_mix(net)); return hash_32(val, IN6_ADDR_HSIZE_SHIFT); } @@ -1872,7 +1874,8 @@ int ipv6_dev_get_saddr(struct net *net, const struct net_device *dst_dev, master, &dst, scores, hiscore_idx); - if (scores[hiscore_idx].ifa) + if (scores[hiscore_idx].ifa && + scores[hiscore_idx].scopedist >= 0) goto out; } @@ -2568,6 +2571,24 @@ static struct inet6_dev *addrconf_add_dev(struct net_device *dev) return idev; } +static void delete_tempaddrs(struct inet6_dev *idev, + struct inet6_ifaddr *ifp) +{ + struct inet6_ifaddr *ift, *tmp; + + write_lock_bh(&idev->lock); + list_for_each_entry_safe(ift, tmp, &idev->tempaddr_list, tmp_list) { + if (ift->ifpub != ifp) + continue; + + in6_ifa_hold(ift); + write_unlock_bh(&idev->lock); + ipv6_del_addr(ift); + write_lock_bh(&idev->lock); + } + write_unlock_bh(&idev->lock); +} + static void manage_tempaddrs(struct inet6_dev *idev, struct inet6_ifaddr *ifp, __u32 valid_lft, __u32 prefered_lft, @@ -2760,6 +2781,7 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len, bool sllao) u32 addr_flags = 0; struct inet6_dev *in6_dev; struct net *net = dev_net(dev); + bool ignore_autoconf = false; pinfo = (struct prefix_info *) opt; @@ -2862,7 +2884,8 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len, bool sllao) /* Try to figure out our local address for this prefix */ - if (pinfo->autoconf && in6_dev->cnf.autoconf) { + ignore_autoconf = READ_ONCE(in6_dev->cnf.ra_honor_pio_pflag) && pinfo->preferpd; + if (pinfo->autoconf && in6_dev->cnf.autoconf && !ignore_autoconf) { struct in6_addr addr; bool tokenized = false, dev_addr_generated = false; @@ -2918,7 +2941,7 @@ put: static int addrconf_set_sit_dstaddr(struct net *net, struct net_device *dev, struct in6_ifreq *ireq) { - struct ip_tunnel_parm p = { }; + struct ip_tunnel_parm_kern p = { }; int err; if (!(ipv6_addr_type(&ireq->ifr6_addr) & IPV6_ADDR_COMPATv4)) @@ -2958,11 +2981,11 @@ int addrconf_set_dstaddr(struct net *net, void __user *arg) if (copy_from_user(&ireq, arg, sizeof(struct in6_ifreq))) return -EFAULT; - rtnl_lock(); + rtnl_net_lock(net); dev = __dev_get_by_index(net, ireq.ifr6_ifindex); if (dev && dev->type == ARPHRD_SIT) err = addrconf_set_sit_dstaddr(net, dev, &ireq); - rtnl_unlock(); + rtnl_net_unlock(net); return err; } @@ -2986,39 +3009,25 @@ static int ipv6_mc_config(struct sock *sk, bool join, /* * Manual configuration of address on an interface */ -static int inet6_addr_add(struct net *net, int ifindex, - struct ifa6_config *cfg, +static int inet6_addr_add(struct net *net, struct net_device *dev, + struct ifa6_config *cfg, clock_t expires, u32 flags, struct netlink_ext_ack *extack) { struct inet6_ifaddr *ifp; struct inet6_dev *idev; - struct net_device *dev; - unsigned long timeout; - clock_t expires; - u32 flags; - ASSERT_RTNL(); + ASSERT_RTNL_NET(net); if (cfg->plen > 128) { NL_SET_ERR_MSG_MOD(extack, "Invalid prefix length"); return -EINVAL; } - /* check the lifetime */ - if (!cfg->valid_lft || cfg->preferred_lft > cfg->valid_lft) { - NL_SET_ERR_MSG_MOD(extack, "address lifetime invalid"); - return -EINVAL; - } - if (cfg->ifa_flags & IFA_F_MANAGETEMPADDR && cfg->plen != 64) { NL_SET_ERR_MSG_MOD(extack, "address with \"mngtmpaddr\" flag must have a prefix length of 64"); return -EINVAL; } - dev = __dev_get_by_index(net, ifindex); - if (!dev) - return -ENODEV; - idev = addrconf_add_dev(dev); if (IS_ERR(idev)) { NL_SET_ERR_MSG_MOD(extack, "IPv6 is disabled on this device"); @@ -3027,7 +3036,7 @@ static int inet6_addr_add(struct net *net, int ifindex, if (cfg->ifa_flags & IFA_F_MCAUTOJOIN) { int ret = ipv6_mc_config(net->ipv6.mc_autojoin_sk, - true, cfg->pfx, ifindex); + true, cfg->pfx, dev->ifindex); if (ret < 0) { NL_SET_ERR_MSG_MOD(extack, "Multicast auto join failed"); @@ -3037,24 +3046,6 @@ static int inet6_addr_add(struct net *net, int ifindex, cfg->scope = ipv6_addr_scope(cfg->pfx); - timeout = addrconf_timeout_fixup(cfg->valid_lft, HZ); - if (addrconf_finite_timeout(timeout)) { - expires = jiffies_to_clock_t(timeout * HZ); - cfg->valid_lft = timeout; - flags = RTF_EXPIRES; - } else { - expires = 0; - flags = 0; - cfg->ifa_flags |= IFA_F_PERMANENT; - } - - timeout = addrconf_timeout_fixup(cfg->preferred_lft, HZ); - if (addrconf_finite_timeout(timeout)) { - if (timeout == 0) - cfg->ifa_flags |= IFA_F_DEPRECATED; - cfg->preferred_lft = timeout; - } - ifp = ipv6_add_addr(idev, cfg, true, extack); if (!IS_ERR(ifp)) { if (!(cfg->ifa_flags & IFA_F_NOPREFIXROUTE)) { @@ -3082,7 +3073,7 @@ static int inet6_addr_add(struct net *net, int ifindex, return 0; } else if (cfg->ifa_flags & IFA_F_MCAUTOJOIN) { ipv6_mc_config(net->ipv6.mc_autojoin_sk, false, - cfg->pfx, ifindex); + cfg->pfx, dev->ifindex); } return PTR_ERR(ifp); @@ -3107,7 +3098,7 @@ static int inet6_addr_del(struct net *net, int ifindex, u32 ifa_flags, return -ENODEV; } - idev = __in6_dev_get(dev); + idev = __in6_dev_get_rtnl_net(dev); if (!idev) { NL_SET_ERR_MSG_MOD(extack, "IPv6 is disabled on this device"); return -ENXIO; @@ -3120,11 +3111,12 @@ static int inet6_addr_del(struct net *net, int ifindex, u32 ifa_flags, in6_ifa_hold(ifp); read_unlock_bh(&idev->lock); - if (!(ifp->flags & IFA_F_TEMPORARY) && - (ifa_flags & IFA_F_MANAGETEMPADDR)) - manage_tempaddrs(idev, ifp, 0, 0, false, - jiffies); ipv6_del_addr(ifp); + + if (!(ifp->flags & IFA_F_TEMPORARY) && + (ifp->flags & IFA_F_MANAGETEMPADDR)) + delete_tempaddrs(idev, ifp); + addrconf_verify_rtnl(net); if (ipv6_addr_is_multicast(pfx)) { ipv6_mc_config(net->ipv6.mc_autojoin_sk, @@ -3147,6 +3139,7 @@ int addrconf_add_ifaddr(struct net *net, void __user *arg) .preferred_lft = INFINITY_LIFE_TIME, .valid_lft = INFINITY_LIFE_TIME, }; + struct net_device *dev; struct in6_ifreq ireq; int err; @@ -3159,9 +3152,16 @@ int addrconf_add_ifaddr(struct net *net, void __user *arg) cfg.pfx = &ireq.ifr6_addr; cfg.plen = ireq.ifr6_prefixlen; - rtnl_lock(); - err = inet6_addr_add(net, ireq.ifr6_ifindex, &cfg, NULL); - rtnl_unlock(); + rtnl_net_lock(net); + dev = __dev_get_by_index(net, ireq.ifr6_ifindex); + if (dev) { + netdev_lock_ops(dev); + err = inet6_addr_add(net, dev, &cfg, 0, 0, NULL); + netdev_unlock_ops(dev); + } else { + err = -ENODEV; + } + rtnl_net_unlock(net); return err; } @@ -3176,10 +3176,10 @@ int addrconf_del_ifaddr(struct net *net, void __user *arg) if (copy_from_user(&ireq, arg, sizeof(struct in6_ifreq))) return -EFAULT; - rtnl_lock(); + rtnl_net_lock(net); err = inet6_addr_del(net, ireq.ifr6_ifindex, 0, &ireq.ifr6_addr, ireq.ifr6_prefixlen, NULL); - rtnl_unlock(); + rtnl_net_unlock(net); return err; } @@ -3214,16 +3214,13 @@ static void add_v4_addrs(struct inet6_dev *idev) struct in6_addr addr; struct net_device *dev; struct net *net = dev_net(idev->dev); - int scope, plen, offset = 0; + int scope, plen; u32 pflags = 0; ASSERT_RTNL(); memset(&addr, 0, sizeof(struct in6_addr)); - /* in case of IP6GRE the dev_addr is an IPv6 and therefore we use only the last 4 bytes */ - if (idev->dev->addr_len == sizeof(struct in6_addr)) - offset = sizeof(struct in6_addr) - 4; - memcpy(&addr.s6_addr32[3], idev->dev->dev_addr + offset, 4); + memcpy(&addr.s6_addr32[3], idev->dev->dev_addr, 4); if (!(idev->dev->flags & IFF_POINTOPOINT) && idev->dev->type == ARPHRD_SIT) { scope = IPV6_ADDR_COMPATv4; @@ -3534,7 +3531,13 @@ static void addrconf_gre_config(struct net_device *dev) return; } - if (dev->type == ARPHRD_ETHER) { + /* Generate the IPv6 link-local address using addrconf_addr_gen(), + * unless we have an IPv4 GRE device not bound to an IP address and + * which is in EUI64 mode (as __ipv6_isatap_ifid() would fail in this + * case). Such devices fall back to add_v4_addrs() instead. + */ + if (!(dev->type == ARPHRD_IPGRE && *(__be32 *)dev->dev_addr == 0 && + idev->cnf.addr_gen_mode == IN6_ADDR_GEN_MODE_EUI64)) { addrconf_addr_gen(idev, true); return; } @@ -4182,6 +4185,7 @@ static void addrconf_dad_work(struct work_struct *w) struct inet6_dev *idev = ifp->idev; bool bump_id, disable_ipv6 = false; struct in6_addr mcaddr; + struct net *net; enum { DAD_PROCESS, @@ -4189,7 +4193,9 @@ static void addrconf_dad_work(struct work_struct *w) DAD_ABORT, } action = DAD_PROCESS; - rtnl_lock(); + net = dev_net(idev->dev); + + rtnl_net_lock(net); spin_lock_bh(&ifp->lock); if (ifp->state == INET6_IFADDR_STATE_PREDAD) { @@ -4199,7 +4205,7 @@ static void addrconf_dad_work(struct work_struct *w) action = DAD_ABORT; ifp->state = INET6_IFADDR_STATE_POSTDAD; - if ((READ_ONCE(dev_net(idev->dev)->ipv6.devconf_all->accept_dad) > 1 || + if ((READ_ONCE(net->ipv6.devconf_all->accept_dad) > 1 || READ_ONCE(idev->cnf.accept_dad) > 1) && !idev->cnf.disable_ipv6 && !(ifp->flags & IFA_F_STABLE_PRIVACY)) { @@ -4281,7 +4287,7 @@ static void addrconf_dad_work(struct work_struct *w) ifp->dad_nonce); out: in6_ifa_put(ifp); - rtnl_unlock(); + rtnl_net_unlock(net); } /* ifp->idev must be at least read locked */ @@ -4729,9 +4735,9 @@ static void addrconf_verify_work(struct work_struct *w) struct net *net = container_of(to_delayed_work(w), struct net, ipv6.addr_chk_work); - rtnl_lock(); + rtnl_net_lock(net); addrconf_verify_rtnl(net); - rtnl_unlock(); + rtnl_net_unlock(net); } static void addrconf_verify(struct net *net) @@ -4789,16 +4795,20 @@ inet6_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, if (!pfx) return -EINVAL; - ifa_flags = tb[IFA_FLAGS] ? nla_get_u32(tb[IFA_FLAGS]) : ifm->ifa_flags; + ifa_flags = nla_get_u32_default(tb[IFA_FLAGS], ifm->ifa_flags); /* We ignore other flags so far. */ ifa_flags &= IFA_F_MANAGETEMPADDR; - return inet6_addr_del(net, ifm->ifa_index, ifa_flags, pfx, - ifm->ifa_prefixlen, extack); + rtnl_net_lock(net); + err = inet6_addr_del(net, ifm->ifa_index, ifa_flags, pfx, + ifm->ifa_prefixlen, extack); + rtnl_net_unlock(net); + + return err; } -static int modify_prefix_route(struct inet6_ifaddr *ifp, +static int modify_prefix_route(struct net *net, struct inet6_ifaddr *ifp, unsigned long expires, u32 flags, bool modify_peer) { @@ -4822,7 +4832,9 @@ static int modify_prefix_route(struct inet6_ifaddr *ifp, ifp->prefix_len, ifp->rt_priority, ifp->idev->dev, expires, flags, GFP_KERNEL); - } else { + return 0; + } + if (f6i != net->ipv6.fib6_null_entry) { table = f6i->fib6_table; spin_lock_bh(&table->tb6_lock); @@ -4835,27 +4847,21 @@ static int modify_prefix_route(struct inet6_ifaddr *ifp, } spin_unlock_bh(&table->tb6_lock); - - fib6_info_release(f6i); } + fib6_info_release(f6i); return 0; } static int inet6_addr_modify(struct net *net, struct inet6_ifaddr *ifp, - struct ifa6_config *cfg) + struct ifa6_config *cfg, clock_t expires, + u32 flags) { - u32 flags; - clock_t expires; - unsigned long timeout; bool was_managetempaddr; - bool had_prefixroute; bool new_peer = false; + bool had_prefixroute; - ASSERT_RTNL(); - - if (!cfg->valid_lft || cfg->preferred_lft > cfg->valid_lft) - return -EINVAL; + ASSERT_RTNL_NET(net); if (cfg->ifa_flags & IFA_F_MANAGETEMPADDR && (ifp->flags & IFA_F_TEMPORARY || ifp->prefix_len != 64)) @@ -4864,24 +4870,6 @@ static int inet6_addr_modify(struct net *net, struct inet6_ifaddr *ifp, if (!(ifp->flags & IFA_F_TENTATIVE) || ifp->flags & IFA_F_DADFAILED) cfg->ifa_flags &= ~IFA_F_OPTIMISTIC; - timeout = addrconf_timeout_fixup(cfg->valid_lft, HZ); - if (addrconf_finite_timeout(timeout)) { - expires = jiffies_to_clock_t(timeout * HZ); - cfg->valid_lft = timeout; - flags = RTF_EXPIRES; - } else { - expires = 0; - flags = 0; - cfg->ifa_flags |= IFA_F_PERMANENT; - } - - timeout = addrconf_timeout_fixup(cfg->preferred_lft, HZ); - if (addrconf_finite_timeout(timeout)) { - if (timeout == 0) - cfg->ifa_flags |= IFA_F_DEPRECATED; - cfg->preferred_lft = timeout; - } - if (cfg->peer_pfx && memcmp(&ifp->peer_addr, cfg->peer_pfx, sizeof(struct in6_addr))) { if (!ipv6_addr_any(&ifp->peer_addr)) @@ -4916,7 +4904,7 @@ static int inet6_addr_modify(struct net *net, struct inet6_ifaddr *ifp, int rc = -ENOENT; if (had_prefixroute) - rc = modify_prefix_route(ifp, expires, flags, false); + rc = modify_prefix_route(net, ifp, expires, flags, false); /* prefix route could have been deleted; if so restore it */ if (rc == -ENOENT) { @@ -4926,7 +4914,7 @@ static int inet6_addr_modify(struct net *net, struct inet6_ifaddr *ifp, } if (had_prefixroute && !ipv6_addr_any(&ifp->peer_addr)) - rc = modify_prefix_route(ifp, expires, flags, true); + rc = modify_prefix_route(net, ifp, expires, flags, true); if (rc == -ENOENT && !ipv6_addr_any(&ifp->peer_addr)) { addrconf_prefix_route(&ifp->peer_addr, ifp->prefix_len, @@ -4948,14 +4936,12 @@ static int inet6_addr_modify(struct net *net, struct inet6_ifaddr *ifp, } if (was_managetempaddr || ifp->flags & IFA_F_MANAGETEMPADDR) { - if (was_managetempaddr && - !(ifp->flags & IFA_F_MANAGETEMPADDR)) { - cfg->valid_lft = 0; - cfg->preferred_lft = 0; - } - manage_tempaddrs(ifp->idev, ifp, cfg->valid_lft, - cfg->preferred_lft, !was_managetempaddr, - jiffies); + if (was_managetempaddr && !(ifp->flags & IFA_F_MANAGETEMPADDR)) + delete_tempaddrs(ifp->idev, ifp); + else + manage_tempaddrs(ifp->idev, ifp, cfg->valid_lft, + cfg->preferred_lft, !was_managetempaddr, + jiffies); } addrconf_verify_rtnl(net); @@ -4968,13 +4954,16 @@ inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); - struct ifaddrmsg *ifm; struct nlattr *tb[IFA_MAX+1]; struct in6_addr *peer_pfx; struct inet6_ifaddr *ifa; struct net_device *dev; struct inet6_dev *idev; struct ifa6_config cfg; + struct ifaddrmsg *ifm; + unsigned long timeout; + clock_t expires; + u32 flags; int err; err = nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFA_MAX, @@ -4997,8 +4986,18 @@ inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, if (tb[IFA_PROTO]) cfg.ifa_proto = nla_get_u8(tb[IFA_PROTO]); + cfg.ifa_flags = nla_get_u32_default(tb[IFA_FLAGS], ifm->ifa_flags); + + /* We ignore other flags so far. */ + cfg.ifa_flags &= IFA_F_NODAD | IFA_F_HOMEADDRESS | + IFA_F_MANAGETEMPADDR | IFA_F_NOPREFIXROUTE | + IFA_F_MCAUTOJOIN | IFA_F_OPTIMISTIC; + + cfg.ifa_flags |= IFA_F_PERMANENT; cfg.valid_lft = INFINITY_LIFE_TIME; cfg.preferred_lft = INFINITY_LIFE_TIME; + expires = 0; + flags = 0; if (tb[IFA_CACHEINFO]) { struct ifa_cacheinfo *ci; @@ -5006,27 +5005,44 @@ inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, ci = nla_data(tb[IFA_CACHEINFO]); cfg.valid_lft = ci->ifa_valid; cfg.preferred_lft = ci->ifa_prefered; + + if (!cfg.valid_lft || cfg.preferred_lft > cfg.valid_lft) { + NL_SET_ERR_MSG_MOD(extack, "address lifetime invalid"); + return -EINVAL; + } + + timeout = addrconf_timeout_fixup(cfg.valid_lft, HZ); + if (addrconf_finite_timeout(timeout)) { + cfg.ifa_flags &= ~IFA_F_PERMANENT; + cfg.valid_lft = timeout; + expires = jiffies_to_clock_t(timeout * HZ); + flags = RTF_EXPIRES; + } + + timeout = addrconf_timeout_fixup(cfg.preferred_lft, HZ); + if (addrconf_finite_timeout(timeout)) { + if (timeout == 0) + cfg.ifa_flags |= IFA_F_DEPRECATED; + + cfg.preferred_lft = timeout; + } } + rtnl_net_lock(net); + dev = __dev_get_by_index(net, ifm->ifa_index); if (!dev) { NL_SET_ERR_MSG_MOD(extack, "Unable to find the interface"); - return -ENODEV; + err = -ENODEV; + goto unlock_rtnl; } - if (tb[IFA_FLAGS]) - cfg.ifa_flags = nla_get_u32(tb[IFA_FLAGS]); - else - cfg.ifa_flags = ifm->ifa_flags; - - /* We ignore other flags so far. */ - cfg.ifa_flags &= IFA_F_NODAD | IFA_F_HOMEADDRESS | - IFA_F_MANAGETEMPADDR | IFA_F_NOPREFIXROUTE | - IFA_F_MCAUTOJOIN | IFA_F_OPTIMISTIC; - + netdev_lock_ops(dev); idev = ipv6_find_idev(dev); - if (IS_ERR(idev)) - return PTR_ERR(idev); + if (IS_ERR(idev)) { + err = PTR_ERR(idev); + goto unlock; + } if (!ipv6_allow_optimistic_dad(net, idev)) cfg.ifa_flags &= ~IFA_F_OPTIMISTIC; @@ -5034,7 +5050,8 @@ inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, if (cfg.ifa_flags & IFA_F_NODAD && cfg.ifa_flags & IFA_F_OPTIMISTIC) { NL_SET_ERR_MSG(extack, "IFA_F_NODAD and IFA_F_OPTIMISTIC are mutually exclusive"); - return -EINVAL; + err = -EINVAL; + goto unlock; } ifa = ipv6_get_ifaddr(net, cfg.pfx, dev, 1); @@ -5043,7 +5060,8 @@ inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, * It would be best to check for !NLM_F_CREATE here but * userspace already relies on not having to provide this. */ - return inet6_addr_add(net, ifm->ifa_index, &cfg, extack); + err = inet6_addr_add(net, dev, &cfg, expires, flags, extack); + goto unlock; } if (nlh->nlmsg_flags & NLM_F_EXCL || @@ -5051,10 +5069,14 @@ inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, NL_SET_ERR_MSG_MOD(extack, "address already assigned"); err = -EEXIST; } else { - err = inet6_addr_modify(net, ifa, &cfg); + err = inet6_addr_modify(net, ifa, &cfg, expires, flags); } in6_ifa_put(ifa); +unlock: + netdev_unlock_ops(dev); +unlock_rtnl: + rtnl_net_unlock(net); return err; } @@ -5108,22 +5130,6 @@ static inline int inet6_ifaddr_msgsize(void) + nla_total_size(4) /* IFA_RT_PRIORITY */; } -enum addr_type_t { - UNICAST_ADDR, - MULTICAST_ADDR, - ANYCAST_ADDR, -}; - -struct inet6_fill_args { - u32 portid; - u32 seq; - int event; - unsigned int flags; - int netnsid; - int ifindex; - enum addr_type_t type; -}; - static int inet6_fill_ifaddr(struct sk_buff *skb, const struct inet6_ifaddr *ifa, struct inet6_fill_args *args) @@ -5202,15 +5208,16 @@ error: return -EMSGSIZE; } -static int inet6_fill_ifmcaddr(struct sk_buff *skb, - const struct ifmcaddr6 *ifmca, - struct inet6_fill_args *args) +int inet6_fill_ifmcaddr(struct sk_buff *skb, + const struct ifmcaddr6 *ifmca, + struct inet6_fill_args *args) { int ifindex = ifmca->idev->dev->ifindex; u8 scope = RT_SCOPE_UNIVERSE; struct nlmsghdr *nlh; - if (ipv6_addr_scope(&ifmca->mca_addr) & IFA_SITE) + if (!args->force_rt_scope_universe && + ipv6_addr_scope(&ifmca->mca_addr) & IFA_SITE) scope = RT_SCOPE_SITE; nlh = nlmsg_put(skb, args->portid, args->seq, args->event, @@ -5236,9 +5243,9 @@ static int inet6_fill_ifmcaddr(struct sk_buff *skb, return 0; } -static int inet6_fill_ifacaddr(struct sk_buff *skb, - const struct ifacaddr6 *ifaca, - struct inet6_fill_args *args) +int inet6_fill_ifacaddr(struct sk_buff *skb, + const struct ifacaddr6 *ifaca, + struct inet6_fill_args *args) { struct net_device *dev = fib6_info_nh_dev(ifaca->aca_rt); int ifindex = dev ? dev->ifindex : 1; @@ -5342,12 +5349,12 @@ static int inet6_valid_dump_ifaddr_req(const struct nlmsghdr *nlh, struct ifaddrmsg *ifm; int err, i; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifm))) { + ifm = nlmsg_payload(nlh, sizeof(*ifm)); + if (!ifm) { NL_SET_ERR_MSG_MOD(extack, "Invalid header for address dump request"); return -EINVAL; } - ifm = nlmsg_data(nlh); if (ifm->ifa_prefixlen || ifm->ifa_flags || ifm->ifa_scope) { NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for address dump request"); return -EINVAL; @@ -5399,6 +5406,7 @@ static int inet6_dump_addr(struct sk_buff *skb, struct netlink_callback *cb, .flags = NLM_F_MULTI, .netnsid = -1, .type = type, + .force_rt_scope_universe = false, }; struct { unsigned long ifindex; @@ -5479,7 +5487,8 @@ static int inet6_rtm_valid_getaddr_req(struct sk_buff *skb, struct ifaddrmsg *ifm; int i, err; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifm))) { + ifm = nlmsg_payload(nlh, sizeof(*ifm)); + if (!ifm) { NL_SET_ERR_MSG_MOD(extack, "Invalid header for get address request"); return -EINVAL; } @@ -5488,7 +5497,6 @@ static int inet6_rtm_valid_getaddr_req(struct sk_buff *skb, return nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_ipv6_policy, extack); - ifm = nlmsg_data(nlh); if (ifm->ifa_prefixlen || ifm->ifa_flags || ifm->ifa_scope) { NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for get address request"); return -EINVAL; @@ -5527,6 +5535,7 @@ static int inet6_rtm_getaddr(struct sk_buff *in_skb, struct nlmsghdr *nlh, .event = RTM_NEWADDR, .flags = 0, .netnsid = -1, + .force_rt_scope_universe = false, }; struct ifaddrmsg *ifm; struct nlattr *tb[IFA_MAX+1]; @@ -5598,6 +5607,7 @@ static void inet6_ifa_notify(int event, struct inet6_ifaddr *ifa) .event = event, .flags = 0, .netnsid = -1, + .force_rt_scope_universe = false, }; int err = -ENOBUFS; @@ -5615,8 +5625,7 @@ static void inet6_ifa_notify(int event, struct inet6_ifaddr *ifa) rtnl_notify(skb, net, 0, RTNLGRP_IPV6_IFADDR, NULL, GFP_ATOMIC); return; errout: - if (err < 0) - rtnl_set_sk_err(net, RTNLGRP_IPV6_IFADDR, err); + rtnl_set_sk_err(net, RTNLGRP_IPV6_IFADDR, err); } static void ipv6_store_devconf(const struct ipv6_devconf *cnf, @@ -5786,6 +5795,27 @@ static void snmp6_fill_stats(u64 *stats, struct inet6_dev *idev, int attrtype, } } +static int inet6_fill_ifla6_stats_attrs(struct sk_buff *skb, + struct inet6_dev *idev) +{ + struct nlattr *nla; + + nla = nla_reserve(skb, IFLA_INET6_STATS, IPSTATS_MIB_MAX * sizeof(u64)); + if (!nla) + goto nla_put_failure; + snmp6_fill_stats(nla_data(nla), idev, IFLA_INET6_STATS, nla_len(nla)); + + nla = nla_reserve(skb, IFLA_INET6_ICMP6STATS, ICMP6_MIB_MAX * sizeof(u64)); + if (!nla) + goto nla_put_failure; + snmp6_fill_stats(nla_data(nla), idev, IFLA_INET6_ICMP6STATS, nla_len(nla)); + + return 0; + +nla_put_failure: + return -EMSGSIZE; +} + static int inet6_fill_ifla6_attrs(struct sk_buff *skb, struct inet6_dev *idev, u32 ext_filter_mask) { @@ -5808,18 +5838,10 @@ static int inet6_fill_ifla6_attrs(struct sk_buff *skb, struct inet6_dev *idev, /* XXX - MC not implemented */ - if (ext_filter_mask & RTEXT_FILTER_SKIP_STATS) - return 0; - - nla = nla_reserve(skb, IFLA_INET6_STATS, IPSTATS_MIB_MAX * sizeof(u64)); - if (!nla) - goto nla_put_failure; - snmp6_fill_stats(nla_data(nla), idev, IFLA_INET6_STATS, nla_len(nla)); - - nla = nla_reserve(skb, IFLA_INET6_ICMP6STATS, ICMP6_MIB_MAX * sizeof(u64)); - if (!nla) - goto nla_put_failure; - snmp6_fill_stats(nla_data(nla), idev, IFLA_INET6_ICMP6STATS, nla_len(nla)); + if (!(ext_filter_mask & RTEXT_FILTER_SKIP_STATS)) { + if (inet6_fill_ifla6_stats_attrs(skb, idev) < 0) + goto nla_put_failure; + } nla = nla_reserve(skb, IFLA_INET6_TOKEN, sizeof(struct in6_addr)); if (!nla) @@ -6093,7 +6115,8 @@ static int inet6_valid_dump_ifinfo(const struct nlmsghdr *nlh, { struct ifinfomsg *ifm; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifm))) { + ifm = nlmsg_payload(nlh, sizeof(*ifm)); + if (!ifm) { NL_SET_ERR_MSG_MOD(extack, "Invalid header for link dump request"); return -EINVAL; } @@ -6103,7 +6126,6 @@ static int inet6_valid_dump_ifinfo(const struct nlmsghdr *nlh, return -EINVAL; } - ifm = nlmsg_data(nlh); if (ifm->__ifi_pad || ifm->ifi_type || ifm->ifi_flags || ifm->ifi_change || ifm->ifi_index) { NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for dump request"); @@ -6171,8 +6193,7 @@ void inet6_ifinfo_notify(int event, struct inet6_dev *idev) rtnl_notify(skb, net, 0, RTNLGRP_IPV6_IFINFO, NULL, GFP_ATOMIC); return; errout: - if (err < 0) - rtnl_set_sk_err(net, RTNLGRP_IPV6_IFINFO, err); + rtnl_set_sk_err(net, RTNLGRP_IPV6_IFINFO, err); } static inline size_t inet6_prefix_nlmsg_size(void) @@ -6239,8 +6260,7 @@ static void inet6_prefix_notify(int event, struct inet6_dev *idev, rtnl_notify(skb, net, 0, RTNLGRP_IPV6_PREFIX, NULL, GFP_ATOMIC); return; errout: - if (err < 0) - rtnl_set_sk_err(net, RTNLGRP_IPV6_PREFIX, err); + rtnl_set_sk_err(net, RTNLGRP_IPV6_PREFIX, err); } static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp) @@ -6307,7 +6327,7 @@ static void ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp) #ifdef CONFIG_SYSCTL -static int addrconf_sysctl_forward(struct ctl_table *ctl, int write, +static int addrconf_sysctl_forward(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { int *valp = ctl->data; @@ -6332,7 +6352,7 @@ static int addrconf_sysctl_forward(struct ctl_table *ctl, int write, return ret; } -static int addrconf_sysctl_mtu(struct ctl_table *ctl, int write, +static int addrconf_sysctl_mtu(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct inet6_dev *idev = ctl->extra1; @@ -6366,7 +6386,7 @@ static void addrconf_disable_change(struct net *net, __s32 newf) struct inet6_dev *idev; for_each_netdev(net, dev) { - idev = __in6_dev_get(dev); + idev = __in6_dev_get_rtnl_net(dev); if (idev) { int changed = (!idev->cnf.disable_ipv6) ^ (!newf); @@ -6377,7 +6397,7 @@ static void addrconf_disable_change(struct net *net, __s32 newf) } } -static int addrconf_disable_ipv6(struct ctl_table *table, int *p, int newf) +static int addrconf_disable_ipv6(const struct ctl_table *table, int *p, int newf) { struct net *net = (struct net *)table->extra2; int old; @@ -6387,7 +6407,7 @@ static int addrconf_disable_ipv6(struct ctl_table *table, int *p, int newf) return 0; } - if (!rtnl_trylock()) + if (!rtnl_net_trylock(net)) return restart_syscall(); old = *p; @@ -6396,14 +6416,15 @@ static int addrconf_disable_ipv6(struct ctl_table *table, int *p, int newf) if (p == &net->ipv6.devconf_all->disable_ipv6) { WRITE_ONCE(net->ipv6.devconf_dflt->disable_ipv6, newf); addrconf_disable_change(net, newf); - } else if ((!newf) ^ (!old)) + } else if ((!newf) ^ (!old)) { dev_disable_change((struct inet6_dev *)table->extra1); + } - rtnl_unlock(); + rtnl_net_unlock(net); return 0; } -static int addrconf_sysctl_disable(struct ctl_table *ctl, int write, +static int addrconf_sysctl_disable(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { int *valp = ctl->data; @@ -6428,7 +6449,7 @@ static int addrconf_sysctl_disable(struct ctl_table *ctl, int write, return ret; } -static int addrconf_sysctl_proxy_ndp(struct ctl_table *ctl, int write, +static int addrconf_sysctl_proxy_ndp(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { int *valp = ctl->data; @@ -6442,20 +6463,20 @@ static int addrconf_sysctl_proxy_ndp(struct ctl_table *ctl, int write, if (write && old != new) { struct net *net = ctl->extra2; - if (!rtnl_trylock()) + if (!rtnl_net_trylock(net)) return restart_syscall(); - if (valp == &net->ipv6.devconf_dflt->proxy_ndp) + if (valp == &net->ipv6.devconf_dflt->proxy_ndp) { inet6_netconf_notify_devconf(net, RTM_NEWNETCONF, NETCONFA_PROXY_NEIGH, NETCONFA_IFINDEX_DEFAULT, net->ipv6.devconf_dflt); - else if (valp == &net->ipv6.devconf_all->proxy_ndp) + } else if (valp == &net->ipv6.devconf_all->proxy_ndp) { inet6_netconf_notify_devconf(net, RTM_NEWNETCONF, NETCONFA_PROXY_NEIGH, NETCONFA_IFINDEX_ALL, net->ipv6.devconf_all); - else { + } else { struct inet6_dev *idev = ctl->extra1; inet6_netconf_notify_devconf(net, RTM_NEWNETCONF, @@ -6463,13 +6484,13 @@ static int addrconf_sysctl_proxy_ndp(struct ctl_table *ctl, int write, idev->dev->ifindex, &idev->cnf); } - rtnl_unlock(); + rtnl_net_unlock(net); } return ret; } -static int addrconf_sysctl_addr_gen_mode(struct ctl_table *ctl, int write, +static int addrconf_sysctl_addr_gen_mode(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { @@ -6483,7 +6504,7 @@ static int addrconf_sysctl_addr_gen_mode(struct ctl_table *ctl, int write, .mode = ctl->mode, }; - if (!rtnl_trylock()) + if (!rtnl_net_trylock(net)) return restart_syscall(); new_val = *((u32 *)ctl->data); @@ -6506,19 +6527,23 @@ static int addrconf_sysctl_addr_gen_mode(struct ctl_table *ctl, int write, if (idev->cnf.addr_gen_mode != new_val) { WRITE_ONCE(idev->cnf.addr_gen_mode, new_val); + netdev_lock_ops(idev->dev); addrconf_init_auto_addrs(idev->dev); + netdev_unlock_ops(idev->dev); } } else if (&net->ipv6.devconf_all->addr_gen_mode == ctl->data) { struct net_device *dev; WRITE_ONCE(net->ipv6.devconf_dflt->addr_gen_mode, new_val); for_each_netdev(net, dev) { - idev = __in6_dev_get(dev); + idev = __in6_dev_get_rtnl_net(dev); if (idev && idev->cnf.addr_gen_mode != new_val) { WRITE_ONCE(idev->cnf.addr_gen_mode, new_val); + netdev_lock_ops(idev->dev); addrconf_init_auto_addrs(idev->dev); + netdev_unlock_ops(idev->dev); } } } @@ -6527,12 +6552,12 @@ static int addrconf_sysctl_addr_gen_mode(struct ctl_table *ctl, int write, } out: - rtnl_unlock(); + rtnl_net_unlock(net); return ret; } -static int addrconf_sysctl_stable_secret(struct ctl_table *ctl, int write, +static int addrconf_sysctl_stable_secret(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { @@ -6549,7 +6574,7 @@ static int addrconf_sysctl_stable_secret(struct ctl_table *ctl, int write, lctl.maxlen = IPV6_MAX_STRLEN; lctl.data = str; - if (!rtnl_trylock()) + if (!rtnl_net_trylock(net)) return restart_syscall(); if (!write && !secret->initialized) { @@ -6579,7 +6604,7 @@ static int addrconf_sysctl_stable_secret(struct ctl_table *ctl, int write, struct net_device *dev; for_each_netdev(net, dev) { - struct inet6_dev *idev = __in6_dev_get(dev); + struct inet6_dev *idev = __in6_dev_get_rtnl_net(dev); if (idev) { WRITE_ONCE(idev->cnf.addr_gen_mode, @@ -6594,13 +6619,13 @@ static int addrconf_sysctl_stable_secret(struct ctl_table *ctl, int write, } out: - rtnl_unlock(); + rtnl_net_unlock(net); return err; } static -int addrconf_sysctl_ignore_routes_with_linkdown(struct ctl_table *ctl, +int addrconf_sysctl_ignore_routes_with_linkdown(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) @@ -6668,7 +6693,7 @@ void addrconf_disable_policy_idev(struct inet6_dev *idev, int val) } static -int addrconf_disable_policy(struct ctl_table *ctl, int *valp, int val) +int addrconf_disable_policy(const struct ctl_table *ctl, int *valp, int val) { struct net *net = (struct net *)ctl->extra2; struct inet6_dev *idev; @@ -6678,7 +6703,7 @@ int addrconf_disable_policy(struct ctl_table *ctl, int *valp, int val) return 0; } - if (!rtnl_trylock()) + if (!rtnl_net_trylock(net)) return restart_syscall(); WRITE_ONCE(*valp, val); @@ -6687,7 +6712,7 @@ int addrconf_disable_policy(struct ctl_table *ctl, int *valp, int val) struct net_device *dev; for_each_netdev(net, dev) { - idev = __in6_dev_get(dev); + idev = __in6_dev_get_rtnl_net(dev); if (idev) addrconf_disable_policy_idev(idev, val); } @@ -6696,11 +6721,11 @@ int addrconf_disable_policy(struct ctl_table *ctl, int *valp, int val) addrconf_disable_policy_idev(idev, val); } - rtnl_unlock(); + rtnl_net_unlock(net); return 0; } -static int addrconf_sysctl_disable_policy(struct ctl_table *ctl, int write, +static int addrconf_sysctl_disable_policy(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { int *valp = ctl->data; @@ -6924,6 +6949,15 @@ static const struct ctl_table addrconf_sysctl[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, + { + .procname = "ra_honor_pio_pflag", + .data = &ipv6_devconf.ra_honor_pio_pflag, + .maxlen = sizeof(u8), + .mode = 0644, + .proc_handler = proc_dou8vec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, #ifdef CONFIG_IPV6_ROUTER_PREF { .procname = "accept_ra_rtr_pref", @@ -7183,14 +7217,12 @@ static const struct ctl_table addrconf_sysctl[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_TWO, }, - { - /* sentinel */ - } }; static int __addrconf_sysctl_register(struct net *net, char *dev_name, struct inet6_dev *idev, struct ipv6_devconf *p) { + size_t table_size = ARRAY_SIZE(addrconf_sysctl); int i, ifindex; struct ctl_table *table; char path[sizeof("net/ipv6/conf/") + IFNAMSIZ]; @@ -7199,7 +7231,7 @@ static int __addrconf_sysctl_register(struct net *net, char *dev_name, if (!table) goto out; - for (i = 0; table[i].data; i++) { + for (i = 0; i < table_size; i++) { table[i].data += (char *)p - (char *)&ipv6_devconf; /* If one of these is already set, then it is not safe to * overwrite either of them: this makes proc_dointvec_minmax @@ -7214,7 +7246,7 @@ static int __addrconf_sysctl_register(struct net *net, char *dev_name, snprintf(path, sizeof(path), "net/ipv6/conf/%s", dev_name); p->sysctl_header = register_net_sysctl_sz(net, path, table, - ARRAY_SIZE(addrconf_sysctl)); + table_size); if (!p->sysctl_header) goto free; @@ -7237,7 +7269,7 @@ out: static void __addrconf_sysctl_unregister(struct net *net, struct ipv6_devconf *p, int ifindex) { - struct ctl_table *table; + const struct ctl_table *table; if (!p->sysctl_header) return; @@ -7398,6 +7430,27 @@ static struct rtnl_af_ops inet6_ops __read_mostly = { .set_link_af = inet6_set_link_af, }; +static const struct rtnl_msg_handler addrconf_rtnl_msg_handlers[] __initconst_or_module = { + {.owner = THIS_MODULE, .protocol = PF_INET6, .msgtype = RTM_GETLINK, + .dumpit = inet6_dump_ifinfo, .flags = RTNL_FLAG_DUMP_UNLOCKED}, + {.owner = THIS_MODULE, .protocol = PF_INET6, .msgtype = RTM_NEWADDR, + .doit = inet6_rtm_newaddr, .flags = RTNL_FLAG_DOIT_PERNET}, + {.owner = THIS_MODULE, .protocol = PF_INET6, .msgtype = RTM_DELADDR, + .doit = inet6_rtm_deladdr, .flags = RTNL_FLAG_DOIT_PERNET}, + {.owner = THIS_MODULE, .protocol = PF_INET6, .msgtype = RTM_GETADDR, + .doit = inet6_rtm_getaddr, .dumpit = inet6_dump_ifaddr, + .flags = RTNL_FLAG_DOIT_UNLOCKED | RTNL_FLAG_DUMP_UNLOCKED}, + {.owner = THIS_MODULE, .protocol = PF_INET6, .msgtype = RTM_GETMULTICAST, + .dumpit = inet6_dump_ifmcaddr, + .flags = RTNL_FLAG_DUMP_UNLOCKED}, + {.owner = THIS_MODULE, .protocol = PF_INET6, .msgtype = RTM_GETANYCAST, + .dumpit = inet6_dump_ifacaddr, + .flags = RTNL_FLAG_DUMP_UNLOCKED}, + {.owner = THIS_MODULE, .protocol = PF_INET6, .msgtype = RTM_GETNETCONF, + .doit = inet6_netconf_get_devconf, .dumpit = inet6_netconf_dump_devconf, + .flags = RTNL_FLAG_DOIT_UNLOCKED | RTNL_FLAG_DUMP_UNLOCKED}, +}; + /* * Init / cleanup code */ @@ -7425,9 +7478,9 @@ int __init addrconf_init(void) goto out_nowq; } - rtnl_lock(); + rtnl_net_lock(&init_net); idev = ipv6_add_dev(blackhole_netdev); - rtnl_unlock(); + rtnl_net_unlock(&init_net); if (IS_ERR(idev)) { err = PTR_ERR(idev); goto errlo; @@ -7439,44 +7492,14 @@ int __init addrconf_init(void) addrconf_verify(&init_net); - rtnl_af_register(&inet6_ops); + err = rtnl_af_register(&inet6_ops); + if (err) + goto erraf; - err = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETLINK, - NULL, inet6_dump_ifinfo, RTNL_FLAG_DUMP_UNLOCKED); - if (err < 0) + err = rtnl_register_many(addrconf_rtnl_msg_handlers); + if (err) goto errout; - err = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWADDR, - inet6_rtm_newaddr, NULL, 0); - if (err < 0) - goto errout; - err = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELADDR, - inet6_rtm_deladdr, NULL, 0); - if (err < 0) - goto errout; - err = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETADDR, - inet6_rtm_getaddr, inet6_dump_ifaddr, - RTNL_FLAG_DOIT_UNLOCKED | - RTNL_FLAG_DUMP_UNLOCKED); - if (err < 0) - goto errout; - err = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETMULTICAST, - NULL, inet6_dump_ifmcaddr, - RTNL_FLAG_DUMP_UNLOCKED); - if (err < 0) - goto errout; - err = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETANYCAST, - NULL, inet6_dump_ifacaddr, - RTNL_FLAG_DUMP_UNLOCKED); - if (err < 0) - goto errout; - err = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETNETCONF, - inet6_netconf_get_devconf, - inet6_netconf_dump_devconf, - RTNL_FLAG_DOIT_UNLOCKED | - RTNL_FLAG_DUMP_UNLOCKED); - if (err < 0) - goto errout; err = ipv6_addr_label_rtnl_register(); if (err < 0) goto errout; @@ -7485,6 +7508,7 @@ int __init addrconf_init(void) errout: rtnl_unregister_all(PF_INET6); rtnl_af_unregister(&inet6_ops); +erraf: unregister_netdevice_notifier(&ipv6_dev_notf); errlo: destroy_workqueue(addrconf_wq); @@ -7506,17 +7530,17 @@ void addrconf_cleanup(void) rtnl_af_unregister(&inet6_ops); - rtnl_lock(); + rtnl_net_lock(&init_net); /* clean dev list */ for_each_netdev(&init_net, dev) { - if (__in6_dev_get(dev) == NULL) + if (!__in6_dev_get_rtnl_net(dev)) continue; addrconf_ifdown(dev, true); } addrconf_ifdown(init_net.loopback_dev, true); - rtnl_unlock(); + rtnl_net_unlock(&init_net); destroy_workqueue(addrconf_wq); } diff --git a/net/ipv6/addrlabel.c b/net/ipv6/addrlabel.c index 17ac45aa7194..fb63ffbcfc64 100644 --- a/net/ipv6/addrlabel.c +++ b/net/ipv6/addrlabel.c @@ -234,7 +234,8 @@ static int __ip6addrlbl_add(struct net *net, struct ip6addrlbl_entry *newp, hlist_add_head_rcu(&newp->list, &net->ipv6.ip6addrlbl_table.head); out: if (!ret) - net->ipv6.ip6addrlbl_table.seq++; + WRITE_ONCE(net->ipv6.ip6addrlbl_table.seq, + net->ipv6.ip6addrlbl_table.seq + 1); return ret; } @@ -445,7 +446,7 @@ static void ip6addrlbl_putmsg(struct nlmsghdr *nlh, }; static int ip6addrlbl_fill(struct sk_buff *skb, - struct ip6addrlbl_entry *p, + const struct ip6addrlbl_entry *p, u32 lseq, u32 portid, u32 seq, int event, unsigned int flags) @@ -472,12 +473,12 @@ static int ip6addrlbl_valid_dump_req(const struct nlmsghdr *nlh, { struct ifaddrlblmsg *ifal; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifal))) { + ifal = nlmsg_payload(nlh, sizeof(*ifal)); + if (!ifal) { NL_SET_ERR_MSG_MOD(extack, "Invalid header for address label dump request"); return -EINVAL; } - ifal = nlmsg_data(nlh); if (ifal->__ifal_reserved || ifal->ifal_prefixlen || ifal->ifal_flags || ifal->ifal_index || ifal->ifal_seq) { NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for address label dump request"); @@ -498,7 +499,8 @@ static int ip6addrlbl_dump(struct sk_buff *skb, struct netlink_callback *cb) struct net *net = sock_net(skb->sk); struct ip6addrlbl_entry *p; int idx = 0, s_idx = cb->args[0]; - int err; + int err = 0; + u32 lseq; if (cb->strict_check) { err = ip6addrlbl_valid_dump_req(nlh, cb->extack); @@ -507,10 +509,11 @@ static int ip6addrlbl_dump(struct sk_buff *skb, struct netlink_callback *cb) } rcu_read_lock(); + lseq = READ_ONCE(net->ipv6.ip6addrlbl_table.seq); hlist_for_each_entry_rcu(p, &net->ipv6.ip6addrlbl_table.head, list) { if (idx >= s_idx) { err = ip6addrlbl_fill(skb, p, - net->ipv6.ip6addrlbl_table.seq, + lseq, NETLINK_CB(cb->skb).portid, nlh->nlmsg_seq, RTM_NEWADDRLABEL, @@ -522,7 +525,7 @@ static int ip6addrlbl_dump(struct sk_buff *skb, struct netlink_callback *cb) } rcu_read_unlock(); cb->args[0] = idx; - return skb->len; + return err; } static inline int ip6addrlbl_msgsize(void) @@ -540,7 +543,8 @@ static int ip6addrlbl_valid_get_req(struct sk_buff *skb, struct ifaddrlblmsg *ifal; int i, err; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifal))) { + ifal = nlmsg_payload(nlh, sizeof(*ifal)); + if (!ifal) { NL_SET_ERR_MSG_MOD(extack, "Invalid header for addrlabel get request"); return -EINVAL; } @@ -549,7 +553,6 @@ static int ip6addrlbl_valid_get_req(struct sk_buff *skb, return nlmsg_parse_deprecated(nlh, sizeof(*ifal), tb, IFAL_MAX, ifal_policy, extack); - ifal = nlmsg_data(nlh); if (ifal->__ifal_reserved || ifal->ifal_flags || ifal->ifal_seq) { NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for addrlabel get request"); return -EINVAL; @@ -614,7 +617,7 @@ static int ip6addrlbl_get(struct sk_buff *in_skb, struct nlmsghdr *nlh, rcu_read_lock(); p = __ipv6_addr_label(net, addr, ipv6_addr_type(addr), ifal->ifal_index); - lseq = net->ipv6.ip6addrlbl_table.seq; + lseq = READ_ONCE(net->ipv6.ip6addrlbl_table.seq); if (p) err = ip6addrlbl_fill(skb, p, lseq, NETLINK_CB(in_skb).portid, @@ -631,22 +634,17 @@ static int ip6addrlbl_get(struct sk_buff *in_skb, struct nlmsghdr *nlh, return err; } +static const struct rtnl_msg_handler ipv6_adddr_label_rtnl_msg_handlers[] __initconst_or_module = { + {.owner = THIS_MODULE, .protocol = PF_INET6, .msgtype = RTM_NEWADDRLABEL, + .doit = ip6addrlbl_newdel, .flags = RTNL_FLAG_DOIT_UNLOCKED}, + {.owner = THIS_MODULE, .protocol = PF_INET6, .msgtype = RTM_DELADDRLABEL, + .doit = ip6addrlbl_newdel, .flags = RTNL_FLAG_DOIT_UNLOCKED}, + {.owner = THIS_MODULE, .protocol = PF_INET6, .msgtype = RTM_GETADDRLABEL, + .doit = ip6addrlbl_get, .dumpit = ip6addrlbl_dump, + .flags = RTNL_FLAG_DOIT_UNLOCKED | RTNL_FLAG_DUMP_UNLOCKED}, +}; + int __init ipv6_addr_label_rtnl_register(void) { - int ret; - - ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWADDRLABEL, - ip6addrlbl_newdel, - NULL, RTNL_FLAG_DOIT_UNLOCKED); - if (ret < 0) - return ret; - ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELADDRLABEL, - ip6addrlbl_newdel, - NULL, RTNL_FLAG_DOIT_UNLOCKED); - if (ret < 0) - return ret; - ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETADDRLABEL, - ip6addrlbl_get, - ip6addrlbl_dump, RTNL_FLAG_DOIT_UNLOCKED); - return ret; + return rtnl_register_many(ipv6_adddr_label_rtnl_msg_handlers); } diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 8041dc181bd4..acaff1296783 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -252,31 +252,29 @@ lookup_protocol: */ inet->inet_sport = htons(inet->inet_num); err = sk->sk_prot->hash(sk); - if (err) { - sk_common_release(sk); - goto out; - } + if (err) + goto out_sk_release; } if (sk->sk_prot->init) { err = sk->sk_prot->init(sk); - if (err) { - sk_common_release(sk); - goto out; - } + if (err) + goto out_sk_release; } if (!kern) { err = BPF_CGROUP_RUN_PROG_INET_SOCK(sk); - if (err) { - sk_common_release(sk); - goto out; - } + if (err) + goto out_sk_release; } out: return err; out_rcu_unlock: rcu_read_unlock(); goto out; +out_sk_release: + sk_common_release(sk); + sock->sk = NULL; + goto out; } static int __inet6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, @@ -509,7 +507,7 @@ void inet6_cleanup_sock(struct sock *sk) /* Free tx options */ - opt = xchg((__force struct ipv6_txoptions **)&np->opt, NULL); + opt = unrcu_pointer(xchg(&np->opt, NULL)); if (opt) { atomic_sub(opt->tot_len, &sk->sk_omem_alloc); txopt_put(opt); @@ -708,6 +706,7 @@ const struct proto_ops inet6_stream_ops = { .splice_eof = inet_splice_eof, .sendmsg_locked = tcp_sendmsg_locked, .splice_read = tcp_splice_read, + .set_peek_off = sk_set_peek_off, .read_sock = tcp_read_sock, .read_skb = tcp_read_skb, .peek_len = tcp_peek_len, @@ -716,6 +715,7 @@ const struct proto_ops inet6_stream_ops = { #endif .set_rcvlowat = tcp_set_rcvlowat, }; +EXPORT_SYMBOL_GPL(inet6_stream_ops); const struct proto_ops inet6_dgram_ops = { .family = PF_INET6, @@ -882,7 +882,6 @@ bool ipv6_opt_accepted(const struct sock *sk, const struct sk_buff *skb, } return false; } -EXPORT_SYMBOL_GPL(ipv6_opt_accepted); static struct packet_type ipv6_packet_type __read_mostly = { .type = cpu_to_be16(ETH_P_IPV6), @@ -1060,6 +1059,7 @@ static const struct ipv6_stub ipv6_stub_impl = { .nd_tbl = &nd_tbl, .ipv6_fragment = ip6_fragment, .ipv6_dev_find = ipv6_dev_find, + .ip6_xmit = ip6_xmit, }; static const struct ipv6_bpf_stub ipv6_bpf_stub_impl = { diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c index 0f2506e35359..21e01695b48c 100644 --- a/net/ipv6/anycast.c +++ b/net/ipv6/anycast.c @@ -49,9 +49,10 @@ static DEFINE_SPINLOCK(acaddr_hash_lock); static int ipv6_dev_ac_dec(struct net_device *dev, const struct in6_addr *addr); -static u32 inet6_acaddr_hash(struct net *net, const struct in6_addr *addr) +static u32 inet6_acaddr_hash(const struct net *net, + const struct in6_addr *addr) { - u32 val = ipv6_addr_hash(addr) ^ net_hash_mix(net); + u32 val = __ipv6_addr_jhash(addr, net_hash_mix(net)); return hash_32(val, IN6_ADDR_HSIZE_SHIFT); } @@ -252,9 +253,8 @@ static void aca_free_rcu(struct rcu_head *h) static void aca_put(struct ifacaddr6 *ac) { - if (refcount_dec_and_test(&ac->aca_refcnt)) { - call_rcu(&ac->rcu, aca_free_rcu); - } + if (refcount_dec_and_test(&ac->aca_refcnt)) + call_rcu_hurry(&ac->rcu, aca_free_rcu); } static struct ifacaddr6 *aca_alloc(struct fib6_info *f6i, @@ -278,6 +278,37 @@ static struct ifacaddr6 *aca_alloc(struct fib6_info *f6i, return aca; } +static void inet6_ifacaddr_notify(struct net_device *dev, + const struct ifacaddr6 *ifaca, int event) +{ + struct inet6_fill_args fillargs = { + .event = event, + .netnsid = -1, + }; + struct net *net = dev_net(dev); + struct sk_buff *skb; + int err = -ENOMEM; + + skb = nlmsg_new(NLMSG_ALIGN(sizeof(struct ifaddrmsg)) + + nla_total_size(sizeof(struct in6_addr)) + + nla_total_size(sizeof(struct ifa_cacheinfo)), + GFP_KERNEL); + if (!skb) + goto error; + + err = inet6_fill_ifacaddr(skb, ifaca, &fillargs); + if (err < 0) { + pr_err("Failed to fill in anycast addresses (err %d)\n", err); + nlmsg_free(skb); + goto error; + } + + rtnl_notify(skb, net, 0, RTNLGRP_IPV6_ACADDR, NULL, GFP_KERNEL); + return; +error: + rtnl_set_sk_err(net, RTNLGRP_IPV6_ACADDR, err); +} + /* * device anycast group inc (add if not found) */ @@ -333,6 +364,8 @@ int __ipv6_dev_ac_inc(struct inet6_dev *idev, const struct in6_addr *addr) addrconf_join_solict(idev->dev, &aca->aca_addr); + inet6_ifacaddr_notify(idev->dev, aca, RTM_NEWANYCAST); + aca_put(aca); return 0; out: @@ -375,6 +408,8 @@ int __ipv6_dev_ac_dec(struct inet6_dev *idev, const struct in6_addr *addr) ip6_del_rt(dev_net(idev->dev), aca->aca_rt, false); + inet6_ifacaddr_notify(idev->dev, aca, RTM_DELANYCAST); + aca_put(aca); return 0; } diff --git a/net/ipv6/calipso.c b/net/ipv6/calipso.c index eb8ee1e9373a..62618a058b8f 100644 --- a/net/ipv6/calipso.c +++ b/net/ipv6/calipso.c @@ -29,7 +29,7 @@ #include <net/calipso.h> #include <linux/atomic.h> #include <linux/bug.h> -#include <asm/unaligned.h> +#include <linux/unaligned.h> #include <linux/crc-ccitt.h> /* Maximium size of the calipso option including @@ -1072,8 +1072,13 @@ static int calipso_sock_getattr(struct sock *sk, struct ipv6_opt_hdr *hop; int opt_len, len, ret_val = -ENOMSG, offset; unsigned char *opt; - struct ipv6_txoptions *txopts = txopt_get(inet6_sk(sk)); + struct ipv6_pinfo *pinfo = inet6_sk(sk); + struct ipv6_txoptions *txopts; + + if (!pinfo) + return -EAFNOSUPPORT; + txopts = txopt_get(pinfo); if (!txopts || !txopts->hopopt) goto done; @@ -1125,8 +1130,13 @@ static int calipso_sock_setattr(struct sock *sk, { int ret_val; struct ipv6_opt_hdr *old, *new; - struct ipv6_txoptions *txopts = txopt_get(inet6_sk(sk)); + struct ipv6_pinfo *pinfo = inet6_sk(sk); + struct ipv6_txoptions *txopts; + + if (!pinfo) + return -EAFNOSUPPORT; + txopts = txopt_get(pinfo); old = NULL; if (txopts) old = txopts->hopopt; @@ -1153,8 +1163,13 @@ static int calipso_sock_setattr(struct sock *sk, static void calipso_sock_delattr(struct sock *sk) { struct ipv6_opt_hdr *new_hop; - struct ipv6_txoptions *txopts = txopt_get(inet6_sk(sk)); + struct ipv6_pinfo *pinfo = inet6_sk(sk); + struct ipv6_txoptions *txopts; + + if (!pinfo) + return; + txopts = txopt_get(pinfo); if (!txopts || !txopts->hopopt) goto done; diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index 7371886d4f9f..72adfc107b55 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -36,6 +36,7 @@ #include <net/tcp.h> #include <net/espintcp.h> #include <net/inet6_hashtables.h> +#include <linux/skbuff_ref.h> #include <linux/highmem.h> @@ -131,51 +132,21 @@ static void esp_ssg_unref(struct xfrm_state *x, void *tmp, struct sk_buff *skb) */ if (req->src != req->dst) for (sg = sg_next(req->src); sg; sg = sg_next(sg)) - skb_page_unref(skb, sg_page(sg), false); + skb_page_unref(page_to_netmem(sg_page(sg)), + skb->pp_recycle); } #ifdef CONFIG_INET6_ESPINTCP -struct esp_tcp_sk { - struct sock *sk; - struct rcu_head rcu; -}; - -static void esp_free_tcp_sk(struct rcu_head *head) -{ - struct esp_tcp_sk *esk = container_of(head, struct esp_tcp_sk, rcu); - - sock_put(esk->sk); - kfree(esk); -} - static struct sock *esp6_find_tcp_sk(struct xfrm_state *x) { struct xfrm_encap_tmpl *encap = x->encap; struct net *net = xs_net(x); - struct esp_tcp_sk *esk; __be16 sport, dport; - struct sock *nsk; struct sock *sk; - sk = rcu_dereference(x->encap_sk); - if (sk && sk->sk_state == TCP_ESTABLISHED) - return sk; - spin_lock_bh(&x->lock); sport = encap->encap_sport; dport = encap->encap_dport; - nsk = rcu_dereference_protected(x->encap_sk, - lockdep_is_held(&x->lock)); - if (sk && sk == nsk) { - esk = kmalloc(sizeof(*esk), GFP_ATOMIC); - if (!esk) { - spin_unlock_bh(&x->lock); - return ERR_PTR(-ENOMEM); - } - RCU_INIT_POINTER(x->encap_sk, NULL); - esk->sk = sk; - call_rcu(&esk->rcu, esp_free_tcp_sk); - } spin_unlock_bh(&x->lock); sk = __inet6_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, &x->id.daddr.in6, @@ -188,20 +159,6 @@ static struct sock *esp6_find_tcp_sk(struct xfrm_state *x) return ERR_PTR(-EINVAL); } - spin_lock_bh(&x->lock); - nsk = rcu_dereference_protected(x->encap_sk, - lockdep_is_held(&x->lock)); - if (encap->encap_sport != sport || - encap->encap_dport != dport) { - sock_put(sk); - sk = nsk ?: ERR_PTR(-EREMCHG); - } else if (sk == nsk) { - sock_put(sk); - } else { - rcu_assign_pointer(x->encap_sk, sk); - } - spin_unlock_bh(&x->lock); - return sk; } @@ -214,8 +171,10 @@ static int esp_output_tcp_finish(struct xfrm_state *x, struct sk_buff *skb) sk = esp6_find_tcp_sk(x); err = PTR_ERR_OR_ZERO(sk); - if (err) + if (err) { + kfree_skb(skb); goto out; + } bh_lock_sock(sk); if (sock_owned_by_user(sk)) @@ -224,6 +183,8 @@ static int esp_output_tcp_finish(struct xfrm_state *x, struct sk_buff *skb) err = espintcp_push_skb(sk, skb); bh_unlock_sock(sk); + sock_put(sk); + out: rcu_read_unlock(); return err; @@ -255,8 +216,7 @@ static int esp_output_tail_tcp(struct xfrm_state *x, struct sk_buff *skb) #else static int esp_output_tail_tcp(struct xfrm_state *x, struct sk_buff *skb) { - kfree_skb(skb); - + WARN_ON(1); return -EOPNOTSUPP; } #endif @@ -314,7 +274,7 @@ static void esp_output_done(void *data, int err) x->encap && x->encap->encap_type == TCP_ENCAP_ESPINTCP) esp_output_tail_tcp(x, skb); else - xfrm_output_resume(skb->sk, skb, err); + xfrm_output_resume(skb_to_full_sk(skb), skb, err); } } @@ -383,7 +343,6 @@ static struct ip_esp_hdr *esp6_output_udp_encap(struct sk_buff *skb, __be16 dport) { struct udphdr *uh; - __be32 *udpdata32; unsigned int len; len = skb->len + esp->tailen - skb_transport_offset(skb); @@ -398,12 +357,6 @@ static struct ip_esp_hdr *esp6_output_udp_encap(struct sk_buff *skb, *skb_mac_header(skb) = IPPROTO_UDP; - if (encap_type == UDP_ENCAP_ESPINUDP_NON_IKE) { - udpdata32 = (__be32 *)(uh + 1); - udpdata32[0] = udpdata32[1] = 0; - return (struct ip_esp_hdr *)(udpdata32 + 2); - } - return (struct ip_esp_hdr *)(uh + 1); } @@ -428,6 +381,8 @@ static struct ip_esp_hdr *esp6_output_tcp_encap(struct xfrm_state *x, if (IS_ERR(sk)) return ERR_CAST(sk); + sock_put(sk); + *lenp = htons(len); esph = (struct ip_esp_hdr *)(lenp + 1); @@ -459,7 +414,6 @@ static int esp6_output_encap(struct xfrm_state *x, struct sk_buff *skb, switch (encap_type) { default: case UDP_ENCAP_ESPINUDP: - case UDP_ENCAP_ESPINUDP_NON_IKE: esph = esp6_output_udp_encap(skb, encap_type, esp, sport, dport); break; case TCP_ENCAP_ESPINTCP: @@ -822,7 +776,6 @@ int esp6_input_done2(struct sk_buff *skb, int err) source = th->source; break; case UDP_ENCAP_ESPINUDP: - case UDP_ENCAP_ESPINUDP_NON_IKE: source = uh->source; break; default: @@ -867,7 +820,8 @@ int esp6_input_done2(struct sk_buff *skb, int err) skb_postpull_rcsum(skb, skb_network_header(skb), skb_network_header_len(skb)); skb_pull_rcsum(skb, hlen); - if (x->props.mode == XFRM_MODE_TUNNEL) + if (x->props.mode == XFRM_MODE_TUNNEL || + x->props.mode == XFRM_MODE_IPTFS) skb_reset_transport_header(skb); else skb_set_transport_header(skb, -hdr_len); @@ -1232,9 +1186,6 @@ static int esp6_init_state(struct xfrm_state *x, struct netlink_ext_ack *extack) case UDP_ENCAP_ESPINUDP: x->props.header_len += sizeof(struct udphdr); break; - case UDP_ENCAP_ESPINUDP_NON_IKE: - x->props.header_len += sizeof(struct udphdr) + 2 * sizeof(u32); - break; #ifdef CONFIG_INET6_ESPINTCP case TCP_ENCAP_ESPINTCP: /* only the length field, TCP encap is done by diff --git a/net/ipv6/esp6_offload.c b/net/ipv6/esp6_offload.c index 527b7caddbc6..7b41fb4f00b5 100644 --- a/net/ipv6/esp6_offload.c +++ b/net/ipv6/esp6_offload.c @@ -80,9 +80,16 @@ static struct sk_buff *esp6_gro_receive(struct list_head *head, if (sp->len == XFRM_MAX_DEPTH) goto out_reset; - x = xfrm_state_lookup(dev_net(skb->dev), skb->mark, - (xfrm_address_t *)&ipv6_hdr(skb)->daddr, - spi, IPPROTO_ESP, AF_INET6); + x = xfrm_input_state_lookup(dev_net(skb->dev), skb->mark, + (xfrm_address_t *)&ipv6_hdr(skb)->daddr, + spi, IPPROTO_ESP, AF_INET6); + + if (unlikely(x && x->dir && x->dir != XFRM_SA_DIR_IN)) { + /* non-offload path will record the error and audit log */ + xfrm_state_put(x); + x = NULL; + } + if (!x) goto out_reset; diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c index 6789623b2b0d..457de0745a33 100644 --- a/net/ipv6/exthdrs.c +++ b/net/ipv6/exthdrs.c @@ -1204,10 +1204,9 @@ ipv6_dup_options(struct sock *sk, struct ipv6_txoptions *opt) { struct ipv6_txoptions *opt2; - opt2 = sock_kmalloc(sk, opt->tot_len, GFP_ATOMIC); + opt2 = sock_kmemdup(sk, opt, opt->tot_len, GFP_ATOMIC); if (opt2) { long dif = (char *)opt2 - (char *)opt; - memcpy(opt2, opt, opt->tot_len); if (opt2->hopopt) *((char **)&opt2->hopopt) += dif; if (opt2->dst0opt) diff --git a/net/ipv6/fib6_notifier.c b/net/ipv6/fib6_notifier.c index f87ae33e1d01..949b72610df7 100644 --- a/net/ipv6/fib6_notifier.c +++ b/net/ipv6/fib6_notifier.c @@ -22,7 +22,7 @@ int call_fib6_notifiers(struct net *net, enum fib_event_type event_type, return call_fib_notifiers(net, event_type, info); } -static unsigned int fib6_seq_read(struct net *net) +static unsigned int fib6_seq_read(const struct net *net) { return fib6_tables_seq_read(net) + fib6_rules_seq_read(net); } diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c index 52c04f0ac498..fd5f7112a51f 100644 --- a/net/ipv6/fib6_rules.c +++ b/net/ipv6/fib6_rules.c @@ -26,14 +26,18 @@ struct fib6_rule { struct fib_rule common; struct rt6key src; struct rt6key dst; + __be32 flowlabel; + __be32 flowlabel_mask; dscp_t dscp; + dscp_t dscp_mask; + u8 dscp_full:1; /* DSCP or TOS selector */ }; static bool fib6_rule_matchall(const struct fib_rule *rule) { struct fib6_rule *r = container_of(rule, struct fib6_rule, common); - if (r->dst.plen || r->src.plen || r->dscp) + if (r->dst.plen || r->src.plen || r->dscp || r->flowlabel_mask) return false; return fib_rule_matchall(rule); } @@ -55,7 +59,7 @@ int fib6_rules_dump(struct net *net, struct notifier_block *nb, return fib_rules_dump(net, nb, AF_INET6, extack); } -unsigned int fib6_rules_seq_read(struct net *net) +unsigned int fib6_rules_seq_read(const struct net *net) { return fib_rules_seq_read(net, AF_INET6); } @@ -233,8 +237,12 @@ static int __fib6_rule_action(struct fib_rule *rule, struct flowi *flp, rt = pol_lookup_func(lookup, net, table, flp6, arg->lookup_data, flags); if (rt != net->ipv6.ip6_null_entry) { + struct inet6_dev *idev = ip6_dst_idev(&rt->dst); + + if (!idev) + goto again; err = fib6_rule_saddr(net, rule, flags, flp6, - ip6_dst_idev(&rt->dst)->dev); + idev->dev); if (err == -EAGAIN) goto again; @@ -324,31 +332,101 @@ INDIRECT_CALLABLE_SCOPE int fib6_rule_match(struct fib_rule *rule, return 0; } - if (r->dscp && r->dscp != ip6_dscp(fl6->flowlabel)) + if ((r->dscp ^ ip6_dscp(fl6->flowlabel)) & r->dscp_mask) + return 0; + + if ((r->flowlabel ^ flowi6_get_flowlabel(fl6)) & r->flowlabel_mask) return 0; if (rule->ip_proto && (rule->ip_proto != fl6->flowi6_proto)) return 0; - if (fib_rule_port_range_set(&rule->sport_range) && - !fib_rule_port_inrange(&rule->sport_range, fl6->fl6_sport)) + if (!fib_rule_port_match(&rule->sport_range, rule->sport_mask, + fl6->fl6_sport)) return 0; - if (fib_rule_port_range_set(&rule->dport_range) && - !fib_rule_port_inrange(&rule->dport_range, fl6->fl6_dport)) + if (!fib_rule_port_match(&rule->dport_range, rule->dport_mask, + fl6->fl6_dport)) return 0; return 1; } +static int fib6_nl2rule_dscp(const struct nlattr *nla, struct fib6_rule *rule6, + struct netlink_ext_ack *extack) +{ + if (rule6->dscp) { + NL_SET_ERR_MSG(extack, "Cannot specify both TOS and DSCP"); + return -EINVAL; + } + + rule6->dscp = inet_dsfield_to_dscp(nla_get_u8(nla) << 2); + rule6->dscp_mask = inet_dsfield_to_dscp(INET_DSCP_MASK); + rule6->dscp_full = true; + + return 0; +} + +static int fib6_nl2rule_dscp_mask(const struct nlattr *nla, + struct fib6_rule *rule6, + struct netlink_ext_ack *extack) +{ + dscp_t dscp_mask; + + if (!rule6->dscp_full) { + NL_SET_ERR_MSG_ATTR(extack, nla, + "Cannot specify DSCP mask without DSCP value"); + return -EINVAL; + } + + dscp_mask = inet_dsfield_to_dscp(nla_get_u8(nla) << 2); + if (rule6->dscp & ~dscp_mask) { + NL_SET_ERR_MSG_ATTR(extack, nla, "Invalid DSCP mask"); + return -EINVAL; + } + + rule6->dscp_mask = dscp_mask; + + return 0; +} + +static int fib6_nl2rule_flowlabel(struct nlattr **tb, struct fib6_rule *rule6, + struct netlink_ext_ack *extack) +{ + __be32 flowlabel, flowlabel_mask; + + if (NL_REQ_ATTR_CHECK(extack, NULL, tb, FRA_FLOWLABEL) || + NL_REQ_ATTR_CHECK(extack, NULL, tb, FRA_FLOWLABEL_MASK)) + return -EINVAL; + + flowlabel = nla_get_be32(tb[FRA_FLOWLABEL]); + flowlabel_mask = nla_get_be32(tb[FRA_FLOWLABEL_MASK]); + + if (flowlabel_mask & ~IPV6_FLOWLABEL_MASK) { + NL_SET_ERR_MSG_ATTR(extack, tb[FRA_FLOWLABEL_MASK], + "Invalid flow label mask"); + return -EINVAL; + } + + if (flowlabel & ~flowlabel_mask) { + NL_SET_ERR_MSG(extack, "Flow label and mask do not match"); + return -EINVAL; + } + + rule6->flowlabel = flowlabel; + rule6->flowlabel_mask = flowlabel_mask; + + return 0; +} + static int fib6_rule_configure(struct fib_rule *rule, struct sk_buff *skb, struct fib_rule_hdr *frh, struct nlattr **tb, struct netlink_ext_ack *extack) { + struct fib6_rule *rule6 = (struct fib6_rule *)rule; + struct net *net = rule->fr_net; int err = -EINVAL; - struct net *net = sock_net(skb->sk); - struct fib6_rule *rule6 = (struct fib6_rule *) rule; if (!inet_validate_dscp(frh->tos)) { NL_SET_ERR_MSG(extack, @@ -356,6 +434,18 @@ static int fib6_rule_configure(struct fib_rule *rule, struct sk_buff *skb, goto errout; } rule6->dscp = inet_dsfield_to_dscp(frh->tos); + rule6->dscp_mask = frh->tos ? inet_dsfield_to_dscp(INET_DSCP_MASK) : 0; + + if (tb[FRA_DSCP] && fib6_nl2rule_dscp(tb[FRA_DSCP], rule6, extack) < 0) + goto errout; + + if (tb[FRA_DSCP_MASK] && + fib6_nl2rule_dscp_mask(tb[FRA_DSCP_MASK], rule6, extack) < 0) + goto errout; + + if ((tb[FRA_FLOWLABEL] || tb[FRA_FLOWLABEL_MASK]) && + fib6_nl2rule_flowlabel(tb, rule6, extack) < 0) + goto errout; if (rule->action == FR_ACT_TO_TBL && !rule->l3mdev) { if (rule->table == RT6_TABLE_UNSPEC) { @@ -409,7 +499,33 @@ static int fib6_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh, if (frh->dst_len && (rule6->dst.plen != frh->dst_len)) return 0; - if (frh->tos && inet_dscp_to_dsfield(rule6->dscp) != frh->tos) + if (frh->tos && + (rule6->dscp_full || + inet_dscp_to_dsfield(rule6->dscp) != frh->tos)) + return 0; + + if (tb[FRA_DSCP]) { + dscp_t dscp; + + dscp = inet_dsfield_to_dscp(nla_get_u8(tb[FRA_DSCP]) << 2); + if (!rule6->dscp_full || rule6->dscp != dscp) + return 0; + } + + if (tb[FRA_DSCP_MASK]) { + dscp_t dscp_mask; + + dscp_mask = inet_dsfield_to_dscp(nla_get_u8(tb[FRA_DSCP_MASK]) << 2); + if (!rule6->dscp_full || rule6->dscp_mask != dscp_mask) + return 0; + } + + if (tb[FRA_FLOWLABEL] && + nla_get_be32(tb[FRA_FLOWLABEL]) != rule6->flowlabel) + return 0; + + if (tb[FRA_FLOWLABEL_MASK] && + nla_get_be32(tb[FRA_FLOWLABEL_MASK]) != rule6->flowlabel_mask) return 0; if (frh->src_len && @@ -430,7 +546,22 @@ static int fib6_rule_fill(struct fib_rule *rule, struct sk_buff *skb, frh->dst_len = rule6->dst.plen; frh->src_len = rule6->src.plen; - frh->tos = inet_dscp_to_dsfield(rule6->dscp); + + if (rule6->dscp_full) { + frh->tos = 0; + if (nla_put_u8(skb, FRA_DSCP, + inet_dscp_to_dsfield(rule6->dscp) >> 2) || + nla_put_u8(skb, FRA_DSCP_MASK, + inet_dscp_to_dsfield(rule6->dscp_mask) >> 2)) + goto nla_put_failure; + } else { + frh->tos = inet_dscp_to_dsfield(rule6->dscp); + } + + if (rule6->flowlabel_mask && + (nla_put_be32(skb, FRA_FLOWLABEL, rule6->flowlabel) || + nla_put_be32(skb, FRA_FLOWLABEL_MASK, rule6->flowlabel_mask))) + goto nla_put_failure; if ((rule6->dst.plen && nla_put_in6_addr(skb, FRA_DST, &rule6->dst.addr)) || @@ -446,7 +577,11 @@ nla_put_failure: static size_t fib6_rule_nlmsg_payload(struct fib_rule *rule) { return nla_total_size(16) /* dst */ - + nla_total_size(16); /* src */ + + nla_total_size(16) /* src */ + + nla_total_size(1) /* dscp */ + + nla_total_size(1) /* dscp mask */ + + nla_total_size(4) /* flowlabel */ + + nla_total_size(4); /* flowlabel mask */ } static void fib6_rule_flush_cache(struct fib_rules_ops *ops) diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index 1635da07285f..3fd19a84b358 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -76,7 +76,7 @@ static int icmpv6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, { /* icmpv6_notify checks 8 bytes can be pulled, icmp6hdr is 8 bytes */ struct icmp6hdr *icmp6 = (struct icmp6hdr *) (skb->data + offset); - struct net *net = dev_net(skb->dev); + struct net *net = dev_net_rcu(skb->dev); if (type == ICMPV6_PKT_TOOBIG) ip6_update_pmtu(skb, net, info, skb->dev->ifindex, 0, sock_net_uid(net, NULL)); @@ -175,14 +175,16 @@ static bool icmpv6_mask_allow(struct net *net, int type) return false; } -static bool icmpv6_global_allow(struct net *net, int type) +static bool icmpv6_global_allow(struct net *net, int type, + bool *apply_ratelimit) { if (icmpv6_mask_allow(net, type)) return true; - if (icmp_global_allow()) + if (icmp_global_allow(net)) { + *apply_ratelimit = true; return true; - + } __ICMP_INC_STATS(net, ICMP_MIB_RATELIMITGLOBAL); return false; } @@ -191,13 +193,13 @@ static bool icmpv6_global_allow(struct net *net, int type) * Check the ICMP output rate limit */ static bool icmpv6_xrlim_allow(struct sock *sk, u8 type, - struct flowi6 *fl6) + struct flowi6 *fl6, bool apply_ratelimit) { struct net *net = sock_net(sk); struct dst_entry *dst; bool res = false; - if (icmpv6_mask_allow(net, type)) + if (!apply_ratelimit) return true; /* @@ -212,7 +214,7 @@ static bool icmpv6_xrlim_allow(struct sock *sk, u8 type, } else if (dst->dev && (dst->dev->flags&IFF_LOOPBACK)) { res = true; } else { - struct rt6_info *rt = (struct rt6_info *)dst; + struct rt6_info *rt = dst_rt6_info(dst); int tmo = net->ipv6.sysctl.icmpv6_time; struct inet_peer *peer; @@ -220,14 +222,16 @@ static bool icmpv6_xrlim_allow(struct sock *sk, u8 type, if (rt->rt6i_dst.plen < 128) tmo >>= ((128 - rt->rt6i_dst.plen)>>5); - peer = inet_getpeer_v6(net->ipv6.peers, &fl6->daddr, 1); + rcu_read_lock(); + peer = inet_getpeer_v6(net->ipv6.peers, &fl6->daddr); res = inet_peer_xrlim_allow(peer, tmo); - if (peer) - inet_putpeer(peer); + rcu_read_unlock(); } if (!res) __ICMP6_INC_STATS(net, ip6_dst_idev(dst), ICMP6_MIB_RATELIMITHOST); + else + icmp_global_consume(net); dst_release(dst); return res; } @@ -241,7 +245,7 @@ static bool icmpv6_rt_has_prefsrc(struct sock *sk, u8 type, dst = ip6_route_output(net, sk, fl6); if (!dst->error) { - struct rt6_info *rt = (struct rt6_info *)dst; + struct rt6_info *rt = dst_rt6_info(dst); struct in6_addr prefsrc; rt6_get_prefsrc(rt, &prefsrc); @@ -452,6 +456,7 @@ void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, struct net *net; struct ipv6_pinfo *np; const struct in6_addr *saddr = NULL; + bool apply_ratelimit = false; struct dst_entry *dst; struct icmp6hdr tmp_hdr; struct flowi6 fl6; @@ -468,7 +473,10 @@ void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, if (!skb->dev) return; - net = dev_net(skb->dev); + + rcu_read_lock(); + + net = dev_net_rcu(skb->dev); mark = IP6_REPLY_MARK(net, skb->mark); /* * Make sure we respect the rules @@ -491,7 +499,7 @@ void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, !(type == ICMPV6_PARAMPROB && code == ICMPV6_UNK_OPTION && (opt_unrec(skb, info)))) - return; + goto out; saddr = NULL; } @@ -521,7 +529,7 @@ void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, if ((addr_type == IPV6_ADDR_ANY) || (addr_type & IPV6_ADDR_MULTICAST)) { net_dbg_ratelimited("icmp6_send: addr_any/mcast source [%pI6c > %pI6c]\n", &hdr->saddr, &hdr->daddr); - return; + goto out; } /* @@ -530,14 +538,15 @@ void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, if (is_ineligible(skb)) { net_dbg_ratelimited("icmp6_send: no reply to icmp error [%pI6c > %pI6c]\n", &hdr->saddr, &hdr->daddr); - return; + goto out; } - /* Needed by both icmp_global_allow and icmpv6_xmit_lock */ + /* Needed by both icmpv6_global_allow and icmpv6_xmit_lock */ local_bh_disable(); /* Check global sysctl_icmp_msgs_per_sec ratelimit */ - if (!(skb->dev->flags & IFF_LOOPBACK) && !icmpv6_global_allow(net, type)) + if (!(skb->dev->flags & IFF_LOOPBACK) && + !icmpv6_global_allow(net, type, &apply_ratelimit)) goto out_bh_enable; mip6_addr_swap(skb, parm); @@ -575,8 +584,8 @@ void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, np = inet6_sk(sk); - if (!icmpv6_xrlim_allow(sk, type, &fl6)) - goto out; + if (!icmpv6_xrlim_allow(sk, type, &fl6, apply_ratelimit)) + goto out_unlock; tmp_hdr.icmp6_type = type; tmp_hdr.icmp6_code = code; @@ -594,7 +603,7 @@ void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, dst = icmpv6_route_lookup(net, skb, sk, &fl6); if (IS_ERR(dst)) - goto out; + goto out_unlock; ipc6.hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst); @@ -610,13 +619,12 @@ void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, goto out_dst_release; } - rcu_read_lock(); idev = __in6_dev_get(skb->dev); if (ip6_append_data(sk, icmpv6_getfrag, &msg, len + sizeof(struct icmp6hdr), sizeof(struct icmp6hdr), - &ipc6, &fl6, (struct rt6_info *)dst, + &ipc6, &fl6, dst_rt6_info(dst), MSG_DONTWAIT)) { ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTERRORS); ip6_flush_pending_frames(sk); @@ -624,13 +632,15 @@ void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, icmpv6_push_pending_frames(sk, &fl6, &tmp_hdr, len + sizeof(struct icmp6hdr)); } - rcu_read_unlock(); + out_dst_release: dst_release(dst); -out: +out_unlock: icmpv6_xmit_unlock(sk); out_bh_enable: local_bh_enable(); +out: + rcu_read_unlock(); } EXPORT_SYMBOL(icmp6_send); @@ -673,8 +683,8 @@ int ip6_err_gen_icmpv6_unreach(struct sk_buff *skb, int nhs, int type, skb_pull(skb2, nhs); skb_reset_network_header(skb2); - rt = rt6_lookup(dev_net(skb->dev), &ipv6_hdr(skb2)->saddr, NULL, 0, - skb, 0); + rt = rt6_lookup(dev_net_rcu(skb->dev), &ipv6_hdr(skb2)->saddr, + NULL, 0, skb, 0); if (rt && rt->dst.dev) skb2->dev = rt->dst.dev; @@ -711,12 +721,13 @@ EXPORT_SYMBOL(ip6_err_gen_icmpv6_unreach); static enum skb_drop_reason icmpv6_echo_reply(struct sk_buff *skb) { - struct net *net = dev_net(skb->dev); + struct net *net = dev_net_rcu(skb->dev); struct sock *sk; struct inet6_dev *idev; struct ipv6_pinfo *np; const struct in6_addr *saddr = NULL; struct icmp6hdr *icmph = icmp6_hdr(skb); + bool apply_ratelimit = false; struct icmp6hdr tmp_hdr; struct flowi6 fl6; struct icmpv6_msg msg; @@ -781,8 +792,9 @@ static enum skb_drop_reason icmpv6_echo_reply(struct sk_buff *skb) goto out; /* Check the ratelimit */ - if ((!(skb->dev->flags & IFF_LOOPBACK) && !icmpv6_global_allow(net, ICMPV6_ECHO_REPLY)) || - !icmpv6_xrlim_allow(sk, ICMPV6_ECHO_REPLY, &fl6)) + if ((!(skb->dev->flags & IFF_LOOPBACK) && + !icmpv6_global_allow(net, ICMPV6_ECHO_REPLY, &apply_ratelimit)) || + !icmpv6_xrlim_allow(sk, ICMPV6_ECHO_REPLY, &fl6, apply_ratelimit)) goto out_dst_release; idev = __in6_dev_get(skb->dev); @@ -803,7 +815,7 @@ static enum skb_drop_reason icmpv6_echo_reply(struct sk_buff *skb) if (ip6_append_data(sk, icmpv6_getfrag, &msg, skb->len + sizeof(struct icmp6hdr), sizeof(struct icmp6hdr), &ipc6, &fl6, - (struct rt6_info *)dst, MSG_DONTWAIT)) { + dst_rt6_info(dst), MSG_DONTWAIT)) { __ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTERRORS); ip6_flush_pending_frames(sk); } else { @@ -824,7 +836,7 @@ enum skb_drop_reason icmpv6_notify(struct sk_buff *skb, u8 type, u8 code, __be32 info) { struct inet6_skb_parm *opt = IP6CB(skb); - struct net *net = dev_net(skb->dev); + struct net *net = dev_net_rcu(skb->dev); const struct inet6_protocol *ipprot; enum skb_drop_reason reason; int inner_offset; @@ -881,7 +893,7 @@ out: static int icmpv6_rcv(struct sk_buff *skb) { enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED; - struct net *net = dev_net(skb->dev); + struct net *net = dev_net_rcu(skb->dev); struct net_device *dev = icmp6_dev(skb); struct inet6_dev *idev = __in6_dev_get(dev); const struct in6_addr *saddr, *daddr; @@ -913,7 +925,7 @@ static int icmpv6_rcv(struct sk_buff *skb) skb_set_network_header(skb, nh); } - __ICMP6_INC_STATS(dev_net(dev), idev, ICMP6_MIB_INMSGS); + __ICMP6_INC_STATS(dev_net_rcu(dev), idev, ICMP6_MIB_INMSGS); saddr = &ipv6_hdr(skb)->saddr; daddr = &ipv6_hdr(skb)->daddr; @@ -931,7 +943,7 @@ static int icmpv6_rcv(struct sk_buff *skb) type = hdr->icmp6_type; - ICMP6MSGIN_INC_STATS(dev_net(dev), idev, type); + ICMP6MSGIN_INC_STATS(dev_net_rcu(dev), idev, type); switch (type) { case ICMPV6_ECHO_REQUEST: @@ -945,12 +957,9 @@ static int icmpv6_rcv(struct sk_buff *skb) break; case ICMPV6_ECHO_REPLY: - reason = ping_rcv(skb); - break; - case ICMPV6_EXT_ECHO_REPLY: - reason = ping_rcv(skb); - break; + ping_rcv(skb); + return 0; case ICMPV6_PKT_TOOBIG: /* BUGGG_FUTURE: if packet contains rthdr, we cannot update @@ -1026,9 +1035,9 @@ static int icmpv6_rcv(struct sk_buff *skb) csum_error: reason = SKB_DROP_REASON_ICMP_CSUM; - __ICMP6_INC_STATS(dev_net(dev), idev, ICMP6_MIB_CSUMERRORS); + __ICMP6_INC_STATS(dev_net_rcu(dev), idev, ICMP6_MIB_CSUMERRORS); discard_it: - __ICMP6_INC_STATS(dev_net(dev), idev, ICMP6_MIB_INERRORS); + __ICMP6_INC_STATS(dev_net_rcu(dev), idev, ICMP6_MIB_INERRORS); drop_no_count: kfree_skb_reason(skb, reason); return 0; @@ -1206,7 +1215,6 @@ static struct ctl_table ipv6_icmp_table_template[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, - { }, }; struct ctl_table * __net_init ipv6_icmp_sysctl_init(struct net *net) diff --git a/net/ipv6/ila/ila.h b/net/ipv6/ila/ila.h index ad5f6f6ba333..85b92917849b 100644 --- a/net/ipv6/ila/ila.h +++ b/net/ipv6/ila/ila.h @@ -108,6 +108,7 @@ int ila_lwt_init(void); void ila_lwt_fini(void); int ila_xlat_init_net(struct net *net); +void ila_xlat_pre_exit_net(struct net *net); void ila_xlat_exit_net(struct net *net); int ila_xlat_nl_cmd_add_mapping(struct sk_buff *skb, struct genl_info *info); diff --git a/net/ipv6/ila/ila_lwt.c b/net/ipv6/ila/ila_lwt.c index 8c1ce78956ba..7d574f5132e2 100644 --- a/net/ipv6/ila/ila_lwt.c +++ b/net/ipv6/ila/ila_lwt.c @@ -38,7 +38,7 @@ static inline struct ila_params *ila_params_lwtunnel( static int ila_output(struct net *net, struct sock *sk, struct sk_buff *skb) { struct dst_entry *orig_dst = skb_dst(skb); - struct rt6_info *rt = (struct rt6_info *)orig_dst; + struct rt6_info *rt = dst_rt6_info(orig_dst); struct ila_lwt *ilwt = ila_lwt_lwtunnel(orig_dst->lwtstate); struct dst_entry *dst; int err = -EINVAL; @@ -58,7 +58,9 @@ static int ila_output(struct net *net, struct sock *sk, struct sk_buff *skb) return orig_dst->lwtstate->orig_output(net, sk, skb); } + local_bh_disable(); dst = dst_cache_get(&ilwt->dst_cache); + local_bh_enable(); if (unlikely(!dst)) { struct ipv6hdr *ip6h = ipv6_hdr(skb); struct flowi6 fl6; @@ -70,7 +72,7 @@ static int ila_output(struct net *net, struct sock *sk, struct sk_buff *skb) memset(&fl6, 0, sizeof(fl6)); fl6.flowi6_oif = orig_dst->dev->ifindex; fl6.flowi6_iif = LOOPBACK_IFINDEX; - fl6.daddr = *rt6_nexthop((struct rt6_info *)orig_dst, + fl6.daddr = *rt6_nexthop(dst_rt6_info(orig_dst), &ip6h->daddr); dst = ip6_route_output(net, NULL, &fl6); @@ -86,10 +88,15 @@ static int ila_output(struct net *net, struct sock *sk, struct sk_buff *skb) goto drop; } - if (ilwt->connected) + /* cache only if we don't create a dst reference loop */ + if (ilwt->connected && orig_dst->lwtstate != dst->lwtstate) { + local_bh_disable(); dst_cache_set_ip6(&ilwt->dst_cache, dst, &fl6.saddr); + local_bh_enable(); + } } + skb_dst_drop(skb); skb_dst_set(skb, dst); return dst_output(net, sk, skb); diff --git a/net/ipv6/ila/ila_main.c b/net/ipv6/ila/ila_main.c index 69caed07315f..976c78efbae1 100644 --- a/net/ipv6/ila/ila_main.c +++ b/net/ipv6/ila/ila_main.c @@ -71,6 +71,11 @@ ila_xlat_init_fail: return err; } +static __net_exit void ila_pre_exit_net(struct net *net) +{ + ila_xlat_pre_exit_net(net); +} + static __net_exit void ila_exit_net(struct net *net) { ila_xlat_exit_net(net); @@ -78,6 +83,7 @@ static __net_exit void ila_exit_net(struct net *net) static struct pernet_operations ila_net_ops = { .init = ila_init_net, + .pre_exit = ila_pre_exit_net, .exit = ila_exit_net, .id = &ila_net_id, .size = sizeof(struct ila_net), diff --git a/net/ipv6/ila/ila_xlat.c b/net/ipv6/ila/ila_xlat.c index 67e8c9440977..1d41b2ab4884 100644 --- a/net/ipv6/ila/ila_xlat.c +++ b/net/ipv6/ila/ila_xlat.c @@ -105,16 +105,11 @@ static int parse_nl_config(struct genl_info *info, xp->ip.locator_match.v64 = (__force __be64)nla_get_u64( info->attrs[ILA_ATTR_LOCATOR_MATCH]); - if (info->attrs[ILA_ATTR_CSUM_MODE]) - xp->ip.csum_mode = nla_get_u8(info->attrs[ILA_ATTR_CSUM_MODE]); - else - xp->ip.csum_mode = ILA_CSUM_NO_ACTION; + xp->ip.csum_mode = nla_get_u8_default(info->attrs[ILA_ATTR_CSUM_MODE], + ILA_CSUM_NO_ACTION); - if (info->attrs[ILA_ATTR_IDENT_TYPE]) - xp->ip.ident_type = nla_get_u8( - info->attrs[ILA_ATTR_IDENT_TYPE]); - else - xp->ip.ident_type = ILA_ATYPE_USE_FORMAT; + xp->ip.ident_type = nla_get_u8_default(info->attrs[ILA_ATTR_IDENT_TYPE], + ILA_ATYPE_USE_FORMAT); if (info->attrs[ILA_ATTR_IFINDEX]) xp->ifindex = nla_get_s32(info->attrs[ILA_ATTR_IFINDEX]); @@ -200,6 +195,8 @@ static const struct nf_hook_ops ila_nf_hook_ops[] = { }, }; +static DEFINE_MUTEX(ila_mutex); + static int ila_add_mapping(struct net *net, struct ila_xlat_params *xp) { struct ila_net *ilan = net_generic(net, ila_net_id); @@ -207,16 +204,20 @@ static int ila_add_mapping(struct net *net, struct ila_xlat_params *xp) spinlock_t *lock = ila_get_lock(ilan, xp->ip.locator_match); int err = 0, order; - if (!ilan->xlat.hooks_registered) { + if (!READ_ONCE(ilan->xlat.hooks_registered)) { /* We defer registering net hooks in the namespace until the * first mapping is added. */ - err = nf_register_net_hooks(net, ila_nf_hook_ops, - ARRAY_SIZE(ila_nf_hook_ops)); + mutex_lock(&ila_mutex); + if (!ilan->xlat.hooks_registered) { + err = nf_register_net_hooks(net, ila_nf_hook_ops, + ARRAY_SIZE(ila_nf_hook_ops)); + if (!err) + WRITE_ONCE(ilan->xlat.hooks_registered, true); + } + mutex_unlock(&ila_mutex); if (err) return err; - - ilan->xlat.hooks_registered = true; } ila = kzalloc(sizeof(*ila), GFP_KERNEL); @@ -619,6 +620,15 @@ int ila_xlat_init_net(struct net *net) return 0; } +void ila_xlat_pre_exit_net(struct net *net) +{ + struct ila_net *ilan = net_generic(net, ila_net_id); + + if (ilan->xlat.hooks_registered) + nf_unregister_net_hooks(net, ila_nf_hook_ops, + ARRAY_SIZE(ila_nf_hook_ops)); +} + void ila_xlat_exit_net(struct net *net) { struct ila_net *ilan = net_generic(net, ila_net_id); @@ -626,10 +636,6 @@ void ila_xlat_exit_net(struct net *net) rhashtable_free_and_destroy(&ilan->xlat.rhash_table, ila_free_cb, NULL); free_bucket_spinlocks(ilan->xlat.locks); - - if (ilan->xlat.hooks_registered) - nf_unregister_net_hooks(net, ila_nf_hook_ops, - ARRAY_SIZE(ila_nf_hook_ops)); } static int ila_xlat_addr(struct sk_buff *skb, bool sir2ila) diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c index 80043e46117c..8f500eaf33cf 100644 --- a/net/ipv6/inet6_connection_sock.c +++ b/net/ipv6/inet6_connection_sock.c @@ -54,21 +54,6 @@ struct dst_entry *inet6_csk_route_req(const struct sock *sk, return dst; } -EXPORT_SYMBOL(inet6_csk_route_req); - -void inet6_csk_addr2sockaddr(struct sock *sk, struct sockaddr *uaddr) -{ - struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) uaddr; - - sin6->sin6_family = AF_INET6; - sin6->sin6_addr = sk->sk_v6_daddr; - sin6->sin6_port = inet_sk(sk)->inet_dport; - /* We do not store received flowlabel for TCP */ - sin6->sin6_flowinfo = 0; - sin6->sin6_scope_id = ipv6_iface_scope_id(&sin6->sin6_addr, - sk->sk_bound_dev_if); -} -EXPORT_SYMBOL_GPL(inet6_csk_addr2sockaddr); static inline struct dst_entry *__inet6_csk_dst_check(struct sock *sk, u32 cookie) @@ -151,4 +136,3 @@ struct dst_entry *inet6_csk_update_pmtu(struct sock *sk, u32 mtu) dst = inet6_csk_route_socket(sk, &fl6); return IS_ERR(dst) ? NULL : dst; } -EXPORT_SYMBOL_GPL(inet6_csk_update_pmtu); diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c index 2e81383b663b..76ee521189eb 100644 --- a/net/ipv6/inet6_hashtables.c +++ b/net/ipv6/inet6_hashtables.c @@ -21,6 +21,7 @@ #include <net/secure_seq.h> #include <net/ip.h> #include <net/sock_reuseport.h> +#include <net/tcp.h> u32 inet6_ehashfn(const struct net *net, const struct in6_addr *laddr, const u16 lport, @@ -34,8 +35,8 @@ u32 inet6_ehashfn(const struct net *net, lhash = (__force u32)laddr->s6_addr32[3]; fhash = __ipv6_addr_jhash(faddr, tcp_ipv6_hash_secret); - return __inet6_ehashfn(lhash, lport, fhash, fport, - inet6_ehash_secret + net_hash_mix(net)); + return lport + __inet6_ehashfn(lhash, 0, fhash, fport, + inet6_ehash_secret + net_hash_mix(net)); } EXPORT_SYMBOL_GPL(inet6_ehashfn); @@ -45,7 +46,7 @@ EXPORT_SYMBOL_GPL(inet6_ehashfn); * * The sockhash lock must be held as a reader here. */ -struct sock *__inet6_lookup_established(struct net *net, +struct sock *__inet6_lookup_established(const struct net *net, struct inet_hashinfo *hashinfo, const struct in6_addr *saddr, const __be16 sport, @@ -88,7 +89,7 @@ found: } EXPORT_SYMBOL(__inet6_lookup_established); -static inline int compute_score(struct sock *sk, struct net *net, +static inline int compute_score(struct sock *sk, const struct net *net, const unsigned short hnum, const struct in6_addr *daddr, const int dif, const int sdif) @@ -125,7 +126,7 @@ static inline int compute_score(struct sock *sk, struct net *net, * Return: NULL if sk doesn't have SO_REUSEPORT set, otherwise a pointer to * the selected sock or an error. */ -struct sock *inet6_lookup_reuseport(struct net *net, struct sock *sk, +struct sock *inet6_lookup_reuseport(const struct net *net, struct sock *sk, struct sk_buff *skb, int doff, const struct in6_addr *saddr, __be16 sport, @@ -146,7 +147,7 @@ struct sock *inet6_lookup_reuseport(struct net *net, struct sock *sk, EXPORT_SYMBOL_GPL(inet6_lookup_reuseport); /* called with rcu_read_lock() */ -static struct sock *inet6_lhash2_lookup(struct net *net, +static struct sock *inet6_lhash2_lookup(const struct net *net, struct inet_listen_hashbucket *ilb2, struct sk_buff *skb, int doff, const struct in6_addr *saddr, @@ -173,7 +174,7 @@ static struct sock *inet6_lhash2_lookup(struct net *net, return result; } -struct sock *inet6_lookup_run_sk_lookup(struct net *net, +struct sock *inet6_lookup_run_sk_lookup(const struct net *net, int protocol, struct sk_buff *skb, int doff, const struct in6_addr *saddr, @@ -198,7 +199,7 @@ struct sock *inet6_lookup_run_sk_lookup(struct net *net, } EXPORT_SYMBOL_GPL(inet6_lookup_run_sk_lookup); -struct sock *inet6_lookup_listener(struct net *net, +struct sock *inet6_lookup_listener(const struct net *net, struct inet_hashinfo *hashinfo, struct sk_buff *skb, int doff, const struct in6_addr *saddr, @@ -242,7 +243,8 @@ done: } EXPORT_SYMBOL_GPL(inet6_lookup_listener); -struct sock *inet6_lookup(struct net *net, struct inet_hashinfo *hashinfo, +struct sock *inet6_lookup(const struct net *net, + struct inet_hashinfo *hashinfo, struct sk_buff *skb, int doff, const struct in6_addr *saddr, const __be16 sport, const struct in6_addr *daddr, const __be16 dport, @@ -261,7 +263,9 @@ EXPORT_SYMBOL_GPL(inet6_lookup); static int __inet6_check_established(struct inet_timewait_death_row *death_row, struct sock *sk, const __u16 lport, - struct inet_timewait_sock **twp) + struct inet_timewait_sock **twp, + bool rcu_lookup, + u32 hash) { struct inet_hashinfo *hinfo = death_row->hashinfo; struct inet_sock *inet = inet_sk(sk); @@ -271,14 +275,26 @@ static int __inet6_check_established(struct inet_timewait_death_row *death_row, struct net *net = sock_net(sk); const int sdif = l3mdev_master_ifindex_by_index(net, dif); const __portpair ports = INET_COMBINED_PORTS(inet->inet_dport, lport); - const unsigned int hash = inet6_ehashfn(net, daddr, lport, saddr, - inet->inet_dport); struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash); - spinlock_t *lock = inet_ehash_lockp(hinfo, hash); - struct sock *sk2; - const struct hlist_nulls_node *node; struct inet_timewait_sock *tw = NULL; + const struct hlist_nulls_node *node; + struct sock *sk2; + spinlock_t *lock; + + if (rcu_lookup) { + sk_nulls_for_each(sk2, node, &head->chain) { + if (sk2->sk_hash != hash || + !inet6_match(net, sk2, saddr, daddr, + ports, dif, sdif)) + continue; + if (sk2->sk_state == TCP_TIME_WAIT) + break; + return -EADDRNOTAVAIL; + } + return 0; + } + lock = inet_ehash_lockp(hinfo, hash); spin_lock(lock); sk_nulls_for_each(sk2, node, &head->chain) { @@ -289,7 +305,8 @@ static int __inet6_check_established(struct inet_timewait_death_row *death_row, dif, sdif))) { if (sk2->sk_state == TCP_TIME_WAIT) { tw = inet_twsk(sk2); - if (twsk_unique(sk, sk2, twp)) + if (sk->sk_protocol == IPPROTO_TCP && + tcp_twsk_unique(sk, sk2, twp)) break; } goto not_unique; @@ -336,11 +353,19 @@ static u64 inet6_sk_port_offset(const struct sock *sk) int inet6_hash_connect(struct inet_timewait_death_row *death_row, struct sock *sk) { + const struct in6_addr *daddr = &sk->sk_v6_rcv_saddr; + const struct in6_addr *saddr = &sk->sk_v6_daddr; + const struct inet_sock *inet = inet_sk(sk); + const struct net *net = sock_net(sk); u64 port_offset = 0; + u32 hash_port0; if (!inet_sk(sk)->inet_num) port_offset = inet6_sk_port_offset(sk); - return __inet_hash_connect(death_row, sk, port_offset, + + hash_port0 = inet6_ehashfn(net, daddr, 0, saddr, inet->inet_dport); + + return __inet_hash_connect(death_row, sk, port_offset, hash_port0, __inet6_check_established); } EXPORT_SYMBOL_GPL(inet6_hash_connect); diff --git a/net/ipv6/ioam6.c b/net/ipv6/ioam6.c index 08c929513065..a84d332f952f 100644 --- a/net/ipv6/ioam6.c +++ b/net/ipv6/ioam6.c @@ -135,15 +135,11 @@ static int ioam6_genl_addns(struct sk_buff *skb, struct genl_info *info) ns->id = id; - if (!info->attrs[IOAM6_ATTR_NS_DATA]) - data32 = IOAM6_U32_UNAVAILABLE; - else - data32 = nla_get_u32(info->attrs[IOAM6_ATTR_NS_DATA]); - - if (!info->attrs[IOAM6_ATTR_NS_DATA_WIDE]) - data64 = IOAM6_U64_UNAVAILABLE; - else - data64 = nla_get_u64(info->attrs[IOAM6_ATTR_NS_DATA_WIDE]); + data32 = nla_get_u32_default(info->attrs[IOAM6_ATTR_NS_DATA], + IOAM6_U32_UNAVAILABLE); + + data64 = nla_get_u64_default(info->attrs[IOAM6_ATTR_NS_DATA_WIDE], + IOAM6_U64_UNAVAILABLE); ns->data = cpu_to_be32(data32); ns->data_wide = cpu_to_be64(data64); diff --git a/net/ipv6/ioam6_iptunnel.c b/net/ipv6/ioam6_iptunnel.c index 7563f8c6aa87..40df8bdfaacd 100644 --- a/net/ipv6/ioam6_iptunnel.c +++ b/net/ipv6/ioam6_iptunnel.c @@ -38,12 +38,15 @@ struct ioam6_lwt_freq { }; struct ioam6_lwt { + struct dst_entry null_dst; struct dst_cache cache; struct ioam6_lwt_freq freq; atomic_t pkt_cnt; u8 mode; + bool has_tunsrc; + struct in6_addr tunsrc; struct in6_addr tundst; - struct ioam6_lwt_encap tuninfo; + struct ioam6_lwt_encap tuninfo; }; static const struct netlink_range_validation freq_range = { @@ -72,8 +75,10 @@ static const struct nla_policy ioam6_iptunnel_policy[IOAM6_IPTUNNEL_MAX + 1] = { [IOAM6_IPTUNNEL_MODE] = NLA_POLICY_RANGE(NLA_U8, IOAM6_IPTUNNEL_MODE_MIN, IOAM6_IPTUNNEL_MODE_MAX), + [IOAM6_IPTUNNEL_SRC] = NLA_POLICY_EXACT_LEN(sizeof(struct in6_addr)), [IOAM6_IPTUNNEL_DST] = NLA_POLICY_EXACT_LEN(sizeof(struct in6_addr)), - [IOAM6_IPTUNNEL_TRACE] = NLA_POLICY_EXACT_LEN(sizeof(struct ioam6_trace_hdr)), + [IOAM6_IPTUNNEL_TRACE] = NLA_POLICY_EXACT_LEN( + sizeof(struct ioam6_trace_hdr)), }; static bool ioam6_validate_trace_hdr(struct ioam6_trace_hdr *trace) @@ -85,7 +90,7 @@ static bool ioam6_validate_trace_hdr(struct ioam6_trace_hdr *trace) trace->type.bit12 | trace->type.bit13 | trace->type.bit14 | trace->type.bit15 | trace->type.bit16 | trace->type.bit17 | trace->type.bit18 | trace->type.bit19 | trace->type.bit20 | - trace->type.bit21) + trace->type.bit21 | trace->type.bit23) return false; trace->nodelen = 0; @@ -138,10 +143,13 @@ static int ioam6_build_state(struct net *net, struct nlattr *nla, } } - if (!tb[IOAM6_IPTUNNEL_MODE]) - mode = IOAM6_IPTUNNEL_MODE_INLINE; - else - mode = nla_get_u8(tb[IOAM6_IPTUNNEL_MODE]); + mode = nla_get_u8_default(tb[IOAM6_IPTUNNEL_MODE], + IOAM6_IPTUNNEL_MODE_INLINE); + + if (tb[IOAM6_IPTUNNEL_SRC] && mode == IOAM6_IPTUNNEL_MODE_INLINE) { + NL_SET_ERR_MSG(extack, "no tunnel src expected with this mode"); + return -EINVAL; + } if (!tb[IOAM6_IPTUNNEL_DST] && mode != IOAM6_IPTUNNEL_MODE_INLINE) { NL_SET_ERR_MSG(extack, "this mode needs a tunnel destination"); @@ -167,19 +175,48 @@ static int ioam6_build_state(struct net *net, struct nlattr *nla, ilwt = ioam6_lwt_state(lwt); err = dst_cache_init(&ilwt->cache, GFP_ATOMIC); - if (err) { - kfree(lwt); - return err; - } + if (err) + goto free_lwt; + + /* This "fake" dst_entry will be stored in a dst_cache, which will call + * dst_hold() and dst_release() on it. We must ensure that dst_destroy() + * will never be called. For that, its initial refcount is 1 and +1 when + * it is stored in the cache. Then, +1/-1 each time we read the cache + * and release it. Long story short, we're fine. + */ + dst_init(&ilwt->null_dst, NULL, NULL, DST_OBSOLETE_NONE, DST_NOCOUNT); atomic_set(&ilwt->pkt_cnt, 0); ilwt->freq.k = freq_k; ilwt->freq.n = freq_n; ilwt->mode = mode; - if (tb[IOAM6_IPTUNNEL_DST]) + + if (!tb[IOAM6_IPTUNNEL_SRC]) { + ilwt->has_tunsrc = false; + } else { + ilwt->has_tunsrc = true; + ilwt->tunsrc = nla_get_in6_addr(tb[IOAM6_IPTUNNEL_SRC]); + + if (ipv6_addr_any(&ilwt->tunsrc)) { + NL_SET_ERR_MSG_ATTR(extack, tb[IOAM6_IPTUNNEL_SRC], + "invalid tunnel source address"); + err = -EINVAL; + goto free_cache; + } + } + + if (tb[IOAM6_IPTUNNEL_DST]) { ilwt->tundst = nla_get_in6_addr(tb[IOAM6_IPTUNNEL_DST]); + if (ipv6_addr_any(&ilwt->tundst)) { + NL_SET_ERR_MSG_ATTR(extack, tb[IOAM6_IPTUNNEL_DST], + "invalid tunnel dest address"); + err = -EINVAL; + goto free_cache; + } + } + tuninfo = ioam6_lwt_info(lwt); tuninfo->eh.hdrlen = ((sizeof(*tuninfo) + len_aligned) >> 3) - 1; tuninfo->pad[0] = IPV6_TLV_PADN; @@ -201,6 +238,11 @@ static int ioam6_build_state(struct net *net, struct nlattr *nla, *ts = lwt; return 0; +free_cache: + dst_cache_destroy(&ilwt->cache); +free_lwt: + kfree(lwt); + return err; } static int ioam6_do_fill(struct net *net, struct sk_buff *skb) @@ -220,14 +262,15 @@ static int ioam6_do_fill(struct net *net, struct sk_buff *skb) } static int ioam6_do_inline(struct net *net, struct sk_buff *skb, - struct ioam6_lwt_encap *tuninfo) + struct ioam6_lwt_encap *tuninfo, + struct dst_entry *cache_dst) { struct ipv6hdr *oldhdr, *hdr; int hdrlen, err; hdrlen = (tuninfo->eh.hdrlen + 1) << 3; - err = skb_cow_head(skb, hdrlen + skb->mac_len); + err = skb_cow_head(skb, hdrlen + dst_dev_overhead(cache_dst, skb)); if (unlikely(err)) return err; @@ -256,7 +299,10 @@ static int ioam6_do_inline(struct net *net, struct sk_buff *skb, static int ioam6_do_encap(struct net *net, struct sk_buff *skb, struct ioam6_lwt_encap *tuninfo, - struct in6_addr *tundst) + bool has_tunsrc, + struct in6_addr *tunsrc, + struct in6_addr *tundst, + struct dst_entry *cache_dst) { struct dst_entry *dst = skb_dst(skb); struct ipv6hdr *hdr, *inner_hdr; @@ -265,7 +311,7 @@ static int ioam6_do_encap(struct net *net, struct sk_buff *skb, hdrlen = (tuninfo->eh.hdrlen + 1) << 3; len = sizeof(*hdr) + hdrlen; - err = skb_cow_head(skb, len + skb->mac_len); + err = skb_cow_head(skb, len + dst_dev_overhead(cache_dst, skb)); if (unlikely(err)) return err; @@ -285,8 +331,12 @@ static int ioam6_do_encap(struct net *net, struct sk_buff *skb, hdr->nexthdr = NEXTHDR_HOP; hdr->payload_len = cpu_to_be16(skb->len - sizeof(*hdr)); hdr->daddr = *tundst; - ipv6_dev_get_saddr(net, dst->dev, &hdr->daddr, - IPV6_PREFER_SRC_PUBLIC, &hdr->saddr); + + if (has_tunsrc) + memcpy(&hdr->saddr, tunsrc, sizeof(*tunsrc)); + else + ipv6_dev_get_saddr(net, dst->dev, &hdr->daddr, + IPV6_PREFER_SRC_PUBLIC, &hdr->saddr); skb_postpush_rcsum(skb, hdr, len); @@ -295,8 +345,8 @@ static int ioam6_do_encap(struct net *net, struct sk_buff *skb, static int ioam6_output(struct net *net, struct sock *sk, struct sk_buff *skb) { - struct dst_entry *dst = skb_dst(skb); - struct in6_addr orig_daddr; + struct dst_entry *orig_dst = skb_dst(skb); + struct dst_entry *dst = NULL; struct ioam6_lwt *ilwt; int err = -EINVAL; u32 pkt_cnt; @@ -304,14 +354,27 @@ static int ioam6_output(struct net *net, struct sock *sk, struct sk_buff *skb) if (skb->protocol != htons(ETH_P_IPV6)) goto drop; - ilwt = ioam6_lwt_state(dst->lwtstate); + ilwt = ioam6_lwt_state(orig_dst->lwtstate); /* Check for insertion frequency (i.e., "k over n" insertions) */ pkt_cnt = atomic_fetch_inc(&ilwt->pkt_cnt); if (pkt_cnt % ilwt->freq.n >= ilwt->freq.k) goto out; - orig_daddr = ipv6_hdr(skb)->daddr; + local_bh_disable(); + dst = dst_cache_get(&ilwt->cache); + local_bh_enable(); + + /* This is how we notify that the destination does not change after + * transformation and that we need to use orig_dst instead of the cache + */ + if (dst == &ilwt->null_dst) { + dst_release(dst); + + dst = orig_dst; + /* keep refcount balance: dst_release() is called at the end */ + dst_hold(dst); + } switch (ilwt->mode) { case IOAM6_IPTUNNEL_MODE_INLINE: @@ -320,7 +383,7 @@ do_inline: if (ipv6_hdr(skb)->nexthdr == NEXTHDR_HOP) goto out; - err = ioam6_do_inline(net, skb, &ilwt->tuninfo); + err = ioam6_do_inline(net, skb, &ilwt->tuninfo, dst); if (unlikely(err)) goto drop; @@ -328,7 +391,9 @@ do_inline: case IOAM6_IPTUNNEL_MODE_ENCAP: do_encap: /* Encapsulation (ip6ip6) */ - err = ioam6_do_encap(net, skb, &ilwt->tuninfo, &ilwt->tundst); + err = ioam6_do_encap(net, skb, &ilwt->tuninfo, + ilwt->has_tunsrc, &ilwt->tunsrc, + &ilwt->tundst, dst); if (unlikely(err)) goto drop; @@ -346,52 +411,65 @@ do_encap: goto drop; } - err = skb_cow_head(skb, LL_RESERVED_SPACE(dst->dev)); - if (unlikely(err)) - goto drop; + if (unlikely(!dst)) { + struct ipv6hdr *hdr = ipv6_hdr(skb); + struct flowi6 fl6; - if (!ipv6_addr_equal(&orig_daddr, &ipv6_hdr(skb)->daddr)) { - preempt_disable(); - dst = dst_cache_get(&ilwt->cache); - preempt_enable(); - - if (unlikely(!dst)) { - struct ipv6hdr *hdr = ipv6_hdr(skb); - struct flowi6 fl6; - - memset(&fl6, 0, sizeof(fl6)); - fl6.daddr = hdr->daddr; - fl6.saddr = hdr->saddr; - fl6.flowlabel = ip6_flowinfo(hdr); - fl6.flowi6_mark = skb->mark; - fl6.flowi6_proto = hdr->nexthdr; - - dst = ip6_route_output(net, NULL, &fl6); - if (dst->error) { - err = dst->error; - dst_release(dst); - goto drop; - } - - preempt_disable(); - dst_cache_set_ip6(&ilwt->cache, dst, &fl6.saddr); - preempt_enable(); + memset(&fl6, 0, sizeof(fl6)); + fl6.daddr = hdr->daddr; + fl6.saddr = hdr->saddr; + fl6.flowlabel = ip6_flowinfo(hdr); + fl6.flowi6_mark = skb->mark; + fl6.flowi6_proto = hdr->nexthdr; + + dst = ip6_route_output(net, NULL, &fl6); + if (dst->error) { + err = dst->error; + goto drop; } + /* If the destination is the same after transformation (which is + * a valid use case for IOAM), then we don't want to add it to + * the cache in order to avoid a reference loop. Instead, we add + * our fake dst_entry to the cache as a way to detect this case. + * Otherwise, we add the resolved destination to the cache. + */ + local_bh_disable(); + if (orig_dst->lwtstate == dst->lwtstate) + dst_cache_set_ip6(&ilwt->cache, + &ilwt->null_dst, &fl6.saddr); + else + dst_cache_set_ip6(&ilwt->cache, dst, &fl6.saddr); + local_bh_enable(); + + err = skb_cow_head(skb, LL_RESERVED_SPACE(dst->dev)); + if (unlikely(err)) + goto drop; + } + + /* avoid lwtunnel_output() reentry loop when destination is the same + * after transformation (e.g., with the inline mode) + */ + if (orig_dst->lwtstate != dst->lwtstate) { skb_dst_drop(skb); skb_dst_set(skb, dst); - return dst_output(net, sk, skb); } out: - return dst->lwtstate->orig_output(net, sk, skb); + dst_release(dst); + return orig_dst->lwtstate->orig_output(net, sk, skb); drop: + dst_release(dst); kfree_skb(skb); return err; } static void ioam6_destroy_state(struct lwtunnel_state *lwt) { + /* Since the refcount of per-cpu dst_entry caches will never be 0 (see + * why above) when our "fake" dst_entry is used, it is not necessary to + * remove them before calling dst_cache_destroy() + */ dst_cache_destroy(&ioam6_lwt_state(lwt)->cache); } @@ -414,6 +492,13 @@ static int ioam6_fill_encap_info(struct sk_buff *skb, goto ret; if (ilwt->mode != IOAM6_IPTUNNEL_MODE_INLINE) { + if (ilwt->has_tunsrc) { + err = nla_put_in6_addr(skb, IOAM6_IPTUNNEL_SRC, + &ilwt->tunsrc); + if (err) + goto ret; + } + err = nla_put_in6_addr(skb, IOAM6_IPTUNNEL_DST, &ilwt->tundst); if (err) goto ret; @@ -435,8 +520,12 @@ static int ioam6_encap_nlsize(struct lwtunnel_state *lwtstate) nla_total_size(sizeof(ilwt->mode)) + nla_total_size(sizeof(ilwt->tuninfo.traceh)); - if (ilwt->mode != IOAM6_IPTUNNEL_MODE_INLINE) + if (ilwt->mode != IOAM6_IPTUNNEL_MODE_INLINE) { + if (ilwt->has_tunsrc) + nlsize += nla_total_size(sizeof(ilwt->tunsrc)); + nlsize += nla_total_size(sizeof(ilwt->tundst)); + } return nlsize; } @@ -451,17 +540,21 @@ static int ioam6_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b) return (ilwt_a->freq.k != ilwt_b->freq.k || ilwt_a->freq.n != ilwt_b->freq.n || ilwt_a->mode != ilwt_b->mode || + ilwt_a->has_tunsrc != ilwt_b->has_tunsrc || (ilwt_a->mode != IOAM6_IPTUNNEL_MODE_INLINE && !ipv6_addr_equal(&ilwt_a->tundst, &ilwt_b->tundst)) || + (ilwt_a->mode != IOAM6_IPTUNNEL_MODE_INLINE && + ilwt_a->has_tunsrc && + !ipv6_addr_equal(&ilwt_a->tunsrc, &ilwt_b->tunsrc)) || trace_a->namespace_id != trace_b->namespace_id); } static const struct lwtunnel_encap_ops ioam6_iptun_ops = { .build_state = ioam6_build_state, .destroy_state = ioam6_destroy_state, - .output = ioam6_output, + .output = ioam6_output, .fill_encap = ioam6_fill_encap_info, - .get_encap_size = ioam6_encap_nlsize, + .get_encap_size = ioam6_encap_nlsize, .cmp_encap = ioam6_encap_cmp, .owner = THIS_MODULE, }; diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index c1f62352a481..7094d7708686 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -198,16 +198,9 @@ static void node_free_immediate(struct net *net, struct fib6_node *fn) net->ipv6.rt6_stats->fib_nodes--; } -static void node_free_rcu(struct rcu_head *head) -{ - struct fib6_node *fn = container_of(head, struct fib6_node, rcu); - - kmem_cache_free(fib6_node_kmem, fn); -} - static void node_free(struct net *net, struct fib6_node *fn) { - call_rcu(&fn->rcu, node_free_rcu); + kfree_rcu(fn, rcu); net->ipv6.rt6_stats->fib_nodes--; } @@ -256,40 +249,52 @@ static struct fib6_table *fib6_alloc_table(struct net *net, u32 id) struct fib6_table *fib6_new_table(struct net *net, u32 id) { - struct fib6_table *tb; + struct fib6_table *tb, *new_tb; if (id == 0) id = RT6_TABLE_MAIN; + tb = fib6_get_table(net, id); if (tb) return tb; - tb = fib6_alloc_table(net, id); - if (tb) - fib6_link_table(net, tb); + new_tb = fib6_alloc_table(net, id); + if (!new_tb) + return NULL; + + spin_lock_bh(&net->ipv6.fib_table_hash_lock); - return tb; + tb = fib6_get_table(net, id); + if (unlikely(tb)) { + spin_unlock_bh(&net->ipv6.fib_table_hash_lock); + kfree(new_tb); + return tb; + } + + fib6_link_table(net, new_tb); + + spin_unlock_bh(&net->ipv6.fib_table_hash_lock); + + return new_tb; } EXPORT_SYMBOL_GPL(fib6_new_table); struct fib6_table *fib6_get_table(struct net *net, u32 id) { - struct fib6_table *tb; struct hlist_head *head; - unsigned int h; + struct fib6_table *tb; - if (id == 0) + if (!id) id = RT6_TABLE_MAIN; - h = id & (FIB6_TABLE_HASHSZ - 1); - rcu_read_lock(); - head = &net->ipv6.fib_table_hash[h]; - hlist_for_each_entry_rcu(tb, head, tb6_hlist) { - if (tb->tb6_id == id) { - rcu_read_unlock(); + + head = &net->ipv6.fib_table_hash[id & (FIB6_TABLE_HASHSZ - 1)]; + + /* See comment in fib6_link_table(). RCU is not required, + * but rcu_dereference_raw() is used to avoid data-race. + */ + hlist_for_each_entry_rcu(tb, head, tb6_hlist, true) + if (tb->tb6_id == id) return tb; - } - } - rcu_read_unlock(); return NULL; } @@ -345,17 +350,17 @@ static void __net_init fib6_tables_init(struct net *net) #endif -unsigned int fib6_tables_seq_read(struct net *net) +unsigned int fib6_tables_seq_read(const struct net *net) { unsigned int h, fib_seq = 0; rcu_read_lock(); for (h = 0; h < FIB6_TABLE_HASHSZ; h++) { - struct hlist_head *head = &net->ipv6.fib_table_hash[h]; - struct fib6_table *tb; + const struct hlist_head *head = &net->ipv6.fib_table_hash[h]; + const struct fib6_table *tb; hlist_for_each_entry_rcu(tb, head, tb6_hlist) - fib_seq += tb->fib_seq; + fib_seq += READ_ONCE(tb->fib_seq); } rcu_read_unlock(); @@ -400,7 +405,7 @@ int call_fib6_entry_notifiers(struct net *net, .rt = rt, }; - rt->fib6_table->fib_seq++; + WRITE_ONCE(rt->fib6_table->fib_seq, rt->fib6_table->fib_seq + 1); return call_fib6_notifiers(net, event_type, &info.info); } @@ -416,7 +421,7 @@ int call_fib6_multipath_entry_notifiers(struct net *net, .nsiblings = nsiblings, }; - rt->fib6_table->fib_seq++; + WRITE_ONCE(rt->fib6_table->fib_seq, rt->fib6_table->fib_seq + 1); return call_fib6_notifiers(net, event_type, &info.info); } @@ -427,7 +432,7 @@ int call_fib6_entry_notifiers_replace(struct net *net, struct fib6_info *rt) .nsiblings = rt->fib6_nsiblings, }; - rt->fib6_table->fib_seq++; + WRITE_ONCE(rt->fib6_table->fib_seq, rt->fib6_table->fib_seq + 1); return call_fib6_notifiers(net, FIB_EVENT_ENTRY_REPLACE, &info.info); } @@ -623,23 +628,22 @@ static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) struct rt6_rtnl_dump_arg arg = { .filter.dump_exceptions = true, .filter.dump_routes = true, - .filter.rtnl_held = true, + .filter.rtnl_held = false, }; const struct nlmsghdr *nlh = cb->nlh; struct net *net = sock_net(skb->sk); - unsigned int h, s_h; unsigned int e = 0, s_e; + struct hlist_head *head; struct fib6_walker *w; struct fib6_table *tb; - struct hlist_head *head; - int res = 0; + unsigned int h, s_h; + int err = 0; + rcu_read_lock(); if (cb->strict_check) { - int err; - err = ip_valid_fib_dump_req(net, nlh, &arg.filter, cb); if (err < 0) - return err; + goto unlock; } else if (nlmsg_len(nlh) >= sizeof(struct rtmsg)) { struct rtmsg *rtm = nlmsg_data(nlh); @@ -654,8 +658,10 @@ static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) * 1. allocate and initialize walker. */ w = kzalloc(sizeof(*w), GFP_ATOMIC); - if (!w) - return -ENOMEM; + if (!w) { + err = -ENOMEM; + goto unlock; + } w->func = fib6_dump_node; cb->args[2] = (long)w; @@ -675,46 +681,46 @@ static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) tb = fib6_get_table(net, arg.filter.table_id); if (!tb) { if (rtnl_msg_family(cb->nlh) != PF_INET6) - goto out; + goto unlock; NL_SET_ERR_MSG_MOD(cb->extack, "FIB table does not exist"); - return -ENOENT; + err = -ENOENT; + goto unlock; } if (!cb->args[0]) { - res = fib6_dump_table(tb, skb, cb); - if (!res) + err = fib6_dump_table(tb, skb, cb); + if (!err) cb->args[0] = 1; } - goto out; + goto unlock; } s_h = cb->args[0]; s_e = cb->args[1]; - rcu_read_lock(); for (h = s_h; h < FIB6_TABLE_HASHSZ; h++, s_e = 0) { e = 0; head = &net->ipv6.fib_table_hash[h]; hlist_for_each_entry_rcu(tb, head, tb6_hlist) { if (e < s_e) goto next; - res = fib6_dump_table(tb, skb, cb); - if (res != 0) - goto out_unlock; + err = fib6_dump_table(tb, skb, cb); + if (err != 0) + goto out; next: e++; } } -out_unlock: - rcu_read_unlock(); +out: cb->args[1] = e; cb->args[0] = h; -out: - res = res < 0 ? res : skb->len; - if (res <= 0) + +unlock: + rcu_read_unlock(); + if (err <= 0) fib6_dump_end(cb); - return res; + return err; } void fib6_metric_set(struct fib6_info *f6i, int metric, u32 val) @@ -965,6 +971,7 @@ static void __fib6_drop_pcpu_from(struct fib6_nh *fib6_nh, if (!fib6_nh->rt6i_pcpu) return; + rcu_read_lock(); /* release the reference to this fib entry from * all of its cached pcpu routes */ @@ -973,7 +980,9 @@ static void __fib6_drop_pcpu_from(struct fib6_nh *fib6_nh, struct rt6_info *pcpu_rt; ppcpu_rt = per_cpu_ptr(fib6_nh->rt6i_pcpu, cpu); - pcpu_rt = *ppcpu_rt; + + /* Paired with xchg() in rt6_get_pcpu_route() */ + pcpu_rt = READ_ONCE(*ppcpu_rt); /* only dropping the 'from' reference if the cached route * is using 'match'. The cached pcpu_rt->from only changes @@ -983,10 +992,11 @@ static void __fib6_drop_pcpu_from(struct fib6_nh *fib6_nh, if (pcpu_rt && rcu_access_pointer(pcpu_rt->from) == match) { struct fib6_info *from; - from = xchg((__force struct fib6_info **)&pcpu_rt->from, NULL); + from = unrcu_pointer(xchg(&pcpu_rt->from, NULL)); fib6_info_release(from); } } + rcu_read_unlock(); } struct fib6_nh_pcpu_arg { @@ -1017,8 +1027,9 @@ static void fib6_drop_pcpu_from(struct fib6_info *f6i, .table = table }; - nexthop_for_each_fib6_nh(f6i->nh, fib6_nh_drop_pcpu_from, - &arg); + rcu_read_lock(); + nexthop_for_each_fib6_nh(f6i->nh, fib6_nh_drop_pcpu_from, &arg); + rcu_read_unlock(); } else { struct fib6_nh *fib6_nh; @@ -1036,8 +1047,14 @@ static void fib6_purge_rt(struct fib6_info *rt, struct fib6_node *fn, rt6_flush_exceptions(rt); fib6_drop_pcpu_from(rt, table); - if (rt->nh && !list_empty(&rt->nh_list)) - list_del_init(&rt->nh_list); + if (rt->nh) { + spin_lock(&rt->nh->lock); + + if (!list_empty(&rt->nh_list)) + list_del_init(&rt->nh_list); + + spin_unlock(&rt->nh->lock); + } if (refcount_read(&rt->fib6_ref) != 1) { /* This route is used as dummy address holder in some split @@ -1071,8 +1088,8 @@ static void fib6_purge_rt(struct fib6_info *rt, struct fib6_node *fn, */ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt, - struct nl_info *info, - struct netlink_ext_ack *extack) + struct nl_info *info, struct netlink_ext_ack *extack, + struct list_head *purge_list) { struct fib6_info *leaf = rcu_dereference_protected(fn->leaf, lockdep_is_held(&rt->fib6_table->tb6_lock)); @@ -1185,8 +1202,8 @@ next_iter: while (sibling) { if (sibling->fib6_metric == rt->fib6_metric && rt6_qualify_for_ecmp(sibling)) { - list_add_tail(&rt->fib6_siblings, - &sibling->fib6_siblings); + list_add_tail_rcu(&rt->fib6_siblings, + &sibling->fib6_siblings); break; } sibling = rcu_dereference_protected(sibling->fib6_next, @@ -1205,7 +1222,9 @@ next_iter: fib6_nsiblings++; } BUG_ON(fib6_nsiblings != rt->fib6_nsiblings); + rcu_read_lock(); rt6_multipath_rebalance(temp_sibling); + rcu_read_unlock(); } /* @@ -1247,8 +1266,10 @@ add: fib6_siblings) sibling->fib6_nsiblings--; rt->fib6_nsiblings = 0; - list_del_init(&rt->fib6_siblings); + list_del_rcu(&rt->fib6_siblings); + rcu_read_lock(); rt6_multipath_rebalance(next_sibling); + rcu_read_unlock(); return err; } } @@ -1296,10 +1317,9 @@ add: } nsiblings = iter->fib6_nsiblings; iter->fib6_node = NULL; - fib6_purge_rt(iter, fn, info->nl_net); + list_add(&iter->purge_link, purge_list); if (rcu_access_pointer(fn->rr_ptr) == iter) fn->rr_ptr = NULL; - fib6_info_release(iter); if (nsiblings) { /* Replacing an ECMP route, remove all siblings */ @@ -1312,10 +1332,9 @@ add: if (rt6_qualify_for_ecmp(iter)) { *ins = iter->fib6_next; iter->fib6_node = NULL; - fib6_purge_rt(iter, fn, info->nl_net); + list_add(&iter->purge_link, purge_list); if (rcu_access_pointer(fn->rr_ptr) == iter) fn->rr_ptr = NULL; - fib6_info_release(iter); nsiblings--; info->nl_net->ipv6.rt6_stats->fib_rt_entries--; } else { @@ -1331,6 +1350,28 @@ add: return 0; } +static int fib6_add_rt2node_nh(struct fib6_node *fn, struct fib6_info *rt, + struct nl_info *info, struct netlink_ext_ack *extack, + struct list_head *purge_list) +{ + int err; + + spin_lock(&rt->nh->lock); + + if (rt->nh->dead) { + NL_SET_ERR_MSG(extack, "Nexthop has been deleted"); + err = -EINVAL; + } else { + err = fib6_add_rt2node(fn, rt, info, extack, purge_list); + if (!err) + list_add(&rt->nh_list, &rt->nh->f6i_list); + } + + spin_unlock(&rt->nh->lock); + + return err; +} + static void fib6_start_gc(struct net *net, struct fib6_info *rt) { if (!timer_pending(&net->ipv6.ip6_fib_timer) && @@ -1385,6 +1426,7 @@ int fib6_add(struct fib6_node *root, struct fib6_info *rt, struct nl_info *info, struct netlink_ext_ack *extack) { struct fib6_table *table = rt->fib6_table; + LIST_HEAD(purge_list); struct fib6_node *fn; #ifdef CONFIG_IPV6_SUBTREES struct fib6_node *pn = NULL; @@ -1487,10 +1529,19 @@ int fib6_add(struct fib6_node *root, struct fib6_info *rt, } #endif - err = fib6_add_rt2node(fn, rt, info, extack); + if (rt->nh) + err = fib6_add_rt2node_nh(fn, rt, info, extack, &purge_list); + else + err = fib6_add_rt2node(fn, rt, info, extack, &purge_list); if (!err) { - if (rt->nh) - list_add(&rt->nh_list, &rt->nh->f6i_list); + struct fib6_info *iter, *next; + + list_for_each_entry_safe(iter, next, &purge_list, purge_link) { + list_del(&iter->purge_link); + fib6_purge_rt(iter, fn, info->nl_net); + fib6_info_release(iter); + } + __fib6_update_sernum_upto_root(rt, fib6_new_sernum(info->nl_net)); if (rt->fib6_flags & RTF_EXPIRES) @@ -1965,7 +2016,7 @@ static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn, &rt->fib6_siblings, fib6_siblings) sibling->fib6_nsiblings--; rt->fib6_nsiblings = 0; - list_del_init(&rt->fib6_siblings); + list_del_rcu(&rt->fib6_siblings); rt6_multipath_rebalance(next_sibling); } @@ -2385,7 +2436,7 @@ void fib6_run_gc(unsigned long expires, struct net *net, bool force) round_jiffies(now + net->ipv6.sysctl.ip6_rt_gc_interval)); else - del_timer(&net->ipv6.ip6_fib_timer); + timer_delete(&net->ipv6.ip6_fib_timer); spin_unlock_bh(&net->ipv6.fib6_gc_lock); } @@ -2425,6 +2476,8 @@ static int __net_init fib6_net_init(struct net *net) if (!net->ipv6.fib_table_hash) goto out_rt6_stats; + spin_lock_init(&net->ipv6.fib_table_hash_lock); + net->ipv6.fib6_main_tbl = kzalloc(sizeof(*net->ipv6.fib6_main_tbl), GFP_KERNEL); if (!net->ipv6.fib6_main_tbl) @@ -2472,7 +2525,7 @@ static void fib6_net_exit(struct net *net) { unsigned int i; - del_timer_sync(&net->ipv6.ip6_fib_timer); + timer_delete_sync(&net->ipv6.ip6_fib_timer); for (i = 0; i < FIB6_TABLE_HASHSZ; i++) { struct hlist_head *head = &net->ipv6.fib_table_hash[i]; @@ -2495,6 +2548,12 @@ static struct pernet_operations fib6_net_ops = { .exit = fib6_net_exit, }; +static const struct rtnl_msg_handler fib6_rtnl_msg_handlers[] __initconst_or_module = { + {.owner = THIS_MODULE, .protocol = PF_INET6, .msgtype = RTM_GETROUTE, + .dumpit = inet6_dump_fib, + .flags = RTNL_FLAG_DUMP_UNLOCKED | RTNL_FLAG_DUMP_SPLIT_NLM_DONE}, +}; + int __init fib6_init(void) { int ret = -ENOMEM; @@ -2508,8 +2567,7 @@ int __init fib6_init(void) if (ret) goto out_kmem_cache_create; - ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE, NULL, - inet6_dump_fib, 0); + ret = rtnl_register_many(fib6_rtnl_msg_handlers); if (ret) goto out_unregister_subsys; diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c index eca07e10e21f..a3ff575798dd 100644 --- a/net/ipv6/ip6_flowlabel.c +++ b/net/ipv6/ip6_flowlabel.c @@ -907,6 +907,6 @@ int ip6_flowlabel_init(void) void ip6_flowlabel_cleanup(void) { static_key_deferred_flush(&ipv6_flowlabel_exclusive); - del_timer(&ip6_fl_gc_timer); + timer_delete(&ip6_fl_gc_timer); unregister_pernet_subsys(&ip6_flowlabel_net_ops); } diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index c89aef524df9..2dc9dcffe2ca 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -43,6 +43,7 @@ #include <net/xfrm.h> #include <net/net_namespace.h> #include <net/netns/generic.h> +#include <net/netdev_lock.h> #include <net/rtnetlink.h> #include <net/ipv6.h> @@ -496,11 +497,11 @@ static int ip6gre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi) tpi->proto); if (tunnel) { if (tunnel->parms.collect_md) { + IP_TUNNEL_DECLARE_FLAGS(flags); struct metadata_dst *tun_dst; __be64 tun_id; - __be16 flags; - flags = tpi->flags; + ip_tunnel_flags_copy(flags, tpi->flags); tun_id = key32_to_tunnel_id(tpi->key); tun_dst = ipv6_tun_rx_dst(skb, flags, tun_id, 0); @@ -551,14 +552,14 @@ static int ip6erspan_rcv(struct sk_buff *skb, if (tunnel->parms.collect_md) { struct erspan_metadata *pkt_md, *md; + IP_TUNNEL_DECLARE_FLAGS(flags); struct metadata_dst *tun_dst; struct ip_tunnel_info *info; unsigned char *gh; __be64 tun_id; - __be16 flags; - tpi->flags |= TUNNEL_KEY; - flags = tpi->flags; + __set_bit(IP_TUNNEL_KEY_BIT, tpi->flags); + ip_tunnel_flags_copy(flags, tpi->flags); tun_id = key32_to_tunnel_id(tpi->key); tun_dst = ipv6_tun_rx_dst(skb, flags, tun_id, @@ -580,7 +581,8 @@ static int ip6erspan_rcv(struct sk_buff *skb, md2 = &md->u.md2; memcpy(md2, pkt_md, ver == 1 ? ERSPAN_V1_MDSIZE : ERSPAN_V2_MDSIZE); - info->key.tun_flags |= TUNNEL_ERSPAN_OPT; + __set_bit(IP_TUNNEL_ERSPAN_OPT_BIT, + info->key.tun_flags); info->options_len = sizeof(*md); ip6_tnl_rcv(tunnel, skb, tpi, tun_dst, log_ecn_error); @@ -748,8 +750,8 @@ static netdev_tx_t __gre6_xmit(struct sk_buff *skb, __u32 *pmtu, __be16 proto) { struct ip6_tnl *tunnel = netdev_priv(dev); + IP_TUNNEL_DECLARE_FLAGS(flags); __be16 protocol; - __be16 flags; if (dev->type == ARPHRD_ETHER) IPCB(skb)->flags = 0; @@ -781,8 +783,11 @@ static netdev_tx_t __gre6_xmit(struct sk_buff *skb, fl6->fl6_gre_key = tunnel_id_to_key32(key->tun_id); dsfield = key->tos; - flags = key->tun_flags & - (TUNNEL_CSUM | TUNNEL_KEY | TUNNEL_SEQ); + ip_tunnel_flags_zero(flags); + __set_bit(IP_TUNNEL_CSUM_BIT, flags); + __set_bit(IP_TUNNEL_KEY_BIT, flags); + __set_bit(IP_TUNNEL_SEQ_BIT, flags); + ip_tunnel_flags_and(flags, flags, key->tun_flags); tun_hlen = gre_calc_hlen(flags); if (skb_cow_head(skb, dev->needed_headroom ?: tun_hlen + tunnel->encap_hlen)) @@ -791,19 +796,21 @@ static netdev_tx_t __gre6_xmit(struct sk_buff *skb, gre_build_header(skb, tun_hlen, flags, protocol, tunnel_id_to_key32(tun_info->key.tun_id), - (flags & TUNNEL_SEQ) ? htonl(atomic_fetch_inc(&tunnel->o_seqno)) - : 0); + test_bit(IP_TUNNEL_SEQ_BIT, flags) ? + htonl(atomic_fetch_inc(&tunnel->o_seqno)) : + 0); } else { if (skb_cow_head(skb, dev->needed_headroom ?: tunnel->hlen)) return -ENOMEM; - flags = tunnel->parms.o_flags; + ip_tunnel_flags_copy(flags, tunnel->parms.o_flags); gre_build_header(skb, tunnel->tun_hlen, flags, protocol, tunnel->parms.o_key, - (flags & TUNNEL_SEQ) ? htonl(atomic_fetch_inc(&tunnel->o_seqno)) - : 0); + test_bit(IP_TUNNEL_SEQ_BIT, flags) ? + htonl(atomic_fetch_inc(&tunnel->o_seqno)) : + 0); } return ip6_tnl_xmit(skb, dev, dsfield, fl6, encap_limit, pmtu, @@ -825,7 +832,8 @@ static inline int ip6gre_xmit_ipv4(struct sk_buff *skb, struct net_device *dev) prepare_ip6gre_xmit_ipv4(skb, dev, &fl6, &dsfield, &encap_limit); - err = gre_handle_offloads(skb, !!(t->parms.o_flags & TUNNEL_CSUM)); + err = gre_handle_offloads(skb, test_bit(IP_TUNNEL_CSUM_BIT, + t->parms.o_flags)); if (err) return -1; @@ -859,7 +867,8 @@ static inline int ip6gre_xmit_ipv6(struct sk_buff *skb, struct net_device *dev) prepare_ip6gre_xmit_ipv6(skb, dev, &fl6, &dsfield, &encap_limit)) return -1; - if (gre_handle_offloads(skb, !!(t->parms.o_flags & TUNNEL_CSUM))) + if (gre_handle_offloads(skb, test_bit(IP_TUNNEL_CSUM_BIT, + t->parms.o_flags))) return -1; err = __gre6_xmit(skb, dev, dsfield, &fl6, encap_limit, @@ -886,7 +895,8 @@ static int ip6gre_xmit_other(struct sk_buff *skb, struct net_device *dev) prepare_ip6gre_xmit_other(skb, dev, &fl6, &dsfield, &encap_limit)) return -1; - err = gre_handle_offloads(skb, !!(t->parms.o_flags & TUNNEL_CSUM)); + err = gre_handle_offloads(skb, test_bit(IP_TUNNEL_CSUM_BIT, + t->parms.o_flags)); if (err) return err; err = __gre6_xmit(skb, dev, dsfield, &fl6, encap_limit, &mtu, skb->protocol); @@ -939,6 +949,7 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb, struct ip_tunnel_info *tun_info = NULL; struct ip6_tnl *t = netdev_priv(dev); struct dst_entry *dst = skb_dst(skb); + IP_TUNNEL_DECLARE_FLAGS(flags) = { }; bool truncate = false; int encap_limit = -1; __u8 dsfield = false; @@ -982,7 +993,7 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb, if (skb_cow_head(skb, dev->needed_headroom ?: t->hlen)) goto tx_err; - t->parms.o_flags &= ~TUNNEL_KEY; + __clear_bit(IP_TUNNEL_KEY_BIT, t->parms.o_flags); IPCB(skb)->flags = 0; /* For collect_md mode, derive fl6 from the tunnel key, @@ -1007,7 +1018,8 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb, fl6.fl6_gre_key = tunnel_id_to_key32(key->tun_id); dsfield = key->tos; - if (!(tun_info->key.tun_flags & TUNNEL_ERSPAN_OPT)) + if (!test_bit(IP_TUNNEL_ERSPAN_OPT_BIT, + tun_info->key.tun_flags)) goto tx_err; if (tun_info->options_len < sizeof(*md)) goto tx_err; @@ -1068,7 +1080,9 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb, } /* Push GRE header. */ - gre_build_header(skb, 8, TUNNEL_SEQ, proto, 0, htonl(atomic_fetch_inc(&t->o_seqno))); + __set_bit(IP_TUNNEL_SEQ_BIT, flags); + gre_build_header(skb, 8, flags, proto, 0, + htonl(atomic_fetch_inc(&t->o_seqno))); /* TooBig packet may have updated dst->dev's mtu */ if (!t->parms.collect_md && dst && dst_mtu(dst) > dst->dev->mtu) @@ -1211,8 +1225,8 @@ static void ip6gre_tnl_copy_tnl_parm(struct ip6_tnl *t, t->parms.proto = p->proto; t->parms.i_key = p->i_key; t->parms.o_key = p->o_key; - t->parms.i_flags = p->i_flags; - t->parms.o_flags = p->o_flags; + ip_tunnel_flags_copy(t->parms.i_flags, p->i_flags); + ip_tunnel_flags_copy(t->parms.o_flags, p->o_flags); t->parms.fwmark = p->fwmark; t->parms.erspan_ver = p->erspan_ver; t->parms.index = p->index; @@ -1241,8 +1255,8 @@ static void ip6gre_tnl_parm_from_user(struct __ip6_tnl_parm *p, p->link = u->link; p->i_key = u->i_key; p->o_key = u->o_key; - p->i_flags = gre_flags_to_tnl_flags(u->i_flags); - p->o_flags = gre_flags_to_tnl_flags(u->o_flags); + gre_flags_to_tnl_flags(p->i_flags, u->i_flags); + gre_flags_to_tnl_flags(p->o_flags, u->o_flags); memcpy(p->name, u->name, sizeof(u->name)); } @@ -1394,7 +1408,7 @@ static int ip6gre_header(struct sk_buff *skb, struct net_device *dev, ipv6h->daddr = t->parms.raddr; p = (__be16 *)(ipv6h + 1); - p[0] = t->parms.o_flags; + p[0] = ip_tunnel_flags_to_be16(t->parms.o_flags); p[1] = htons(type); /* @@ -1421,7 +1435,6 @@ static const struct net_device_ops ip6gre_netdev_ops = { .ndo_start_xmit = ip6gre_tunnel_xmit, .ndo_siocdevprivate = ip6gre_tunnel_siocdevprivate, .ndo_change_mtu = ip6_tnl_change_mtu, - .ndo_get_stats64 = dev_get_tstats64, .ndo_get_iflink = ip6_tnl_get_iflink, }; @@ -1431,7 +1444,6 @@ static void ip6gre_dev_free(struct net_device *dev) gro_cells_destroy(&t->gro_cells); dst_cache_destroy(&t->dst_cache); - free_percpu(dev->tstats); } static void ip6gre_tunnel_setup(struct net_device *dev) @@ -1440,6 +1452,7 @@ static void ip6gre_tunnel_setup(struct net_device *dev) dev->needs_free_netdev = true; dev->priv_destructor = ip6gre_dev_free; + dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS; dev->type = ARPHRD_IP6GRE; dev->flags |= IFF_NOARP; @@ -1458,23 +1471,23 @@ static void ip6gre_tunnel_setup(struct net_device *dev) static void ip6gre_tnl_init_features(struct net_device *dev) { struct ip6_tnl *nt = netdev_priv(dev); - __be16 flags; - dev->features |= GRE6_FEATURES | NETIF_F_LLTX; + dev->features |= GRE6_FEATURES; dev->hw_features |= GRE6_FEATURES; - flags = nt->parms.o_flags; - /* TCP offload with GRE SEQ is not supported, nor can we support 2 * levels of outer headers requiring an update. */ - if (flags & TUNNEL_SEQ) + if (test_bit(IP_TUNNEL_SEQ_BIT, nt->parms.o_flags)) return; - if (flags & TUNNEL_CSUM && nt->encap.type != TUNNEL_ENCAP_NONE) + if (test_bit(IP_TUNNEL_CSUM_BIT, nt->parms.o_flags) && + nt->encap.type != TUNNEL_ENCAP_NONE) return; dev->features |= NETIF_F_GSO_SOFTWARE; dev->hw_features |= NETIF_F_GSO_SOFTWARE; + + dev->lltx = true; } static int ip6gre_tunnel_init_common(struct net_device *dev) @@ -1486,16 +1499,11 @@ static int ip6gre_tunnel_init_common(struct net_device *dev) tunnel = netdev_priv(dev); tunnel->dev = dev; - tunnel->net = dev_net(dev); strcpy(tunnel->parms.name, dev->name); - dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); - if (!dev->tstats) - return -ENOMEM; - ret = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL); if (ret) - goto cleanup_alloc_pcpu_stats; + return ret; ret = gro_cells_init(&tunnel->gro_cells, dev); if (ret) @@ -1519,9 +1527,6 @@ static int ip6gre_tunnel_init_common(struct net_device *dev) cleanup_dst_cache_init: dst_cache_destroy(&tunnel->dst_cache); -cleanup_alloc_pcpu_stats: - free_percpu(dev->tstats); - dev->tstats = NULL; return ret; } @@ -1565,7 +1570,7 @@ static struct inet6_protocol ip6gre_protocol __read_mostly = { .flags = INET6_PROTO_FINAL, }; -static void ip6gre_destroy_tunnels(struct net *net, struct list_head *head) +static void __net_exit ip6gre_exit_rtnl_net(struct net *net, struct list_head *head) { struct ip6gre_net *ign = net_generic(net, ip6gre_net_id); struct net_device *dev, *aux; @@ -1582,16 +1587,16 @@ static void ip6gre_destroy_tunnels(struct net *net, struct list_head *head) for (h = 0; h < IP6_GRE_HASH_SIZE; h++) { struct ip6_tnl *t; - t = rtnl_dereference(ign->tunnels[prio][h]); + t = rtnl_net_dereference(net, ign->tunnels[prio][h]); while (t) { /* If dev is in the same netns, it has already * been added to the list by the previous loop. */ if (!net_eq(dev_net(t->dev), net)) - unregister_netdevice_queue(t->dev, - head); - t = rtnl_dereference(t->next); + unregister_netdevice_queue(t->dev, head); + + t = rtnl_net_dereference(net, t->next); } } } @@ -1616,8 +1621,7 @@ static int __net_init ip6gre_init_net(struct net *net) /* FB netdevice is special: we have one, and only one per netns. * Allowing to move it to another netns is clearly unsafe. */ - ign->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL; - + ign->fb_tunnel_dev->netns_immutable = true; ip6gre_fb_tunnel_init(ign->fb_tunnel_dev); ign->fb_tunnel_dev->rtnl_link_ops = &ip6gre_link_ops; @@ -1636,19 +1640,9 @@ err_alloc_dev: return err; } -static void __net_exit ip6gre_exit_batch_rtnl(struct list_head *net_list, - struct list_head *dev_to_kill) -{ - struct net *net; - - ASSERT_RTNL(); - list_for_each_entry(net, net_list, exit_list) - ip6gre_destroy_tunnels(net, dev_to_kill); -} - static struct pernet_operations ip6gre_net_ops = { .init = ip6gre_init_net, - .exit_batch_rtnl = ip6gre_exit_batch_rtnl, + .exit_rtnl = ip6gre_exit_rtnl_net, .id = &ip6gre_net_id, .size = sizeof(struct ip6gre_net), }; @@ -1795,12 +1789,12 @@ static void ip6gre_netlink_parms(struct nlattr *data[], parms->link = nla_get_u32(data[IFLA_GRE_LINK]); if (data[IFLA_GRE_IFLAGS]) - parms->i_flags = gre_flags_to_tnl_flags( - nla_get_be16(data[IFLA_GRE_IFLAGS])); + gre_flags_to_tnl_flags(parms->i_flags, + nla_get_be16(data[IFLA_GRE_IFLAGS])); if (data[IFLA_GRE_OFLAGS]) - parms->o_flags = gre_flags_to_tnl_flags( - nla_get_be16(data[IFLA_GRE_OFLAGS])); + gre_flags_to_tnl_flags(parms->o_flags, + nla_get_be16(data[IFLA_GRE_OFLAGS])); if (data[IFLA_GRE_IKEY]) parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]); @@ -1853,7 +1847,6 @@ static const struct net_device_ops ip6gre_tap_netdev_ops = { .ndo_set_mac_address = eth_mac_addr, .ndo_validate_addr = eth_validate_addr, .ndo_change_mtu = ip6_tnl_change_mtu, - .ndo_get_stats64 = dev_get_tstats64, .ndo_get_iflink = ip6_tnl_get_iflink, }; @@ -1879,16 +1872,11 @@ static int ip6erspan_tap_init(struct net_device *dev) tunnel = netdev_priv(dev); tunnel->dev = dev; - tunnel->net = dev_net(dev); strcpy(tunnel->parms.name, dev->name); - dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); - if (!dev->tstats) - return -ENOMEM; - ret = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL); if (ret) - goto cleanup_alloc_pcpu_stats; + return ret; ret = gro_cells_init(&tunnel->gro_cells, dev); if (ret) @@ -1910,9 +1898,6 @@ static int ip6erspan_tap_init(struct net_device *dev) cleanup_dst_cache_init: dst_cache_destroy(&tunnel->dst_cache); -cleanup_alloc_pcpu_stats: - free_percpu(dev->tstats); - dev->tstats = NULL; return ret; } @@ -1923,7 +1908,6 @@ static const struct net_device_ops ip6erspan_netdev_ops = { .ndo_set_mac_address = eth_mac_addr, .ndo_validate_addr = eth_validate_addr, .ndo_change_mtu = ip6_tnl_change_mtu, - .ndo_get_stats64 = dev_get_tstats64, .ndo_get_iflink = ip6_tnl_get_iflink, }; @@ -1937,6 +1921,7 @@ static void ip6gre_tap_setup(struct net_device *dev) dev->needs_free_netdev = true; dev->priv_destructor = ip6gre_dev_free; + dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS; dev->priv_flags &= ~IFF_TX_SKB_SHARING; dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; netif_keep_dst(dev); @@ -1975,7 +1960,7 @@ static bool ip6gre_netlink_encap_parms(struct nlattr *data[], return ret; } -static int ip6gre_newlink_common(struct net *src_net, struct net_device *dev, +static int ip6gre_newlink_common(struct net *link_net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { @@ -1996,7 +1981,7 @@ static int ip6gre_newlink_common(struct net *src_net, struct net_device *dev, eth_hw_addr_random(dev); nt->dev = dev; - nt->net = dev_net(dev); + nt->net = link_net; err = register_netdevice(dev); if (err) @@ -2009,12 +1994,14 @@ out: return err; } -static int ip6gre_newlink(struct net *src_net, struct net_device *dev, - struct nlattr *tb[], struct nlattr *data[], +static int ip6gre_newlink(struct net_device *dev, + struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { + struct net *net = params->link_net ? : dev_net(dev); struct ip6_tnl *nt = netdev_priv(dev); - struct net *net = dev_net(dev); + struct nlattr **data = params->data; + struct nlattr **tb = params->tb; struct ip6gre_net *ign; int err; @@ -2029,7 +2016,7 @@ static int ip6gre_newlink(struct net *src_net, struct net_device *dev, return -EEXIST; } - err = ip6gre_newlink_common(src_net, dev, tb, data, extack); + err = ip6gre_newlink_common(net, dev, tb, data, extack); if (!err) { ip6gre_tnl_link_config(nt, !tb[IFLA_MTU]); ip6gre_tunnel_link_md(ign, nt); @@ -2147,11 +2134,13 @@ static int ip6gre_fill_info(struct sk_buff *skb, const struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); struct __ip6_tnl_parm *p = &t->parms; - __be16 o_flags = p->o_flags; + IP_TUNNEL_DECLARE_FLAGS(o_flags); + + ip_tunnel_flags_copy(o_flags, p->o_flags); if (p->erspan_ver == 1 || p->erspan_ver == 2) { if (!p->collect_md) - o_flags |= TUNNEL_KEY; + __set_bit(IP_TUNNEL_KEY_BIT, o_flags); if (nla_put_u8(skb, IFLA_GRE_ERSPAN_VER, p->erspan_ver)) goto nla_put_failure; @@ -2237,17 +2226,20 @@ static void ip6erspan_tap_setup(struct net_device *dev) dev->needs_free_netdev = true; dev->priv_destructor = ip6gre_dev_free; + dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS; dev->priv_flags &= ~IFF_TX_SKB_SHARING; dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; netif_keep_dst(dev); } -static int ip6erspan_newlink(struct net *src_net, struct net_device *dev, - struct nlattr *tb[], struct nlattr *data[], +static int ip6erspan_newlink(struct net_device *dev, + struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { + struct net *net = params->link_net ? : dev_net(dev); struct ip6_tnl *nt = netdev_priv(dev); - struct net *net = dev_net(dev); + struct nlattr **data = params->data; + struct nlattr **tb = params->tb; struct ip6gre_net *ign; int err; @@ -2263,7 +2255,7 @@ static int ip6erspan_newlink(struct net *src_net, struct net_device *dev, return -EEXIST; } - err = ip6gre_newlink_common(src_net, dev, tb, data, extack); + err = ip6gre_newlink_common(net, dev, tb, data, extack); if (!err) { ip6erspan_tnl_link_config(nt, !tb[IFLA_MTU]); ip6erspan_tunnel_link_md(ign, nt); diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c index 133610a49da6..39da6a7ce5f1 100644 --- a/net/ipv6/ip6_input.c +++ b/net/ipv6/ip6_input.c @@ -111,9 +111,8 @@ static void ip6_list_rcv_finish(struct net *net, struct sock *sk, { struct sk_buff *skb, *next, *hint = NULL; struct dst_entry *curr_dst = NULL; - struct list_head sublist; + LIST_HEAD(sublist); - INIT_LIST_HEAD(&sublist); list_for_each_entry_safe(skb, next, head, list) { struct dst_entry *dst; @@ -327,9 +326,8 @@ void ipv6_list_rcv(struct list_head *head, struct packet_type *pt, struct net_device *curr_dev = NULL; struct net *curr_net = NULL; struct sk_buff *skb, *next; - struct list_head sublist; + LIST_HEAD(sublist); - INIT_LIST_HEAD(&sublist); list_for_each_entry_safe(skb, next, head, list) { struct net_device *dev = skb->dev; struct net *net = dev_net(dev); @@ -479,9 +477,7 @@ discard: static int ip6_input_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { skb_clear_delivery_time(skb); - rcu_read_lock(); ip6_protocol_deliver_rcu(net, skb, 0, false); - rcu_read_unlock(); return 0; } @@ -489,9 +485,15 @@ static int ip6_input_finish(struct net *net, struct sock *sk, struct sk_buff *sk int ip6_input(struct sk_buff *skb) { - return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_IN, - dev_net(skb->dev), NULL, skb, skb->dev, NULL, - ip6_input_finish); + int res; + + rcu_read_lock(); + res = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_IN, + dev_net_rcu(skb->dev), NULL, skb, skb->dev, NULL, + ip6_input_finish); + rcu_read_unlock(); + + return res; } EXPORT_SYMBOL_GPL(ip6_input); diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c index c8b909a9904f..9822163428b0 100644 --- a/net/ipv6/ip6_offload.c +++ b/net/ipv6/ip6_offload.c @@ -67,7 +67,7 @@ static int ipv6_gro_pull_exthdrs(struct sk_buff *skb, int off, int proto) off += len; } - skb_gro_pull(skb, off - skb_network_offset(skb)); + skb_gro_pull(skb, off - skb_gro_receive_network_offset(skb)); return proto; } @@ -236,8 +236,7 @@ INDIRECT_CALLABLE_SCOPE struct sk_buff *ipv6_gro_receive(struct list_head *head, if (unlikely(!iph)) goto out; - skb_set_network_header(skb, off); - NAPI_GRO_CB(skb)->inner_network_offset = off; + NAPI_GRO_CB(skb)->network_offsets[NAPI_GRO_CB(skb)->encap_mark] = off; flush += ntohs(iph->payload_len) != skb->len - hlen; @@ -260,7 +259,7 @@ INDIRECT_CALLABLE_SCOPE struct sk_buff *ipv6_gro_receive(struct list_head *head, NAPI_GRO_CB(skb)->proto = proto; flush--; - nlen = skb_network_header_len(skb); + nlen = skb_gro_offset(skb) - off; list_for_each_entry(p, head, list) { const struct ipv6hdr *iph2; @@ -291,19 +290,8 @@ not_same_flow: nlen - sizeof(struct ipv6hdr))) goto not_same_flow; } - /* flush if Traffic Class fields are different */ - NAPI_GRO_CB(p)->flush |= !!((first_word & htonl(0x0FF00000)) | - (__force __be32)(iph->hop_limit ^ iph2->hop_limit)); - NAPI_GRO_CB(p)->flush |= flush; - - /* If the previous IP ID value was based on an atomic - * datagram we can overwrite the value and ignore it. - */ - if (NAPI_GRO_CB(skb)->is_atomic) - NAPI_GRO_CB(p)->flush_id = 0; } - NAPI_GRO_CB(skb)->is_atomic = true; NAPI_GRO_CB(skb)->flush |= flush; skb_gro_postpull_rcsum(skb, iph, nlen); diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index b9dd3a66e423..7bd29a9ff0db 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -70,11 +70,15 @@ static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff * /* Be paranoid, rather than too clever. */ if (unlikely(hh_len > skb_headroom(skb)) && dev->header_ops) { + /* Make sure idev stays alive */ + rcu_read_lock(); skb = skb_expand_head(skb, hh_len); if (!skb) { IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS); + rcu_read_unlock(); return -ENOMEM; } + rcu_read_unlock(); } hdr = ipv6_hdr(skb); @@ -120,10 +124,10 @@ static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff * IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUT, skb->len); rcu_read_lock(); - nexthop = rt6_nexthop((struct rt6_info *)dst, daddr); + nexthop = rt6_nexthop(dst_rt6_info(dst), daddr); neigh = __ipv6_neigh_lookup_noref(dev, nexthop); - if (unlikely(IS_ERR_OR_NULL(neigh))) { + if (IS_ERR_OR_NULL(neigh)) { if (unlikely(!neigh)) neigh = __neigh_create(&nd_tbl, nexthop, dev, false); if (IS_ERR(neigh)) { @@ -234,7 +238,7 @@ int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb) skb->protocol = htons(ETH_P_IPV6); skb->dev = dev; - if (unlikely(READ_ONCE(idev->cnf.disable_ipv6))) { + if (unlikely(!idev || READ_ONCE(idev->cnf.disable_ipv6))) { IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS); kfree_skb_reason(skb, SKB_DROP_REASON_IPV6DISABLED); return 0; @@ -255,7 +259,7 @@ bool ip6_autoflowlabel(struct net *net, const struct sock *sk) } /* - * xmit an sk_buff (used by TCP, SCTP and DCCP) + * xmit an sk_buff (used by TCP and SCTP) * Note : socket lock is not held for SYNACK packets, but might be modified * by calls to skb_set_owner_w() and ipv6_local_error(), * which are using proper atomic operations or spinlocks. @@ -283,11 +287,15 @@ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, head_room += opt->opt_nflen + opt->opt_flen; if (unlikely(head_room > skb_headroom(skb))) { + /* Make sure idev stays alive */ + rcu_read_lock(); skb = skb_expand_head(skb, head_room); if (!skb) { IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS); + rcu_read_unlock(); return -ENOBUFS; } + rcu_read_unlock(); } if (opt) { @@ -599,21 +607,21 @@ int ip6_forward(struct sk_buff *skb) * send a redirect. */ - rt = (struct rt6_info *) dst; + rt = dst_rt6_info(dst); if (rt->rt6i_flags & RTF_GATEWAY) target = &rt->rt6i_gateway; else target = &hdr->daddr; - peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1); + rcu_read_lock(); + peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr); /* Limit redirects both by destination (here) and by source (inside ndisc_send_redirect) */ if (inet_peer_xrlim_allow(peer, 1*HZ)) ndisc_send_redirect(skb, target); - if (peer) - inet_putpeer(peer); + rcu_read_unlock(); } else { int addrtype = ipv6_addr_type(&hdr->saddr); @@ -856,10 +864,10 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, int (*output)(struct net *, struct sock *, struct sk_buff *)) { struct sk_buff *frag; - struct rt6_info *rt = (struct rt6_info *)skb_dst(skb); + struct rt6_info *rt = dst_rt6_info(skb_dst(skb)); struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ? inet6_sk(skb->sk) : NULL; - bool mono_delivery_time = skb->mono_delivery_time; + u8 tstamp_type = skb->tstamp_type; struct ip6_frag_state state; unsigned int mtu, hlen, nexthdr_offset; ktime_t tstamp = skb->tstamp; @@ -955,7 +963,7 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, if (iter.frag) ip6_fraglist_prepare(skb, &iter); - skb_set_delivery_time(skb, tstamp, mono_delivery_time); + skb_set_delivery_time(skb, tstamp, tstamp_type); err = output(net, sk, skb); if (!err) IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), @@ -1016,7 +1024,7 @@ slow_path: /* * Put this fragment into the sending queue. */ - skb_set_delivery_time(frag, tstamp, mono_delivery_time); + skb_set_delivery_time(frag, tstamp, tstamp_type); err = output(net, sk, frag); if (err) goto fail; @@ -1063,7 +1071,7 @@ static struct dst_entry *ip6_sk_dst_check(struct sock *sk, return NULL; } - rt = (struct rt6_info *)dst; + rt = dst_rt6_info(dst); /* Yes, checking route validity in not connected * case is not very simple. Take into account, * that we do not support routing by source, TOS, @@ -1118,12 +1126,13 @@ static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk, struct rt6_info *rt; *dst = ip6_route_output(net, sk, fl6); - rt = (*dst)->error ? NULL : (struct rt6_info *)*dst; + rt = (*dst)->error ? NULL : dst_rt6_info(*dst); rcu_read_lock(); from = rt ? rcu_dereference(rt->from) : NULL; err = ip6_route_get_saddr(net, from, &fl6->daddr, sk ? READ_ONCE(inet6_sk(sk)->srcprefs) : 0, + fl6->flowi6_l3mdev, &fl6->saddr); rcu_read_unlock(); @@ -1159,7 +1168,7 @@ static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk, * dst entry and replace it instead with the * dst entry of the nexthop router */ - rt = (struct rt6_info *) *dst; + rt = dst_rt6_info(*dst); rcu_read_lock(); n = __ipv6_neigh_lookup_noref(rt->dst.dev, rt6_nexthop(rt, &fl6->daddr)); @@ -1377,6 +1386,7 @@ static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork, } v6_cork->hop_limit = ipc6->hlimit; v6_cork->tclass = ipc6->tclass; + v6_cork->dontfrag = ipc6->dontfrag; if (rt->dst.flags & DST_XFRM_TUNNEL) mtu = READ_ONCE(np->pmtudisc) >= IPV6_PMTUDISC_PROBE ? READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst); @@ -1392,8 +1402,12 @@ static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork, cork->base.gso_size = ipc6->gso_size; cork->base.tx_flags = 0; cork->base.mark = ipc6->sockc.mark; - sock_tx_timestamp(sk, ipc6->sockc.tsflags, &cork->base.tx_flags); - + cork->base.priority = ipc6->sockc.priority; + sock_tx_timestamp(sk, &ipc6->sockc, &cork->base.tx_flags); + if (ipc6->sockc.tsflags & SOCKCM_FLAG_TS_OPT_ID) { + cork->base.flags |= IPCORK_TS_OPT_ID; + cork->base.ts_opt_id = ipc6->sockc.ts_opt_id; + } cork->base.length = 0; cork->base.transmit_time = ipc6->sockc.transmit_time; @@ -1408,7 +1422,7 @@ static int __ip6_append_data(struct sock *sk, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb), void *from, size_t length, int transhdrlen, - unsigned int flags, struct ipcm6_cookie *ipc6) + unsigned int flags) { struct sk_buff *skb, *skb_prev = NULL; struct inet_cork *cork = &cork_full->base; @@ -1423,8 +1437,8 @@ static int __ip6_append_data(struct sock *sk, int offset = 0; bool zc = false; u32 tskey = 0; - struct rt6_info *rt = (struct rt6_info *)cork->dst; - bool paged, hold_tskey, extra_uref = false; + struct rt6_info *rt = dst_rt6_info(cork->dst); + bool paged, hold_tskey = false, extra_uref = false; struct ipv6_txoptions *opt = v6_cork->opt; int csummode = CHECKSUM_NONE; unsigned int maxnonfragsize, headersize; @@ -1462,7 +1476,7 @@ static int __ip6_append_data(struct sock *sk, if (headersize + transhdrlen > mtu) goto emsgsize; - if (cork->length + length > mtu - headersize && ipc6->dontfrag && + if (cork->length + length > mtu - headersize && v6_cork->dontfrag && (sk->sk_protocol == IPPROTO_UDP || sk->sk_protocol == IPPROTO_ICMPV6 || sk->sk_protocol == IPPROTO_RAW)) { @@ -1510,7 +1524,8 @@ emsgsize: uarg = msg->msg_ubuf; } } else if (sock_flag(sk, SOCK_ZEROCOPY)) { - uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb)); + uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb), + false); if (!uarg) return -ENOBUFS; extra_uref = !skb_zcopy(skb); /* only ref on new uarg */ @@ -1534,10 +1549,15 @@ emsgsize: flags &= ~MSG_SPLICE_PAGES; } - hold_tskey = cork->tx_flags & SKBTX_ANY_TSTAMP && - READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_OPT_ID; - if (hold_tskey) - tskey = atomic_inc_return(&sk->sk_tskey) - 1; + if (cork->tx_flags & SKBTX_ANY_TSTAMP && + READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_OPT_ID) { + if (cork->flags & IPCORK_TS_OPT_ID) { + tskey = cork->ts_opt_id; + } else { + tskey = atomic_inc_return(&sk->sk_tskey) - 1; + hold_tskey = true; + } + } /* * Let's try using as much space as possible. @@ -1680,8 +1700,9 @@ alloc_new_skb: pskb_trim_unique(skb_prev, maxfraglen); } if (copy > 0 && - getfrag(from, data + transhdrlen, offset, - copy, fraggap, skb) < 0) { + INDIRECT_CALL_1(getfrag, ip_generic_getfrag, + from, data + transhdrlen, offset, + copy, fraggap, skb) < 0) { err = -EFAULT; kfree_skb(skb); goto error; @@ -1725,8 +1746,9 @@ alloc_new_skb: unsigned int off; off = skb->len; - if (getfrag(from, skb_put(skb, copy), - offset, copy, off, skb) < 0) { + if (INDIRECT_CALL_1(getfrag, ip_generic_getfrag, + from, skb_put(skb, copy), + offset, copy, off, skb) < 0) { __skb_trim(skb, off); err = -EFAULT; goto error; @@ -1764,7 +1786,8 @@ alloc_new_skb: get_page(pfrag->page); } copy = min_t(int, copy, pfrag->size - pfrag->offset); - if (getfrag(from, + if (INDIRECT_CALL_1(getfrag, ip_generic_getfrag, + from, page_address(pfrag->page) + pfrag->offset, offset, copy, skb->len, skb) < 0) goto error_efault; @@ -1834,7 +1857,7 @@ int ip6_append_data(struct sock *sk, return __ip6_append_data(sk, &sk->sk_write_queue, &inet->cork, &np->cork, sk_page_frag(sk), getfrag, - from, length, transhdrlen, flags, ipc6); + from, length, transhdrlen, flags); } EXPORT_SYMBOL_GPL(ip6_append_data); @@ -1877,7 +1900,7 @@ struct sk_buff *__ip6_make_skb(struct sock *sk, struct net *net = sock_net(sk); struct ipv6hdr *hdr; struct ipv6_txoptions *opt = v6_cork->opt; - struct rt6_info *rt = (struct rt6_info *)cork->base.dst; + struct rt6_info *rt = dst_rt6_info(cork->base.dst); struct flowi6 *fl6 = &cork->fl.u.ip6; unsigned char proto = fl6->flowi6_proto; @@ -1922,9 +1945,12 @@ struct sk_buff *__ip6_make_skb(struct sock *sk, hdr->saddr = fl6->saddr; hdr->daddr = *final_dst; - skb->priority = READ_ONCE(sk->sk_priority); + skb->priority = cork->base.priority; skb->mark = cork->base.mark; - skb->tstamp = cork->base.transmit_time; + if (sk_is_tcp(sk)) + skb_set_delivery_time(skb, cork->base.transmit_time, SKB_CLOCK_MONOTONIC); + else + skb_set_delivery_type_by_clockid(skb, cork->base.transmit_time, sk->sk_clockid); ip6_cork_steal_dst(skb, cork); IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTREQUESTS); @@ -1933,7 +1959,7 @@ struct sk_buff *__ip6_make_skb(struct sock *sk, u8 icmp6_type; if (sk->sk_socket->type == SOCK_RAW && - !inet_test_bit(HDRINCL, sk)) + !(fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH)) icmp6_type = fl6->fl6_icmp_type; else icmp6_type = icmp6_hdr(skb)->icmp6_type; @@ -1949,9 +1975,10 @@ out: int ip6_send_skb(struct sk_buff *skb) { struct net *net = sock_net(skb->sk); - struct rt6_info *rt = (struct rt6_info *)skb_dst(skb); + struct rt6_info *rt = dst_rt6_info(skb_dst(skb)); int err; + rcu_read_lock(); err = ip6_local_out(net, skb->sk, skb); if (err) { if (err > 0) @@ -1961,6 +1988,7 @@ int ip6_send_skb(struct sk_buff *skb) IPSTATS_MIB_OUTDISCARDS); } + rcu_read_unlock(); return err; } @@ -2028,13 +2056,11 @@ struct sk_buff *ip6_make_skb(struct sock *sk, ip6_cork_release(cork, &v6_cork); return ERR_PTR(err); } - if (ipc6->dontfrag < 0) - ipc6->dontfrag = inet6_test_bit(DONTFRAG, sk); err = __ip6_append_data(sk, &queue, cork, &v6_cork, ¤t->task_frag, getfrag, from, length + exthdrlen, transhdrlen + exthdrlen, - flags, ipc6); + flags); if (err) { __ip6_flush_pending_frames(sk, &queue, cork, &v6_cork); return ERR_PTR(err); diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index e9cc315832cb..894d3158a6f0 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -52,7 +52,9 @@ #include <net/inet_ecn.h> #include <net/net_namespace.h> #include <net/netns/generic.h> +#include <net/netdev_lock.h> #include <net/dst_metadata.h> +#include <net/inet_dscp.h> MODULE_AUTHOR("Ville Nuorvala"); MODULE_DESCRIPTION("IPv6 tunneling device"); @@ -252,8 +254,7 @@ static void ip6_dev_free(struct net_device *dev) static int ip6_tnl_create2(struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); - struct net *net = dev_net(dev); - struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); + struct ip6_tnl_net *ip6n = net_generic(t->net, ip6_tnl_net_id); int err; dev->rtnl_link_ops = &ip6_link_ops; @@ -608,7 +609,8 @@ ip4ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, /* Try to guess incoming interface */ rt = ip_route_output_ports(dev_net(skb->dev), &fl4, NULL, eiph->saddr, - 0, 0, 0, IPPROTO_IPIP, RT_TOS(eiph->tos), 0); + 0, 0, 0, IPPROTO_IPIP, + eiph->tos & INET_DSCP_MASK, 0); if (IS_ERR(rt)) goto out; @@ -619,7 +621,8 @@ ip4ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, if (rt->rt_flags & RTCF_LOCAL) { rt = ip_route_output_ports(dev_net(skb->dev), &fl4, NULL, eiph->daddr, eiph->saddr, 0, 0, - IPPROTO_IPIP, RT_TOS(eiph->tos), 0); + IPPROTO_IPIP, + eiph->tos & INET_DSCP_MASK, 0); if (IS_ERR(rt) || rt->dst.dev->type != ARPHRD_TUNNEL6) { if (!IS_ERR(rt)) ip_rt_put(rt); @@ -627,8 +630,8 @@ ip4ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, } skb_dst_set(skb2, &rt->dst); } else { - if (ip_route_input(skb2, eiph->daddr, eiph->saddr, eiph->tos, - skb2->dev) || + if (ip_route_input(skb2, eiph->daddr, eiph->saddr, + ip4h_dscp(eiph), skb2->dev) || skb_dst(skb2)->dev->type != ARPHRD_TUNNEL6) goto out; } @@ -798,17 +801,15 @@ static int __ip6_tnl_rcv(struct ip6_tnl *tunnel, struct sk_buff *skb, const struct ipv6hdr *ipv6h; int nh, err; - if ((!(tpi->flags & TUNNEL_CSUM) && - (tunnel->parms.i_flags & TUNNEL_CSUM)) || - ((tpi->flags & TUNNEL_CSUM) && - !(tunnel->parms.i_flags & TUNNEL_CSUM))) { + if (test_bit(IP_TUNNEL_CSUM_BIT, tunnel->parms.i_flags) != + test_bit(IP_TUNNEL_CSUM_BIT, tpi->flags)) { DEV_STATS_INC(tunnel->dev, rx_crc_errors); DEV_STATS_INC(tunnel->dev, rx_errors); goto drop; } - if (tunnel->parms.i_flags & TUNNEL_SEQ) { - if (!(tpi->flags & TUNNEL_SEQ) || + if (test_bit(IP_TUNNEL_SEQ_BIT, tunnel->parms.i_flags)) { + if (!test_bit(IP_TUNNEL_SEQ_BIT, tpi->flags) || (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) { DEV_STATS_INC(tunnel->dev, rx_fifo_errors); @@ -946,7 +947,9 @@ static int ipxip6_rcv(struct sk_buff *skb, u8 ipproto, if (iptunnel_pull_header(skb, 0, tpi->proto, false)) goto drop; if (t->parms.collect_md) { - tun_dst = ipv6_tun_rx_dst(skb, 0, 0, 0); + IP_TUNNEL_DECLARE_FLAGS(flags) = { }; + + tun_dst = ipv6_tun_rx_dst(skb, flags, 0, 0); if (!tun_dst) goto drop; } @@ -1507,7 +1510,8 @@ static void ip6_tnl_link_config(struct ip6_tnl *t) tdev = __dev_get_by_index(t->net, p->link); if (tdev) { - dev->hard_header_len = tdev->hard_header_len + t_hlen; + dev->needed_headroom = tdev->hard_header_len + + tdev->needed_headroom + t_hlen; mtu = min_t(unsigned int, tdev->mtu, IP6_MAX_MTU); mtu = mtu - t_hlen; @@ -1731,7 +1735,9 @@ ip6_tnl_siocdevprivate(struct net_device *dev, struct ifreq *ifr, int ip6_tnl_change_mtu(struct net_device *dev, int new_mtu) { struct ip6_tnl *tnl = netdev_priv(dev); + int t_hlen; + t_hlen = tnl->hlen + sizeof(struct ipv6hdr); if (tnl->parms.proto == IPPROTO_IPV6) { if (new_mtu < IPV6_MIN_MTU) return -EINVAL; @@ -1740,13 +1746,13 @@ int ip6_tnl_change_mtu(struct net_device *dev, int new_mtu) return -EINVAL; } if (tnl->parms.proto == IPPROTO_IPV6 || tnl->parms.proto == 0) { - if (new_mtu > IP6_MAX_MTU - dev->hard_header_len) + if (new_mtu > IP6_MAX_MTU - dev->hard_header_len - t_hlen) return -EINVAL; } else { - if (new_mtu > IP_MAX_MTU - dev->hard_header_len) + if (new_mtu > IP_MAX_MTU - dev->hard_header_len - t_hlen) return -EINVAL; } - dev->mtu = new_mtu; + WRITE_ONCE(dev->mtu, new_mtu); return 0; } EXPORT_SYMBOL(ip6_tnl_change_mtu); @@ -1846,7 +1852,7 @@ static void ip6_tnl_dev_setup(struct net_device *dev) dev->type = ARPHRD_TUNNEL6; dev->flags |= IFF_NOARP; dev->addr_len = sizeof(struct in6_addr); - dev->features |= NETIF_F_LLTX; + dev->lltx = true; dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS; netif_keep_dst(dev); @@ -1872,7 +1878,6 @@ ip6_tnl_dev_init_gen(struct net_device *dev) int t_hlen; t->dev = dev; - t->net = dev_net(dev); ret = dst_cache_init(&t->dst_cache, GFP_KERNEL); if (ret) @@ -1887,12 +1892,11 @@ ip6_tnl_dev_init_gen(struct net_device *dev) t_hlen = t->hlen + sizeof(struct ipv6hdr); dev->type = ARPHRD_TUNNEL6; - dev->hard_header_len = LL_MAX_HEADER + t_hlen; dev->mtu = ETH_DATA_LEN - t_hlen; if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) dev->mtu -= 8; dev->min_mtu = ETH_MIN_MTU; - dev->max_mtu = IP6_MAX_MTU - dev->hard_header_len; + dev->max_mtu = IP6_MAX_MTU - dev->hard_header_len - t_hlen; netdev_hold(dev, &t->dev_tracker, GFP_KERNEL); netdev_lockdep_set_classes(dev); @@ -1935,6 +1939,7 @@ static int __net_init ip6_fb_tnl_dev_init(struct net_device *dev) struct net *net = dev_net(dev); struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); + t->net = net; t->parms.proto = IPPROTO_IPV6; rcu_assign_pointer(ip6n->tnls_wc[0], t); @@ -1997,17 +2002,22 @@ static void ip6_tnl_netlink_parms(struct nlattr *data[], parms->fwmark = nla_get_u32(data[IFLA_IPTUN_FWMARK]); } -static int ip6_tnl_newlink(struct net *src_net, struct net_device *dev, - struct nlattr *tb[], struct nlattr *data[], +static int ip6_tnl_newlink(struct net_device *dev, + struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { - struct net *net = dev_net(dev); - struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); + struct nlattr **data = params->data; + struct nlattr **tb = params->tb; struct ip_tunnel_encap ipencap; + struct ip6_tnl_net *ip6n; struct ip6_tnl *nt, *t; + struct net *net; int err; + net = params->link_net ? : dev_net(dev); + ip6n = net_generic(net, ip6_tnl_net_id); nt = netdev_priv(dev); + nt->net = net; if (ip_tunnel_netlink_encap_parms(data, &ipencap)) { err = ip6_tnl_encap_setup(nt, &ipencap); @@ -2146,7 +2156,7 @@ struct net *ip6_tnl_get_link_net(const struct net_device *dev) { struct ip6_tnl *tunnel = netdev_priv(dev); - return tunnel->net; + return READ_ONCE(tunnel->net); } EXPORT_SYMBOL(ip6_tnl_get_link_net); @@ -2200,7 +2210,7 @@ static struct xfrm6_tunnel mplsip6_handler __read_mostly = { .priority = 1, }; -static void __net_exit ip6_tnl_destroy_tunnels(struct net *net, struct list_head *list) +static void __net_exit ip6_tnl_exit_rtnl_net(struct net *net, struct list_head *list) { struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); struct net_device *dev, *aux; @@ -2212,25 +2222,27 @@ static void __net_exit ip6_tnl_destroy_tunnels(struct net *net, struct list_head unregister_netdevice_queue(dev, list); for (h = 0; h < IP6_TUNNEL_HASH_SIZE; h++) { - t = rtnl_dereference(ip6n->tnls_r_l[h]); + t = rtnl_net_dereference(net, ip6n->tnls_r_l[h]); while (t) { /* If dev is in the same netns, it has already * been added to the list by the previous loop. */ if (!net_eq(dev_net(t->dev), net)) unregister_netdevice_queue(t->dev, list); - t = rtnl_dereference(t->next); + + t = rtnl_net_dereference(net, t->next); } } - t = rtnl_dereference(ip6n->tnls_wc[0]); + t = rtnl_net_dereference(net, ip6n->tnls_wc[0]); while (t) { /* If dev is in the same netns, it has already * been added to the list by the previous loop. */ if (!net_eq(dev_net(t->dev), net)) unregister_netdevice_queue(t->dev, list); - t = rtnl_dereference(t->next); + + t = rtnl_net_dereference(net, t->next); } } @@ -2256,7 +2268,7 @@ static int __net_init ip6_tnl_init_net(struct net *net) /* FB netdevice is special: we have one, and only one per netns. * Allowing to move it to another netns is clearly unsafe. */ - ip6n->fb_tnl_dev->features |= NETIF_F_NETNS_LOCAL; + ip6n->fb_tnl_dev->netns_immutable = true; err = ip6_fb_tnl_dev_init(ip6n->fb_tnl_dev); if (err < 0) @@ -2277,19 +2289,9 @@ err_alloc_dev: return err; } -static void __net_exit ip6_tnl_exit_batch_rtnl(struct list_head *net_list, - struct list_head *dev_to_kill) -{ - struct net *net; - - ASSERT_RTNL(); - list_for_each_entry(net, net_list, exit_list) - ip6_tnl_destroy_tunnels(net, dev_to_kill); -} - static struct pernet_operations ip6_tnl_net_ops = { .init = ip6_tnl_init_net, - .exit_batch_rtnl = ip6_tnl_exit_batch_rtnl, + .exit_rtnl = ip6_tnl_exit_rtnl_net, .id = &ip6_tnl_net_id, .size = sizeof(struct ip6_tnl_net), }; diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c index 7f4f976aa24a..40464a88bca6 100644 --- a/net/ipv6/ip6_vti.c +++ b/net/ipv6/ip6_vti.c @@ -45,6 +45,7 @@ #include <net/xfrm.h> #include <net/net_namespace.h> #include <net/netns/generic.h> +#include <net/netdev_lock.h> #include <linux/etherdevice.h> #define IP6_VTI_HASH_SIZE_SHIFT 5 @@ -174,16 +175,10 @@ vti6_tnl_unlink(struct vti6_net *ip6n, struct ip6_tnl *t) } } -static void vti6_dev_free(struct net_device *dev) -{ - free_percpu(dev->tstats); -} - static int vti6_tnl_create2(struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); - struct net *net = dev_net(dev); - struct vti6_net *ip6n = net_generic(net, vti6_net_id); + struct vti6_net *ip6n = net_generic(t->net, vti6_net_id); int err; dev->rtnl_link_ops = &vti6_link_ops; @@ -671,7 +666,8 @@ static void vti6_link_config(struct ip6_tnl *t, bool keep_mtu) dev->flags &= ~IFF_POINTOPOINT; if (keep_mtu && dev->mtu) { - dev->mtu = clamp(dev->mtu, dev->min_mtu, dev->max_mtu); + WRITE_ONCE(dev->mtu, + clamp(dev->mtu, dev->min_mtu, dev->max_mtu)); return; } @@ -892,7 +888,6 @@ static const struct net_device_ops vti6_netdev_ops = { .ndo_uninit = vti6_dev_uninit, .ndo_start_xmit = vti6_tnl_xmit, .ndo_siocdevprivate = vti6_siocdevprivate, - .ndo_get_stats64 = dev_get_tstats64, .ndo_get_iflink = ip6_tnl_get_iflink, }; @@ -908,8 +903,8 @@ static void vti6_dev_setup(struct net_device *dev) dev->netdev_ops = &vti6_netdev_ops; dev->header_ops = &ip_tunnel_header_ops; dev->needs_free_netdev = true; - dev->priv_destructor = vti6_dev_free; + dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS; dev->type = ARPHRD_TUNNEL6; dev->min_mtu = IPV4_MIN_MTU; dev->max_mtu = IP_MAX_MTU - sizeof(struct ipv6hdr); @@ -930,10 +925,6 @@ static inline int vti6_dev_init_gen(struct net_device *dev) struct ip6_tnl *t = netdev_priv(dev); t->dev = dev; - t->net = dev_net(dev); - dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); - if (!dev->tstats) - return -ENOMEM; netdev_hold(dev, &t->dev_tracker, GFP_KERNEL); netdev_lockdep_set_classes(dev); return 0; @@ -966,6 +957,7 @@ static int __net_init vti6_fb_tnl_dev_init(struct net_device *dev) struct net *net = dev_net(dev); struct vti6_net *ip6n = net_generic(net, vti6_net_id); + t->net = net; t->parms.proto = IPPROTO_IPV6; rcu_assign_pointer(ip6n->tnls_wc[0], t); @@ -1005,17 +997,20 @@ static void vti6_netlink_parms(struct nlattr *data[], parms->fwmark = nla_get_u32(data[IFLA_VTI_FWMARK]); } -static int vti6_newlink(struct net *src_net, struct net_device *dev, - struct nlattr *tb[], struct nlattr *data[], +static int vti6_newlink(struct net_device *dev, + struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { - struct net *net = dev_net(dev); + struct nlattr **data = params->data; struct ip6_tnl *nt; + struct net *net; + net = params->link_net ? : dev_net(dev); nt = netdev_priv(dev); vti6_netlink_parms(data, &nt->parms); nt->parms.proto = IPPROTO_IPV6; + nt->net = net; if (vti6_locate(net, &nt->parms, 0)) return -EEXIST; @@ -1117,21 +1112,21 @@ static struct rtnl_link_ops vti6_link_ops __read_mostly = { .get_link_net = ip6_tnl_get_link_net, }; -static void __net_exit vti6_destroy_tunnels(struct vti6_net *ip6n, - struct list_head *list) +static void __net_exit vti6_exit_rtnl_net(struct net *net, struct list_head *list) { - int h; + struct vti6_net *ip6n = net_generic(net, vti6_net_id); struct ip6_tnl *t; + int h; for (h = 0; h < IP6_VTI_HASH_SIZE; h++) { - t = rtnl_dereference(ip6n->tnls_r_l[h]); + t = rtnl_net_dereference(net, ip6n->tnls_r_l[h]); while (t) { unregister_netdevice_queue(t->dev, list); - t = rtnl_dereference(t->next); + t = rtnl_net_dereference(net, t->next); } } - t = rtnl_dereference(ip6n->tnls_wc[0]); + t = rtnl_net_dereference(net, ip6n->tnls_wc[0]); if (t) unregister_netdevice_queue(t->dev, list); } @@ -1175,22 +1170,9 @@ err_alloc_dev: return err; } -static void __net_exit vti6_exit_batch_rtnl(struct list_head *net_list, - struct list_head *dev_to_kill) -{ - struct vti6_net *ip6n; - struct net *net; - - ASSERT_RTNL(); - list_for_each_entry(net, net_list, exit_list) { - ip6n = net_generic(net, vti6_net_id); - vti6_destroy_tunnels(ip6n, dev_to_kill); - } -} - static struct pernet_operations vti6_net_ops = { .init = vti6_init_net, - .exit_batch_rtnl = vti6_exit_batch_rtnl, + .exit_rtnl = vti6_exit_rtnl_net, .id = &vti6_net_id, .size = sizeof(struct vti6_net), }; diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index cb0ee81a068a..3276cde5ebd7 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -125,7 +125,7 @@ static struct mr_table *ip6mr_mr_table_iter(struct net *net, return ret; } -static struct mr_table *ip6mr_get_table(struct net *net, u32 id) +static struct mr_table *__ip6mr_get_table(struct net *net, u32 id) { struct mr_table *mrt; @@ -136,6 +136,16 @@ static struct mr_table *ip6mr_get_table(struct net *net, u32 id) return NULL; } +static struct mr_table *ip6mr_get_table(struct net *net, u32 id) +{ + struct mr_table *mrt; + + rcu_read_lock(); + mrt = __ip6mr_get_table(net, id); + rcu_read_unlock(); + return mrt; +} + static int ip6mr_fib_lookup(struct net *net, struct flowi6 *flp6, struct mr_table **mrt) { @@ -177,7 +187,7 @@ static int ip6mr_rule_action(struct fib_rule *rule, struct flowi *flp, arg->table = fib_rule_get_table(rule, arg); - mrt = ip6mr_get_table(rule->fr_net, arg->table); + mrt = __ip6mr_get_table(rule->fr_net, arg->table); if (!mrt) return -EAGAIN; res->mrt = mrt; @@ -276,7 +286,7 @@ static int ip6mr_rules_dump(struct net *net, struct notifier_block *nb, return fib_rules_dump(net, nb, RTNL_FAMILY_IP6MR, extack); } -static unsigned int ip6mr_rules_seq_read(struct net *net) +static unsigned int ip6mr_rules_seq_read(const struct net *net) { return fib_rules_seq_read(net, RTNL_FAMILY_IP6MR); } @@ -304,6 +314,8 @@ static struct mr_table *ip6mr_get_table(struct net *net, u32 id) return net->ipv6.mrt6; } +#define __ip6mr_get_table ip6mr_get_table + static int ip6mr_fib_lookup(struct net *net, struct flowi6 *flp6, struct mr_table **mrt) { @@ -335,7 +347,7 @@ static int ip6mr_rules_dump(struct net *net, struct notifier_block *nb, return 0; } -static unsigned int ip6mr_rules_seq_read(struct net *net) +static unsigned int ip6mr_rules_seq_read(const struct net *net) { return 0; } @@ -382,7 +394,7 @@ static struct mr_table *ip6mr_new_table(struct net *net, u32 id) { struct mr_table *mrt; - mrt = ip6mr_get_table(net, id); + mrt = __ip6mr_get_table(net, id); if (mrt) return mrt; @@ -392,6 +404,10 @@ static struct mr_table *ip6mr_new_table(struct net *net, u32 id) static void ip6mr_free_table(struct mr_table *mrt) { + struct net *net = read_pnet(&mrt->net); + + WARN_ON_ONCE(!mr_can_free_table(net)); + timer_shutdown_sync(&mrt->ipmr_expire_timer); mroute_clean_tables(mrt, MRT6_FLUSH_MIFS | MRT6_FLUSH_MIFS_STATIC | MRT6_FLUSH_MFC | MRT6_FLUSH_MFC_STATIC); @@ -411,13 +427,15 @@ static void *ip6mr_vif_seq_start(struct seq_file *seq, loff_t *pos) struct net *net = seq_file_net(seq); struct mr_table *mrt; - mrt = ip6mr_get_table(net, RT6_TABLE_DFLT); - if (!mrt) + rcu_read_lock(); + mrt = __ip6mr_get_table(net, RT6_TABLE_DFLT); + if (!mrt) { + rcu_read_unlock(); return ERR_PTR(-ENOENT); + } iter->mrt = mrt; - rcu_read_lock(); return mr_vif_seq_start(seq, pos); } @@ -492,9 +510,9 @@ static int ipmr_mfc_seq_show(struct seq_file *seq, void *v) if (it->cache != &mrt->mfc_unres_queue) { seq_printf(seq, " %8lu %8lu %8lu", - mfc->_c.mfc_un.res.pkt, - mfc->_c.mfc_un.res.bytes, - mfc->_c.mfc_un.res.wrong_if); + atomic_long_read(&mfc->_c.mfc_un.res.pkt), + atomic_long_read(&mfc->_c.mfc_un.res.bytes), + atomic_long_read(&mfc->_c.mfc_un.res.wrong_if)); for (n = mfc->_c.mfc_un.res.minvif; n < mfc->_c.mfc_un.res.maxvif; n++) { if (VIF_EXISTS(mrt, n) && @@ -640,7 +658,7 @@ static void reg_vif_setup(struct net_device *dev) dev->flags = IFF_NOARP; dev->netdev_ops = ®_vif_netdev_ops; dev->needs_free_netdev = true; - dev->features |= NETIF_F_NETNS_LOCAL; + dev->netns_immutable = true; } static struct net_device *ip6mr_reg_vif(struct net *net, struct mr_table *mrt) @@ -856,7 +874,7 @@ static void ip6mr_update_thresholds(struct mr_table *mrt, cache->mfc_un.res.maxvif = vifi + 1; } } - cache->mfc_un.res.lastuse = jiffies; + WRITE_ONCE(cache->mfc_un.res.lastuse, jiffies); } static int mif6_add(struct net *net, struct mr_table *mrt, @@ -1260,11 +1278,9 @@ static int ip6mr_device_event(struct notifier_block *this, return NOTIFY_DONE; } -static unsigned int ip6mr_seq_read(struct net *net) +static unsigned int ip6mr_seq_read(const struct net *net) { - ASSERT_RTNL(); - - return net->ipv6.ipmr_seq + ip6mr_rules_seq_read(net); + return READ_ONCE(net->ipv6.ipmr_seq) + ip6mr_rules_seq_read(net); } static int ip6mr_dump(struct net *net, struct notifier_block *nb, @@ -1369,6 +1385,12 @@ static struct pernet_operations ip6mr_net_ops = { .exit_batch = ip6mr_net_exit_batch, }; +static const struct rtnl_msg_handler ip6mr_rtnl_msg_handlers[] __initconst_or_module = { + {.owner = THIS_MODULE, .protocol = RTNL_FAMILY_IP6MR, + .msgtype = RTM_GETROUTE, + .doit = ip6mr_rtm_getroute, .dumpit = ip6mr_rtm_dumproute}, +}; + int __init ip6_mr_init(void) { int err; @@ -1391,9 +1413,8 @@ int __init ip6_mr_init(void) goto add_proto_fail; } #endif - err = rtnl_register_module(THIS_MODULE, RTNL_FAMILY_IP6MR, RTM_GETROUTE, - ip6mr_rtm_getroute, ip6mr_rtm_dumproute, 0); - if (err == 0) + err = rtnl_register_many(ip6mr_rtnl_msg_handlers); + if (!err) return 0; #ifdef CONFIG_IPV6_PIMSM_V2 @@ -1408,9 +1429,9 @@ reg_pernet_fail: return err; } -void ip6_mr_cleanup(void) +void __init ip6_mr_cleanup(void) { - rtnl_unregister(RTNL_FAMILY_IP6MR, RTM_GETROUTE); + rtnl_unregister_many(ip6mr_rtnl_msg_handlers); #ifdef CONFIG_IPV6_PIMSM_V2 inet6_del_protocol(&pim6_protocol, IPPROTO_PIM); #endif @@ -1495,7 +1516,7 @@ static int ip6mr_mfc_add(struct net *net, struct mr_table *mrt, } } if (list_empty(&mrt->mfc_unres_queue)) - del_timer(&mrt->ipmr_expire_timer); + timer_delete(&mrt->ipmr_expire_timer); spin_unlock_bh(&mfc_unres_lock); if (found) { @@ -1914,9 +1935,9 @@ int ip6mr_ioctl(struct sock *sk, int cmd, void *arg) c = ip6mr_cache_find(mrt, &sr->src.sin6_addr, &sr->grp.sin6_addr); if (c) { - sr->pktcnt = c->_c.mfc_un.res.pkt; - sr->bytecnt = c->_c.mfc_un.res.bytes; - sr->wrong_if = c->_c.mfc_un.res.wrong_if; + sr->pktcnt = atomic_long_read(&c->_c.mfc_un.res.pkt); + sr->bytecnt = atomic_long_read(&c->_c.mfc_un.res.bytes); + sr->wrong_if = atomic_long_read(&c->_c.mfc_un.res.wrong_if); rcu_read_unlock(); return 0; } @@ -1986,9 +2007,9 @@ int ip6mr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg) rcu_read_lock(); c = ip6mr_cache_find(mrt, &sr.src.sin6_addr, &sr.grp.sin6_addr); if (c) { - sr.pktcnt = c->_c.mfc_un.res.pkt; - sr.bytecnt = c->_c.mfc_un.res.bytes; - sr.wrong_if = c->_c.mfc_un.res.wrong_if; + sr.pktcnt = atomic_long_read(&c->_c.mfc_un.res.pkt); + sr.bytecnt = atomic_long_read(&c->_c.mfc_un.res.bytes); + sr.wrong_if = atomic_long_read(&c->_c.mfc_un.res.wrong_if); rcu_read_unlock(); if (copy_to_user(arg, &sr, sizeof(sr))) @@ -2111,9 +2132,9 @@ static void ip6_mr_forward(struct net *net, struct mr_table *mrt, int true_vifi = ip6mr_find_vif(mrt, dev); vif = c->_c.mfc_parent; - c->_c.mfc_un.res.pkt++; - c->_c.mfc_un.res.bytes += skb->len; - c->_c.mfc_un.res.lastuse = jiffies; + atomic_long_inc(&c->_c.mfc_un.res.pkt); + atomic_long_add(skb->len, &c->_c.mfc_un.res.bytes); + WRITE_ONCE(c->_c.mfc_un.res.lastuse, jiffies); if (ipv6_addr_any(&c->mf6c_origin) && true_vifi >= 0) { struct mfc6_cache *cache_proxy; @@ -2131,7 +2152,7 @@ static void ip6_mr_forward(struct net *net, struct mr_table *mrt, * Wrong interface: drop packet and (maybe) send PIM assert. */ if (rcu_access_pointer(mrt->vif_table[vif].dev) != dev) { - c->_c.mfc_un.res.wrong_if++; + atomic_long_inc(&c->_c.mfc_un.res.wrong_if); if (true_vifi >= 0 && mrt->mroute_do_assert && /* pimsm uses asserts, when switching from RPT to SPT, @@ -2273,13 +2294,15 @@ int ip6mr_get_route(struct net *net, struct sk_buff *skb, struct rtmsg *rtm, int err; struct mr_table *mrt; struct mfc6_cache *cache; - struct rt6_info *rt = (struct rt6_info *)skb_dst(skb); + struct rt6_info *rt = dst_rt6_info(skb_dst(skb)); - mrt = ip6mr_get_table(net, RT6_TABLE_DFLT); - if (!mrt) + rcu_read_lock(); + mrt = __ip6mr_get_table(net, RT6_TABLE_DFLT); + if (!mrt) { + rcu_read_unlock(); return -ENOENT; + } - rcu_read_lock(); cache = ip6mr_cache_find(mrt, &rt->rt6i_src.addr, &rt->rt6i_dst.addr); if (!cache && skb->dev) { int vif = ip6mr_find_vif(mrt, skb->dev); @@ -2431,8 +2454,7 @@ static void mr6_netlink_event(struct mr_table *mrt, struct mfc6_cache *mfc, errout: kfree_skb(skb); - if (err < 0) - rtnl_set_sk_err(net, RTNLGRP_IPV6_MROUTE, err); + rtnl_set_sk_err(net, RTNLGRP_IPV6_MROUTE, err); } static size_t mrt6msg_netlink_msgsize(size_t payloadlen) @@ -2558,9 +2580,9 @@ static int ip6mr_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, src = nla_get_in6_addr(tb[RTA_SRC]); if (tb[RTA_DST]) grp = nla_get_in6_addr(tb[RTA_DST]); - tableid = tb[RTA_TABLE] ? nla_get_u32(tb[RTA_TABLE]) : 0; + tableid = nla_get_u32_default(tb[RTA_TABLE], 0); - mrt = ip6mr_get_table(net, tableid ?: RT_TABLE_DEFAULT); + mrt = __ip6mr_get_table(net, tableid ?: RT_TABLE_DEFAULT); if (!mrt) { NL_SET_ERR_MSG_MOD(extack, "MR table does not exist"); return -ENOENT; @@ -2607,7 +2629,7 @@ static int ip6mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb) if (filter.table_id) { struct mr_table *mrt; - mrt = ip6mr_get_table(sock_net(skb->sk), filter.table_id); + mrt = __ip6mr_get_table(sock_net(skb->sk), filter.table_id); if (!mrt) { if (rtnl_msg_family(cb->nlh) != RTNL_FAMILY_IP6MR) return skb->len; diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index d4c28ec1bc51..1e225e6489ea 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -111,8 +111,7 @@ struct ipv6_txoptions *ipv6_update_options(struct sock *sk, icsk->icsk_sync_mss(sk, icsk->icsk_pmtu_cookie); } } - opt = xchg((__force struct ipv6_txoptions **)&inet6_sk(sk)->opt, - opt); + opt = unrcu_pointer(xchg(&inet6_sk(sk)->opt, RCU_INITIALIZER(opt))); sk_dst_reset(sk); return opt; @@ -986,7 +985,7 @@ int ipv6_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, int err; if (level == SOL_IP && sk->sk_type != SOCK_RAW) - return udp_prot.setsockopt(sk, level, optname, optval, optlen); + return ip_setsockopt(sk, level, optname, optval, optlen); if (level != SOL_IPV6) return -ENOPROTOOPT; @@ -1476,7 +1475,7 @@ int ipv6_getsockopt(struct sock *sk, int level, int optname, int err; if (level == SOL_IP && sk->sk_type != SOCK_RAW) - return udp_prot.getsockopt(sk, level, optname, optval, optlen); + return ip_getsockopt(sk, level, optname, optval, optlen); if (level != SOL_IPV6) return -ENOPROTOOPT; diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index 7ba01d8cfbae..65831b4fee1f 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -33,8 +33,10 @@ #include <linux/in.h> #include <linux/in6.h> #include <linux/netdevice.h> +#include <linux/if_addr.h> #include <linux/if_arp.h> #include <linux/route.h> +#include <linux/rtnetlink.h> #include <linux/init.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> @@ -47,6 +49,7 @@ #include <linux/netfilter_ipv6.h> #include <net/net_namespace.h> +#include <net/netlink.h> #include <net/sock.h> #include <net/snmp.h> @@ -586,7 +589,8 @@ int ip6_mc_msfget(struct sock *sk, struct group_filter *gsf, const struct in6_addr *group; struct ipv6_mc_socklist *pmc; struct ip6_sf_socklist *psl; - int i, count, copycount; + unsigned int count; + int i, copycount; group = &((struct sockaddr_in6 *)&gsf->gf_group)->sin6_addr; @@ -610,7 +614,7 @@ int ip6_mc_msfget(struct sock *sk, struct group_filter *gsf, psl = sock_dereference(pmc->sflist, sk); count = psl ? psl->sl_count : 0; - copycount = count < gsf->gf_numsrc ? count : gsf->gf_numsrc; + copycount = min(count, gsf->gf_numsrc); gsf->gf_numsrc = count; for (i = 0; i < copycount; i++) { struct sockaddr_in6 *psin6; @@ -900,6 +904,41 @@ static struct ifmcaddr6 *mca_alloc(struct inet6_dev *idev, return mc; } +static void inet6_ifmcaddr_notify(struct net_device *dev, + const struct ifmcaddr6 *ifmca, int event) +{ + struct inet6_fill_args fillargs = { + .portid = 0, + .seq = 0, + .event = event, + .flags = 0, + .netnsid = -1, + .force_rt_scope_universe = true, + }; + struct net *net = dev_net(dev); + struct sk_buff *skb; + int err = -ENOMEM; + + skb = nlmsg_new(NLMSG_ALIGN(sizeof(struct ifaddrmsg)) + + nla_total_size(sizeof(struct in6_addr)) + + nla_total_size(sizeof(struct ifa_cacheinfo)), + GFP_KERNEL); + if (!skb) + goto error; + + err = inet6_fill_ifmcaddr(skb, ifmca, &fillargs); + if (err < 0) { + WARN_ON_ONCE(err == -EMSGSIZE); + nlmsg_free(skb); + goto error; + } + + rtnl_notify(skb, net, 0, RTNLGRP_IPV6_MCADDR, NULL, GFP_KERNEL); + return; +error: + rtnl_set_sk_err(net, RTNLGRP_IPV6_MCADDR, err); +} + /* * device multicast group inc (add if not found) */ @@ -947,6 +986,7 @@ static int __ipv6_dev_mc_inc(struct net_device *dev, mld_del_delrec(idev, mc); igmp6_group_added(mc); + inet6_ifmcaddr_notify(dev, mc, RTM_NEWMULTICAST); mutex_unlock(&idev->mc_lock); ma_put(mc); return 0; @@ -976,6 +1016,8 @@ int __ipv6_dev_mc_dec(struct inet6_dev *idev, const struct in6_addr *addr) *map = ma->next; igmp6_group_dropped(ma); + inet6_ifmcaddr_notify(idev->dev, ma, + RTM_DELMULTICAST); ip6_mc_clear_src(ma); mutex_unlock(&idev->mc_lock); @@ -1020,29 +1062,31 @@ bool ipv6_chk_mcast_addr(struct net_device *dev, const struct in6_addr *group, rcu_read_lock(); idev = __in6_dev_get(dev); - if (idev) { - for_each_mc_rcu(idev, mc) { - if (ipv6_addr_equal(&mc->mca_addr, group)) - break; - } - if (mc) { - if (src_addr && !ipv6_addr_any(src_addr)) { - struct ip6_sf_list *psf; + if (!idev) + goto unlock; + for_each_mc_rcu(idev, mc) { + if (ipv6_addr_equal(&mc->mca_addr, group)) + break; + } + if (!mc) + goto unlock; + if (src_addr && !ipv6_addr_any(src_addr)) { + struct ip6_sf_list *psf; - for_each_psf_rcu(mc, psf) { - if (ipv6_addr_equal(&psf->sf_addr, src_addr)) - break; - } - if (psf) - rv = psf->sf_count[MCAST_INCLUDE] || - psf->sf_count[MCAST_EXCLUDE] != - mc->mca_sfcount[MCAST_EXCLUDE]; - else - rv = mc->mca_sfcount[MCAST_EXCLUDE] != 0; - } else - rv = true; /* don't filter unspecified source */ + for_each_psf_rcu(mc, psf) { + if (ipv6_addr_equal(&psf->sf_addr, src_addr)) + break; } + if (psf) + rv = READ_ONCE(psf->sf_count[MCAST_INCLUDE]) || + READ_ONCE(psf->sf_count[MCAST_EXCLUDE]) != + READ_ONCE(mc->mca_sfcount[MCAST_EXCLUDE]); + else + rv = READ_ONCE(mc->mca_sfcount[MCAST_EXCLUDE]) != 0; + } else { + rv = true; /* don't filter unspecified source */ } +unlock: rcu_read_unlock(); return rv; } @@ -1729,21 +1773,19 @@ static struct sk_buff *mld_newpack(struct inet6_dev *idev, unsigned int mtu) struct net_device *dev = idev->dev; int hlen = LL_RESERVED_SPACE(dev); int tlen = dev->needed_tailroom; - struct net *net = dev_net(dev); const struct in6_addr *saddr; struct in6_addr addr_buf; struct mld2_report *pmr; struct sk_buff *skb; unsigned int size; struct sock *sk; - int err; + struct net *net; - sk = net->ipv6.igmp_sk; /* we assume size > sizeof(ra) here * Also try to not allocate high-order pages for big MTU */ size = min_t(int, mtu, PAGE_SIZE / 2) + hlen + tlen; - skb = sock_alloc_send_skb(sk, size, 1, &err); + skb = alloc_skb(size, GFP_KERNEL); if (!skb) return NULL; @@ -1751,6 +1793,12 @@ static struct sk_buff *mld_newpack(struct inet6_dev *idev, unsigned int mtu) skb_reserve(skb, hlen); skb_tailroom_reserve(skb, mtu, tlen); + rcu_read_lock(); + + net = dev_net_rcu(dev); + sk = net->ipv6.igmp_sk; + skb_set_owner_w(skb, sk); + if (ipv6_get_lladdr(dev, &addr_buf, IFA_F_TENTATIVE)) { /* <draft-ietf-magma-mld-source-05.txt>: * use unspecified address as the source address @@ -1762,6 +1810,8 @@ static struct sk_buff *mld_newpack(struct inet6_dev *idev, unsigned int mtu) ip6_mc_hdr(sk, skb, dev, saddr, &mld2_all_mcr, NEXTHDR_HOP, 0); + rcu_read_unlock(); + skb_put_data(skb, ra, sizeof(ra)); skb_set_transport_header(skb, skb_tail_pointer(skb) - skb->data); @@ -2121,21 +2171,21 @@ static void mld_send_cr(struct inet6_dev *idev) static void igmp6_send(struct in6_addr *addr, struct net_device *dev, int type) { - struct net *net = dev_net(dev); - struct sock *sk = net->ipv6.igmp_sk; + const struct in6_addr *snd_addr, *saddr; + int err, len, payload_len, full_len; + struct in6_addr addr_buf; struct inet6_dev *idev; struct sk_buff *skb; struct mld_msg *hdr; - const struct in6_addr *snd_addr, *saddr; - struct in6_addr addr_buf; int hlen = LL_RESERVED_SPACE(dev); int tlen = dev->needed_tailroom; - int err, len, payload_len, full_len; u8 ra[8] = { IPPROTO_ICMPV6, 0, IPV6_TLV_ROUTERALERT, 2, 0, 0, IPV6_TLV_PADN, 0 }; - struct flowi6 fl6; struct dst_entry *dst; + struct flowi6 fl6; + struct net *net; + struct sock *sk; if (type == ICMPV6_MGM_REDUCTION) snd_addr = &in6addr_linklocal_allrouters; @@ -2146,19 +2196,21 @@ static void igmp6_send(struct in6_addr *addr, struct net_device *dev, int type) payload_len = len + sizeof(ra); full_len = sizeof(struct ipv6hdr) + payload_len; - rcu_read_lock(); - IP6_INC_STATS(net, __in6_dev_get(dev), IPSTATS_MIB_OUTREQUESTS); - rcu_read_unlock(); + skb = alloc_skb(hlen + tlen + full_len, GFP_KERNEL); - skb = sock_alloc_send_skb(sk, hlen + tlen + full_len, 1, &err); + rcu_read_lock(); + net = dev_net_rcu(dev); + idev = __in6_dev_get(dev); + IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTREQUESTS); if (!skb) { - rcu_read_lock(); - IP6_INC_STATS(net, __in6_dev_get(dev), - IPSTATS_MIB_OUTDISCARDS); + IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS); rcu_read_unlock(); return; } + sk = net->ipv6.igmp_sk; + skb_set_owner_w(skb, sk); + skb->priority = TC_PRIO_CONTROL; skb_reserve(skb, hlen); @@ -2183,9 +2235,6 @@ static void igmp6_send(struct in6_addr *addr, struct net_device *dev, int type) IPPROTO_ICMPV6, csum_partial(hdr, len, 0)); - rcu_read_lock(); - idev = __in6_dev_get(skb->dev); - icmpv6_flow_init(sk, &fl6, type, &ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr, skb->dev->ifindex); @@ -2284,7 +2333,7 @@ static int ip6_mc_del1_src(struct ifmcaddr6 *pmc, int sfmode, /* source filter not found, or count wrong => bug */ return -ESRCH; } - psf->sf_count[sfmode]--; + WRITE_ONCE(psf->sf_count[sfmode], psf->sf_count[sfmode] - 1); if (!psf->sf_count[MCAST_INCLUDE] && !psf->sf_count[MCAST_EXCLUDE]) { struct inet6_dev *idev = pmc->idev; @@ -2390,7 +2439,7 @@ static int ip6_mc_add1_src(struct ifmcaddr6 *pmc, int sfmode, rcu_assign_pointer(pmc->mca_sources, psf); } } - psf->sf_count[sfmode]++; + WRITE_ONCE(psf->sf_count[sfmode], psf->sf_count[sfmode] + 1); return 0; } @@ -2502,7 +2551,8 @@ static int ip6_mc_add_src(struct inet6_dev *idev, const struct in6_addr *pmca, sf_markstate(pmc); isexclude = pmc->mca_sfmode == MCAST_EXCLUDE; if (!delta) - pmc->mca_sfcount[sfmode]++; + WRITE_ONCE(pmc->mca_sfcount[sfmode], + pmc->mca_sfcount[sfmode] + 1); err = 0; for (i = 0; i < sfcount; i++) { err = ip6_mc_add1_src(pmc, sfmode, &psfsrc[i]); @@ -2513,7 +2563,8 @@ static int ip6_mc_add_src(struct inet6_dev *idev, const struct in6_addr *pmca, int j; if (!delta) - pmc->mca_sfcount[sfmode]--; + WRITE_ONCE(pmc->mca_sfcount[sfmode], + pmc->mca_sfcount[sfmode] - 1); for (j = 0; j < i; j++) ip6_mc_del1_src(pmc, sfmode, &psfsrc[j]); } else if (isexclude != (pmc->mca_sfcount[MCAST_EXCLUDE] != 0)) { @@ -2558,7 +2609,8 @@ static void ip6_mc_clear_src(struct ifmcaddr6 *pmc) RCU_INIT_POINTER(pmc->mca_sources, NULL); pmc->mca_sfmode = MCAST_EXCLUDE; pmc->mca_sfcount[MCAST_INCLUDE] = 0; - pmc->mca_sfcount[MCAST_EXCLUDE] = 1; + /* Paired with the READ_ONCE() from ipv6_chk_mcast_addr() */ + WRITE_ONCE(pmc->mca_sfcount[MCAST_EXCLUDE], 1); } /* called with mc_lock */ @@ -3073,8 +3125,8 @@ static int igmp6_mcf_seq_show(struct seq_file *seq, void *v) state->dev->ifindex, state->dev->name, &state->im->mca_addr, &psf->sf_addr, - psf->sf_count[MCAST_INCLUDE], - psf->sf_count[MCAST_EXCLUDE]); + READ_ONCE(psf->sf_count[MCAST_INCLUDE]), + READ_ONCE(psf->sf_count[MCAST_EXCLUDE])); } return 0; } diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index ae134634c323..ecb5c4b8518f 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -200,9 +200,9 @@ static inline int ndisc_is_useropt(const struct net_device *dev, return opt->nd_opt_type == ND_OPT_PREFIX_INFO || opt->nd_opt_type == ND_OPT_RDNSS || opt->nd_opt_type == ND_OPT_DNSSL || + opt->nd_opt_type == ND_OPT_6CO || opt->nd_opt_type == ND_OPT_CAPTIVE_PORTAL || - opt->nd_opt_type == ND_OPT_PREF64 || - ndisc_ops_is_useropt(dev, opt->nd_opt_type); + opt->nd_opt_type == ND_OPT_PREF64; } static struct nd_opt_hdr *ndisc_next_useropt(const struct net_device *dev, @@ -227,6 +227,7 @@ struct ndisc_options *ndisc_parse_options(const struct net_device *dev, return NULL; memset(ndopts, 0, sizeof(*ndopts)); while (opt_len) { + bool unknown = false; int l; if (opt_len < sizeof(struct nd_opt_hdr)) return NULL; @@ -262,22 +263,23 @@ struct ndisc_options *ndisc_parse_options(const struct net_device *dev, break; #endif default: - if (ndisc_is_useropt(dev, nd_opt)) { - ndopts->nd_useropts_end = nd_opt; - if (!ndopts->nd_useropts) - ndopts->nd_useropts = nd_opt; - } else { - /* - * Unknown options must be silently ignored, - * to accommodate future extension to the - * protocol. - */ - ND_PRINTK(2, notice, - "%s: ignored unsupported option; type=%d, len=%d\n", - __func__, - nd_opt->nd_opt_type, - nd_opt->nd_opt_len); - } + unknown = true; + } + if (ndisc_is_useropt(dev, nd_opt)) { + ndopts->nd_useropts_end = nd_opt; + if (!ndopts->nd_useropts) + ndopts->nd_useropts = nd_opt; + } else if (unknown) { + /* + * Unknown options must be silently ignored, + * to accommodate future extension to the + * protocol. + */ + ND_PRINTK(2, notice, + "%s: ignored unsupported option; type=%d, len=%d\n", + __func__, + nd_opt->nd_opt_type, + nd_opt->nd_opt_len); } next_opt: opt_len -= l; @@ -416,15 +418,11 @@ static struct sk_buff *ndisc_alloc_skb(struct net_device *dev, { int hlen = LL_RESERVED_SPACE(dev); int tlen = dev->needed_tailroom; - struct sock *sk = dev_net(dev)->ipv6.ndisc_sk; struct sk_buff *skb; skb = alloc_skb(hlen + sizeof(struct ipv6hdr) + len + tlen, GFP_ATOMIC); - if (!skb) { - ND_PRINTK(0, err, "ndisc: %s failed to allocate an skb\n", - __func__); + if (!skb) return NULL; - } skb->protocol = htons(ETH_P_IPV6); skb->dev = dev; @@ -435,7 +433,9 @@ static struct sk_buff *ndisc_alloc_skb(struct net_device *dev, /* Manually assign socket ownership as we avoid calling * sock_alloc_send_pskb() to bypass wmem buffer limits */ - skb_set_owner_w(skb, sk); + rcu_read_lock(); + skb_set_owner_w(skb, dev_net_rcu(dev)->ipv6.ndisc_sk); + rcu_read_unlock(); return skb; } @@ -471,16 +471,20 @@ static void ip6_nd_hdr(struct sk_buff *skb, void ndisc_send_skb(struct sk_buff *skb, const struct in6_addr *daddr, const struct in6_addr *saddr) { + struct icmp6hdr *icmp6h = icmp6_hdr(skb); struct dst_entry *dst = skb_dst(skb); - struct net *net = dev_net(skb->dev); - struct sock *sk = net->ipv6.ndisc_sk; struct inet6_dev *idev; + struct net *net; + struct sock *sk; int err; - struct icmp6hdr *icmp6h = icmp6_hdr(skb); u8 type; type = icmp6h->icmp6_type; + rcu_read_lock(); + + net = dev_net_rcu(skb->dev); + sk = net->ipv6.ndisc_sk; if (!dst) { struct flowi6 fl6; int oif = skb->dev->ifindex; @@ -488,6 +492,7 @@ void ndisc_send_skb(struct sk_buff *skb, const struct in6_addr *daddr, icmpv6_flow_init(sk, &fl6, type, saddr, daddr, oif); dst = icmp6_dst_alloc(skb->dev, &fl6); if (IS_ERR(dst)) { + rcu_read_unlock(); kfree_skb(skb); return; } @@ -502,7 +507,6 @@ void ndisc_send_skb(struct sk_buff *skb, const struct in6_addr *daddr, ip6_nd_hdr(skb, saddr, daddr, READ_ONCE(inet6_sk(sk)->hop_limit), skb->len); - rcu_read_lock(); idev = __in6_dev_get(dst->dev); IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTREQUESTS); @@ -1676,7 +1680,7 @@ static void ndisc_fill_redirect_hdr_option(struct sk_buff *skb, void ndisc_send_redirect(struct sk_buff *skb, const struct in6_addr *target) { struct net_device *dev = skb->dev; - struct net *net = dev_net(dev); + struct net *net = dev_net_rcu(dev); struct sock *sk = net->ipv6.ndisc_sk; int optlen = 0; struct inet_peer *peer; @@ -1691,8 +1695,8 @@ void ndisc_send_redirect(struct sk_buff *skb, const struct in6_addr *target) ops_data_buf[NDISC_OPS_REDIRECT_DATA_SPACE], *ops_data = NULL; bool ret; - if (netif_is_l3_master(skb->dev)) { - dev = __dev_get_by_index(dev_net(skb->dev), IPCB(skb)->iif); + if (netif_is_l3_master(dev)) { + dev = dev_get_by_index_rcu(net, IPCB(skb)->iif); if (!dev) return; } @@ -1722,17 +1726,17 @@ void ndisc_send_redirect(struct sk_buff *skb, const struct in6_addr *target) if (IS_ERR(dst)) return; - rt = (struct rt6_info *) dst; + rt = dst_rt6_info(dst); if (rt->rt6i_flags & RTF_GATEWAY) { ND_PRINTK(2, warn, "Redirect: destination is not a neighbour\n"); goto release; } - peer = inet_getpeer_v6(net->ipv6.peers, &ipv6_hdr(skb)->saddr, 1); + + peer = inet_getpeer_v6(net->ipv6.peers, &ipv6_hdr(skb)->saddr); ret = inet_peer_xrlim_allow(peer, 1*HZ); - if (peer) - inet_putpeer(peer); + if (!ret) goto release; @@ -1936,13 +1940,13 @@ static struct notifier_block ndisc_netdev_notifier = { }; #ifdef CONFIG_SYSCTL -static void ndisc_warn_deprecated_sysctl(struct ctl_table *ctl, +static void ndisc_warn_deprecated_sysctl(const struct ctl_table *ctl, const char *func, const char *dev_name) { static char warncomm[TASK_COMM_LEN]; static int warned; if (strcmp(warncomm, current->comm) && warned < 5) { - strcpy(warncomm, current->comm); + strscpy(warncomm, current->comm); pr_warn("process `%s' is using deprecated sysctl (%s) net.ipv6.neigh.%s.%s - use net.ipv6.neigh.%s.%s_ms instead\n", warncomm, func, dev_name, ctl->procname, @@ -1951,7 +1955,7 @@ static void ndisc_warn_deprecated_sysctl(struct ctl_table *ctl, } } -int ndisc_ifinfo_sysctl_change(struct ctl_table *ctl, int write, void *buffer, +int ndisc_ifinfo_sysctl_change(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct net_device *dev = ctl->extra1; diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c index 53d255838e6a..4541836ee3da 100644 --- a/net/ipv6/netfilter.c +++ b/net/ipv6/netfilter.c @@ -36,6 +36,7 @@ int ip6_route_me_harder(struct net *net, struct sock *sk_partial, struct sk_buff .flowi6_uid = sock_net_uid(net, sk), .daddr = iph->daddr, .saddr = iph->saddr, + .flowlabel = ip6_flowinfo(iph), }; int err; @@ -126,7 +127,7 @@ int br_ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, struct sk_buff *)) { int frag_max_size = BR_INPUT_SKB_CB(skb)->frag_max_size; - bool mono_delivery_time = skb->mono_delivery_time; + u8 tstamp_type = skb->tstamp_type; ktime_t tstamp = skb->tstamp; struct ip6_frag_state state; u8 *prevhdr, nexthdr = 0; @@ -163,20 +164,20 @@ int br_ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, struct ip6_fraglist_iter iter; struct sk_buff *frag2; - if (first_len - hlen > mtu || - skb_headroom(skb) < (hroom + sizeof(struct frag_hdr))) + if (first_len - hlen > mtu) goto blackhole; - if (skb_cloned(skb)) + if (skb_cloned(skb) || + skb_headroom(skb) < (hroom + sizeof(struct frag_hdr))) goto slow_path; skb_walk_frags(skb, frag2) { - if (frag2->len > mtu || - skb_headroom(frag2) < (hlen + hroom + sizeof(struct frag_hdr))) + if (frag2->len > mtu) goto blackhole; /* Partially cloned skb? */ - if (skb_shared(frag2)) + if (skb_shared(frag2) || + skb_headroom(frag2) < (hlen + hroom + sizeof(struct frag_hdr))) goto slow_path; } @@ -192,7 +193,7 @@ int br_ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, if (iter.frag) ip6_fraglist_prepare(skb, &iter); - skb_set_delivery_time(skb, tstamp, mono_delivery_time); + skb_set_delivery_time(skb, tstamp, tstamp_type); err = output(net, sk, data, skb); if (err || !iter.frag) break; @@ -225,7 +226,7 @@ slow_path: goto blackhole; } - skb_set_delivery_time(skb2, tstamp, mono_delivery_time); + skb_set_delivery_time(skb2, tstamp, tstamp_type); err = output(net, sk, data, skb2); if (err) goto blackhole; diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig index f3c8e2d918e1..e087a8e97ba7 100644 --- a/net/ipv6/netfilter/Kconfig +++ b/net/ipv6/netfilter/Kconfig @@ -8,7 +8,14 @@ menu "IPv6: Netfilter Configuration" # old sockopt interface and eval loop config IP6_NF_IPTABLES_LEGACY - tristate + tristate "Legacy IP6 tables support" + depends on INET && IPV6 + select NETFILTER_XTABLES + default n + help + ip6tables is a legacy packet classifier. + This is not needed if you are using iptables over nftables + (iptables-nft). config NF_SOCKET_IPV6 tristate "IPv6 socket lookup support" diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index 131f7bb2110d..d585ac3c1113 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -292,7 +292,7 @@ ip6t_do_table(void *priv, struct sk_buff *skb, * but it is no problem since absolute verdict is issued by these. */ if (static_key_false(&xt_tee_enabled)) - jumpstack += private->stacksize * __this_cpu_read(nf_skb_duplicated); + jumpstack += private->stacksize * current->in_nf_duplicate; e = get_entry(table_base, private->hook_entry[hook]); @@ -1773,7 +1773,7 @@ int ip6t_register_table(struct net *net, const struct xt_table *table, goto out_free; } - ops = kmemdup(template_ops, sizeof(*ops) * num_ops, GFP_KERNEL); + ops = kmemdup_array(template_ops, num_ops, sizeof(*ops), GFP_KERNEL); if (!ops) { ret = -ENOMEM; goto out_free; diff --git a/net/ipv6/netfilter/ip6table_filter.c b/net/ipv6/netfilter/ip6table_filter.c index df785ebda0ca..e8992693e14a 100644 --- a/net/ipv6/netfilter/ip6table_filter.c +++ b/net/ipv6/netfilter/ip6table_filter.c @@ -43,7 +43,7 @@ static int ip6table_filter_table_init(struct net *net) return -ENOMEM; /* Entry 1 is the FORWARD hook */ ((struct ip6t_standard *)repl->entries)[1].target.verdict = - forward ? -NF_ACCEPT - 1 : -NF_DROP - 1; + forward ? -NF_ACCEPT - 1 : NF_DROP - 1; err = ip6t_register_table(net, &packet_filter, repl, filter_ops); kfree(repl); diff --git a/net/ipv6/netfilter/ip6table_nat.c b/net/ipv6/netfilter/ip6table_nat.c index 52cf104e3478..e119d4f090cc 100644 --- a/net/ipv6/netfilter/ip6table_nat.c +++ b/net/ipv6/netfilter/ip6table_nat.c @@ -147,23 +147,27 @@ static struct pernet_operations ip6table_nat_net_ops = { static int __init ip6table_nat_init(void) { - int ret = xt_register_template(&nf_nat_ipv6_table, - ip6table_nat_table_init); + int ret; + /* net->gen->ptr[ip6table_nat_net_id] must be allocated + * before calling ip6t_nat_register_lookups(). + */ + ret = register_pernet_subsys(&ip6table_nat_net_ops); if (ret < 0) return ret; - ret = register_pernet_subsys(&ip6table_nat_net_ops); + ret = xt_register_template(&nf_nat_ipv6_table, + ip6table_nat_table_init); if (ret) - xt_unregister_template(&nf_nat_ipv6_table); + unregister_pernet_subsys(&ip6table_nat_net_ops); return ret; } static void __exit ip6table_nat_exit(void) { - unregister_pernet_subsys(&ip6table_nat_net_ops); xt_unregister_template(&nf_nat_ipv6_table); + unregister_pernet_subsys(&ip6table_nat_net_ops); } module_init(ip6table_nat_init); diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index d0dcbaca1994..d6bd8f7079bb 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -62,7 +62,6 @@ static struct ctl_table nf_ct_frag6_sysctl_table[] = { .mode = 0644, .proc_handler = proc_doulongvec_minmax, }, - { } }; static int nf_ct_frag6_sysctl_register(struct net *net) @@ -105,7 +104,7 @@ err_alloc: static void __net_exit nf_ct_frags6_sysctl_unregister(struct net *net) { struct nft_ct_frag6_pernet *nf_frag = nf_frag_pernet(net); - struct ctl_table *table; + const struct ctl_table *table; table = nf_frag->nf_frag_frags_hdr->ctl_table_arg; unregister_net_sysctl_table(nf_frag->nf_frag_frags_hdr); @@ -124,7 +123,8 @@ static void __net_exit nf_ct_frags6_sysctl_unregister(struct net *net) #endif static int nf_ct_frag6_reasm(struct frag_queue *fq, struct sk_buff *skb, - struct sk_buff *prev_tail, struct net_device *dev); + struct sk_buff *prev_tail, struct net_device *dev, + int *refs); static inline u8 ip6_frag_ecn(const struct ipv6hdr *ipv6h) { @@ -155,6 +155,10 @@ static struct frag_queue *fq_find(struct net *net, __be32 id, u32 user, }; struct inet_frag_queue *q; + if (!(ipv6_addr_type(&hdr->daddr) & (IPV6_ADDR_MULTICAST | + IPV6_ADDR_LINKLOCAL))) + key.iif = 0; + q = inet_frag_find(nf_frag->fqdir, &key); if (!q) return NULL; @@ -164,7 +168,8 @@ static struct frag_queue *fq_find(struct net *net, __be32 id, u32 user, static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb, - const struct frag_hdr *fhdr, int nhoff) + const struct frag_hdr *fhdr, int nhoff, + int *refs) { unsigned int payload_len; struct net_device *dev; @@ -218,7 +223,7 @@ static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb, * this case. -DaveM */ pr_debug("end of fragment not rounded to 8 bytes.\n"); - inet_frag_kill(&fq->q); + inet_frag_kill(&fq->q, refs); return -EPROTO; } if (end > fq->q.len) { @@ -264,7 +269,7 @@ static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb, fq->iif = dev->ifindex; fq->q.stamp = skb->tstamp; - fq->q.mono_delivery_time = skb->mono_delivery_time; + fq->q.tstamp_type = skb->tstamp_type; fq->q.meat += skb->len; fq->ecn |= ecn; if (payload_len > fq->q.max_size) @@ -284,7 +289,7 @@ static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb, unsigned long orefdst = skb->_skb_refdst; skb->_skb_refdst = 0UL; - err = nf_ct_frag6_reasm(fq, skb, prev, dev); + err = nf_ct_frag6_reasm(fq, skb, prev, dev, refs); skb->_skb_refdst = orefdst; /* After queue has assumed skb ownership, only 0 or @@ -298,7 +303,7 @@ static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb, return -EINPROGRESS; insert_error: - inet_frag_kill(&fq->q); + inet_frag_kill(&fq->q, refs); err: skb_dst_drop(skb); return -EINVAL; @@ -312,13 +317,14 @@ err: * the last and the first frames arrived and all the bits are here. */ static int nf_ct_frag6_reasm(struct frag_queue *fq, struct sk_buff *skb, - struct sk_buff *prev_tail, struct net_device *dev) + struct sk_buff *prev_tail, struct net_device *dev, + int *refs) { void *reasm_data; int payload_len; u8 ecn; - inet_frag_kill(&fq->q); + inet_frag_kill(&fq->q, refs); ecn = ip_frag_ecn_table[fq->ecn]; if (unlikely(ecn == 0xff)) @@ -369,7 +375,7 @@ static int nf_ct_frag6_reasm(struct frag_queue *fq, struct sk_buff *skb, return 0; err: - inet_frag_kill(&fq->q); + inet_frag_kill(&fq->q, refs); return -EINVAL; } @@ -444,6 +450,7 @@ int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user) struct frag_hdr *fhdr; struct frag_queue *fq; struct ipv6hdr *hdr; + int refs = 0; u8 prevhdr; /* Jumbo payload inhibits frag. header */ @@ -470,23 +477,26 @@ int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user) hdr = ipv6_hdr(skb); fhdr = (struct frag_hdr *)skb_transport_header(skb); + rcu_read_lock(); fq = fq_find(net, fhdr->identification, user, hdr, skb->dev ? skb->dev->ifindex : 0); if (fq == NULL) { + rcu_read_unlock(); pr_debug("Can't find and can't create new queue\n"); return -ENOMEM; } spin_lock_bh(&fq->q.lock); - ret = nf_ct_frag6_queue(fq, skb, fhdr, nhoff); + ret = nf_ct_frag6_queue(fq, skb, fhdr, nhoff, &refs); if (ret == -EPROTO) { skb->transport_header = savethdr; ret = 0; } spin_unlock_bh(&fq->q.lock); - inet_frag_put(&fq->q); + rcu_read_unlock(); + inet_frag_putn(&fq->q, refs); return ret; } EXPORT_SYMBOL_GPL(nf_ct_frag6_gather); diff --git a/net/ipv6/netfilter/nf_dup_ipv6.c b/net/ipv6/netfilter/nf_dup_ipv6.c index a0a2de30be3e..b903c62c00c9 100644 --- a/net/ipv6/netfilter/nf_dup_ipv6.c +++ b/net/ipv6/netfilter/nf_dup_ipv6.c @@ -47,11 +47,12 @@ static bool nf_dup_ipv6_route(struct net *net, struct sk_buff *skb, void nf_dup_ipv6(struct net *net, struct sk_buff *skb, unsigned int hooknum, const struct in6_addr *gw, int oif) { - if (this_cpu_read(nf_skb_duplicated)) - return; + local_bh_disable(); + if (current->in_nf_duplicate) + goto out; skb = pskb_copy(skb, GFP_ATOMIC); if (skb == NULL) - return; + goto out; #if IS_ENABLED(CONFIG_NF_CONNTRACK) nf_reset_ct(skb); @@ -63,12 +64,14 @@ void nf_dup_ipv6(struct net *net, struct sk_buff *skb, unsigned int hooknum, --iph->hop_limit; } if (nf_dup_ipv6_route(net, skb, gw, oif)) { - __this_cpu_write(nf_skb_duplicated, true); + current->in_nf_duplicate = true; ip6_local_out(net, skb->sk, skb); - __this_cpu_write(nf_skb_duplicated, false); + current->in_nf_duplicate = false; } else { kfree_skb(skb); } +out: + local_bh_enable(); } EXPORT_SYMBOL_GPL(nf_dup_ipv6); diff --git a/net/ipv6/netfilter/nf_reject_ipv6.c b/net/ipv6/netfilter/nf_reject_ipv6.c index dedee264b8f6..9ae2b2725bf9 100644 --- a/net/ipv6/netfilter/nf_reject_ipv6.c +++ b/net/ipv6/netfilter/nf_reject_ipv6.c @@ -223,33 +223,23 @@ void nf_reject_ip6_tcphdr_put(struct sk_buff *nskb, const struct tcphdr *oth, unsigned int otcplen) { struct tcphdr *tcph; - int needs_ack; skb_reset_transport_header(nskb); - tcph = skb_put(nskb, sizeof(struct tcphdr)); + tcph = skb_put_zero(nskb, sizeof(struct tcphdr)); /* Truncate to length (no data) */ tcph->doff = sizeof(struct tcphdr)/4; tcph->source = oth->dest; tcph->dest = oth->source; if (oth->ack) { - needs_ack = 0; tcph->seq = oth->ack_seq; - tcph->ack_seq = 0; } else { - needs_ack = 1; tcph->ack_seq = htonl(ntohl(oth->seq) + oth->syn + oth->fin + otcplen - (oth->doff<<2)); - tcph->seq = 0; + tcph->ack = 1; } - /* Reset flags */ - ((u_int8_t *)tcph)[13] = 0; tcph->rst = 1; - tcph->ack = needs_ack; - tcph->window = 0; - tcph->urg_ptr = 0; - tcph->check = 0; /* Adjust TCP checksum */ tcph->check = csum_ipv6_magic(&ipv6_hdr(nskb)->saddr, @@ -278,13 +268,12 @@ static int nf_reject6_fill_skb_dst(struct sk_buff *skb_in) void nf_send_reset6(struct net *net, struct sock *sk, struct sk_buff *oldskb, int hook) { - struct sk_buff *nskb; - struct tcphdr _otcph; - const struct tcphdr *otcph; - unsigned int otcplen, hh_len; const struct ipv6hdr *oip6h = ipv6_hdr(oldskb); - struct ipv6hdr *ip6h; struct dst_entry *dst = NULL; + const struct tcphdr *otcph; + struct sk_buff *nskb; + struct tcphdr _otcph; + unsigned int otcplen; struct flowi6 fl6; if ((!(ipv6_addr_type(&oip6h->saddr) & IPV6_ADDR_UNICAST)) || @@ -323,9 +312,8 @@ void nf_send_reset6(struct net *net, struct sock *sk, struct sk_buff *oldskb, if (IS_ERR(dst)) return; - hh_len = (dst->dev->hard_header_len + 15)&~15; - nskb = alloc_skb(hh_len + 15 + dst->header_len + sizeof(struct ipv6hdr) - + sizeof(struct tcphdr) + dst->trailer_len, + nskb = alloc_skb(LL_MAX_HEADER + sizeof(struct ipv6hdr) + + sizeof(struct tcphdr) + dst->trailer_len, GFP_ATOMIC); if (!nskb) { @@ -338,9 +326,8 @@ void nf_send_reset6(struct net *net, struct sock *sk, struct sk_buff *oldskb, nskb->mark = fl6.flowi6_mark; - skb_reserve(nskb, hh_len + dst->header_len); - ip6h = nf_reject_ip6hdr_put(nskb, oldskb, IPPROTO_TCP, - ip6_dst_hoplimit(dst)); + skb_reserve(nskb, LL_MAX_HEADER); + nf_reject_ip6hdr_put(nskb, oldskb, IPPROTO_TCP, ip6_dst_hoplimit(dst)); nf_reject_ip6_tcphdr_put(nskb, oldskb, otcph, otcplen); nf_ct_attach(nskb, oldskb); @@ -355,6 +342,7 @@ void nf_send_reset6(struct net *net, struct sock *sk, struct sk_buff *oldskb, */ if (nf_bridge_info_exists(oldskb)) { struct ethhdr *oeth = eth_hdr(oldskb); + struct ipv6hdr *ip6h = ipv6_hdr(nskb); struct net_device *br_indev; br_indev = nf_bridge_get_physindev(oldskb, net); diff --git a/net/ipv6/netfilter/nf_socket_ipv6.c b/net/ipv6/netfilter/nf_socket_ipv6.c index a7690ec62325..9ea5ef56cb27 100644 --- a/net/ipv6/netfilter/nf_socket_ipv6.c +++ b/net/ipv6/netfilter/nf_socket_ipv6.c @@ -103,6 +103,10 @@ struct sock *nf_sk_lookup_slow_v6(struct net *net, const struct sk_buff *skb, struct sk_buff *data_skb = NULL; int doff = 0; int thoff = 0, tproto; +#if IS_ENABLED(CONFIG_NF_CONNTRACK) + enum ip_conntrack_info ctinfo; + struct nf_conn const *ct; +#endif tproto = ipv6_find_hdr(skb, &thoff, -1, NULL, NULL); if (tproto < 0) { @@ -136,6 +140,25 @@ struct sock *nf_sk_lookup_slow_v6(struct net *net, const struct sk_buff *skb, return NULL; } +#if IS_ENABLED(CONFIG_NF_CONNTRACK) + /* Do the lookup with the original socket address in + * case this is a reply packet of an established + * SNAT-ted connection. + */ + ct = nf_ct_get(skb, &ctinfo); + if (ct && + ((tproto != IPPROTO_ICMPV6 && + ctinfo == IP_CT_ESTABLISHED_REPLY) || + (tproto == IPPROTO_ICMPV6 && + ctinfo == IP_CT_RELATED_REPLY)) && + (ct->status & IPS_SRC_NAT_DONE)) { + daddr = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.in6; + dport = (tproto == IPPROTO_TCP) ? + ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.tcp.port : + ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.udp.port; + } +#endif + return nf_socket_get_sock_v6(net, data_skb, doff, tproto, saddr, daddr, sport, dport, indev); } diff --git a/net/ipv6/netfilter/nft_dup_ipv6.c b/net/ipv6/netfilter/nft_dup_ipv6.c index c82f3fdd4a65..492a811828a7 100644 --- a/net/ipv6/netfilter/nft_dup_ipv6.c +++ b/net/ipv6/netfilter/nft_dup_ipv6.c @@ -38,13 +38,13 @@ static int nft_dup_ipv6_init(const struct nft_ctx *ctx, if (tb[NFTA_DUP_SREG_ADDR] == NULL) return -EINVAL; - err = nft_parse_register_load(tb[NFTA_DUP_SREG_ADDR], &priv->sreg_addr, + err = nft_parse_register_load(ctx, tb[NFTA_DUP_SREG_ADDR], &priv->sreg_addr, sizeof(struct in6_addr)); if (err < 0) return err; if (tb[NFTA_DUP_SREG_DEV]) - err = nft_parse_register_load(tb[NFTA_DUP_SREG_DEV], + err = nft_parse_register_load(ctx, tb[NFTA_DUP_SREG_DEV], &priv->sreg_dev, sizeof(int)); return err; diff --git a/net/ipv6/netfilter/nft_fib_ipv6.c b/net/ipv6/netfilter/nft_fib_ipv6.c index 36dc14b34388..421036a3605b 100644 --- a/net/ipv6/netfilter/nft_fib_ipv6.c +++ b/net/ipv6/netfilter/nft_fib_ipv6.c @@ -41,8 +41,6 @@ static int nft_fib6_flowi_init(struct flowi6 *fl6, const struct nft_fib *priv, if (ipv6_addr_type(&fl6->daddr) & IPV6_ADDR_LINKLOCAL) { lookup_flags |= RT6_LOOKUP_F_IFACE; fl6->flowi6_oif = get_ifindex(dev ? dev : pkt->skb->dev); - } else if (priv->flags & NFTA_FIB_F_IIF) { - fl6->flowi6_l3mdev = l3mdev_master_ifindex_rcu(dev); } if (ipv6_addr_type(&fl6->saddr) & IPV6_ADDR_UNICAST) @@ -52,6 +50,7 @@ static int nft_fib6_flowi_init(struct flowi6 *fl6, const struct nft_fib *priv, fl6->flowi6_mark = pkt->skb->mark; fl6->flowlabel = (*(__be32 *)iph) & IPV6_FLOWINFO_MASK; + fl6->flowi6_l3mdev = nft_fib_l3mdev_master_ifindex_rcu(pkt, dev); return lookup_flags; } @@ -158,6 +157,7 @@ void nft_fib6_eval(const struct nft_expr *expr, struct nft_regs *regs, { const struct nft_fib *priv = nft_expr_priv(expr); int noff = skb_network_offset(pkt->skb); + const struct net_device *found = NULL; const struct net_device *oif = NULL; u32 *dest = ®s->data[priv->dreg]; struct ipv6hdr *iph, _iph; @@ -169,6 +169,11 @@ void nft_fib6_eval(const struct nft_expr *expr, struct nft_regs *regs, struct rt6_info *rt; int lookup_flags; + if (nft_fib_can_skip(pkt)) { + nft_fib_store_result(dest, priv, nft_in(pkt)); + return; + } + if (priv->flags & NFTA_FIB_F_IIF) oif = nft_in(pkt); else if (priv->flags & NFTA_FIB_F_OIF) @@ -180,17 +185,13 @@ void nft_fib6_eval(const struct nft_expr *expr, struct nft_regs *regs, return; } - lookup_flags = nft_fib6_flowi_init(&fl6, priv, pkt, oif, iph); - - if (nft_hook(pkt) == NF_INET_PRE_ROUTING || - nft_hook(pkt) == NF_INET_INGRESS) { - if (nft_fib_is_loopback(pkt->skb, nft_in(pkt)) || - nft_fib_v6_skip_icmpv6(pkt->skb, pkt->tprot, iph)) { - nft_fib_store_result(dest, priv, nft_in(pkt)); - return; - } + if (nft_fib_v6_skip_icmpv6(pkt->skb, pkt->tprot, iph)) { + nft_fib_store_result(dest, priv, nft_in(pkt)); + return; } + lookup_flags = nft_fib6_flowi_init(&fl6, priv, pkt, oif, iph); + *dest = 0; rt = (void *)ip6_route_lookup(nft_net(pkt), &fl6, pkt->skb, lookup_flags); @@ -201,11 +202,15 @@ void nft_fib6_eval(const struct nft_expr *expr, struct nft_regs *regs, if (rt->rt6i_flags & (RTF_REJECT | RTF_ANYCAST | RTF_LOCAL)) goto put_rt_err; - if (oif && oif != rt->rt6i_idev->dev && - l3mdev_master_ifindex_rcu(rt->rt6i_idev->dev) != oif->ifindex) - goto put_rt_err; + if (!oif) { + found = rt->rt6i_idev->dev; + } else { + if (oif == rt->rt6i_idev->dev || + l3mdev_master_ifindex_rcu(rt->rt6i_idev->dev) == oif->ifindex) + found = oif; + } - nft_fib_store_result(dest, priv, rt->rt6i_idev->dev); + nft_fib_store_result(dest, priv, found); put_rt_err: ip6_rt_put(rt); } diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c index ef2059c88955..84d90dd8b3f0 100644 --- a/net/ipv6/ping.c +++ b/net/ipv6/ping.c @@ -119,8 +119,6 @@ static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) return -EINVAL; ipcm6_init_sk(&ipc6, sk); - ipc6.sockc.tsflags = READ_ONCE(sk->sk_tsflags); - ipc6.sockc.mark = READ_ONCE(sk->sk_mark); fl6.flowi6_oif = oif; @@ -154,7 +152,7 @@ static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) dst = ip6_sk_dst_lookup_flow(sk, &fl6, daddr, false); if (IS_ERR(dst)) return PTR_ERR(dst); - rt = (struct rt6_info *) dst; + rt = dst_rt6_info(dst); if (!fl6.flowi6_oif && ipv6_addr_is_multicast(&fl6.daddr)) fl6.flowi6_oif = READ_ONCE(np->mcast_oif); diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c index 6d1d9221649d..752327b10dde 100644 --- a/net/ipv6/proc.c +++ b/net/ipv6/proc.c @@ -27,7 +27,7 @@ #include <net/ipv6.h> #define MAX4(a, b, c, d) \ - max_t(u32, max_t(u32, a, b), max_t(u32, c, d)) + MAX_T(u32, MAX_T(u32, a, b), MAX_T(u32, c, d)) #define SNMP_MIB_MAX MAX4(UDP_MIB_MAX, TCP_MIB_MAX, \ IPSTATS_MIB_MAX, ICMP_MIB_MAX) diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 0d896ca7b589..fda640ebd53f 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -362,14 +362,14 @@ static inline int rawv6_rcv_skb(struct sock *sk, struct sk_buff *skb) if ((raw6_sk(sk)->checksum || rcu_access_pointer(sk->sk_filter)) && skb_checksum_complete(skb)) { atomic_inc(&sk->sk_drops); - kfree_skb_reason(skb, SKB_DROP_REASON_SKB_CSUM); + sk_skb_reason_drop(sk, skb, SKB_DROP_REASON_SKB_CSUM); return NET_RX_DROP; } /* Charge it to the socket. */ skb_dst_drop(skb); if (sock_queue_rcv_skb_reason(sk, skb, &reason) < 0) { - kfree_skb_reason(skb, reason); + sk_skb_reason_drop(sk, skb, reason); return NET_RX_DROP; } @@ -390,7 +390,7 @@ int rawv6_rcv(struct sock *sk, struct sk_buff *skb) if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb)) { atomic_inc(&sk->sk_drops); - kfree_skb_reason(skb, SKB_DROP_REASON_XFRM_POLICY); + sk_skb_reason_drop(sk, skb, SKB_DROP_REASON_XFRM_POLICY); return NET_RX_DROP; } nf_reset_ct(skb); @@ -415,7 +415,7 @@ int rawv6_rcv(struct sock *sk, struct sk_buff *skb) if (inet_test_bit(HDRINCL, sk)) { if (skb_checksum_complete(skb)) { atomic_inc(&sk->sk_drops); - kfree_skb_reason(skb, SKB_DROP_REASON_SKB_CSUM); + sk_skb_reason_drop(sk, skb, SKB_DROP_REASON_SKB_CSUM); return NET_RX_DROP; } } @@ -598,7 +598,7 @@ static int rawv6_send_hdrinc(struct sock *sk, struct msghdr *msg, int length, struct ipv6hdr *iph; struct sk_buff *skb; int err; - struct rt6_info *rt = (struct rt6_info *)*dstp; + struct rt6_info *rt = dst_rt6_info(*dstp); int hlen = LL_RESERVED_SPACE(rt->dst.dev); int tlen = rt->dst.dev->needed_tailroom; @@ -619,9 +619,9 @@ static int rawv6_send_hdrinc(struct sock *sk, struct msghdr *msg, int length, skb_reserve(skb, hlen); skb->protocol = htons(ETH_P_IPV6); - skb->priority = READ_ONCE(sk->sk_priority); + skb->priority = sockc->priority; skb->mark = sockc->mark; - skb->tstamp = sockc->transmit_time; + skb_set_delivery_type_by_clockid(skb, sockc->transmit_time, sk->sk_clockid); skb_put(skb, length); skb_reset_network_header(skb); @@ -629,7 +629,7 @@ static int rawv6_send_hdrinc(struct sock *sk, struct msghdr *msg, int length, skb->ip_summed = CHECKSUM_NONE; - skb_setup_tx_timestamp(skb, sockc->tsflags); + skb_setup_tx_timestamp(skb, sockc); if (flags & MSG_CONFIRM) skb_set_dst_pending_confirm(skb, 1); @@ -769,18 +769,16 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) hdrincl = inet_test_bit(HDRINCL, sk); + ipcm6_init_sk(&ipc6, sk); + /* * Get and verify the address. */ memset(&fl6, 0, sizeof(fl6)); - fl6.flowi6_mark = READ_ONCE(sk->sk_mark); + fl6.flowi6_mark = ipc6.sockc.mark; fl6.flowi6_uid = sk->sk_uid; - ipcm6_init(&ipc6); - ipc6.sockc.tsflags = READ_ONCE(sk->sk_tsflags); - ipc6.sockc.mark = fl6.flowi6_mark; - if (sin6) { if (addr_len < SIN6_LEN_RFC2133) return -EINVAL; @@ -890,9 +888,6 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) if (hdrincl) fl6.flowi6_flags |= FLOWI_FLAG_KNOWN_NH; - if (ipc6.tclass < 0) - ipc6.tclass = np->tclass; - fl6.flowlabel = ip6_make_flowinfo(ipc6.tclass, fl6.flowlabel); dst = ip6_dst_lookup_flow(sock_net(sk), sk, &fl6, final_p); @@ -903,9 +898,6 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) if (ipc6.hlimit < 0) ipc6.hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst); - if (ipc6.dontfrag < 0) - ipc6.dontfrag = inet6_test_bit(DONTFRAG, sk); - if (msg->msg_flags&MSG_CONFIRM) goto do_confirm; @@ -917,7 +909,7 @@ back_from_confirm: ipc6.opt = opt; lock_sock(sk); err = ip6_append_data(sk, raw6_getfrag, &rfv, - len, 0, &ipc6, &fl6, (struct rt6_info *)dst, + len, 0, &ipc6, &fl6, dst_rt6_info(dst), msg->msg_flags); if (err) diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index acb4f119e11f..49740898bc13 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -68,7 +68,8 @@ static u8 ip6_frag_ecn(const struct ipv6hdr *ipv6h) static struct inet_frags ip6_frags; static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *skb, - struct sk_buff *prev_tail, struct net_device *dev); + struct sk_buff *prev_tail, struct net_device *dev, + int *refs); static void ip6_frag_expire(struct timer_list *t) { @@ -105,7 +106,7 @@ fq_find(struct net *net, __be32 id, const struct ipv6hdr *hdr, int iif) static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb, struct frag_hdr *fhdr, int nhoff, - u32 *prob_offset) + u32 *prob_offset, int *refs) { struct net *net = dev_net(skb_dst(skb)->dev); int offset, end, fragsize; @@ -198,7 +199,7 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb, fq->iif = dev->ifindex; fq->q.stamp = skb->tstamp; - fq->q.mono_delivery_time = skb->mono_delivery_time; + fq->q.tstamp_type = skb->tstamp_type; fq->q.meat += skb->len; fq->ecn |= ecn; add_frag_mem_limit(fq->q.fqdir, skb->truesize); @@ -220,7 +221,7 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb, unsigned long orefdst = skb->_skb_refdst; skb->_skb_refdst = 0UL; - err = ip6_frag_reasm(fq, skb, prev_tail, dev); + err = ip6_frag_reasm(fq, skb, prev_tail, dev, refs); skb->_skb_refdst = orefdst; return err; } @@ -238,7 +239,7 @@ insert_error: __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_REASM_OVERLAPS); discard_fq: - inet_frag_kill(&fq->q); + inet_frag_kill(&fq->q, refs); __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_REASMFAILS); err: @@ -254,7 +255,8 @@ err: * the last and the first frames arrived and all the bits are here. */ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *skb, - struct sk_buff *prev_tail, struct net_device *dev) + struct sk_buff *prev_tail, struct net_device *dev, + int *refs) { struct net *net = fq->q.fqdir->net; unsigned int nhoff; @@ -262,7 +264,7 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *skb, int payload_len; u8 ecn; - inet_frag_kill(&fq->q); + inet_frag_kill(&fq->q, refs); ecn = ip_frag_ecn_table[fq->ecn]; if (unlikely(ecn == 0xff)) @@ -303,9 +305,7 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *skb, skb_postpush_rcsum(skb, skb_network_header(skb), skb_network_header_len(skb)); - rcu_read_lock(); __IP6_INC_STATS(net, __in6_dev_stats_get(dev, skb), IPSTATS_MIB_REASMOKS); - rcu_read_unlock(); fq->q.rb_fragments = RB_ROOT; fq->q.fragments_tail = NULL; fq->q.last_run_head = NULL; @@ -317,10 +317,8 @@ out_oversize: out_oom: net_dbg_ratelimited("ip6_frag_reasm: no memory for reassembly\n"); out_fail: - rcu_read_lock(); __IP6_INC_STATS(net, __in6_dev_stats_get(dev, skb), IPSTATS_MIB_REASMFAILS); - rcu_read_unlock(); - inet_frag_kill(&fq->q); + inet_frag_kill(&fq->q, refs); return -1; } @@ -369,7 +367,7 @@ static int ipv6_frag_rcv(struct sk_buff *skb) * the source of the fragment, with the Pointer field set to zero. */ nexthdr = hdr->nexthdr; - if (ipv6frag_thdr_truncated(skb, skb_transport_offset(skb), &nexthdr)) { + if (ipv6frag_thdr_truncated(skb, skb_network_offset(skb) + sizeof(struct ipv6hdr), &nexthdr)) { __IP6_INC_STATS(net, __in6_dev_get_safely(skb->dev), IPSTATS_MIB_INHDRERRORS); icmpv6_param_prob(skb, ICMPV6_HDR_INCOMP, 0); @@ -377,19 +375,21 @@ static int ipv6_frag_rcv(struct sk_buff *skb) } iif = skb->dev ? skb->dev->ifindex : 0; + rcu_read_lock(); fq = fq_find(net, fhdr->identification, hdr, iif); if (fq) { u32 prob_offset = 0; - int ret; + int ret, refs = 0; spin_lock(&fq->q.lock); fq->iif = iif; ret = ip6_frag_queue(fq, skb, fhdr, IP6CB(skb)->nhoff, - &prob_offset); + &prob_offset, &refs); spin_unlock(&fq->q.lock); - inet_frag_put(&fq->q); + rcu_read_unlock(); + inet_frag_putn(&fq->q, refs); if (prob_offset) { __IP6_INC_STATS(net, __in6_dev_get_safely(skb->dev), IPSTATS_MIB_INHDRERRORS); @@ -398,6 +398,7 @@ static int ipv6_frag_rcv(struct sk_buff *skb) } return ret; } + rcu_read_unlock(); __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_REASMFAILS); kfree_skb(skb); @@ -436,7 +437,6 @@ static struct ctl_table ip6_frags_ns_ctl_table[] = { .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, - { } }; /* secret interval has been deprecated */ @@ -449,7 +449,6 @@ static struct ctl_table ip6_frags_ctl_table[] = { .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, - { } }; static int __net_init ip6_frags_ns_sysctl_register(struct net *net) @@ -487,7 +486,7 @@ err_alloc: static void __net_exit ip6_frags_ns_sysctl_unregister(struct net *net) { - struct ctl_table *table; + const struct ctl_table *table; table = net->ipv6.sysctl.frags_hdr->ctl_table_arg; unregister_net_sysctl_table(net->ipv6.sysctl.frags_hdr); diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 1f4b935a0e57..0143262094b0 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -87,7 +87,8 @@ struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie); static unsigned int ip6_default_advmss(const struct dst_entry *dst); INDIRECT_CALLABLE_SCOPE unsigned int ip6_mtu(const struct dst_entry *dst); -static struct dst_entry *ip6_negative_advice(struct dst_entry *); +static void ip6_negative_advice(struct sock *sk, + struct dst_entry *dst); static void ip6_dst_destroy(struct dst_entry *); static void ip6_dst_ifdown(struct dst_entry *, struct net_device *dev); @@ -130,7 +131,6 @@ static struct fib6_info *rt6_get_route_info(struct net *net, struct uncached_list { spinlock_t lock; struct list_head head; - struct list_head quarantine; }; static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list); @@ -174,7 +174,7 @@ static void rt6_uncached_list_flush_dev(struct net_device *dev) struct net_device *rt_dev = rt->dst.dev; bool handled = false; - if (rt_idev->dev == dev) { + if (rt_idev && rt_idev->dev == dev) { rt->rt6i_idev = in6_dev_get(blackhole_netdev); in6_dev_put(rt_idev); handled = true; @@ -188,8 +188,7 @@ static void rt6_uncached_list_flush_dev(struct net_device *dev) handled = true; } if (handled) - list_move(&rt->dst.rt_uncached, - &ul->quarantine); + list_del_init(&rt->dst.rt_uncached); } spin_unlock_bh(&ul->lock); } @@ -226,7 +225,7 @@ static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst, struct sk_buff *skb, const void *daddr) { - const struct rt6_info *rt = container_of(dst, struct rt6_info, dst); + const struct rt6_info *rt = dst_rt6_info(dst); return ip6_neigh_lookup(rt6_nexthop(rt, &in6addr_any), dst->dev, skb, daddr); @@ -234,8 +233,8 @@ static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst, static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr) { + const struct rt6_info *rt = dst_rt6_info(dst); struct net_device *dev = dst->dev; - struct rt6_info *rt = (struct rt6_info *)dst; daddr = choose_neigh_daddr(rt6_nexthop(rt, &in6addr_any), NULL, daddr); if (!daddr) @@ -354,7 +353,7 @@ EXPORT_SYMBOL(ip6_dst_alloc); static void ip6_dst_destroy(struct dst_entry *dst) { - struct rt6_info *rt = (struct rt6_info *)dst; + struct rt6_info *rt = dst_rt6_info(dst); struct fib6_info *from; struct inet6_dev *idev; @@ -367,14 +366,15 @@ static void ip6_dst_destroy(struct dst_entry *dst) in6_dev_put(idev); } - from = xchg((__force struct fib6_info **)&rt->from, NULL); + from = unrcu_pointer(xchg(&rt->from, NULL)); fib6_info_release(from); } static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev) { - struct rt6_info *rt = (struct rt6_info *)dst; + struct rt6_info *rt = dst_rt6_info(dst); struct inet6_dev *idev = rt->rt6i_idev; + struct fib6_info *from; if (idev && idev->dev != blackhole_netdev) { struct inet6_dev *blackhole_idev = in6_dev_get(blackhole_netdev); @@ -384,6 +384,8 @@ static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev) in6_dev_put(idev); } } + from = unrcu_pointer(xchg(&rt->from, NULL)); + fib6_info_release(from); } static bool __rt6_check_expired(const struct rt6_info *rt) @@ -410,12 +412,37 @@ static bool rt6_check_expired(const struct rt6_info *rt) return false; } +static struct fib6_info * +rt6_multipath_first_sibling_rcu(const struct fib6_info *rt) +{ + struct fib6_info *iter; + struct fib6_node *fn; + + fn = rcu_dereference(rt->fib6_node); + if (!fn) + goto out; + iter = rcu_dereference(fn->leaf); + if (!iter) + goto out; + + while (iter) { + if (iter->fib6_metric == rt->fib6_metric && + rt6_qualify_for_ecmp(iter)) + return iter; + iter = rcu_dereference(iter->fib6_next); + } + +out: + return NULL; +} + void fib6_select_path(const struct net *net, struct fib6_result *res, struct flowi6 *fl6, int oif, bool have_oif_match, const struct sk_buff *skb, int strict) { - struct fib6_info *sibling, *next_sibling; - struct fib6_info *match = res->f6i; + struct fib6_info *first, *match = res->f6i; + struct fib6_info *sibling; + int hash; if (!match->nh && (!match->fib6_nsiblings || have_oif_match)) goto out; @@ -438,16 +465,25 @@ void fib6_select_path(const struct net *net, struct fib6_result *res, return; } - if (fl6->mp_hash <= atomic_read(&match->fib6_nh->fib_nh_upper_bound)) + first = rt6_multipath_first_sibling_rcu(match); + if (!first) + goto out; + + hash = fl6->mp_hash; + if (hash <= atomic_read(&first->fib6_nh->fib_nh_upper_bound)) { + if (rt6_score_route(first->fib6_nh, first->fib6_flags, oif, + strict) >= 0) + match = first; goto out; + } - list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings, - fib6_siblings) { + list_for_each_entry_rcu(sibling, &first->fib6_siblings, + fib6_siblings) { const struct fib6_nh *nh = sibling->fib6_nh; int nh_upper_bound; nh_upper_bound = atomic_read(&nh->fib_nh_upper_bound); - if (fl6->mp_hash > nh_upper_bound) + if (hash > nh_upper_bound) continue; if (rt6_score_route(nh, sibling->fib6_flags, oif, strict) < 0) break; @@ -637,6 +673,8 @@ static void rt6_probe(struct fib6_nh *fib6_nh) rcu_read_lock(); last_probe = READ_ONCE(fib6_nh->last_probe); idev = __in6_dev_get(dev); + if (!idev) + goto out; neigh = __ipv6_neigh_lookup_noref(dev, nh_gw); if (neigh) { if (READ_ONCE(neigh->nud_state) & NUD_VALID) @@ -1288,7 +1326,7 @@ struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr, dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup); if (dst->error == 0) - return (struct rt6_info *) dst; + return dst_rt6_info(dst); dst_release(dst); @@ -1408,6 +1446,7 @@ static struct rt6_info *rt6_get_pcpu_route(const struct fib6_result *res) struct rt6_info *prev, **p; p = this_cpu_ptr(res->nh->rt6i_pcpu); + /* Paired with READ_ONCE() in __fib6_drop_pcpu_from() */ prev = xchg(p, NULL); if (prev) { dst_dev_put(&prev->dst); @@ -1436,7 +1475,7 @@ static struct rt6_info *rt6_make_pcpu_route(struct net *net, if (res->f6i->fib6_destroying) { struct fib6_info *from; - from = xchg((__force struct fib6_info **)&pcpu_rt->from, NULL); + from = unrcu_pointer(xchg(&pcpu_rt->from, NULL)); fib6_info_release(from); } @@ -1453,7 +1492,6 @@ static DEFINE_SPINLOCK(rt6_exception_lock); static void rt6_remove_exception(struct rt6_exception_bucket *bucket, struct rt6_exception *rt6_ex) { - struct fib6_info *from; struct net *net; if (!bucket || !rt6_ex) @@ -1465,8 +1503,6 @@ static void rt6_remove_exception(struct rt6_exception_bucket *bucket, /* purge completely the exception to allow releasing the held resources: * some [sk] cache may keep the dst around for unlimited time */ - from = xchg((__force struct fib6_info **)&rt6_ex->rt6i->from, NULL); - fib6_info_release(from); dst_dev_put(&rt6_ex->rt6i->dst); hlist_del_rcu(&rt6_ex->hlist); @@ -1735,6 +1771,7 @@ out: if (!err) { spin_lock_bh(&f6i->fib6_table->tb6_lock); fib6_update_sernum(net, f6i); + fib6_add_gc_list(f6i); spin_unlock_bh(&f6i->fib6_table->tb6_lock); fib6_force_start_gc(net); } @@ -1783,11 +1820,13 @@ static int rt6_nh_flush_exceptions(struct fib6_nh *nh, void *arg) void rt6_flush_exceptions(struct fib6_info *f6i) { - if (f6i->nh) - nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_flush_exceptions, - f6i); - else + if (f6i->nh) { + rcu_read_lock(); + nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_flush_exceptions, f6i); + rcu_read_unlock(); + } else { fib6_nh_flush_exceptions(f6i->fib6_nh, f6i); + } } /* Find cached rt in the hash table inside passed in rt @@ -2372,7 +2411,7 @@ static u32 rt6_multipath_custom_hash_outer(const struct net *net, hash_keys.ports.dst = keys.ports.dst; *p_has_inner = !!(keys.control.flags & FLOW_DIS_ENCAPSULATION); - return flow_hash_from_keys(&hash_keys); + return fib_multipath_hash_from_keys(net, &hash_keys); } static u32 rt6_multipath_custom_hash_inner(const struct net *net, @@ -2421,7 +2460,7 @@ static u32 rt6_multipath_custom_hash_inner(const struct net *net, if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_PORT) hash_keys.ports.dst = keys.ports.dst; - return flow_hash_from_keys(&hash_keys); + return fib_multipath_hash_from_keys(net, &hash_keys); } static u32 rt6_multipath_custom_hash_skb(const struct net *net, @@ -2455,12 +2494,16 @@ static u32 rt6_multipath_custom_hash_fl6(const struct net *net, hash_keys.basic.ip_proto = fl6->flowi6_proto; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_FLOWLABEL) hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6); - if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT) - hash_keys.ports.src = fl6->fl6_sport; + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT) { + if (fl6->flowi6_flags & FLOWI_FLAG_ANY_SPORT) + hash_keys.ports.src = (__force __be16)get_random_u16(); + else + hash_keys.ports.src = fl6->fl6_sport; + } if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_PORT) hash_keys.ports.dst = fl6->fl6_dport; - return flow_hash_from_keys(&hash_keys); + return fib_multipath_hash_from_keys(net, &hash_keys); } /* if skb is set it will be used and fl6 can be NULL */ @@ -2482,7 +2525,7 @@ u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6, hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6); hash_keys.basic.ip_proto = fl6->flowi6_proto; } - mhash = flow_hash_from_keys(&hash_keys); + mhash = fib_multipath_hash_from_keys(net, &hash_keys); break; case 1: if (skb) { @@ -2510,11 +2553,14 @@ u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6, hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; hash_keys.addrs.v6addrs.src = fl6->saddr; hash_keys.addrs.v6addrs.dst = fl6->daddr; - hash_keys.ports.src = fl6->fl6_sport; + if (fl6->flowi6_flags & FLOWI_FLAG_ANY_SPORT) + hash_keys.ports.src = (__force __be16)get_random_u16(); + else + hash_keys.ports.src = fl6->fl6_sport; hash_keys.ports.dst = fl6->fl6_dport; hash_keys.basic.ip_proto = fl6->flowi6_proto; } - mhash = flow_hash_from_keys(&hash_keys); + mhash = fib_multipath_hash_from_keys(net, &hash_keys); break; case 2: memset(&hash_keys, 0, sizeof(hash_keys)); @@ -2551,7 +2597,7 @@ u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6, hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6); hash_keys.basic.ip_proto = fl6->flowi6_proto; } - mhash = flow_hash_from_keys(&hash_keys); + mhash = fib_multipath_hash_from_keys(net, &hash_keys); break; case 3: if (skb) @@ -2647,7 +2693,7 @@ struct dst_entry *ip6_route_output_flags(struct net *net, rcu_read_lock(); dst = ip6_route_output_flags_noref(net, sk, fl6, flags); - rt6 = (struct rt6_info *)dst; + rt6 = dst_rt6_info(dst); /* For dst cached in uncached_list, refcnt is already taken. */ if (list_empty(&rt6->dst.rt_uncached) && !dst_hold_safe(dst)) { dst = &net->ipv6.ip6_null_entry->dst; @@ -2661,7 +2707,7 @@ EXPORT_SYMBOL_GPL(ip6_route_output_flags); struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig) { - struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig; + struct rt6_info *rt, *ort = dst_rt6_info(dst_orig); struct net_device *loopback_dev = net->loopback_dev; struct dst_entry *new = NULL; @@ -2744,7 +2790,7 @@ INDIRECT_CALLABLE_SCOPE struct dst_entry *ip6_dst_check(struct dst_entry *dst, struct fib6_info *from; struct rt6_info *rt; - rt = container_of(dst, struct rt6_info, dst); + rt = dst_rt6_info(dst); if (rt->sernum) return rt6_is_valid(rt) ? dst : NULL; @@ -2770,24 +2816,24 @@ INDIRECT_CALLABLE_SCOPE struct dst_entry *ip6_dst_check(struct dst_entry *dst, } EXPORT_INDIRECT_CALLABLE(ip6_dst_check); -static struct dst_entry *ip6_negative_advice(struct dst_entry *dst) +static void ip6_negative_advice(struct sock *sk, + struct dst_entry *dst) { - struct rt6_info *rt = (struct rt6_info *) dst; + struct rt6_info *rt = dst_rt6_info(dst); - if (rt) { - if (rt->rt6i_flags & RTF_CACHE) { - rcu_read_lock(); - if (rt6_check_expired(rt)) { - rt6_remove_exception_rt(rt); - dst = NULL; - } - rcu_read_unlock(); - } else { - dst_release(dst); - dst = NULL; + if (rt->rt6i_flags & RTF_CACHE) { + rcu_read_lock(); + if (rt6_check_expired(rt)) { + /* rt/dst can not be destroyed yet, + * because of rcu_read_lock() + */ + sk_dst_reset(sk); + rt6_remove_exception_rt(rt); } + rcu_read_unlock(); + return; } - return dst; + sk_dst_reset(sk); } static void ip6_link_failure(struct sk_buff *skb) @@ -2796,7 +2842,7 @@ static void ip6_link_failure(struct sk_buff *skb) icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0); - rt = (struct rt6_info *) skb_dst(skb); + rt = dst_rt6_info(skb_dst(skb)); if (rt) { rcu_read_lock(); if (rt->rt6i_flags & RTF_CACHE) { @@ -2852,7 +2898,7 @@ static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk, bool confirm_neigh) { const struct in6_addr *daddr, *saddr; - struct rt6_info *rt6 = (struct rt6_info *)dst; + struct rt6_info *rt6 = dst_rt6_info(dst); /* Note: do *NOT* check dst_metric_locked(dst, RTAX_MTU) * IPv6 pmtu discovery isn't optional, so 'mtu lock' cannot disable it. @@ -3194,13 +3240,18 @@ static unsigned int ip6_default_advmss(const struct dst_entry *dst) { struct net_device *dev = dst->dev; unsigned int mtu = dst_mtu(dst); - struct net *net = dev_net(dev); + struct net *net; mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr); + rcu_read_lock(); + + net = dev_net_rcu(dev); if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss) mtu = net->ipv6.sysctl.ip6_rt_min_advmss; + rcu_read_unlock(); + /* * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and * corresponding MSS is IPV6_MAXPLEN - tcp_header_size. @@ -3601,7 +3652,7 @@ int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh, if (!dev) goto out; - if (idev->cnf.disable_ipv6) { + if (!idev || idev->cnf.disable_ipv6) { NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device"); err = -EACCES; goto out; @@ -3637,7 +3688,8 @@ out: in6_dev_put(idev); if (err) { - lwtstate_put(fib6_nh->fib_nh_lws); + fib_nh_common_release(&fib6_nh->nh_common); + fib6_nh->nh_common.nhc_pcpu_rth_output = NULL; fib6_nh->fib_nh_lws = NULL; netdev_put(dev, dev_tracker); } @@ -3686,61 +3738,14 @@ void fib6_nh_release_dsts(struct fib6_nh *fib6_nh) } static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg, - gfp_t gfp_flags, - struct netlink_ext_ack *extack) + gfp_t gfp_flags, + struct netlink_ext_ack *extack) { struct net *net = cfg->fc_nlinfo.nl_net; - struct fib6_info *rt = NULL; - struct nexthop *nh = NULL; struct fib6_table *table; - struct fib6_nh *fib6_nh; - int err = -EINVAL; - int addr_type; - - /* RTF_PCPU is an internal flag; can not be set by userspace */ - if (cfg->fc_flags & RTF_PCPU) { - NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU"); - goto out; - } - - /* RTF_CACHE is an internal flag; can not be set by userspace */ - if (cfg->fc_flags & RTF_CACHE) { - NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE"); - goto out; - } - - if (cfg->fc_type > RTN_MAX) { - NL_SET_ERR_MSG(extack, "Invalid route type"); - goto out; - } - - if (cfg->fc_dst_len > 128) { - NL_SET_ERR_MSG(extack, "Invalid prefix length"); - goto out; - } - if (cfg->fc_src_len > 128) { - NL_SET_ERR_MSG(extack, "Invalid source address length"); - goto out; - } -#ifndef CONFIG_IPV6_SUBTREES - if (cfg->fc_src_len) { - NL_SET_ERR_MSG(extack, - "Specifying source address requires IPV6_SUBTREES to be enabled"); - goto out; - } -#endif - if (cfg->fc_nh_id) { - nh = nexthop_find_by_id(net, cfg->fc_nh_id); - if (!nh) { - NL_SET_ERR_MSG(extack, "Nexthop id does not exist"); - goto out; - } - err = fib6_check_nexthop(nh, cfg, extack); - if (err) - goto out; - } + struct fib6_info *rt; + int err; - err = -ENOBUFS; if (cfg->fc_nlinfo.nlh && !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) { table = fib6_get_table(net, cfg->fc_table); @@ -3751,22 +3756,22 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg, } else { table = fib6_new_table(net, cfg->fc_table); } + if (!table) { + err = -ENOBUFS; + goto err; + } - if (!table) - goto out; - - err = -ENOMEM; - rt = fib6_info_alloc(gfp_flags, !nh); - if (!rt) - goto out; + rt = fib6_info_alloc(gfp_flags, !cfg->fc_nh_id); + if (!rt) { + err = -ENOMEM; + goto err; + } - rt->fib6_metrics = ip_fib_metrics_init(net, cfg->fc_mx, cfg->fc_mx_len, + rt->fib6_metrics = ip_fib_metrics_init(cfg->fc_mx, cfg->fc_mx_len, extack); if (IS_ERR(rt->fib6_metrics)) { err = PTR_ERR(rt->fib6_metrics); - /* Do not leave garbage there. */ - rt->fib6_metrics = (struct dst_metrics *)&dst_default_metrics; - goto out_free; + goto free; } if (cfg->fc_flags & RTF_ADDRCONF) @@ -3774,12 +3779,12 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg, if (cfg->fc_flags & RTF_EXPIRES) fib6_set_expires(rt, jiffies + - clock_t_to_jiffies(cfg->fc_expires)); + clock_t_to_jiffies(cfg->fc_expires)); if (cfg->fc_protocol == RTPROT_UNSPEC) cfg->fc_protocol = RTPROT_BOOT; - rt->fib6_protocol = cfg->fc_protocol; + rt->fib6_protocol = cfg->fc_protocol; rt->fib6_table = table; rt->fib6_metric = cfg->fc_metric; rt->fib6_type = cfg->fc_type ? : RTN_UNICAST; @@ -3792,21 +3797,54 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg, ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len); rt->fib6_src.plen = cfg->fc_src_len; #endif - if (nh) { - if (rt->fib6_src.plen) { - NL_SET_ERR_MSG(extack, "Nexthops can not be used with source routing"); + return rt; +free: + kfree(rt); +err: + return ERR_PTR(err); +} + +static int ip6_route_info_create_nh(struct fib6_info *rt, + struct fib6_config *cfg, + gfp_t gfp_flags, + struct netlink_ext_ack *extack) +{ + struct net *net = cfg->fc_nlinfo.nl_net; + struct fib6_nh *fib6_nh; + int err; + + if (cfg->fc_nh_id) { + struct nexthop *nh; + + rcu_read_lock(); + + nh = nexthop_find_by_id(net, cfg->fc_nh_id); + if (!nh) { + err = -EINVAL; + NL_SET_ERR_MSG(extack, "Nexthop id does not exist"); goto out_free; } + + err = fib6_check_nexthop(nh, cfg, extack); + if (err) + goto out_free; + if (!nexthop_get(nh)) { NL_SET_ERR_MSG(extack, "Nexthop has been deleted"); + err = -ENOENT; goto out_free; } + rt->nh = nh; fib6_nh = nexthop_fib6_nh(rt->nh); + + rcu_read_unlock(); } else { + int addr_type; + err = fib6_nh_init(net, rt->fib6_nh, cfg, gfp_flags, extack); if (err) - goto out; + goto out_release; fib6_nh = rt->fib6_nh; @@ -3825,21 +3863,21 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg, if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) { NL_SET_ERR_MSG(extack, "Invalid source address"); err = -EINVAL; - goto out; + goto out_release; } rt->fib6_prefsrc.addr = cfg->fc_prefsrc; rt->fib6_prefsrc.plen = 128; - } else - rt->fib6_prefsrc.plen = 0; + } - return rt; -out: + return 0; +out_release: fib6_info_release(rt); - return ERR_PTR(err); + return err; out_free: + rcu_read_unlock(); ip_fib_metrics_put(rt->fib6_metrics); kfree(rt); - return ERR_PTR(err); + return err; } int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags, @@ -3852,6 +3890,10 @@ int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags, if (IS_ERR(rt)) return PTR_ERR(rt); + err = ip6_route_info_create_nh(rt, cfg, gfp_flags, extack); + if (err) + return err; + err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack); fib6_info_release(rt); @@ -4080,9 +4122,9 @@ static int ip6_route_del(struct fib6_config *cfg, if (rt->nh) { if (!fib6_info_hold_safe(rt)) continue; - rcu_read_unlock(); - return __ip6_del_rt(rt, &cfg->fc_nlinfo); + err = __ip6_del_rt(rt, &cfg->fc_nlinfo); + break; } if (cfg->fc_nh_id) continue; @@ -4097,13 +4139,13 @@ static int ip6_route_del(struct fib6_config *cfg, continue; if (!fib6_info_hold_safe(rt)) continue; - rcu_read_unlock(); /* if gateway was specified only delete the one hop */ if (cfg->fc_flags & RTF_GATEWAY) - return __ip6_del_rt(rt, &cfg->fc_nlinfo); - - return __ip6_del_rt_siblings(rt, cfg); + err = __ip6_del_rt(rt, &cfg->fc_nlinfo); + else + err = __ip6_del_rt_siblings(rt, cfg); + break; } } rcu_read_unlock(); @@ -4174,7 +4216,7 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu } } - rt = (struct rt6_info *) dst; + rt = dst_rt6_info(dst); if (rt->rt6i_flags & RTF_REJECT) { net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n"); return; @@ -4437,6 +4479,53 @@ void rt6_purge_dflt_routers(struct net *net) rcu_read_unlock(); } +static int fib6_config_validate(struct fib6_config *cfg, + struct netlink_ext_ack *extack) +{ + /* RTF_PCPU is an internal flag; can not be set by userspace */ + if (cfg->fc_flags & RTF_PCPU) { + NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU"); + goto errout; + } + + /* RTF_CACHE is an internal flag; can not be set by userspace */ + if (cfg->fc_flags & RTF_CACHE) { + NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE"); + goto errout; + } + + if (cfg->fc_type > RTN_MAX) { + NL_SET_ERR_MSG(extack, "Invalid route type"); + goto errout; + } + + if (cfg->fc_dst_len > 128) { + NL_SET_ERR_MSG(extack, "Invalid prefix length"); + goto errout; + } + +#ifdef CONFIG_IPV6_SUBTREES + if (cfg->fc_src_len > 128) { + NL_SET_ERR_MSG(extack, "Invalid source address length"); + goto errout; + } + + if (cfg->fc_nh_id && cfg->fc_src_len) { + NL_SET_ERR_MSG(extack, "Nexthops can not be used with source routing"); + goto errout; + } +#else + if (cfg->fc_src_len) { + NL_SET_ERR_MSG(extack, + "Specifying source address requires IPV6_SUBTREES to be enabled"); + goto errout; + } +#endif + return 0; +errout: + return -EINVAL; +} + static void rtmsg_to_fib6_config(struct net *net, struct in6_rtmsg *rtmsg, struct fib6_config *cfg) @@ -4445,7 +4534,7 @@ static void rtmsg_to_fib6_config(struct net *net, .fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ? : RT6_TABLE_MAIN, .fc_ifindex = rtmsg->rtmsg_ifindex, - .fc_metric = rtmsg->rtmsg_metric ? : IP6_RT_PRIO_USER, + .fc_metric = rtmsg->rtmsg_metric, .fc_expires = rtmsg->rtmsg_info, .fc_dst_len = rtmsg->rtmsg_dst_len, .fc_src_len = rtmsg->rtmsg_src_len, @@ -4472,16 +4561,22 @@ int ipv6_route_ioctl(struct net *net, unsigned int cmd, struct in6_rtmsg *rtmsg) rtmsg_to_fib6_config(net, rtmsg, &cfg); - rtnl_lock(); switch (cmd) { case SIOCADDRT: + err = fib6_config_validate(&cfg, NULL); + if (err) + break; + + /* Only do the default setting of fc_metric in route adding */ + if (cfg.fc_metric == 0) + cfg.fc_metric = IP6_RT_PRIO_USER; err = ip6_route_add(&cfg, GFP_KERNEL, NULL); break; case SIOCDELRT: err = ip6_route_del(&cfg, NULL); break; } - rtnl_unlock(); + return err; } @@ -4571,6 +4666,7 @@ struct fib6_info *addrconf_f6i_alloc(struct net *net, .fc_ignore_dev_down = true, }; struct fib6_info *f6i; + int err; if (anycast) { cfg.fc_type = RTN_ANYCAST; @@ -4581,14 +4677,19 @@ struct fib6_info *addrconf_f6i_alloc(struct net *net, } f6i = ip6_route_info_create(&cfg, gfp_flags, extack); - if (!IS_ERR(f6i)) { - f6i->dst_nocount = true; + if (IS_ERR(f6i)) + return f6i; - if (!anycast && - (READ_ONCE(net->ipv6.devconf_all->disable_policy) || - READ_ONCE(idev->cnf.disable_policy))) - f6i->dst_nopolicy = true; - } + err = ip6_route_info_create_nh(f6i, &cfg, gfp_flags, extack); + if (err) + return ERR_PTR(err); + + f6i->dst_nocount = true; + + if (!anycast && + (READ_ONCE(net->ipv6.devconf_all->disable_policy) || + READ_ONCE(idev->cnf.disable_policy))) + f6i->dst_nopolicy = true; return f6i; } @@ -5000,14 +5101,63 @@ static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = { [RTA_SPORT] = { .type = NLA_U16 }, [RTA_DPORT] = { .type = NLA_U16 }, [RTA_NH_ID] = { .type = NLA_U32 }, + [RTA_FLOWLABEL] = { .type = NLA_BE32 }, }; +static int rtm_to_fib6_multipath_config(struct fib6_config *cfg, + struct netlink_ext_ack *extack, + bool newroute) +{ + struct rtnexthop *rtnh; + int remaining; + + remaining = cfg->fc_mp_len; + rtnh = (struct rtnexthop *)cfg->fc_mp; + + if (!rtnh_ok(rtnh, remaining)) { + NL_SET_ERR_MSG(extack, "Invalid nexthop configuration - no valid nexthops"); + return -EINVAL; + } + + do { + bool has_gateway = cfg->fc_flags & RTF_GATEWAY; + int attrlen = rtnh_attrlen(rtnh); + + if (attrlen > 0) { + struct nlattr *nla, *attrs; + + attrs = rtnh_attrs(rtnh); + nla = nla_find(attrs, attrlen, RTA_GATEWAY); + if (nla) { + if (nla_len(nla) < sizeof(cfg->fc_gateway)) { + NL_SET_ERR_MSG(extack, + "Invalid IPv6 address in RTA_GATEWAY"); + return -EINVAL; + } + + has_gateway = true; + } + } + + if (newroute && (cfg->fc_nh_id || !has_gateway)) { + NL_SET_ERR_MSG(extack, + "Device only routes can not be added for IPv6 using the multipath API."); + return -EINVAL; + } + + rtnh = rtnh_next(rtnh, &remaining); + } while (rtnh_ok(rtnh, remaining)); + + return lwtunnel_valid_encap_type_attr(cfg->fc_mp, cfg->fc_mp_len, extack); +} + static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, struct fib6_config *cfg, struct netlink_ext_ack *extack) { - struct rtmsg *rtm; + bool newroute = nlh->nlmsg_type == RTM_NEWROUTE; struct nlattr *tb[RTA_MAX+1]; + struct rtmsg *rtm; unsigned int pref; int err; @@ -5025,6 +5175,12 @@ static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, goto errout; } + if (tb[RTA_FLOWLABEL]) { + NL_SET_ERR_MSG_ATTR(extack, tb[RTA_FLOWLABEL], + "Flow label cannot be specified for this operation"); + goto errout; + } + *cfg = (struct fib6_config){ .fc_table = rtm->rtm_table, .fc_dst_len = rtm->rtm_dst_len, @@ -5110,8 +5266,7 @@ static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]); cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]); - err = lwtunnel_valid_encap_type_attr(cfg->fc_mp, - cfg->fc_mp_len, extack); + err = rtm_to_fib6_multipath_config(cfg, extack, newroute); if (err < 0) goto errout; } @@ -5152,29 +5307,28 @@ errout: struct rt6_nh { struct fib6_info *fib6_info; struct fib6_config r_cfg; - struct list_head next; + struct list_head list; }; -static int ip6_route_info_append(struct net *net, - struct list_head *rt6_nh_list, +static int ip6_route_info_append(struct list_head *rt6_nh_list, struct fib6_info *rt, struct fib6_config *r_cfg) { struct rt6_nh *nh; - int err = -EEXIST; - list_for_each_entry(nh, rt6_nh_list, next) { + list_for_each_entry(nh, rt6_nh_list, list) { /* check if fib6_info already exists */ if (rt6_duplicate_nexthop(nh->fib6_info, rt)) - return err; + return -EEXIST; } nh = kzalloc(sizeof(*nh), GFP_KERNEL); if (!nh) return -ENOMEM; + nh->fib6_info = rt; memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg)); - list_add_tail(&nh->next, rt6_nh_list); + list_add_tail(&nh->list, rt6_nh_list); return 0; } @@ -5190,14 +5344,18 @@ static void ip6_route_mpath_notify(struct fib6_info *rt, * nexthop. Since sibling routes are always added at the end of * the list, find the first sibling of the last route appended */ + rcu_read_lock(); + if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) { - rt = list_first_entry(&rt_last->fib6_siblings, - struct fib6_info, - fib6_siblings); + rt = list_first_or_null_rcu(&rt_last->fib6_siblings, + struct fib6_info, + fib6_siblings); } if (rt) inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags); + + rcu_read_unlock(); } static bool ip6_route_mpath_should_notify(const struct fib6_info *rt) @@ -5226,37 +5384,26 @@ out: return should_notify; } -static int fib6_gw_from_attr(struct in6_addr *gw, struct nlattr *nla, - struct netlink_ext_ack *extack) -{ - if (nla_len(nla) < sizeof(*gw)) { - NL_SET_ERR_MSG(extack, "Invalid IPv6 address in RTA_GATEWAY"); - return -EINVAL; - } - - *gw = nla_get_in6_addr(nla); - - return 0; -} - static int ip6_route_multipath_add(struct fib6_config *cfg, struct netlink_ext_ack *extack) { struct fib6_info *rt_notif = NULL, *rt_last = NULL; struct nl_info *info = &cfg->fc_nlinfo; + struct rt6_nh *nh, *nh_safe; struct fib6_config r_cfg; struct rtnexthop *rtnh; - struct fib6_info *rt; + LIST_HEAD(rt6_nh_list); struct rt6_nh *err_nh; - struct rt6_nh *nh, *nh_safe; + struct fib6_info *rt; __u16 nlflags; int remaining; int attrlen; - int err = 1; + int replace; int nhn = 0; - int replace = (cfg->fc_nlinfo.nlh && - (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE)); - LIST_HEAD(rt6_nh_list); + int err; + + replace = (cfg->fc_nlinfo.nlh && + (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE)); nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE; if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND) @@ -5279,18 +5426,11 @@ static int ip6_route_multipath_add(struct fib6_config *cfg, nla = nla_find(attrs, attrlen, RTA_GATEWAY); if (nla) { - err = fib6_gw_from_attr(&r_cfg.fc_gateway, nla, - extack); - if (err) - goto cleanup; - + r_cfg.fc_gateway = nla_get_in6_addr(nla); r_cfg.fc_flags |= RTF_GATEWAY; } - r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP); - /* RTA_ENCAP_TYPE length checked in - * lwtunnel_valid_encap_type_attr - */ + r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP); nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE); if (nla) r_cfg.fc_encap_type = nla_get_u16(nla); @@ -5303,18 +5443,16 @@ static int ip6_route_multipath_add(struct fib6_config *cfg, rt = NULL; goto cleanup; } - if (!rt6_qualify_for_ecmp(rt)) { - err = -EINVAL; - NL_SET_ERR_MSG(extack, - "Device only routes can not be added for IPv6 using the multipath API."); - fib6_info_release(rt); + + err = ip6_route_info_create_nh(rt, &r_cfg, GFP_KERNEL, extack); + if (err) { + rt = NULL; goto cleanup; } rt->fib6_nh->fib_nh_weight = rtnh->rtnh_hops + 1; - err = ip6_route_info_append(info->nl_net, &rt6_nh_list, - rt, &r_cfg); + err = ip6_route_info_append(&rt6_nh_list, rt, &r_cfg); if (err) { fib6_info_release(rt); goto cleanup; @@ -5323,12 +5461,6 @@ static int ip6_route_multipath_add(struct fib6_config *cfg, rtnh = rtnh_next(rtnh, &remaining); } - if (list_empty(&rt6_nh_list)) { - NL_SET_ERR_MSG(extack, - "Invalid nexthop configuration - no valid nexthops"); - return -EINVAL; - } - /* for add and replace send one notification with all nexthops. * Skip the notification in fib6_add_rt2node and send one with * the full route when done @@ -5341,7 +5473,7 @@ static int ip6_route_multipath_add(struct fib6_config *cfg, info->skip_notify_kernel = 1; err_nh = NULL; - list_for_each_entry(nh, &rt6_nh_list, next) { + list_for_each_entry(nh, &rt6_nh_list, list) { err = __ip6_ins_rt(nh->fib6_info, info, extack); if (err) { @@ -5409,16 +5541,16 @@ add_errout: ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags); /* Delete routes that were already added */ - list_for_each_entry(nh, &rt6_nh_list, next) { + list_for_each_entry(nh, &rt6_nh_list, list) { if (err_nh == nh) break; ip6_route_del(&nh->r_cfg, extack); } cleanup: - list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) { + list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, list) { fib6_info_release(nh->fib6_info); - list_del(&nh->next); + list_del(&nh->list); kfree(nh); } @@ -5450,21 +5582,15 @@ static int ip6_route_multipath_del(struct fib6_config *cfg, nla = nla_find(attrs, attrlen, RTA_GATEWAY); if (nla) { - err = fib6_gw_from_attr(&r_cfg.fc_gateway, nla, - extack); - if (err) { - last_err = err; - goto next_rtnh; - } - + r_cfg.fc_gateway = nla_get_in6_addr(nla); r_cfg.fc_flags |= RTF_GATEWAY; } } + err = ip6_route_del(&r_cfg, extack); if (err) last_err = err; -next_rtnh: rtnh = rtnh_next(rtnh, &remaining); } @@ -5481,15 +5607,20 @@ static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh, if (err < 0) return err; - if (cfg.fc_nh_id && - !nexthop_find_by_id(sock_net(skb->sk), cfg.fc_nh_id)) { - NL_SET_ERR_MSG(extack, "Nexthop id does not exist"); - return -EINVAL; + if (cfg.fc_nh_id) { + rcu_read_lock(); + err = !nexthop_find_by_id(sock_net(skb->sk), cfg.fc_nh_id); + rcu_read_unlock(); + + if (err) { + NL_SET_ERR_MSG(extack, "Nexthop id does not exist"); + return -EINVAL; + } } - if (cfg.fc_mp) + if (cfg.fc_mp) { return ip6_route_multipath_del(&cfg, extack); - else { + } else { cfg.fc_delete_all_nh = 1; return ip6_route_del(&cfg, extack); } @@ -5505,6 +5636,10 @@ static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, if (err < 0) return err; + err = fib6_config_validate(&cfg, extack); + if (err) + return err; + if (cfg.fc_metric == 0) cfg.fc_metric = IP6_RT_PRIO_USER; @@ -5542,17 +5677,21 @@ static size_t rt6_nlmsg_size(struct fib6_info *f6i) nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_nlmsg_size, &nexthop_len); } else { - struct fib6_info *sibling, *next_sibling; struct fib6_nh *nh = f6i->fib6_nh; + struct fib6_info *sibling; nexthop_len = 0; if (f6i->fib6_nsiblings) { rt6_nh_nlmsg_size(nh, &nexthop_len); - list_for_each_entry_safe(sibling, next_sibling, - &f6i->fib6_siblings, fib6_siblings) { + rcu_read_lock(); + + list_for_each_entry_rcu(sibling, &f6i->fib6_siblings, + fib6_siblings) { rt6_nh_nlmsg_size(sibling->fib6_nh, &nexthop_len); } + + rcu_read_unlock(); } nexthop_len += lwtunnel_get_encap_size(nh->fib_nh_lws); } @@ -5608,7 +5747,7 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb, int iif, int type, u32 portid, u32 seq, unsigned int flags) { - struct rt6_info *rt6 = (struct rt6_info *)dst; + struct rt6_info *rt6 = dst_rt6_info(dst); struct rt6key *rt6_dst, *rt6_src; u32 *pmetrics, table, rt6_flags; unsigned char nh_flags = 0; @@ -5682,7 +5821,7 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb, goto nla_put_failure; } else if (dest) { struct in6_addr saddr_buf; - if (ip6_route_get_saddr(net, rt, dest, 0, &saddr_buf) == 0 && + if (ip6_route_get_saddr(net, rt, dest, 0, 0, &saddr_buf) == 0 && nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf)) goto nla_put_failure; } @@ -5716,7 +5855,7 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb, lwtunnel_fill_encap(skb, dst->lwtstate, RTA_ENCAP, RTA_ENCAP_TYPE) < 0) goto nla_put_failure; } else if (rt->fib6_nsiblings) { - struct fib6_info *sibling, *next_sibling; + struct fib6_info *sibling; struct nlattr *mp; mp = nla_nest_start_noflag(skb, RTA_MULTIPATH); @@ -5728,14 +5867,21 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb, 0) < 0) goto nla_put_failure; - list_for_each_entry_safe(sibling, next_sibling, - &rt->fib6_siblings, fib6_siblings) { + rcu_read_lock(); + + list_for_each_entry_rcu(sibling, &rt->fib6_siblings, + fib6_siblings) { if (fib_add_nexthop(skb, &sibling->fib6_nh->nh_common, sibling->fib6_nh->fib_nh_weight, - AF_INET6, 0) < 0) + AF_INET6, 0) < 0) { + rcu_read_unlock(); + goto nla_put_failure; + } } + rcu_read_unlock(); + nla_nest_end(skb, mp); } else if (rt->nh) { if (nla_put_u32(skb, RTA_NH_ID, rt->nh->id)) @@ -5958,7 +6104,8 @@ static int inet6_rtm_valid_getroute_req(struct sk_buff *skb, struct rtmsg *rtm; int i, err; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) { + rtm = nlmsg_payload(nlh, sizeof(*rtm)); + if (!rtm) { NL_SET_ERR_MSG_MOD(extack, "Invalid header for get route request"); return -EINVAL; @@ -5968,7 +6115,6 @@ static int inet6_rtm_valid_getroute_req(struct sk_buff *skb, return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy, extack); - rtm = nlmsg_data(nlh); if ((rtm->rtm_src_len && rtm->rtm_src_len != 128) || (rtm->rtm_dst_len && rtm->rtm_dst_len != 128) || rtm->rtm_table || rtm->rtm_protocol || rtm->rtm_scope || @@ -5993,6 +6139,13 @@ static int inet6_rtm_valid_getroute_req(struct sk_buff *skb, return -EINVAL; } + if (tb[RTA_FLOWLABEL] && + (nla_get_be32(tb[RTA_FLOWLABEL]) & ~IPV6_FLOWLABEL_MASK)) { + NL_SET_ERR_MSG_ATTR(extack, tb[RTA_FLOWLABEL], + "Invalid flow label"); + return -EINVAL; + } + for (i = 0; i <= RTA_MAX; i++) { if (!tb[i]) continue; @@ -6007,6 +6160,7 @@ static int inet6_rtm_valid_getroute_req(struct sk_buff *skb, case RTA_SPORT: case RTA_DPORT: case RTA_IP_PROTO: + case RTA_FLOWLABEL: break; default: NL_SET_ERR_MSG_MOD(extack, "Unsupported attribute in get route request"); @@ -6029,6 +6183,7 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, struct sk_buff *skb; struct rtmsg *rtm; struct flowi6 fl6 = {}; + __be32 flowlabel; bool fibmatch; err = inet6_rtm_valid_getroute_req(in_skb, nlh, tb, extack); @@ -6037,7 +6192,6 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, err = -EINVAL; rtm = nlmsg_data(nlh); - fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0); fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH); if (tb[RTA_SRC]) { @@ -6083,6 +6237,9 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, goto errout; } + flowlabel = nla_get_be32_default(tb[RTA_FLOWLABEL], 0); + fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, flowlabel); + if (iif) { struct net_device *dev; int flags = 0; @@ -6111,7 +6268,7 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, } - rt = container_of(dst, struct rt6_info, dst); + rt = dst_rt6_info(dst); if (rt->dst.error) { err = rt->dst.error; ip6_rt_put(rt); @@ -6172,7 +6329,9 @@ void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info, err = -ENOBUFS; seq = info->nlh ? info->nlh->nlmsg_seq : 0; - skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any()); + rcu_read_lock(); + + skb = nlmsg_new(rt6_nlmsg_size(rt), GFP_ATOMIC); if (!skb) goto errout; @@ -6184,12 +6343,15 @@ void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info, kfree_skb(skb); goto errout; } + + rcu_read_unlock(); + rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE, - info->nlh, gfp_any()); + info->nlh, GFP_ATOMIC); return; errout: - if (err < 0) - rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err); + rcu_read_unlock(); + rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err); } void fib6_rt_update(struct net *net, struct fib6_info *rt, @@ -6215,8 +6377,7 @@ void fib6_rt_update(struct net *net, struct fib6_info *rt, info->nlh, gfp_any()); return; errout: - if (err < 0) - rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err); + rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err); } void fib6_info_hw_flags_set(struct net *net, struct fib6_info *f6i, @@ -6329,7 +6490,7 @@ static int rt6_stats_seq_show(struct seq_file *seq, void *v) #ifdef CONFIG_SYSCTL -static int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write, +static int ipv6_sysctl_rtcache_flush(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct net *net; @@ -6338,12 +6499,12 @@ static int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write, if (!write) return -EINVAL; - net = (struct net *)ctl->extra1; - delay = net->ipv6.sysctl.flush_delay; ret = proc_dointvec(ctl, write, buffer, lenp, ppos); if (ret) return ret; + net = (struct net *)ctl->extra1; + delay = net->ipv6.sysctl.flush_delay; fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0); return 0; } @@ -6428,7 +6589,6 @@ static struct ctl_table ipv6_route_table_template[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, - { } }; struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net) @@ -6452,10 +6612,6 @@ struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net) table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss; table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval; table[10].data = &net->ipv6.sysctl.skip_notify_on_dev_down; - - /* Don't export sysctls to unprivileged users */ - if (net->user_ns != &init_user_ns) - table[1].procname = NULL; } return table; @@ -6682,6 +6838,15 @@ static void bpf_iter_unregister(void) #endif #endif +static const struct rtnl_msg_handler ip6_route_rtnl_msg_handlers[] __initconst_or_module = { + {.owner = THIS_MODULE, .protocol = PF_INET6, .msgtype = RTM_NEWROUTE, + .doit = inet6_rtm_newroute, .flags = RTNL_FLAG_DOIT_UNLOCKED}, + {.owner = THIS_MODULE, .protocol = PF_INET6, .msgtype = RTM_DELROUTE, + .doit = inet6_rtm_delroute, .flags = RTNL_FLAG_DOIT_UNLOCKED}, + {.owner = THIS_MODULE, .protocol = PF_INET6, .msgtype = RTM_GETROUTE, + .doit = inet6_rtm_getroute, .flags = RTNL_FLAG_DOIT_UNLOCKED}, +}; + int __init ip6_route_init(void) { int ret; @@ -6724,19 +6889,7 @@ int __init ip6_route_init(void) if (ret) goto fib6_rules_init; - ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE, - inet6_rtm_newroute, NULL, 0); - if (ret < 0) - goto out_register_late_subsys; - - ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE, - inet6_rtm_delroute, NULL, 0); - if (ret < 0) - goto out_register_late_subsys; - - ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE, - inet6_rtm_getroute, NULL, - RTNL_FLAG_DOIT_UNLOCKED); + ret = rtnl_register_many(ip6_route_rtnl_msg_handlers); if (ret < 0) goto out_register_late_subsys; @@ -6756,7 +6909,6 @@ int __init ip6_route_init(void) struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu); INIT_LIST_HEAD(&ul->head); - INIT_LIST_HEAD(&ul->quarantine); spin_lock_init(&ul->lock); } diff --git a/net/ipv6/rpl_iptunnel.c b/net/ipv6/rpl_iptunnel.c index a013b92cbb86..7c05ac846646 100644 --- a/net/ipv6/rpl_iptunnel.c +++ b/net/ipv6/rpl_iptunnel.c @@ -125,7 +125,8 @@ static void rpl_destroy_state(struct lwtunnel_state *lwt) } static int rpl_do_srh_inline(struct sk_buff *skb, const struct rpl_lwt *rlwt, - const struct ipv6_rpl_sr_hdr *srh) + const struct ipv6_rpl_sr_hdr *srh, + struct dst_entry *cache_dst) { struct ipv6_rpl_sr_hdr *isrh, *csrh; const struct ipv6hdr *oldhdr; @@ -153,7 +154,7 @@ static int rpl_do_srh_inline(struct sk_buff *skb, const struct rpl_lwt *rlwt, hdrlen = ((csrh->hdrlen + 1) << 3); - err = skb_cow_head(skb, hdrlen + skb->mac_len); + err = skb_cow_head(skb, hdrlen + dst_dev_overhead(cache_dst, skb)); if (unlikely(err)) { kfree(buf); return err; @@ -186,7 +187,8 @@ static int rpl_do_srh_inline(struct sk_buff *skb, const struct rpl_lwt *rlwt, return 0; } -static int rpl_do_srh(struct sk_buff *skb, const struct rpl_lwt *rlwt) +static int rpl_do_srh(struct sk_buff *skb, const struct rpl_lwt *rlwt, + struct dst_entry *cache_dst) { struct dst_entry *dst = skb_dst(skb); struct rpl_iptunnel_encap *tinfo; @@ -196,7 +198,7 @@ static int rpl_do_srh(struct sk_buff *skb, const struct rpl_lwt *rlwt) tinfo = rpl_encap_lwtunnel(dst->lwtstate); - return rpl_do_srh_inline(skb, rlwt, tinfo->srh); + return rpl_do_srh_inline(skb, rlwt, tinfo->srh, cache_dst); } static int rpl_output(struct net *net, struct sock *sk, struct sk_buff *skb) @@ -208,14 +210,14 @@ static int rpl_output(struct net *net, struct sock *sk, struct sk_buff *skb) rlwt = rpl_lwt_lwtunnel(orig_dst->lwtstate); - err = rpl_do_srh(skb, rlwt); + local_bh_disable(); + dst = dst_cache_get(&rlwt->cache); + local_bh_enable(); + + err = rpl_do_srh(skb, rlwt, dst); if (unlikely(err)) goto drop; - preempt_disable(); - dst = dst_cache_get(&rlwt->cache); - preempt_enable(); - if (unlikely(!dst)) { struct ipv6hdr *hdr = ipv6_hdr(skb); struct flowi6 fl6; @@ -230,25 +232,28 @@ static int rpl_output(struct net *net, struct sock *sk, struct sk_buff *skb) dst = ip6_route_output(net, NULL, &fl6); if (dst->error) { err = dst->error; - dst_release(dst); goto drop; } - preempt_disable(); - dst_cache_set_ip6(&rlwt->cache, dst, &fl6.saddr); - preempt_enable(); + /* cache only if we don't create a dst reference loop */ + if (orig_dst->lwtstate != dst->lwtstate) { + local_bh_disable(); + dst_cache_set_ip6(&rlwt->cache, dst, &fl6.saddr); + local_bh_enable(); + } + + err = skb_cow_head(skb, LL_RESERVED_SPACE(dst->dev)); + if (unlikely(err)) + goto drop; } skb_dst_drop(skb); skb_dst_set(skb, dst); - err = skb_cow_head(skb, LL_RESERVED_SPACE(dst->dev)); - if (unlikely(err)) - goto drop; - return dst_output(net, sk, skb); drop: + dst_release(dst); kfree_skb(skb); return err; } @@ -257,40 +262,54 @@ static int rpl_input(struct sk_buff *skb) { struct dst_entry *orig_dst = skb_dst(skb); struct dst_entry *dst = NULL; + struct lwtunnel_state *lwtst; struct rpl_lwt *rlwt; int err; - rlwt = rpl_lwt_lwtunnel(orig_dst->lwtstate); + /* We cannot dereference "orig_dst" once ip6_route_input() or + * skb_dst_drop() is called. However, in order to detect a dst loop, we + * need the address of its lwtstate. So, save the address of lwtstate + * now and use it later as a comparison. + */ + lwtst = orig_dst->lwtstate; - err = rpl_do_srh(skb, rlwt); - if (unlikely(err)) { - kfree_skb(skb); - return err; - } + rlwt = rpl_lwt_lwtunnel(lwtst); - preempt_disable(); + local_bh_disable(); dst = dst_cache_get(&rlwt->cache); - preempt_enable(); + local_bh_enable(); + + err = rpl_do_srh(skb, rlwt, dst); + if (unlikely(err)) { + dst_release(dst); + goto drop; + } if (!dst) { ip6_route_input(skb); dst = skb_dst(skb); - if (!dst->error) { - preempt_disable(); + + /* cache only if we don't create a dst reference loop */ + if (!dst->error && lwtst != dst->lwtstate) { + local_bh_disable(); dst_cache_set_ip6(&rlwt->cache, dst, &ipv6_hdr(skb)->saddr); - preempt_enable(); + local_bh_enable(); } + + err = skb_cow_head(skb, LL_RESERVED_SPACE(dst->dev)); + if (unlikely(err)) + goto drop; } else { skb_dst_drop(skb); skb_dst_set(skb, dst); } - err = skb_cow_head(skb, LL_RESERVED_SPACE(dst->dev)); - if (unlikely(err)) - return err; - return dst_input(skb); + +drop: + kfree_skb(skb); + return err; } static int nla_put_rpl_srh(struct sk_buff *skb, int attrtype, diff --git a/net/ipv6/seg6.c b/net/ipv6/seg6.c index 35508abd76f4..180da19c148c 100644 --- a/net/ipv6/seg6.c +++ b/net/ipv6/seg6.c @@ -21,9 +21,7 @@ #include <net/genetlink.h> #include <linux/seg6.h> #include <linux/seg6_genl.h> -#ifdef CONFIG_IPV6_SEG6_HMAC #include <net/seg6_hmac.h> -#endif bool seg6_validate_srh(struct ipv6_sr_hdr *srh, int len, bool reduced) { @@ -437,13 +435,11 @@ static int __net_init seg6_net_init(struct net *net) net->ipv6.seg6_data = sdata; -#ifdef CONFIG_IPV6_SEG6_HMAC if (seg6_hmac_net_init(net)) { kfree(rcu_dereference_raw(sdata->tun_src)); kfree(sdata); return -ENOMEM; } -#endif return 0; } @@ -452,9 +448,7 @@ static void __net_exit seg6_net_exit(struct net *net) { struct seg6_pernet_data *sdata = seg6_pernet(net); -#ifdef CONFIG_IPV6_SEG6_HMAC seg6_hmac_net_exit(net); -#endif kfree(rcu_dereference_raw(sdata->tun_src)); kfree(sdata); @@ -520,39 +514,28 @@ int __init seg6_init(void) if (err) goto out_unregister_pernet; -#ifdef CONFIG_IPV6_SEG6_LWTUNNEL err = seg6_iptunnel_init(); if (err) goto out_unregister_genl; err = seg6_local_init(); - if (err) { - seg6_iptunnel_exit(); - goto out_unregister_genl; - } -#endif + if (err) + goto out_unregister_iptun; -#ifdef CONFIG_IPV6_SEG6_HMAC err = seg6_hmac_init(); if (err) - goto out_unregister_iptun; -#endif + goto out_unregister_seg6; pr_info("Segment Routing with IPv6\n"); out: return err; -#ifdef CONFIG_IPV6_SEG6_HMAC -out_unregister_iptun: -#ifdef CONFIG_IPV6_SEG6_LWTUNNEL +out_unregister_seg6: seg6_local_exit(); +out_unregister_iptun: seg6_iptunnel_exit(); -#endif -#endif -#ifdef CONFIG_IPV6_SEG6_LWTUNNEL out_unregister_genl: genl_unregister_family(&seg6_genl_family); -#endif out_unregister_pernet: unregister_pernet_subsys(&ip6_segments_ops); goto out; @@ -560,12 +543,9 @@ out_unregister_pernet: void seg6_exit(void) { -#ifdef CONFIG_IPV6_SEG6_HMAC seg6_hmac_exit(); -#endif -#ifdef CONFIG_IPV6_SEG6_LWTUNNEL + seg6_local_exit(); seg6_iptunnel_exit(); -#endif - unregister_pernet_subsys(&ip6_segments_ops); genl_unregister_family(&seg6_genl_family); + unregister_pernet_subsys(&ip6_segments_ops); } diff --git a/net/ipv6/seg6_hmac.c b/net/ipv6/seg6_hmac.c index 861e0366f549..f78ecb6ad838 100644 --- a/net/ipv6/seg6_hmac.c +++ b/net/ipv6/seg6_hmac.c @@ -40,7 +40,14 @@ #include <net/seg6_hmac.h> #include <linux/random.h> -static DEFINE_PER_CPU(char [SEG6_HMAC_RING_SIZE], hmac_ring); +struct hmac_storage { + local_lock_t bh_lock; + char hmac_ring[SEG6_HMAC_RING_SIZE]; +}; + +static DEFINE_PER_CPU(struct hmac_storage, hmac_storage) = { + .bh_lock = INIT_LOCAL_LOCK(bh_lock), +}; static int seg6_hmac_cmpfn(struct rhashtable_compare_arg *arg, const void *obj) { @@ -187,7 +194,8 @@ int seg6_hmac_compute(struct seg6_hmac_info *hinfo, struct ipv6_sr_hdr *hdr, */ local_bh_disable(); - ring = this_cpu_ptr(hmac_ring); + local_lock_nested_bh(&hmac_storage.bh_lock); + ring = this_cpu_ptr(hmac_storage.hmac_ring); off = ring; /* source address */ @@ -212,6 +220,7 @@ int seg6_hmac_compute(struct seg6_hmac_info *hinfo, struct ipv6_sr_hdr *hdr, dgsize = __do_hmac(hinfo, ring, plen, tmp_out, SEG6_HMAC_MAX_DIGESTSIZE); + local_unlock_nested_bh(&hmac_storage.bh_lock); local_bh_enable(); if (dgsize < 0) @@ -356,6 +365,7 @@ static int seg6_hmac_init_algo(void) struct crypto_shash *tfm; struct shash_desc *shash; int i, alg_count, cpu; + int ret = -ENOMEM; alg_count = ARRAY_SIZE(hmac_algos); @@ -366,12 +376,14 @@ static int seg6_hmac_init_algo(void) algo = &hmac_algos[i]; algo->tfms = alloc_percpu(struct crypto_shash *); if (!algo->tfms) - return -ENOMEM; + goto error_out; for_each_possible_cpu(cpu) { tfm = crypto_alloc_shash(algo->name, 0, 0); - if (IS_ERR(tfm)) - return PTR_ERR(tfm); + if (IS_ERR(tfm)) { + ret = PTR_ERR(tfm); + goto error_out; + } p_tfm = per_cpu_ptr(algo->tfms, cpu); *p_tfm = tfm; } @@ -383,18 +395,22 @@ static int seg6_hmac_init_algo(void) algo->shashs = alloc_percpu(struct shash_desc *); if (!algo->shashs) - return -ENOMEM; + goto error_out; for_each_possible_cpu(cpu) { shash = kzalloc_node(shsize, GFP_KERNEL, cpu_to_node(cpu)); if (!shash) - return -ENOMEM; + goto error_out; *per_cpu_ptr(algo->shashs, cpu) = shash; } } return 0; + +error_out: + seg6_hmac_exit(); + return ret; } int __init seg6_hmac_init(void) @@ -412,22 +428,29 @@ int __net_init seg6_hmac_net_init(struct net *net) void seg6_hmac_exit(void) { struct seg6_hmac_algo *algo = NULL; + struct crypto_shash *tfm; + struct shash_desc *shash; int i, alg_count, cpu; alg_count = ARRAY_SIZE(hmac_algos); for (i = 0; i < alg_count; i++) { algo = &hmac_algos[i]; - for_each_possible_cpu(cpu) { - struct crypto_shash *tfm; - struct shash_desc *shash; - shash = *per_cpu_ptr(algo->shashs, cpu); - kfree(shash); - tfm = *per_cpu_ptr(algo->tfms, cpu); - crypto_free_shash(tfm); + if (algo->shashs) { + for_each_possible_cpu(cpu) { + shash = *per_cpu_ptr(algo->shashs, cpu); + kfree(shash); + } + free_percpu(algo->shashs); + } + + if (algo->tfms) { + for_each_possible_cpu(cpu) { + tfm = *per_cpu_ptr(algo->tfms, cpu); + crypto_free_shash(tfm); + } + free_percpu(algo->tfms); } - free_percpu(algo->tfms); - free_percpu(algo->shashs); } } EXPORT_SYMBOL(seg6_hmac_exit); diff --git a/net/ipv6/seg6_iptunnel.c b/net/ipv6/seg6_iptunnel.c index 03b877ff4558..51583461ae29 100644 --- a/net/ipv6/seg6_iptunnel.c +++ b/net/ipv6/seg6_iptunnel.c @@ -124,8 +124,8 @@ static __be32 seg6_make_flowlabel(struct net *net, struct sk_buff *skb, return flowlabel; } -/* encapsulate an IPv6 packet within an outer IPv6 header with a given SRH */ -int seg6_do_srh_encap(struct sk_buff *skb, struct ipv6_sr_hdr *osrh, int proto) +static int __seg6_do_srh_encap(struct sk_buff *skb, struct ipv6_sr_hdr *osrh, + int proto, struct dst_entry *cache_dst) { struct dst_entry *dst = skb_dst(skb); struct net *net = dev_net(dst->dev); @@ -137,7 +137,7 @@ int seg6_do_srh_encap(struct sk_buff *skb, struct ipv6_sr_hdr *osrh, int proto) hdrlen = (osrh->hdrlen + 1) << 3; tot_len = hdrlen + sizeof(*hdr); - err = skb_cow_head(skb, tot_len + skb->mac_len); + err = skb_cow_head(skb, tot_len + dst_dev_overhead(cache_dst, skb)); if (unlikely(err)) return err; @@ -197,11 +197,18 @@ int seg6_do_srh_encap(struct sk_buff *skb, struct ipv6_sr_hdr *osrh, int proto) return 0; } + +/* encapsulate an IPv6 packet within an outer IPv6 header with a given SRH */ +int seg6_do_srh_encap(struct sk_buff *skb, struct ipv6_sr_hdr *osrh, int proto) +{ + return __seg6_do_srh_encap(skb, osrh, proto, NULL); +} EXPORT_SYMBOL_GPL(seg6_do_srh_encap); /* encapsulate an IPv6 packet within an outer IPv6 header with reduced SRH */ static int seg6_do_srh_encap_red(struct sk_buff *skb, - struct ipv6_sr_hdr *osrh, int proto) + struct ipv6_sr_hdr *osrh, int proto, + struct dst_entry *cache_dst) { __u8 first_seg = osrh->first_segment; struct dst_entry *dst = skb_dst(skb); @@ -230,7 +237,7 @@ static int seg6_do_srh_encap_red(struct sk_buff *skb, tot_len = red_hdrlen + sizeof(struct ipv6hdr); - err = skb_cow_head(skb, tot_len + skb->mac_len); + err = skb_cow_head(skb, tot_len + dst_dev_overhead(cache_dst, skb)); if (unlikely(err)) return err; @@ -317,8 +324,8 @@ out: return 0; } -/* insert an SRH within an IPv6 packet, just after the IPv6 header */ -int seg6_do_srh_inline(struct sk_buff *skb, struct ipv6_sr_hdr *osrh) +static int __seg6_do_srh_inline(struct sk_buff *skb, struct ipv6_sr_hdr *osrh, + struct dst_entry *cache_dst) { struct ipv6hdr *hdr, *oldhdr; struct ipv6_sr_hdr *isrh; @@ -326,7 +333,7 @@ int seg6_do_srh_inline(struct sk_buff *skb, struct ipv6_sr_hdr *osrh) hdrlen = (osrh->hdrlen + 1) << 3; - err = skb_cow_head(skb, hdrlen + skb->mac_len); + err = skb_cow_head(skb, hdrlen + dst_dev_overhead(cache_dst, skb)); if (unlikely(err)) return err; @@ -369,9 +376,8 @@ int seg6_do_srh_inline(struct sk_buff *skb, struct ipv6_sr_hdr *osrh) return 0; } -EXPORT_SYMBOL_GPL(seg6_do_srh_inline); -static int seg6_do_srh(struct sk_buff *skb) +static int seg6_do_srh(struct sk_buff *skb, struct dst_entry *cache_dst) { struct dst_entry *dst = skb_dst(skb); struct seg6_iptunnel_encap *tinfo; @@ -384,7 +390,7 @@ static int seg6_do_srh(struct sk_buff *skb) if (skb->protocol != htons(ETH_P_IPV6)) return -EINVAL; - err = seg6_do_srh_inline(skb, tinfo->srh); + err = __seg6_do_srh_inline(skb, tinfo->srh, cache_dst); if (err) return err; break; @@ -402,9 +408,11 @@ static int seg6_do_srh(struct sk_buff *skb) return -EINVAL; if (tinfo->mode == SEG6_IPTUN_MODE_ENCAP) - err = seg6_do_srh_encap(skb, tinfo->srh, proto); + err = __seg6_do_srh_encap(skb, tinfo->srh, + proto, cache_dst); else - err = seg6_do_srh_encap_red(skb, tinfo->srh, proto); + err = seg6_do_srh_encap_red(skb, tinfo->srh, + proto, cache_dst); if (err) return err; @@ -425,11 +433,13 @@ static int seg6_do_srh(struct sk_buff *skb) skb_push(skb, skb->mac_len); if (tinfo->mode == SEG6_IPTUN_MODE_L2ENCAP) - err = seg6_do_srh_encap(skb, tinfo->srh, - IPPROTO_ETHERNET); + err = __seg6_do_srh_encap(skb, tinfo->srh, + IPPROTO_ETHERNET, + cache_dst); else err = seg6_do_srh_encap_red(skb, tinfo->srh, - IPPROTO_ETHERNET); + IPPROTO_ETHERNET, + cache_dst); if (err) return err; @@ -444,6 +454,13 @@ static int seg6_do_srh(struct sk_buff *skb) return 0; } +/* insert an SRH within an IPv6 packet, just after the IPv6 header */ +int seg6_do_srh_inline(struct sk_buff *skb, struct ipv6_sr_hdr *osrh) +{ + return __seg6_do_srh_inline(skb, osrh, NULL); +} +EXPORT_SYMBOL_GPL(seg6_do_srh_inline); + static int seg6_input_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { @@ -455,45 +472,58 @@ static int seg6_input_core(struct net *net, struct sock *sk, { struct dst_entry *orig_dst = skb_dst(skb); struct dst_entry *dst = NULL; + struct lwtunnel_state *lwtst; struct seg6_lwt *slwt; int err; - err = seg6_do_srh(skb); - if (unlikely(err)) { - kfree_skb(skb); - return err; - } + /* We cannot dereference "orig_dst" once ip6_route_input() or + * skb_dst_drop() is called. However, in order to detect a dst loop, we + * need the address of its lwtstate. So, save the address of lwtstate + * now and use it later as a comparison. + */ + lwtst = orig_dst->lwtstate; - slwt = seg6_lwt_lwtunnel(orig_dst->lwtstate); + slwt = seg6_lwt_lwtunnel(lwtst); - preempt_disable(); + local_bh_disable(); dst = dst_cache_get(&slwt->cache); - preempt_enable(); + local_bh_enable(); + + err = seg6_do_srh(skb, dst); + if (unlikely(err)) { + dst_release(dst); + goto drop; + } if (!dst) { ip6_route_input(skb); dst = skb_dst(skb); - if (!dst->error) { - preempt_disable(); + + /* cache only if we don't create a dst reference loop */ + if (!dst->error && lwtst != dst->lwtstate) { + local_bh_disable(); dst_cache_set_ip6(&slwt->cache, dst, &ipv6_hdr(skb)->saddr); - preempt_enable(); + local_bh_enable(); } + + err = skb_cow_head(skb, LL_RESERVED_SPACE(dst->dev)); + if (unlikely(err)) + goto drop; } else { skb_dst_drop(skb); skb_dst_set(skb, dst); } - err = skb_cow_head(skb, LL_RESERVED_SPACE(dst->dev)); - if (unlikely(err)) - return err; - if (static_branch_unlikely(&nf_hooks_lwtunnel_enabled)) return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, dev_net(skb->dev), NULL, skb, NULL, skb_dst(skb)->dev, seg6_input_finish); return seg6_input_finish(dev_net(skb->dev), NULL, skb); +drop: + kfree_skb(skb); + return err; } static int seg6_input_nf(struct sk_buff *skb) @@ -529,15 +559,15 @@ static int seg6_output_core(struct net *net, struct sock *sk, struct seg6_lwt *slwt; int err; - err = seg6_do_srh(skb); - if (unlikely(err)) - goto drop; - slwt = seg6_lwt_lwtunnel(orig_dst->lwtstate); - preempt_disable(); + local_bh_disable(); dst = dst_cache_get(&slwt->cache); - preempt_enable(); + local_bh_enable(); + + err = seg6_do_srh(skb, dst); + if (unlikely(err)) + goto drop; if (unlikely(!dst)) { struct ipv6hdr *hdr = ipv6_hdr(skb); @@ -553,28 +583,31 @@ static int seg6_output_core(struct net *net, struct sock *sk, dst = ip6_route_output(net, NULL, &fl6); if (dst->error) { err = dst->error; - dst_release(dst); goto drop; } - preempt_disable(); - dst_cache_set_ip6(&slwt->cache, dst, &fl6.saddr); - preempt_enable(); + /* cache only if we don't create a dst reference loop */ + if (orig_dst->lwtstate != dst->lwtstate) { + local_bh_disable(); + dst_cache_set_ip6(&slwt->cache, dst, &fl6.saddr); + local_bh_enable(); + } + + err = skb_cow_head(skb, LL_RESERVED_SPACE(dst->dev)); + if (unlikely(err)) + goto drop; } skb_dst_drop(skb); skb_dst_set(skb, dst); - err = skb_cow_head(skb, LL_RESERVED_SPACE(dst->dev)); - if (unlikely(err)) - goto drop; - if (static_branch_unlikely(&nf_hooks_lwtunnel_enabled)) return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, net, sk, skb, NULL, skb_dst(skb)->dev, dst_output); return dst_output(net, sk, skb); drop: + dst_release(dst); kfree_skb(skb); return err; } diff --git a/net/ipv6/seg6_local.c b/net/ipv6/seg6_local.c index 24e2b4b494cb..ac1dbd492c22 100644 --- a/net/ipv6/seg6_local.c +++ b/net/ipv6/seg6_local.c @@ -941,8 +941,8 @@ static int input_action_end_dx6(struct sk_buff *skb, if (static_branch_unlikely(&nf_hooks_lwtunnel_enabled)) return NF_HOOK(NFPROTO_IPV6, NF_INET_PRE_ROUTING, - dev_net(skb->dev), NULL, skb, NULL, - skb_dst(skb)->dev, input_action_end_dx6_finish); + dev_net(skb->dev), NULL, skb, skb->dev, + NULL, input_action_end_dx6_finish); return input_action_end_dx6_finish(dev_net(skb->dev), NULL, skb); drop: @@ -954,10 +954,10 @@ static int input_action_end_dx4_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { struct dst_entry *orig_dst = skb_dst(skb); + enum skb_drop_reason reason; struct seg6_local_lwt *slwt; struct iphdr *iph; __be32 nhaddr; - int err; slwt = seg6_local_lwtunnel(orig_dst->lwtstate); @@ -967,9 +967,9 @@ static int input_action_end_dx4_finish(struct net *net, struct sock *sk, skb_dst_drop(skb); - err = ip_route_input(skb, nhaddr, iph->saddr, 0, skb->dev); - if (err) { - kfree_skb(skb); + reason = ip_route_input(skb, nhaddr, iph->saddr, 0, skb->dev); + if (reason) { + kfree_skb_reason(skb, reason); return -EINVAL; } @@ -991,8 +991,8 @@ static int input_action_end_dx4(struct sk_buff *skb, if (static_branch_unlikely(&nf_hooks_lwtunnel_enabled)) return NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, - dev_net(skb->dev), NULL, skb, NULL, - skb_dst(skb)->dev, input_action_end_dx4_finish); + dev_net(skb->dev), NULL, skb, skb->dev, + NULL, input_action_end_dx4_finish); return input_action_end_dx4_finish(dev_net(skb->dev), NULL, skb); drop: @@ -1174,8 +1174,8 @@ drop: static int input_action_end_dt4(struct sk_buff *skb, struct seg6_local_lwt *slwt) { + enum skb_drop_reason reason; struct iphdr *iph; - int err; if (!decap_and_validate(skb, IPPROTO_IPIP)) goto drop; @@ -1193,8 +1193,8 @@ static int input_action_end_dt4(struct sk_buff *skb, iph = ip_hdr(skb); - err = ip_route_input(skb, iph->daddr, iph->saddr, 0, skb->dev); - if (unlikely(err)) + reason = ip_route_input(skb, iph->daddr, iph->saddr, 0, skb->dev); + if (unlikely(reason)) goto drop; return dst_input(skb); @@ -1380,7 +1380,9 @@ drop: return err; } -DEFINE_PER_CPU(struct seg6_bpf_srh_state, seg6_bpf_srh_states); +DEFINE_PER_CPU(struct seg6_bpf_srh_state, seg6_bpf_srh_states) = { + .bh_lock = INIT_LOCAL_LOCK(bh_lock), +}; bool seg6_bpf_has_valid_srh(struct sk_buff *skb) { @@ -1388,6 +1390,7 @@ bool seg6_bpf_has_valid_srh(struct sk_buff *skb) this_cpu_ptr(&seg6_bpf_srh_states); struct ipv6_sr_hdr *srh = srh_state->srh; + lockdep_assert_held(&srh_state->bh_lock); if (unlikely(srh == NULL)) return false; @@ -1408,8 +1411,7 @@ bool seg6_bpf_has_valid_srh(struct sk_buff *skb) static int input_action_end_bpf(struct sk_buff *skb, struct seg6_local_lwt *slwt) { - struct seg6_bpf_srh_state *srh_state = - this_cpu_ptr(&seg6_bpf_srh_states); + struct seg6_bpf_srh_state *srh_state; struct ipv6_sr_hdr *srh; int ret; @@ -1420,10 +1422,14 @@ static int input_action_end_bpf(struct sk_buff *skb, } advance_nextseg(srh, &ipv6_hdr(skb)->daddr); - /* preempt_disable is needed to protect the per-CPU buffer srh_state, - * which is also accessed by the bpf_lwt_seg6_* helpers + /* The access to the per-CPU buffer srh_state is protected by running + * always in softirq context (with disabled BH). On PREEMPT_RT the + * required locking is provided by the following local_lock_nested_bh() + * statement. It is also accessed by the bpf_lwt_seg6_* helpers via + * bpf_prog_run_save_cb(). */ - preempt_disable(); + local_lock_nested_bh(&seg6_bpf_srh_states.bh_lock); + srh_state = this_cpu_ptr(&seg6_bpf_srh_states); srh_state->srh = srh; srh_state->hdrlen = srh->hdrlen << 3; srh_state->valid = true; @@ -1446,15 +1452,15 @@ static int input_action_end_bpf(struct sk_buff *skb, if (srh_state->srh && !seg6_bpf_has_valid_srh(skb)) goto drop; + local_unlock_nested_bh(&seg6_bpf_srh_states.bh_lock); - preempt_enable(); if (ret != BPF_REDIRECT) seg6_lookup_nexthop(skb, NULL, 0); return dst_input(skb); drop: - preempt_enable(); + local_unlock_nested_bh(&seg6_bpf_srh_states.bh_lock); kfree_skb(skb); return -EINVAL; } diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index 655c9b1a19b8..a72dbca9e8fc 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -51,6 +51,8 @@ #include <net/dsfield.h> #include <net/net_namespace.h> #include <net/netns/generic.h> +#include <net/netdev_lock.h> +#include <net/inet_dscp.h> /* This version of net/ipv6/sit.c is cloned of net/ipv4/ip_gre.c @@ -132,8 +134,8 @@ static struct ip_tunnel *ipip6_tunnel_lookup(struct net *net, return NULL; } -static struct ip_tunnel __rcu **__ipip6_bucket(struct sit_net *sitn, - struct ip_tunnel_parm *parms) +static struct ip_tunnel __rcu ** +__ipip6_bucket(struct sit_net *sitn, struct ip_tunnel_parm_kern *parms) { __be32 remote = parms->iph.daddr; __be32 local = parms->iph.saddr; @@ -200,14 +202,13 @@ static void ipip6_tunnel_clone_6rd(struct net_device *dev, struct sit_net *sitn) static int ipip6_tunnel_create(struct net_device *dev) { struct ip_tunnel *t = netdev_priv(dev); - struct net *net = dev_net(dev); - struct sit_net *sitn = net_generic(net, sit_net_id); + struct sit_net *sitn = net_generic(t->net, sit_net_id); int err; __dev_addr_set(dev, &t->parms.iph.saddr, 4); memcpy(dev->broadcast, &t->parms.iph.daddr, 4); - if ((__force u16)t->parms.i_flags & SIT_ISATAP) + if (test_bit(IP_TUNNEL_SIT_ISATAP_BIT, t->parms.i_flags)) dev->priv_flags |= IFF_ISATAP; dev->rtnl_link_ops = &sit_link_ops; @@ -226,7 +227,8 @@ out: } static struct ip_tunnel *ipip6_tunnel_locate(struct net *net, - struct ip_tunnel_parm *parms, int create) + struct ip_tunnel_parm_kern *parms, + int create) { __be32 remote = parms->iph.daddr; __be32 local = parms->iph.saddr; @@ -267,6 +269,7 @@ static struct ip_tunnel *ipip6_tunnel_locate(struct net *net, nt = netdev_priv(dev); + nt->net = net; nt->parms = *parms; if (ipip6_tunnel_create(dev) < 0) goto failed_free; @@ -934,8 +937,8 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb, } flowi4_init_output(&fl4, tunnel->parms.link, tunnel->fwmark, - RT_TOS(tos), RT_SCOPE_UNIVERSE, IPPROTO_IPV6, - 0, dst, tiph->saddr, 0, 0, + tos & INET_DSCP_MASK, RT_SCOPE_UNIVERSE, + IPPROTO_IPV6, 0, dst, tiph->saddr, 0, 0, sock_net_uid(tunnel->net, NULL)); rt = dst_cache_get_ip4(&tunnel->dst_cache, &fl4.saddr); @@ -1110,7 +1113,7 @@ static void ipip6_tunnel_bind_dev(struct net_device *dev) iph->daddr, iph->saddr, 0, 0, IPPROTO_IPV6, - RT_TOS(iph->tos), + iph->tos & INET_DSCP_MASK, tunnel->parms.link); if (!IS_ERR(rt)) { @@ -1135,7 +1138,8 @@ static void ipip6_tunnel_bind_dev(struct net_device *dev) dev->needed_headroom = t_hlen + hlen; } -static void ipip6_tunnel_update(struct ip_tunnel *t, struct ip_tunnel_parm *p, +static void ipip6_tunnel_update(struct ip_tunnel *t, + struct ip_tunnel_parm_kern *p, __u32 fwmark) { struct net *net = t->net; @@ -1196,11 +1200,11 @@ static int ipip6_tunnel_get6rd(struct net_device *dev, struct ip_tunnel_parm __user *data) { struct ip_tunnel *t = netdev_priv(dev); + struct ip_tunnel_parm_kern p; struct ip_tunnel_6rd ip6rd; - struct ip_tunnel_parm p; if (dev == dev_to_sit_net(dev)->fb_tunnel_dev) { - if (copy_from_user(&p, data, sizeof(p))) + if (!ip_tunnel_parm_from_user(&p, data)) return -EFAULT; t = ipip6_tunnel_locate(t->net, &p, 0); } @@ -1251,7 +1255,7 @@ static bool ipip6_valid_ip_proto(u8 ipproto) } static int -__ipip6_tunnel_ioctl_validate(struct net *net, struct ip_tunnel_parm *p) +__ipip6_tunnel_ioctl_validate(struct net *net, struct ip_tunnel_parm_kern *p) { if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; @@ -1268,7 +1272,7 @@ __ipip6_tunnel_ioctl_validate(struct net *net, struct ip_tunnel_parm *p) } static int -ipip6_tunnel_get(struct net_device *dev, struct ip_tunnel_parm *p) +ipip6_tunnel_get(struct net_device *dev, struct ip_tunnel_parm_kern *p) { struct ip_tunnel *t = netdev_priv(dev); @@ -1281,7 +1285,7 @@ ipip6_tunnel_get(struct net_device *dev, struct ip_tunnel_parm *p) } static int -ipip6_tunnel_add(struct net_device *dev, struct ip_tunnel_parm *p) +ipip6_tunnel_add(struct net_device *dev, struct ip_tunnel_parm_kern *p) { struct ip_tunnel *t = netdev_priv(dev); int err; @@ -1297,7 +1301,7 @@ ipip6_tunnel_add(struct net_device *dev, struct ip_tunnel_parm *p) } static int -ipip6_tunnel_change(struct net_device *dev, struct ip_tunnel_parm *p) +ipip6_tunnel_change(struct net_device *dev, struct ip_tunnel_parm_kern *p) { struct ip_tunnel *t = netdev_priv(dev); int err; @@ -1328,7 +1332,7 @@ ipip6_tunnel_change(struct net_device *dev, struct ip_tunnel_parm *p) } static int -ipip6_tunnel_del(struct net_device *dev, struct ip_tunnel_parm *p) +ipip6_tunnel_del(struct net_device *dev, struct ip_tunnel_parm_kern *p) { struct ip_tunnel *t = netdev_priv(dev); @@ -1348,7 +1352,8 @@ ipip6_tunnel_del(struct net_device *dev, struct ip_tunnel_parm *p) } static int -ipip6_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd) +ipip6_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm_kern *p, + int cmd) { switch (cmd) { case SIOCGETTUNNEL: @@ -1432,7 +1437,7 @@ static void ipip6_tunnel_setup(struct net_device *dev) dev->flags = IFF_NOARP; netif_keep_dst(dev); dev->addr_len = 4; - dev->features |= NETIF_F_LLTX; + dev->lltx = true; dev->features |= SIT_FEATURES; dev->hw_features |= SIT_FEATURES; dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS; @@ -1445,7 +1450,6 @@ static int ipip6_tunnel_init(struct net_device *dev) int err; tunnel->dev = dev; - tunnel->net = dev_net(dev); strcpy(tunnel->parms.name, dev->name); ipip6_tunnel_bind_dev(dev); @@ -1490,7 +1494,7 @@ static int ipip6_validate(struct nlattr *tb[], struct nlattr *data[], } static void ipip6_netlink_parms(struct nlattr *data[], - struct ip_tunnel_parm *parms, + struct ip_tunnel_parm_kern *parms, __u32 *fwmark) { memset(parms, 0, sizeof(*parms)); @@ -1546,19 +1550,23 @@ static bool ipip6_netlink_6rd_parms(struct nlattr *data[], } #endif -static int ipip6_newlink(struct net *src_net, struct net_device *dev, - struct nlattr *tb[], struct nlattr *data[], +static int ipip6_newlink(struct net_device *dev, + struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { - struct net *net = dev_net(dev); + struct nlattr **data = params->data; + struct nlattr **tb = params->tb; struct ip_tunnel *nt; struct ip_tunnel_encap ipencap; #ifdef CONFIG_IPV6_SIT_6RD struct ip_tunnel_6rd ip6rd; #endif + struct net *net; int err; + net = params->link_net ? : dev_net(dev); nt = netdev_priv(dev); + nt->net = net; if (ip_tunnel_netlink_encap_parms(data, &ipencap)) { err = ip_tunnel_encap_setup(nt, &ipencap); @@ -1599,8 +1607,8 @@ static int ipip6_changelink(struct net_device *dev, struct nlattr *tb[], struct netlink_ext_ack *extack) { struct ip_tunnel *t = netdev_priv(dev); - struct ip_tunnel_parm p; struct ip_tunnel_encap ipencap; + struct ip_tunnel_parm_kern p; struct net *net = t->net; struct sit_net *sitn = net_generic(net, sit_net_id); #ifdef CONFIG_IPV6_SIT_6RD @@ -1687,7 +1695,7 @@ static size_t ipip6_get_size(const struct net_device *dev) static int ipip6_fill_info(struct sk_buff *skb, const struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); - struct ip_tunnel_parm *parm = &tunnel->parms; + struct ip_tunnel_parm_kern *parm = &tunnel->parms; if (nla_put_u32(skb, IFLA_IPTUN_LINK, parm->link) || nla_put_in_addr(skb, IFLA_IPTUN_LOCAL, parm->iph.saddr) || @@ -1697,7 +1705,8 @@ static int ipip6_fill_info(struct sk_buff *skb, const struct net_device *dev) nla_put_u8(skb, IFLA_IPTUN_PMTUDISC, !!(parm->iph.frag_off & htons(IP_DF))) || nla_put_u8(skb, IFLA_IPTUN_PROTO, parm->iph.protocol) || - nla_put_be16(skb, IFLA_IPTUN_FLAGS, parm->i_flags) || + nla_put_be16(skb, IFLA_IPTUN_FLAGS, + ip_tunnel_flags_to_be16(parm->i_flags)) || nla_put_u32(skb, IFLA_IPTUN_FWMARK, tunnel->fwmark)) goto nla_put_failure; @@ -1795,8 +1804,7 @@ static struct xfrm_tunnel mplsip_handler __read_mostly = { }; #endif -static void __net_exit sit_destroy_tunnels(struct net *net, - struct list_head *head) +static void __net_exit sit_exit_rtnl_net(struct net *net, struct list_head *head) { struct sit_net *sitn = net_generic(net, sit_net_id); struct net_device *dev, *aux; @@ -1811,15 +1819,15 @@ static void __net_exit sit_destroy_tunnels(struct net *net, for (h = 0; h < (prio ? IP6_SIT_HASH_SIZE : 1); h++) { struct ip_tunnel *t; - t = rtnl_dereference(sitn->tunnels[prio][h]); + t = rtnl_net_dereference(net, sitn->tunnels[prio][h]); while (t) { /* If dev is in the same netns, it has already * been added to the list by the previous loop. */ if (!net_eq(dev_net(t->dev), net)) - unregister_netdevice_queue(t->dev, - head); - t = rtnl_dereference(t->next); + unregister_netdevice_queue(t->dev, head); + + t = rtnl_net_dereference(net, t->next); } } } @@ -1851,7 +1859,10 @@ static int __net_init sit_init_net(struct net *net) /* FB netdevice is special: we have one, and only one per netns. * Allowing to move it to another netns is clearly unsafe. */ - sitn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL; + sitn->fb_tunnel_dev->netns_immutable = true; + + t = netdev_priv(sitn->fb_tunnel_dev); + t->net = net; err = register_netdev(sitn->fb_tunnel_dev); if (err) @@ -1860,8 +1871,6 @@ static int __net_init sit_init_net(struct net *net) ipip6_tunnel_clone_6rd(sitn->fb_tunnel_dev, sitn); ipip6_fb_tunnel_init(sitn->fb_tunnel_dev); - t = netdev_priv(sitn->fb_tunnel_dev); - strcpy(t->parms.name, sitn->fb_tunnel_dev->name); return 0; @@ -1871,19 +1880,9 @@ err_alloc_dev: return err; } -static void __net_exit sit_exit_batch_rtnl(struct list_head *net_list, - struct list_head *dev_to_kill) -{ - struct net *net; - - ASSERT_RTNL(); - list_for_each_entry(net, net_list, exit_list) - sit_destroy_tunnels(net, dev_to_kill); -} - static struct pernet_operations sit_net_ops = { .init = sit_init_net, - .exit_batch_rtnl = sit_exit_batch_rtnl, + .exit_rtnl = sit_exit_rtnl_net, .id = &sit_net_id, .size = sizeof(struct sit_net), }; diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c index 6d8286c299c9..9d83eadd308b 100644 --- a/net/ipv6/syncookies.c +++ b/net/ipv6/syncookies.c @@ -246,7 +246,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) } } - req->rsk_window_clamp = tp->window_clamp ? :dst_metric(dst, RTAX_WINDOW); + req->rsk_window_clamp = READ_ONCE(tp->window_clamp) ? :dst_metric(dst, RTAX_WINDOW); /* limit the window selection if the user enforce a smaller rx buffer */ full_space = tcp_full_space(sk); if (sk->sk_userlocks & SOCK_RCVBUF_LOCK && @@ -275,6 +275,6 @@ out: out_free: reqsk_free(req); out_drop: - kfree_skb_reason(skb, reason); + sk_skb_reason_drop(sk, skb, reason); return NULL; } diff --git a/net/ipv6/sysctl_net_ipv6.c b/net/ipv6/sysctl_net_ipv6.c index 888676163e90..d2cd33e2698d 100644 --- a/net/ipv6/sysctl_net_ipv6.c +++ b/net/ipv6/sysctl_net_ipv6.c @@ -30,7 +30,7 @@ static u32 rt6_multipath_hash_fields_all_mask = static u32 ioam6_id_max = IOAM6_DEFAULT_ID; static u64 ioam6_id_wide_max = IOAM6_DEFAULT_ID_WIDE; -static int proc_rt6_multipath_hash_policy(struct ctl_table *table, int write, +static int proc_rt6_multipath_hash_policy(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct net *net; @@ -46,7 +46,7 @@ static int proc_rt6_multipath_hash_policy(struct ctl_table *table, int write, } static int -proc_rt6_multipath_hash_fields(struct ctl_table *table, int write, void *buffer, +proc_rt6_multipath_hash_fields(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct net *net; @@ -213,7 +213,6 @@ static struct ctl_table ipv6_table_template[] = { .proc_handler = proc_doulongvec_minmax, .extra2 = &ioam6_id_wide_max, }, - { } }; static struct ctl_table ipv6_rotable[] = { @@ -248,11 +247,11 @@ static struct ctl_table ipv6_rotable[] = { .proc_handler = proc_dointvec, }, #endif /* CONFIG_NETLABEL */ - { } }; static int __net_init ipv6_sysctl_net_init(struct net *net) { + size_t table_size = ARRAY_SIZE(ipv6_table_template); struct ctl_table *ipv6_table; struct ctl_table *ipv6_route_table; struct ctl_table *ipv6_icmp_table; @@ -264,7 +263,7 @@ static int __net_init ipv6_sysctl_net_init(struct net *net) if (!ipv6_table) goto out; /* Update the variables to point into the current struct net */ - for (i = 0; i < ARRAY_SIZE(ipv6_table_template) - 1; i++) + for (i = 0; i < table_size; i++) ipv6_table[i].data += (void *)net - (void *)&init_net; ipv6_route_table = ipv6_route_sysctl_init(net); @@ -276,8 +275,7 @@ static int __net_init ipv6_sysctl_net_init(struct net *net) goto out_ipv6_route_table; net->ipv6.sysctl.hdr = register_net_sysctl_sz(net, "net/ipv6", - ipv6_table, - ARRAY_SIZE(ipv6_table_template)); + ipv6_table, table_size); if (!net->ipv6.sysctl.hdr) goto out_ipv6_icmp_table; @@ -313,9 +311,9 @@ out_ipv6_table: static void __net_exit ipv6_sysctl_net_exit(struct net *net) { - struct ctl_table *ipv6_table; - struct ctl_table *ipv6_route_table; - struct ctl_table *ipv6_icmp_table; + const struct ctl_table *ipv6_table; + const struct ctl_table *ipv6_route_table; + const struct ctl_table *ipv6_icmp_table; ipv6_table = net->ipv6.sysctl.hdr->ctl_table_arg; ipv6_route_table = net->ipv6.sysctl.route_hdr->ctl_table_arg; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 3f4cba49e9ee..e8e68a142649 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -60,6 +60,7 @@ #include <net/secure_seq.h> #include <net/hotdata.h> #include <net/busy_poll.h> +#include <net/rstreason.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> @@ -69,7 +70,8 @@ #include <trace/events/tcp.h> -static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb); +static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb, + enum sk_rst_reason reason); static void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, struct request_sock *req); @@ -95,11 +97,9 @@ static void inet6_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) struct dst_entry *dst = skb_dst(skb); if (dst && dst_hold_safe(dst)) { - const struct rt6_info *rt = (const struct rt6_info *)dst; - rcu_assign_pointer(sk->sk_rx_dst, dst); sk->sk_rx_dst_ifindex = skb->skb_iif; - sk->sk_rx_dst_cookie = rt6_get_cookie(rt); + sk->sk_rx_dst_cookie = rt6_get_cookie(dst_rt6_info(dst)); } } @@ -267,6 +267,8 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, fl6.flowi6_mark = sk->sk_mark; fl6.fl6_dport = usin->sin6_port; fl6.fl6_sport = inet->inet_sport; + if (IS_ENABLED(CONFIG_IP_ROUTE_MULTIPATH) && !fl6.fl6_sport) + fl6.flowi6_flags = FLOWI_FLAG_ANY_SPORT; fl6.flowi6_uid = sk->sk_uid; opt = rcu_dereference_protected(np->opt, lockdep_sock_is_held(sk)); @@ -376,7 +378,7 @@ static int tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, { const struct ipv6hdr *hdr = (const struct ipv6hdr *)skb->data; const struct tcphdr *th = (struct tcphdr *)(skb->data+offset); - struct net *net = dev_net(skb->dev); + struct net *net = dev_net_rcu(skb->dev); struct request_sock *fastopen; struct ipv6_pinfo *np; struct tcp_sock *tp; @@ -490,14 +492,10 @@ static int tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, ipv6_icmp_error(sk, skb, err, th->dest, ntohl(info), (u8 *)th); - if (!sock_owned_by_user(sk)) { - WRITE_ONCE(sk->sk_err, err); - sk_error_report(sk); /* Wake people up to see the error (see connect in sock.c) */ - - tcp_done(sk); - } else { + if (!sock_owned_by_user(sk)) + tcp_done_with_error(sk, err); + else WRITE_ONCE(sk->sk_err_soft, err); - } goto out; case TCP_LISTEN: break; @@ -793,7 +791,8 @@ clear_hash_nostart: static void tcp_v6_init_req(struct request_sock *req, const struct sock *sk_listener, - struct sk_buff *skb) + struct sk_buff *skb, + u32 tw_isn) { bool l3_slave = ipv6_l3mdev_skb(TCP_SKB_CB(skb)->header.h6.flags); struct inet_request_sock *ireq = inet_rsk(req); @@ -801,13 +800,15 @@ static void tcp_v6_init_req(struct request_sock *req, ireq->ir_v6_rmt_addr = ipv6_hdr(skb)->saddr; ireq->ir_v6_loc_addr = ipv6_hdr(skb)->daddr; + ireq->ir_rmt_addr = LOOPBACK4_IPV6; + ireq->ir_loc_addr = LOOPBACK4_IPV6; /* So that link locals have meaning */ if ((!sk_listener->sk_bound_dev_if || l3_slave) && ipv6_addr_type(&ireq->ir_v6_rmt_addr) & IPV6_ADDR_LINKLOCAL) ireq->ir_iif = tcp_v6_iif(skb); - if (!TCP_SKB_CB(skb)->tcp_tw_isn && + if (!tw_isn && (ipv6_opt_accepted(sk_listener, skb, &TCP_SKB_CB(skb)->header.h6) || np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo || np->rxopt.bits.rxhlim || @@ -820,9 +821,10 @@ static void tcp_v6_init_req(struct request_sock *req, static struct dst_entry *tcp_v6_route_req(const struct sock *sk, struct sk_buff *skb, struct flowi *fl, - struct request_sock *req) + struct request_sock *req, + u32 tw_isn) { - tcp_v6_init_req(req, sk, skb); + tcp_v6_init_req(req, sk, skb, tw_isn); if (security_inet_conn_request(sk, skb, req)) return NULL; @@ -866,16 +868,16 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 int oif, int rst, u8 tclass, __be32 label, u32 priority, u32 txhash, struct tcp_key *key) { - const struct tcphdr *th = tcp_hdr(skb); - struct tcphdr *t1; - struct sk_buff *buff; - struct flowi6 fl6; - struct net *net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev); - struct sock *ctl_sk = net->ipv6.tcp_sk; + struct net *net = sk ? sock_net(sk) : dev_net_rcu(skb_dst(skb)->dev); unsigned int tot_len = sizeof(struct tcphdr); + struct sock *ctl_sk = net->ipv6.tcp_sk; + const struct tcphdr *th = tcp_hdr(skb); __be32 mrst = 0, *topt; struct dst_entry *dst; - __u32 mark = 0; + struct sk_buff *buff; + struct tcphdr *t1; + struct flowi6 fl6; + u32 mark = 0; if (tsecr) tot_len += TCPOLEN_TSTAMP_ALIGNED; @@ -969,11 +971,14 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 } if (sk) { + /* unconstify the socket only to attach it to buff with care. */ + skb_set_owner_edemux(buff, (struct sock *)sk); + if (sk->sk_state == TCP_TIME_WAIT) mark = inet_twsk(sk)->tw_mark; else mark = READ_ONCE(sk->sk_mark); - skb_set_delivery_time(buff, tcp_transmit_time(sk), true); + skb_set_delivery_time(buff, tcp_transmit_time(sk), SKB_CLOCK_MONOTONIC); } if (txhash) { /* autoflowlabel/skb_get_hash_flowi6 rely on buff->hash */ @@ -996,7 +1001,7 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 if (!IS_ERR(dst)) { skb_dst_set(buff, dst); ip6_xmit(ctl_sk, buff, &fl6, fl6.flowi6_mark, NULL, - tclass & ~INET_ECN_MASK, priority); + tclass, priority); TCP_INC_STATS(net, TCP_MIB_OUTSEGS); if (rst) TCP_INC_STATS(net, TCP_MIB_OUTRSTS); @@ -1006,7 +1011,8 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 kfree_skb(buff); } -static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb) +static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb, + enum sk_rst_reason reason) { const struct tcphdr *th = tcp_hdr(skb); struct ipv6hdr *ipv6h = ipv6_hdr(skb); @@ -1037,7 +1043,7 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb) if (!sk && !ipv6_unicast_destination(skb)) return; - net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev); + net = sk ? sock_net(sk) : dev_net_rcu(skb_dst(skb)->dev); /* Invalid TCP option size or twice included auth */ if (tcp_parse_auth_options(th, &md5_hash_location, &aoh)) return; @@ -1113,7 +1119,6 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb) if (sk) { oif = sk->sk_bound_dev_if; if (sk_fullsock(sk)) { - trace_tcp_send_reset(sk, skb); if (inet6_test_bit(REPFLOW, sk)) label = ip6_flowlabel(ipv6h); priority = READ_ONCE(sk->sk_priority); @@ -1129,8 +1134,11 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb) label = ip6_flowlabel(ipv6h); } + trace_tcp_send_reset(sk, skb, reason); + tcp_v6_send_response(sk, skb, seq, ack_seq, 0, 0, 0, oif, 1, - ipv6_get_dsfield(ipv6h), label, priority, txhash, + ipv6_get_dsfield(ipv6h) & ~INET_ECN_MASK, + label, priority, txhash, &key); #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) @@ -1150,11 +1158,16 @@ static void tcp_v6_send_ack(const struct sock *sk, struct sk_buff *skb, u32 seq, tclass, label, priority, txhash, key); } -static void tcp_v6_timewait_ack(struct sock *sk, struct sk_buff *skb) +static void tcp_v6_timewait_ack(struct sock *sk, struct sk_buff *skb, + enum tcp_tw_status tw_status) { struct inet_timewait_sock *tw = inet_twsk(sk); struct tcp_timewait_sock *tcptw = tcp_twsk(sk); + u8 tclass = tw->tw_tclass; struct tcp_key key = {}; + + if (tw_status == TCP_TW_ACK_OOW) + tclass &= ~INET_ECN_MASK; #ifdef CONFIG_TCP_AO struct tcp_ao_info *ao_info; @@ -1169,8 +1182,8 @@ static void tcp_v6_timewait_ack(struct sock *sk, struct sk_buff *skb) if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh)) goto out; if (aoh) - key.ao_key = tcp_ao_established_key(ao_info, - aoh->rnext_keyid, -1); + key.ao_key = tcp_ao_established_key(sk, ao_info, + aoh->rnext_keyid, -1); } } if (key.ao_key) { @@ -1193,12 +1206,13 @@ static void tcp_v6_timewait_ack(struct sock *sk, struct sk_buff *skb) #endif } - tcp_v6_send_ack(sk, skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt, + tcp_v6_send_ack(sk, skb, tcptw->tw_snd_nxt, + READ_ONCE(tcptw->tw_rcv_nxt), tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, tcp_tw_tsval(tcptw), - tcptw->tw_ts_recent, tw->tw_bound_dev_if, &key, - tw->tw_tclass, cpu_to_be32(tw->tw_flowlabel), tw->tw_priority, - tw->tw_txhash); + READ_ONCE(tcptw->tw_ts_recent), tw->tw_bound_dev_if, + &key, tclass, cpu_to_be32(tw->tw_flowlabel), + tw->tw_priority, tw->tw_txhash); #ifdef CONFIG_TCP_AO out: @@ -1268,18 +1282,14 @@ static void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV * sk->sk_state == TCP_SYN_RECV -> for Fast Open. */ - /* RFC 7323 2.3 - * The window field (SEG.WND) of every outgoing segment, with the - * exception of <SYN> segments, MUST be right-shifted by - * Rcv.Wind.Shift bits: - */ tcp_v6_send_ack(sk, skb, (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt, tcp_rsk(req)->rcv_nxt, - req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale, + tcp_synack_window(req) >> inet_rsk(req)->rcv_wscale, tcp_rsk_tsval(tcp_rsk(req)), - READ_ONCE(req->ts_recent), sk->sk_bound_dev_if, - &key, ipv6_get_dsfield(ipv6_hdr(skb)), 0, + req->ts_recent, sk->sk_bound_dev_if, + &key, ipv6_get_dsfield(ipv6_hdr(skb)) & ~INET_ECN_MASK, + 0, READ_ONCE(sk->sk_priority), READ_ONCE(tcp_rsk(req)->txhash)); if (tcp_key_is_ao(&key)) @@ -1440,7 +1450,6 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * */ newsk->sk_gso_type = SKB_GSO_TCPV6; - ip6_dst_store(newsk, dst, NULL, NULL); inet6_sk_rx_dst_set(newsk, skb); inet_sk(newsk)->pinet6 = tcp_inet6_sk(newsk); @@ -1451,10 +1460,9 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * memcpy(newnp, np, sizeof(struct ipv6_pinfo)); - newsk->sk_v6_daddr = ireq->ir_v6_rmt_addr; + ip6_dst_store(newsk, dst, NULL, NULL); + newnp->saddr = ireq->ir_v6_loc_addr; - newsk->sk_v6_rcv_saddr = ireq->ir_v6_loc_addr; - newsk->sk_bound_dev_if = ireq->ir_iif; /* Now IPv6 options... @@ -1507,9 +1515,6 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * tcp_initialize_rcv_mss(newsk); - newinet->inet_daddr = newinet->inet_saddr = LOOPBACK4_IPV6; - newinet->inet_rcv_saddr = LOOPBACK4_IPV6; - #ifdef CONFIG_TCP_MD5SIG l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif); @@ -1621,7 +1626,7 @@ int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) by tcp. Feel free to propose better solution. --ANK (980728) */ - if (np->rxopt.all) + if (np->rxopt.all && sk->sk_state != TCP_LISTEN) opt_skb = skb_clone_and_charge_r(skb, sk); if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ @@ -1659,8 +1664,6 @@ int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) if (reason) goto reset; } - if (opt_skb) - __kfree_skb(opt_skb); return 0; } } else @@ -1674,11 +1677,11 @@ int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) return 0; reset: - tcp_v6_send_reset(sk, skb); + tcp_v6_send_reset(sk, skb, sk_rst_convert_drop_reason(reason)); discard: if (opt_skb) __kfree_skb(opt_skb); - kfree_skb_reason(skb, reason); + sk_skb_reason_drop(sk, skb, reason); return 0; csum_err: reason = SKB_DROP_REASON_TCP_CSUM; @@ -1737,8 +1740,7 @@ static void tcp_v6_fill_cb(struct sk_buff *skb, const struct ipv6hdr *hdr, TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + skb->len - th->doff*4); TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); - TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th); - TCP_SKB_CB(skb)->tcp_tw_isn = 0; + TCP_SKB_CB(skb)->tcp_flags = tcp_flags_ntohs(th); TCP_SKB_CB(skb)->ip_dsfield = ipv6_get_dsfield(hdr); TCP_SKB_CB(skb)->sacked = 0; TCP_SKB_CB(skb)->has_rxtstamp = @@ -1747,15 +1749,17 @@ static void tcp_v6_fill_cb(struct sk_buff *skb, const struct ipv6hdr *hdr, INDIRECT_CALLABLE_SCOPE int tcp_v6_rcv(struct sk_buff *skb) { + struct net *net = dev_net_rcu(skb->dev); enum skb_drop_reason drop_reason; + enum tcp_tw_status tw_status; int sdif = inet6_sdif(skb); int dif = inet6_iif(skb); const struct tcphdr *th; const struct ipv6hdr *hdr; + struct sock *sk = NULL; bool refcounted; - struct sock *sk; int ret; - struct net *net = dev_net(skb->dev); + u32 isn; drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; if (skb->pkt_type != PACKET_HOST) @@ -1791,7 +1795,6 @@ lookup: if (!sk) goto no_tcp_socket; -process: if (sk->sk_state == TCP_TIME_WAIT) goto do_time_wait; @@ -1835,7 +1838,8 @@ process: th = (const struct tcphdr *)skb->data; hdr = ipv6_hdr(skb); tcp_v6_fill_cb(skb, hdr, th); - nsk = tcp_check_req(sk, skb, req, false, &req_stolen); + nsk = tcp_check_req(sk, skb, req, false, &req_stolen, + &drop_reason); } else { drop_reason = SKB_DROP_REASON_SOCKET_FILTER; } @@ -1860,7 +1864,10 @@ process: } else { drop_reason = tcp_child_process(sk, nsk, skb); if (drop_reason) { - tcp_v6_send_reset(nsk, skb); + enum sk_rst_reason rst_reason; + + rst_reason = sk_rst_convert_drop_reason(drop_reason); + tcp_v6_send_reset(nsk, skb, rst_reason); goto discard_and_relse; } sock_put(sk); @@ -1868,6 +1875,7 @@ process: } } +process: if (static_branch_unlikely(&ip6_min_hopcount)) { /* min_hopcount can be changed concurrently from do_ipv6_setsockopt() */ if (unlikely(hdr->hop_limit < READ_ONCE(tcp_inet6_sk(sk)->min_hopcount))) { @@ -1936,12 +1944,12 @@ csum_error: bad_packet: __TCP_INC_STATS(net, TCP_MIB_INERRS); } else { - tcp_v6_send_reset(NULL, skb); + tcp_v6_send_reset(NULL, skb, sk_rst_convert_drop_reason(drop_reason)); } discard_it: SKB_DR_OR(drop_reason, NOT_SPECIFIED); - kfree_skb_reason(skb, drop_reason); + sk_skb_reason_drop(sk, skb, drop_reason); return 0; discard_and_relse: @@ -1964,7 +1972,9 @@ do_time_wait: goto csum_error; } - switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) { + tw_status = tcp_timewait_state_process(inet_twsk(sk), skb, th, &isn, + &drop_reason); + switch (tw_status) { case TCP_TW_SYN: { struct sock *sk2; @@ -1982,16 +1992,18 @@ do_time_wait: sk = sk2; tcp_v6_restore_cb(skb); refcounted = false; + __this_cpu_write(tcp_tw_isn, isn); goto process; } } /* to ACK */ fallthrough; case TCP_TW_ACK: - tcp_v6_timewait_ack(sk, skb); + case TCP_TW_ACK_OOW: + tcp_v6_timewait_ack(sk, skb, tw_status); break; case TCP_TW_RST: - tcp_v6_send_reset(sk, skb); + tcp_v6_send_reset(sk, skb, SK_RST_REASON_TCP_TIMEWAIT_SOCKET); inet_twsk_deschedule_put(inet_twsk(sk)); goto discard_it; case TCP_TW_SUCCESS: @@ -2002,7 +2014,7 @@ do_time_wait: void tcp_v6_early_demux(struct sk_buff *skb) { - struct net *net = dev_net(skb->dev); + struct net *net = dev_net_rcu(skb->dev); const struct ipv6hdr *hdr; const struct tcphdr *th; struct sock *sk; @@ -2041,7 +2053,6 @@ void tcp_v6_early_demux(struct sk_buff *skb) static struct timewait_sock_ops tcp6_timewait_sock_ops = { .twsk_obj_size = sizeof(struct tcp6_timewait_sock), - .twsk_unique = tcp_twsk_unique, .twsk_destructor = tcp_twsk_destructor, }; @@ -2060,8 +2071,6 @@ const struct inet_connection_sock_af_ops ipv6_specific = { .net_header_len = sizeof(struct ipv6hdr), .setsockopt = ipv6_setsockopt, .getsockopt = ipv6_getsockopt, - .addr2sockaddr = inet6_csk_addr2sockaddr, - .sockaddr_len = sizeof(struct sockaddr_in6), .mtu_reduced = tcp_v6_mtu_reduced, }; @@ -2094,8 +2103,6 @@ static const struct inet_connection_sock_af_ops ipv6_mapped = { .net_header_len = sizeof(struct iphdr), .setsockopt = ipv6_setsockopt, .getsockopt = ipv6_getsockopt, - .addr2sockaddr = inet6_csk_addr2sockaddr, - .sockaddr_len = sizeof(struct sockaddr_in6), .mtu_reduced = tcp_v4_mtu_reduced, }; @@ -2177,6 +2184,7 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i) const struct tcp_sock *tp = tcp_sk(sp); const struct inet_connection_sock *icsk = inet_csk(sp); const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq; + u8 icsk_pending; int rx_queue; int state; @@ -2185,14 +2193,15 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i) destp = ntohs(inet->inet_dport); srcp = ntohs(inet->inet_sport); - if (icsk->icsk_pending == ICSK_TIME_RETRANS || - icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT || - icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { + icsk_pending = smp_load_acquire(&icsk->icsk_pending); + if (icsk_pending == ICSK_TIME_RETRANS || + icsk_pending == ICSK_TIME_REO_TIMEOUT || + icsk_pending == ICSK_TIME_LOSS_PROBE) { timer_active = 1; - timer_expires = icsk->icsk_timeout; - } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) { + timer_expires = icsk_timeout(icsk); + } else if (icsk_pending == ICSK_TIME_PROBE0) { timer_active = 4; - timer_expires = icsk->icsk_timeout; + timer_expires = icsk_timeout(icsk); } else if (timer_pending(&sp->sk_timer)) { timer_active = 2; timer_expires = sp->sk_timer.expires; @@ -2259,7 +2268,7 @@ static void get_timewait6_sock(struct seq_file *seq, src->s6_addr32[2], src->s6_addr32[3], srcp, dest->s6_addr32[0], dest->s6_addr32[1], dest->s6_addr32[2], dest->s6_addr32[3], destp, - tw->tw_substate, 0, 0, + READ_ONCE(tw->tw_substate), 0, 0, 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0, refcount_read(&tw->tw_refcnt), tw); } @@ -2380,8 +2389,14 @@ static struct inet_protosw tcpv6_protosw = { static int __net_init tcpv6_net_init(struct net *net) { - return inet_ctl_sock_create(&net->ipv6.tcp_sk, PF_INET6, - SOCK_RAW, IPPROTO_TCP, net); + int res; + + res = inet_ctl_sock_create(&net->ipv6.tcp_sk, PF_INET6, + SOCK_RAW, IPPROTO_TCP, net); + if (!res) + net->ipv6.tcp_sk->sk_clockid = CLOCK_MONOTONIC; + + return res; } static void __net_exit tcpv6_net_exit(struct net *net) @@ -2389,15 +2404,9 @@ static void __net_exit tcpv6_net_exit(struct net *net) inet_ctl_sock_destroy(net->ipv6.tcp_sk); } -static void __net_exit tcpv6_net_exit_batch(struct list_head *net_exit_list) -{ - tcp_twsk_purge(net_exit_list, AF_INET6); -} - static struct pernet_operations tcpv6_net_ops = { .init = tcpv6_net_init, .exit = tcpv6_net_exit, - .exit_batch = tcpv6_net_exit_batch, }; int __init tcpv6_init(void) diff --git a/net/ipv6/tcpv6_offload.c b/net/ipv6/tcpv6_offload.c index 4b07d1e6c952..a8a04f441e78 100644 --- a/net/ipv6/tcpv6_offload.c +++ b/net/ipv6/tcpv6_offload.c @@ -7,31 +7,84 @@ */ #include <linux/indirect_call_wrapper.h> #include <linux/skbuff.h> +#include <net/inet6_hashtables.h> #include <net/gro.h> #include <net/protocol.h> #include <net/tcp.h> #include <net/ip6_checksum.h> #include "ip6_offload.h" +static void tcp6_check_fraglist_gro(struct list_head *head, struct sk_buff *skb, + struct tcphdr *th) +{ +#if IS_ENABLED(CONFIG_IPV6) + const struct ipv6hdr *hdr; + struct sk_buff *p; + struct sock *sk; + struct net *net; + int iif, sdif; + + if (likely(!(skb->dev->features & NETIF_F_GRO_FRAGLIST))) + return; + + p = tcp_gro_lookup(head, th); + if (p) { + NAPI_GRO_CB(skb)->is_flist = NAPI_GRO_CB(p)->is_flist; + return; + } + + inet6_get_iif_sdif(skb, &iif, &sdif); + hdr = skb_gro_network_header(skb); + net = dev_net_rcu(skb->dev); + sk = __inet6_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, + &hdr->saddr, th->source, + &hdr->daddr, ntohs(th->dest), + iif, sdif); + NAPI_GRO_CB(skb)->is_flist = !sk; + if (sk) + sock_gen_put(sk); +#endif /* IS_ENABLED(CONFIG_IPV6) */ +} + INDIRECT_CALLABLE_SCOPE struct sk_buff *tcp6_gro_receive(struct list_head *head, struct sk_buff *skb) { + struct tcphdr *th; + /* Don't bother verifying checksum if we're going to flush anyway. */ if (!NAPI_GRO_CB(skb)->flush && skb_gro_checksum_validate(skb, IPPROTO_TCP, - ip6_gro_compute_pseudo)) { - NAPI_GRO_CB(skb)->flush = 1; - return NULL; - } + ip6_gro_compute_pseudo)) + goto flush; + + th = tcp_gro_pull_header(skb); + if (!th) + goto flush; - return tcp_gro_receive(head, skb); + tcp6_check_fraglist_gro(head, skb, th); + + return tcp_gro_receive(head, skb, th); + +flush: + NAPI_GRO_CB(skb)->flush = 1; + return NULL; } INDIRECT_CALLABLE_SCOPE int tcp6_gro_complete(struct sk_buff *skb, int thoff) { - const struct ipv6hdr *iph = ipv6_hdr(skb); + const u16 offset = NAPI_GRO_CB(skb)->network_offsets[skb->encapsulation]; + const struct ipv6hdr *iph = (struct ipv6hdr *)(skb->data + offset); struct tcphdr *th = tcp_hdr(skb); + if (unlikely(NAPI_GRO_CB(skb)->is_flist)) { + skb_shinfo(skb)->gso_type |= SKB_GSO_FRAGLIST | SKB_GSO_TCPV6; + skb_shinfo(skb)->gso_segs = NAPI_GRO_CB(skb)->count; + + __skb_incr_checksum_unnecessary(skb); + + return 0; + } + th->check = ~tcp_v6_check(skb->len - thoff, &iph->saddr, &iph->daddr, 0); skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV6; @@ -40,6 +93,70 @@ INDIRECT_CALLABLE_SCOPE int tcp6_gro_complete(struct sk_buff *skb, int thoff) return 0; } +static void __tcpv6_gso_segment_csum(struct sk_buff *seg, + struct in6_addr *oldip, + const struct in6_addr *newip, + __be16 *oldport, __be16 newport) +{ + struct tcphdr *th = tcp_hdr(seg); + + if (!ipv6_addr_equal(oldip, newip)) { + inet_proto_csum_replace16(&th->check, seg, + oldip->s6_addr32, + newip->s6_addr32, + true); + *oldip = *newip; + } + + if (*oldport == newport) + return; + + inet_proto_csum_replace2(&th->check, seg, *oldport, newport, false); + *oldport = newport; +} + +static struct sk_buff *__tcpv6_gso_segment_list_csum(struct sk_buff *segs) +{ + const struct tcphdr *th; + const struct ipv6hdr *iph; + struct sk_buff *seg; + struct tcphdr *th2; + struct ipv6hdr *iph2; + + seg = segs; + th = tcp_hdr(seg); + iph = ipv6_hdr(seg); + th2 = tcp_hdr(seg->next); + iph2 = ipv6_hdr(seg->next); + + if (!(*(const u32 *)&th->source ^ *(const u32 *)&th2->source) && + ipv6_addr_equal(&iph->saddr, &iph2->saddr) && + ipv6_addr_equal(&iph->daddr, &iph2->daddr)) + return segs; + + while ((seg = seg->next)) { + th2 = tcp_hdr(seg); + iph2 = ipv6_hdr(seg); + + __tcpv6_gso_segment_csum(seg, &iph2->saddr, &iph->saddr, + &th2->source, th->source); + __tcpv6_gso_segment_csum(seg, &iph2->daddr, &iph->daddr, + &th2->dest, th->dest); + } + + return segs; +} + +static struct sk_buff *__tcp6_gso_segment_list(struct sk_buff *skb, + netdev_features_t features) +{ + skb = skb_segment_list(skb, features, skb_mac_header_len(skb)); + if (IS_ERR(skb)) + return skb; + + return __tcpv6_gso_segment_list_csum(skb); +} + static struct sk_buff *tcp6_gso_segment(struct sk_buff *skb, netdev_features_t features) { @@ -51,6 +168,15 @@ static struct sk_buff *tcp6_gso_segment(struct sk_buff *skb, if (!pskb_may_pull(skb, sizeof(*th))) return ERR_PTR(-EINVAL); + if (skb_shinfo(skb)->gso_type & SKB_GSO_FRAGLIST) { + struct tcphdr *th = tcp_hdr(skb); + + if (skb_pagelen(skb) - th->doff * 4 == skb_shinfo(skb)->gso_size) + return __tcp6_gso_segment_list(skb, features); + + skb->ip_summed = CHECKSUM_NONE; + } + if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) { const struct ipv6hdr *ipv6h = ipv6_hdr(skb); struct tcphdr *th = tcp_hdr(skb); diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 8f7aa8bac1e7..7317f8e053f1 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -34,6 +34,7 @@ #include <linux/slab.h> #include <linux/uaccess.h> #include <linux/indirect_call_wrapper.h> +#include <trace/events/udp.h> #include <net/addrconf.h> #include <net/ndisc.h> @@ -45,7 +46,7 @@ #include <net/tcp_states.h> #include <net/ip6_checksum.h> #include <net/ip6_tunnel.h> -#include <trace/events/udp.h> +#include <net/udp_tunnel.h> #include <net/xfrm.h> #include <net/inet_hashtables.h> #include <net/inet6_hashtables.h> @@ -110,11 +111,22 @@ void udp_v6_rehash(struct sock *sk) u16 new_hash = ipv6_portaddr_hash(sock_net(sk), &sk->sk_v6_rcv_saddr, inet_sk(sk)->inet_num); + u16 new_hash4; + + if (ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr)) { + new_hash4 = udp_ehashfn(sock_net(sk), + sk->sk_rcv_saddr, sk->sk_num, + sk->sk_daddr, sk->sk_dport); + } else { + new_hash4 = udp6_ehashfn(sock_net(sk), + &sk->sk_v6_rcv_saddr, sk->sk_num, + &sk->sk_v6_daddr, sk->sk_dport); + } - udp_lib_rehash(sk, new_hash); + udp_lib_rehash(sk, new_hash, new_hash4); } -static int compute_score(struct sock *sk, struct net *net, +static int compute_score(struct sock *sk, const struct net *net, const struct in6_addr *saddr, __be16 sport, const struct in6_addr *daddr, unsigned short hnum, int dif, int sdif) @@ -159,8 +171,51 @@ static int compute_score(struct sock *sk, struct net *net, return score; } +/** + * udp6_lib_lookup1() - Simplified lookup using primary hash (destination port) + * @net: Network namespace + * @saddr: Source address, network order + * @sport: Source port, network order + * @daddr: Destination address, network order + * @hnum: Destination port, host order + * @dif: Destination interface index + * @sdif: Destination bridge port index, if relevant + * @udptable: Set of UDP hash tables + * + * Simplified lookup to be used as fallback if no sockets are found due to a + * potential race between (receive) address change, and lookup happening before + * the rehash operation. This function ignores SO_REUSEPORT groups while scoring + * result sockets, because if we have one, we don't need the fallback at all. + * + * Called under rcu_read_lock(). + * + * Return: socket with highest matching score if any, NULL if none + */ +static struct sock *udp6_lib_lookup1(const struct net *net, + const struct in6_addr *saddr, __be16 sport, + const struct in6_addr *daddr, + unsigned int hnum, int dif, int sdif, + const struct udp_table *udptable) +{ + unsigned int slot = udp_hashfn(net, hnum, udptable->mask); + struct udp_hslot *hslot = &udptable->hash[slot]; + struct sock *sk, *result = NULL; + int score, badness = 0; + + sk_for_each_rcu(sk, &hslot->head) { + score = compute_score(sk, net, + saddr, sport, daddr, hnum, dif, sdif); + if (score > badness) { + result = sk; + badness = score; + } + } + + return result; +} + /* called with rcu_read_lock() */ -static struct sock *udp6_lib_lookup2(struct net *net, +static struct sock *udp6_lib_lookup2(const struct net *net, const struct in6_addr *saddr, __be16 sport, const struct in6_addr *daddr, unsigned int hnum, int dif, int sdif, struct udp_hslot *hslot2, @@ -168,15 +223,21 @@ static struct sock *udp6_lib_lookup2(struct net *net, { struct sock *sk, *result; int score, badness; + bool need_rescore; result = NULL; badness = -1; udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) { - score = compute_score(sk, net, saddr, sport, - daddr, hnum, dif, sdif); + need_rescore = false; +rescore: + score = compute_score(need_rescore ? result : sk, net, saddr, + sport, daddr, hnum, dif, sdif); if (score > badness) { badness = score; + if (need_rescore) + continue; + if (sk->sk_state == TCP_ESTABLISHED) { result = sk; continue; @@ -197,28 +258,108 @@ static struct sock *udp6_lib_lookup2(struct net *net, if (IS_ERR(result)) continue; - badness = compute_score(sk, net, saddr, sport, - daddr, hnum, dif, sdif); + /* compute_score is too long of a function to be + * inlined, and calling it again here yields + * measureable overhead for some + * workloads. Work around it by jumping + * backwards to rescore 'result'. + */ + need_rescore = true; + goto rescore; } } return result; } +#if IS_ENABLED(CONFIG_BASE_SMALL) +static struct sock *udp6_lib_lookup4(const struct net *net, + const struct in6_addr *saddr, __be16 sport, + const struct in6_addr *daddr, + unsigned int hnum, int dif, int sdif, + struct udp_table *udptable) +{ + return NULL; +} + +static void udp6_hash4(struct sock *sk) +{ +} +#else /* !CONFIG_BASE_SMALL */ +static struct sock *udp6_lib_lookup4(const struct net *net, + const struct in6_addr *saddr, __be16 sport, + const struct in6_addr *daddr, + unsigned int hnum, int dif, int sdif, + struct udp_table *udptable) +{ + const __portpair ports = INET_COMBINED_PORTS(sport, hnum); + const struct hlist_nulls_node *node; + struct udp_hslot *hslot4; + unsigned int hash4, slot; + struct udp_sock *up; + struct sock *sk; + + hash4 = udp6_ehashfn(net, daddr, hnum, saddr, sport); + slot = hash4 & udptable->mask; + hslot4 = &udptable->hash4[slot]; + +begin: + udp_lrpa_for_each_entry_rcu(up, node, &hslot4->nulls_head) { + sk = (struct sock *)up; + if (inet6_match(net, sk, saddr, daddr, ports, dif, sdif)) + return sk; + } + + /* if the nulls value we got at the end of this lookup is not the + * expected one, we must restart lookup. We probably met an item that + * was moved to another chain due to rehash. + */ + if (get_nulls_value(node) != slot) + goto begin; + + return NULL; +} + +static void udp6_hash4(struct sock *sk) +{ + struct net *net = sock_net(sk); + unsigned int hash; + + if (ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr)) { + udp4_hash4(sk); + return; + } + + if (sk_unhashed(sk) || ipv6_addr_any(&sk->sk_v6_rcv_saddr)) + return; + + hash = udp6_ehashfn(net, &sk->sk_v6_rcv_saddr, sk->sk_num, + &sk->sk_v6_daddr, sk->sk_dport); + + udp_lib_hash4(sk, hash); +} +#endif /* CONFIG_BASE_SMALL */ + /* rcu_read_lock() must be held */ -struct sock *__udp6_lib_lookup(struct net *net, +struct sock *__udp6_lib_lookup(const struct net *net, const struct in6_addr *saddr, __be16 sport, const struct in6_addr *daddr, __be16 dport, int dif, int sdif, struct udp_table *udptable, struct sk_buff *skb) { unsigned short hnum = ntohs(dport); - unsigned int hash2, slot2; struct udp_hslot *hslot2; struct sock *result, *sk; + unsigned int hash2; hash2 = ipv6_portaddr_hash(net, daddr, hnum); - slot2 = hash2 & udptable->mask; - hslot2 = &udptable->hash2[slot2]; + hslot2 = udp_hashslot2(udptable, hash2); + + if (udp_has_hash4(hslot2)) { + result = udp6_lib_lookup4(net, saddr, sport, daddr, hnum, + dif, sdif, udptable); + if (result) /* udp6_lib_lookup4 return sk or NULL */ + return result; + } /* Lookup connected or non-wildcard sockets */ result = udp6_lib_lookup2(net, saddr, sport, @@ -245,12 +386,18 @@ struct sock *__udp6_lib_lookup(struct net *net, /* Lookup wildcard sockets */ hash2 = ipv6_portaddr_hash(net, &in6addr_any, hnum); - slot2 = hash2 & udptable->mask; - hslot2 = &udptable->hash2[slot2]; + hslot2 = udp_hashslot2(udptable, hash2); result = udp6_lib_lookup2(net, saddr, sport, &in6addr_any, hnum, dif, sdif, hslot2, skb); + if (!IS_ERR_OR_NULL(result)) + goto done; + + /* Cover address change/lookup/rehash race: see __udp4_lib_lookup() */ + result = udp6_lib_lookup1(net, saddr, sport, daddr, hnum, dif, sdif, + udptable); + done: if (IS_ERR(result)) return NULL; @@ -288,7 +435,7 @@ struct sock *udp6_lib_lookup_skb(const struct sk_buff *skb, * Does increment socket refcount. */ #if IS_ENABLED(CONFIG_NF_TPROXY_IPV6) || IS_ENABLED(CONFIG_NF_SOCKET_IPV6) -struct sock *udp6_lib_lookup(struct net *net, const struct in6_addr *saddr, __be16 sport, +struct sock *udp6_lib_lookup(const struct net *net, const struct in6_addr *saddr, __be16 sport, const struct in6_addr *daddr, __be16 dport, int dif) { struct sock *sk; @@ -440,7 +587,7 @@ csum_copy_err: SNMP_INC_STATS(mib, UDP_MIB_CSUMERRORS); SNMP_INC_STATS(mib, UDP_MIB_INERRORS); } - kfree_skb(skb); + kfree_skb_reason(skb, SKB_DROP_REASON_UDP_CSUM); /* starting over for a new packet, but check if we need to yield */ cond_resched(); @@ -659,8 +806,8 @@ static int __udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) drop_reason = SKB_DROP_REASON_PROTO_MEM; } UDP6_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite); - kfree_skb_reason(skb, drop_reason); - trace_udp_fail_queue_rcv_skb(rc, sk); + trace_udp_fail_queue_rcv_skb(rc, sk, skb); + sk_skb_reason_drop(sk, skb, drop_reason); return -1; } @@ -763,7 +910,7 @@ csum_error: drop: __UDP6_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite); atomic_inc(&sk->sk_drops); - kfree_skb_reason(skb, drop_reason); + sk_skb_reason_drop(sk, skb, drop_reason); return -1; } @@ -847,7 +994,7 @@ static int __udp6_lib_mcast_deliver(struct net *net, struct sk_buff *skb, udptable->mask; hash2 = ipv6_portaddr_hash(net, daddr, hnum) & udptable->mask; start_lookup: - hslot = &udptable->hash2[hash2]; + hslot = &udptable->hash2[hash2].hslot; offset = offsetof(typeof(*sk), __sk_common.skc_portaddr_node); } @@ -898,11 +1045,8 @@ start_lookup: static void udp6_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst) { - if (udp_sk_rx_dst_set(sk, dst)) { - const struct rt6_info *rt = (const struct rt6_info *)dst; - - sk->sk_rx_dst_cookie = rt6_get_cookie(rt); - } + if (udp_sk_rx_dst_set(sk, dst)) + sk->sk_rx_dst_cookie = rt6_get_cookie(dst_rt6_info(dst)); } /* wrapper for udp_queue_rcv_skb tacking care of csum conversion and @@ -930,8 +1074,8 @@ int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED; const struct in6_addr *saddr, *daddr; struct net *net = dev_net(skb->dev); + struct sock *sk = NULL; struct udphdr *uh; - struct sock *sk; bool refcounted; u32 ulen = 0; @@ -1023,7 +1167,7 @@ no_sk: __UDP6_INC_STATS(net, UDP_MIB_NOPORTS, proto == IPPROTO_UDPLITE); icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0); - kfree_skb_reason(skb, reason); + sk_skb_reason_drop(sk, skb, reason); return 0; short_packet: @@ -1044,7 +1188,7 @@ csum_error: __UDP6_INC_STATS(net, UDP_MIB_CSUMERRORS, proto == IPPROTO_UDPLITE); discard: __UDP6_INC_STATS(net, UDP_MIB_INERRORS, proto == IPPROTO_UDPLITE); - kfree_skb_reason(skb, reason); + sk_skb_reason_drop(sk, skb, reason); return 0; } @@ -1056,14 +1200,13 @@ static struct sock *__udp6_lib_demux_lookup(struct net *net, { struct udp_table *udptable = net->ipv4.udp_table; unsigned short hnum = ntohs(loc_port); - unsigned int hash2, slot2; struct udp_hslot *hslot2; + unsigned int hash2; __portpair ports; struct sock *sk; hash2 = ipv6_portaddr_hash(net, loc_addr, hnum); - slot2 = hash2 & udptable->mask; - hslot2 = &udptable->hash2[slot2]; + hslot2 = udp_hashslot2(udptable, hash2); ports = INET_COMBINED_PORTS(rmt_port, hnum); udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) { @@ -1160,6 +1303,18 @@ static int udpv6_pre_connect(struct sock *sk, struct sockaddr *uaddr, return BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr, &addr_len); } +static int udpv6_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) +{ + int res; + + lock_sock(sk); + res = __ip6_datagram_connect(sk, uaddr, addr_len); + if (!res) + udp6_hash4(sk); + release_sock(sk); + return res; +} + /** * udp6_hwcsum_outgoing - handle outgoing HW checksumming * @sk: socket we are sending on @@ -1235,9 +1390,9 @@ static int udp_v6_send_skb(struct sk_buff *skb, struct flowi6 *fl6, const int hlen = skb_network_header_len(skb) + sizeof(struct udphdr); - if (hlen + cork->gso_size > cork->fragsize) { + if (hlen + min(datalen, cork->gso_size) > cork->fragsize) { kfree_skb(skb); - return -EINVAL; + return -EMSGSIZE; } if (datalen > cork->gso_size * UDP_MAX_SEGMENTS) { kfree_skb(skb); @@ -1247,8 +1402,7 @@ static int udp_v6_send_skb(struct sk_buff *skb, struct flowi6 *fl6, kfree_skb(skb); return -EINVAL; } - if (skb->ip_summed != CHECKSUM_PARTIAL || is_udplite || - dst_xfrm(skb_dst(skb))) { + if (is_udplite || dst_xfrm(skb_dst(skb))) { kfree_skb(skb); return -EIO; } @@ -1258,8 +1412,10 @@ static int udp_v6_send_skb(struct sk_buff *skb, struct flowi6 *fl6, skb_shinfo(skb)->gso_type = SKB_GSO_UDP_L4; skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(datalen, cork->gso_size); + + /* Don't checksum the payload, skb will get segmented */ + goto csum_partial; } - goto csum_partial; } if (is_udplite) @@ -1339,10 +1495,8 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) int is_udplite = IS_UDPLITE(sk); int (*getfrag)(void *, char *, int, int, int, struct sk_buff *); - ipcm6_init(&ipc6); + ipcm6_init_sk(&ipc6, sk); ipc6.gso_size = READ_ONCE(up->gso_size); - ipc6.sockc.tsflags = READ_ONCE(sk->sk_tsflags); - ipc6.sockc.mark = READ_ONCE(sk->sk_mark); /* destination address check */ if (sin6) { @@ -1548,9 +1702,6 @@ do_udp_sendmsg: security_sk_classify_flow(sk, flowi6_to_flowi_common(fl6)); - if (ipc6.tclass < 0) - ipc6.tclass = np->tclass; - fl6->flowlabel = ip6_make_flowinfo(ipc6.tclass, fl6->flowlabel); dst = ip6_sk_dst_lookup_flow(sk, fl6, final_p, connected); @@ -1573,7 +1724,7 @@ back_from_confirm: skb = ip6_make_skb(sk, getfrag, msg, ulen, sizeof(struct udphdr), &ipc6, - (struct rt6_info *)dst, + dst_rt6_info(dst), msg->msg_flags, &cork); err = PTR_ERR(skb); if (!IS_ERR_OR_NULL(skb)) @@ -1596,11 +1747,9 @@ back_from_confirm: WRITE_ONCE(up->pending, AF_INET6); do_append_data: - if (ipc6.dontfrag < 0) - ipc6.dontfrag = inet6_test_bit(DONTFRAG, sk); up->len += ulen; err = ip6_append_data(sk, getfrag, msg, ulen, sizeof(struct udphdr), - &ipc6, fl6, (struct rt6_info *)dst, + &ipc6, fl6, dst_rt6_info(dst), corkreq ? msg->msg_flags|MSG_MORE : msg->msg_flags); if (err) udp_v6_flush_pending_frames(sk); @@ -1677,6 +1826,7 @@ void udpv6_destroy_sock(struct sock *sk) if (udp_test_bit(ENCAP_ENABLED, sk)) { static_branch_dec(&udpv6_encap_needed_key); udp_encap_disable(); + udp_tunnel_cleanup_gro(sk); } } } @@ -1754,7 +1904,7 @@ struct proto udpv6_prot = { .owner = THIS_MODULE, .close = udp_lib_close, .pre_connect = udpv6_pre_connect, - .connect = ip6_datagram_connect, + .connect = udpv6_connect, .disconnect = udp_disconnect, .ioctl = udp_ioctl, .init = udpv6_init_sock, diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c index b41152dd4246..d8445ac1b2e4 100644 --- a/net/ipv6/udp_offload.c +++ b/net/ipv6/udp_offload.c @@ -117,9 +117,14 @@ static struct sock *udp6_gro_lookup_skb(struct sk_buff *skb, __be16 sport, __be16 dport) { const struct ipv6hdr *iph = skb_gro_network_header(skb); - struct net *net = dev_net(skb->dev); + struct net *net = dev_net_rcu(skb->dev); + struct sock *sk; int iif, sdif; + sk = udp_tunnel_sk(net, true); + if (sk && dport == htons(sk->sk_num)) + return sk; + inet6_get_iif_sdif(skb, &iif, &sdif); return __udp6_lib_lookup(net, &iph->saddr, sport, diff --git a/net/ipv6/xfrm6_input.c b/net/ipv6/xfrm6_input.c index a17d783dc7c0..841c81abaaf4 100644 --- a/net/ipv6/xfrm6_input.c +++ b/net/ipv6/xfrm6_input.c @@ -58,7 +58,11 @@ int xfrm6_transport_finish(struct sk_buff *skb, int async) skb_postpush_rcsum(skb, skb_network_header(skb), nhlen); if (xo && (xo->flags & XFRM_GRO)) { - skb_mac_header_rebuild(skb); + /* The full l2 header needs to be preserved so that re-injecting the packet at l2 + * works correctly in the presence of vlan tags. + */ + skb_mac_header_rebuild_full(skb, xo->orig_mac_len); + skb_reset_network_header(skb); skb_reset_transport_header(skb); return 0; } @@ -109,19 +113,6 @@ static int __xfrm6_udp_encap_rcv(struct sock *sk, struct sk_buff *skb, bool pull /* Must be an IKE packet.. pass it through */ return 1; break; - case UDP_ENCAP_ESPINUDP_NON_IKE: - /* Check if this is a keepalive packet. If so, eat it. */ - if (len == 1 && udpdata[0] == 0xff) { - return -EINVAL; - } else if (len > 2 * sizeof(u32) + sizeof(struct ip_esp_hdr) && - udpdata32[0] == 0 && udpdata32[1] == 0) { - - /* ESP Packet with Non-IKE marker */ - len = sizeof(struct udphdr) + 2 * sizeof(u32); - } else - /* Must be an IKE packet.. pass it through */ - return 1; - break; } /* At this point we are sure that this is an ESPinUDP packet, @@ -188,14 +179,18 @@ struct sk_buff *xfrm6_gro_udp_encap_rcv(struct sock *sk, struct list_head *head, int offset = skb_gro_offset(skb); const struct net_offload *ops; struct sk_buff *pp = NULL; - int ret; + int len, dlen; + __u8 *udpdata; + __be32 *udpdata32; if (skb->protocol == htons(ETH_P_IP)) return xfrm4_gro_udp_encap_rcv(sk, head, skb); - offset = offset - sizeof(struct udphdr); - - if (!pskb_pull(skb, offset)) + len = skb->len - offset; + dlen = offset + min(len, 8); + udpdata = skb_gro_header(skb, dlen, offset); + udpdata32 = (__be32 *)udpdata; + if (unlikely(!udpdata)) return NULL; rcu_read_lock(); @@ -203,11 +198,10 @@ struct sk_buff *xfrm6_gro_udp_encap_rcv(struct sock *sk, struct list_head *head, if (!ops || !ops->callbacks.gro_receive) goto out; - ret = __xfrm6_udp_encap_rcv(sk, skb, false); - if (ret) + /* check if it is a keepalive or IKE packet */ + if (len <= sizeof(struct ip_esp_hdr) || udpdata32[0] == 0) goto out; - skb_push(skb, offset); NAPI_GRO_CB(skb)->proto = IPPROTO_UDP; pp = call_gro_receive(ops->callbacks.gro_receive, head, skb); @@ -217,7 +211,6 @@ struct sk_buff *xfrm6_gro_udp_encap_rcv(struct sock *sk, struct list_head *head, out: rcu_read_unlock(); - skb_push(skb, offset); NAPI_GRO_CB(skb)->same_flow = 0; NAPI_GRO_CB(skb)->flush = 1; @@ -279,6 +272,13 @@ int xfrm6_input_addr(struct sk_buff *skb, xfrm_address_t *daddr, if (!x) continue; + if (unlikely(x->dir && x->dir != XFRM_SA_DIR_IN)) { + XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATEDIRERROR); + xfrm_state_put(x); + x = NULL; + continue; + } + spin_lock(&x->lock); if ((!i || (x->props.flags & XFRM_STATE_WILDRECV)) && diff --git a/net/ipv6/xfrm6_output.c b/net/ipv6/xfrm6_output.c index 5f7b1fdbffe6..b3d5d1f266ee 100644 --- a/net/ipv6/xfrm6_output.c +++ b/net/ipv6/xfrm6_output.c @@ -82,14 +82,14 @@ static int __xfrm6_output(struct net *net, struct sock *sk, struct sk_buff *skb) toobig = skb->len > mtu && !skb_is_gso(skb); - if (toobig && xfrm6_local_dontfrag(skb->sk)) { + if (toobig && xfrm6_local_dontfrag(sk)) { xfrm6_local_rxpmtu(skb, mtu); kfree_skb(skb); return -EMSGSIZE; } else if (toobig && xfrm6_noneed_fragment(skb)) { skb->ignore_df = 1; goto skip_frag; - } else if (!skb->ignore_df && toobig && skb->sk) { + } else if (!skb->ignore_df && toobig && sk) { xfrm_local_error(skb, mtu); kfree_skb(skb); return -EMSGSIZE; diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c index 42fb6996b077..1f19b6f14484 100644 --- a/net/ipv6/xfrm6_policy.c +++ b/net/ipv6/xfrm6_policy.c @@ -23,23 +23,24 @@ #include <net/ip6_route.h> #include <net/l3mdev.h> -static struct dst_entry *xfrm6_dst_lookup(struct net *net, int tos, int oif, - const xfrm_address_t *saddr, - const xfrm_address_t *daddr, - u32 mark) +static struct dst_entry *xfrm6_dst_lookup(const struct xfrm_dst_lookup_params *params) { struct flowi6 fl6; struct dst_entry *dst; int err; memset(&fl6, 0, sizeof(fl6)); - fl6.flowi6_l3mdev = l3mdev_master_ifindex_by_index(net, oif); - fl6.flowi6_mark = mark; - memcpy(&fl6.daddr, daddr, sizeof(fl6.daddr)); - if (saddr) - memcpy(&fl6.saddr, saddr, sizeof(fl6.saddr)); + fl6.flowi6_l3mdev = l3mdev_master_ifindex_by_index(params->net, + params->oif); + fl6.flowi6_mark = params->mark; + memcpy(&fl6.daddr, params->daddr, sizeof(fl6.daddr)); + if (params->saddr) + memcpy(&fl6.saddr, params->saddr, sizeof(fl6.saddr)); - dst = ip6_route_output(net, NULL, &fl6); + fl6.flowi4_proto = params->ipproto; + fl6.uli = params->uli; + + dst = ip6_route_output(params->net, NULL, &fl6); err = dst->error; if (dst->error) { @@ -50,19 +51,25 @@ static struct dst_entry *xfrm6_dst_lookup(struct net *net, int tos, int oif, return dst; } -static int xfrm6_get_saddr(struct net *net, int oif, - xfrm_address_t *saddr, xfrm_address_t *daddr, - u32 mark) +static int xfrm6_get_saddr(xfrm_address_t *saddr, + const struct xfrm_dst_lookup_params *params) { struct dst_entry *dst; struct net_device *dev; + struct inet6_dev *idev; - dst = xfrm6_dst_lookup(net, 0, oif, NULL, daddr, mark); + dst = xfrm6_dst_lookup(params); if (IS_ERR(dst)) return -EHOSTUNREACH; - dev = ip6_dst_idev(dst)->dev; - ipv6_dev_get_saddr(dev_net(dev), dev, &daddr->in6, 0, &saddr->in6); + idev = ip6_dst_idev(dst); + if (!idev) { + dst_release(dst); + return -EHOSTUNREACH; + } + dev = idev->dev; + ipv6_dev_get_saddr(dev_net(dev), dev, ¶ms->daddr->in6, 0, + &saddr->in6); dst_release(dst); return 0; } @@ -70,7 +77,7 @@ static int xfrm6_get_saddr(struct net *net, int oif, static int xfrm6_fill_dst(struct xfrm_dst *xdst, struct net_device *dev, const struct flowi *fl) { - struct rt6_info *rt = (struct rt6_info *)xdst->route; + struct rt6_info *rt = dst_rt6_info(xdst->route); xdst->u.dst.dev = dev; netdev_hold(dev, &xdst->u.dst.dev_tracker, GFP_ATOMIC); @@ -184,7 +191,6 @@ static struct ctl_table xfrm6_policy_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, - { } }; static int __net_init xfrm6_net_sysctl_init(struct net *net) @@ -218,7 +224,7 @@ err_alloc: static void __net_exit xfrm6_net_sysctl_exit(struct net *net) { - struct ctl_table *table; + const struct ctl_table *table; if (!net->ipv6.sysctl.xfrm6_hdr) return; @@ -285,8 +291,14 @@ int __init xfrm6_init(void) ret = register_pernet_subsys(&xfrm6_net_ops); if (ret) goto out_protocol; + + ret = xfrm_nat_keepalive_init(AF_INET6); + if (ret) + goto out_nat_keepalive; out: return ret; +out_nat_keepalive: + unregister_pernet_subsys(&xfrm6_net_ops); out_protocol: xfrm6_protocol_fini(); out_state: @@ -298,6 +310,7 @@ out_policy: void xfrm6_fini(void) { + xfrm_nat_keepalive_fini(AF_INET6); unregister_pernet_subsys(&xfrm6_net_ops); xfrm6_protocol_fini(); xfrm6_policy_fini(); |