diff options
Diffstat (limited to 'net')
148 files changed, 2777 insertions, 1549 deletions
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index b90781b9ece6..2a7f1b15714a 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -354,6 +354,26 @@ out: return 0; } +static int vlan_hwtstamp_get(struct net_device *dev, + struct kernel_hwtstamp_config *cfg) +{ + struct net_device *real_dev = vlan_dev_priv(dev)->real_dev; + + return generic_hwtstamp_get_lower(real_dev, cfg); +} + +static int vlan_hwtstamp_set(struct net_device *dev, + struct kernel_hwtstamp_config *cfg, + struct netlink_ext_ack *extack) +{ + struct net_device *real_dev = vlan_dev_priv(dev)->real_dev; + + if (!net_eq(dev_net(dev), dev_net(real_dev))) + return -EOPNOTSUPP; + + return generic_hwtstamp_set_lower(real_dev, cfg, extack); +} + static int vlan_dev_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) { struct net_device *real_dev = vlan_dev_priv(dev)->real_dev; @@ -365,14 +385,9 @@ static int vlan_dev_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) ifrr.ifr_ifru = ifr->ifr_ifru; switch (cmd) { - case SIOCSHWTSTAMP: - if (!net_eq(dev_net(dev), dev_net(real_dev))) - break; - fallthrough; case SIOCGMIIPHY: case SIOCGMIIREG: case SIOCSMIIREG: - case SIOCGHWTSTAMP: if (netif_device_present(real_dev) && ops->ndo_eth_ioctl) err = ops->ndo_eth_ioctl(real_dev, &ifrr, cmd); break; @@ -1081,6 +1096,8 @@ static const struct net_device_ops vlan_netdev_ops = { .ndo_fix_features = vlan_dev_fix_features, .ndo_get_iflink = vlan_dev_get_iflink, .ndo_fill_forward_path = vlan_dev_fill_forward_path, + .ndo_hwtstamp_get = vlan_hwtstamp_get, + .ndo_hwtstamp_set = vlan_hwtstamp_set, }; static void vlan_dev_free(struct net_device *dev) diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c index 00b684616e8d..c4015f30f9fa 100644 --- a/net/9p/trans_fd.c +++ b/net/9p/trans_fd.c @@ -1019,7 +1019,7 @@ p9_fd_create_tcp(struct p9_client *client, const char *addr, char *args) } } - err = csocket->ops->connect(csocket, + err = READ_ONCE(csocket->ops)->connect(csocket, (struct sockaddr *)&sin_server, sizeof(struct sockaddr_in), 0); if (err < 0) { @@ -1060,7 +1060,7 @@ p9_fd_create_unix(struct p9_client *client, const char *addr, char *args) return err; } - err = csocket->ops->connect(csocket, (struct sockaddr *)&sun_server, + err = READ_ONCE(csocket->ops)->connect(csocket, (struct sockaddr *)&sun_server, sizeof(struct sockaddr_un) - 1, 0); if (err < 0) { pr_err("%s (%d): problem connecting socket: %s: %d\n", diff --git a/net/Kconfig b/net/Kconfig index 2fb25b534df5..d532ec33f1fe 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -52,6 +52,11 @@ config NET_INGRESS config NET_EGRESS bool +config NET_XGRESS + select NET_INGRESS + select NET_EGRESS + bool + config NET_REDIRECT bool diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 2321bd2f9964..57a7a64b84ed 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -15,11 +15,12 @@ #include <net/sock.h> #include <net/tcp.h> #include <net/net_namespace.h> -#include <net/page_pool.h> +#include <net/page_pool/helpers.h> #include <linux/error-injection.h> #include <linux/smp.h> #include <linux/sock_diag.h> #include <linux/netfilter.h> +#include <net/netdev_rx_queue.h> #include <net/xdp.h> #include <net/netfilter/nf_bpf_link.h> @@ -555,12 +556,23 @@ __bpf_kfunc u32 bpf_fentry_test9(u32 *a) return *a; } +void noinline bpf_fentry_test_sinfo(struct skb_shared_info *sinfo) +{ +} + __bpf_kfunc int bpf_modify_return_test(int a, int *b) { *b += 1; return a + *b; } +__bpf_kfunc int bpf_modify_return_test2(int a, int *b, short c, int d, + void *e, char f, int g) +{ + *b += 1; + return a + *b + c + d + (long)e + f + g; +} + int noinline bpf_fentry_shadow_test(int a) { return a + 1; @@ -596,6 +608,7 @@ __diag_pop(); BTF_SET8_START(bpf_test_modify_return_ids) BTF_ID_FLAGS(func, bpf_modify_return_test) +BTF_ID_FLAGS(func, bpf_modify_return_test2) BTF_ID_FLAGS(func, bpf_fentry_test1, KF_SLEEPABLE) BTF_SET8_END(bpf_test_modify_return_ids) @@ -663,7 +676,11 @@ int bpf_prog_test_run_tracing(struct bpf_prog *prog, case BPF_MODIFY_RETURN: ret = bpf_modify_return_test(1, &b); if (b != 2) - side_effect = 1; + side_effect++; + b = 2; + ret += bpf_modify_return_test2(1, &b, 3, 4, (void *)5, 6, 7); + if (b != 2) + side_effect++; break; default: goto out; diff --git a/net/bridge/br.c b/net/bridge/br.c index 4f5098d33a46..a6e94ceb7c9a 100644 --- a/net/bridge/br.c +++ b/net/bridge/br.c @@ -234,6 +234,14 @@ static int br_switchdev_blocking_event(struct notifier_block *nb, br_switchdev_port_unoffload(p, b->ctx, b->atomic_nb, b->blocking_nb); break; + case SWITCHDEV_BRPORT_REPLAY: + brport_info = ptr; + b = &brport_info->brport; + + err = br_switchdev_port_replay(p, b->dev, b->ctx, b->atomic_nb, + b->blocking_nb, extack); + err = notifier_from_errno(err); + break; } out: diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c index 6116eba1bd89..9d7bc8b96b53 100644 --- a/net/bridge/br_forward.c +++ b/net/bridge/br_forward.c @@ -154,6 +154,7 @@ void br_forward(const struct net_bridge_port *to, backup_port = rcu_dereference(to->backup_port); if (unlikely(!backup_port)) goto out; + BR_INPUT_SKB_CB(skb)->backup_nhid = READ_ONCE(to->backup_nhid); to = backup_port; } diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 05c5863d2e20..10f0d33d8ccf 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -211,6 +211,7 @@ static inline size_t br_port_info_size(void) + nla_total_size(sizeof(u8)) /* IFLA_BRPORT_MRP_IN_OPEN */ + nla_total_size(sizeof(u32)) /* IFLA_BRPORT_MCAST_EHT_HOSTS_LIMIT */ + nla_total_size(sizeof(u32)) /* IFLA_BRPORT_MCAST_EHT_HOSTS_CNT */ + + nla_total_size(sizeof(u32)) /* IFLA_BRPORT_BACKUP_NHID */ + 0; } @@ -319,6 +320,10 @@ static int br_port_fill_attrs(struct sk_buff *skb, backup_p->dev->ifindex); rcu_read_unlock(); + if (p->backup_nhid && + nla_put_u32(skb, IFLA_BRPORT_BACKUP_NHID, p->backup_nhid)) + return -EMSGSIZE; + return 0; } @@ -895,6 +900,7 @@ static const struct nla_policy br_port_policy[IFLA_BRPORT_MAX + 1] = { [IFLA_BRPORT_MCAST_N_GROUPS] = { .type = NLA_REJECT }, [IFLA_BRPORT_MCAST_MAX_GROUPS] = { .type = NLA_U32 }, [IFLA_BRPORT_NEIGH_VLAN_SUPPRESS] = NLA_POLICY_MAX(NLA_U8, 1), + [IFLA_BRPORT_BACKUP_NHID] = { .type = NLA_U32 }, }; /* Change the state of the port and notify spanning tree */ @@ -1065,6 +1071,12 @@ static int br_setport(struct net_bridge_port *p, struct nlattr *tb[], return err; } + if (tb[IFLA_BRPORT_BACKUP_NHID]) { + u32 backup_nhid = nla_get_u32(tb[IFLA_BRPORT_BACKUP_NHID]); + + WRITE_ONCE(p->backup_nhid, backup_nhid); + } + return 0; } diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index a63b32c1638e..a1f4acfa6994 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -387,6 +387,7 @@ struct net_bridge_port { struct net_bridge_vlan_group __rcu *vlgrp; #endif struct net_bridge_port __rcu *backup_port; + u32 backup_nhid; /* STP */ u8 priority; @@ -605,6 +606,8 @@ struct br_input_skb_cb { */ unsigned long fwd_hwdoms; #endif + + u32 backup_nhid; }; #define BR_INPUT_SKB_CB(__skb) ((struct br_input_skb_cb *)(__skb)->cb) @@ -971,7 +974,6 @@ int br_multicast_set_vlan_router(struct net_bridge_vlan *v, u8 mcast_router); int br_multicast_toggle(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack); int br_multicast_set_querier(struct net_bridge_mcast *brmctx, unsigned long val); -int br_multicast_set_hash_max(struct net_bridge *br, unsigned long val); int br_multicast_set_igmp_version(struct net_bridge_mcast *brmctx, unsigned long val); #if IS_ENABLED(CONFIG_IPV6) @@ -2115,6 +2117,12 @@ void br_switchdev_port_unoffload(struct net_bridge_port *p, const void *ctx, struct notifier_block *atomic_nb, struct notifier_block *blocking_nb); +int br_switchdev_port_replay(struct net_bridge_port *p, + struct net_device *dev, const void *ctx, + struct notifier_block *atomic_nb, + struct notifier_block *blocking_nb, + struct netlink_ext_ack *extack); + bool br_switchdev_frame_uses_tx_fwd_offload(struct sk_buff *skb); void br_switchdev_frame_set_offload_fwd_mark(struct sk_buff *skb); @@ -2165,6 +2173,16 @@ br_switchdev_port_unoffload(struct net_bridge_port *p, const void *ctx, { } +static inline int +br_switchdev_port_replay(struct net_bridge_port *p, + struct net_device *dev, const void *ctx, + struct notifier_block *atomic_nb, + struct notifier_block *blocking_nb, + struct netlink_ext_ack *extack) +{ + return -EOPNOTSUPP; +} + static inline bool br_switchdev_frame_uses_tx_fwd_offload(struct sk_buff *skb) { return false; diff --git a/net/bridge/br_switchdev.c b/net/bridge/br_switchdev.c index ba95c4d74a60..ee84e783e1df 100644 --- a/net/bridge/br_switchdev.c +++ b/net/bridge/br_switchdev.c @@ -727,6 +727,8 @@ br_switchdev_mdb_replay(struct net_device *br_dev, struct net_device *dev, err = br_switchdev_mdb_replay_one(nb, dev, SWITCHDEV_OBJ_PORT_MDB(obj), action, ctx, extack); + if (err == -EOPNOTSUPP) + err = 0; if (err) goto out_free_mdb; } @@ -759,8 +761,10 @@ static int nbp_switchdev_sync_objs(struct net_bridge_port *p, const void *ctx, err = br_switchdev_mdb_replay(br_dev, dev, ctx, true, blocking_nb, extack); - if (err && err != -EOPNOTSUPP) + if (err) { + /* -EOPNOTSUPP not propagated from MDB replay. */ return err; + } err = br_switchdev_fdb_replay(br_dev, ctx, true, atomic_nb); if (err && err != -EOPNOTSUPP) @@ -825,3 +829,12 @@ void br_switchdev_port_unoffload(struct net_bridge_port *p, const void *ctx, nbp_switchdev_del(p); } + +int br_switchdev_port_replay(struct net_bridge_port *p, + struct net_device *dev, const void *ctx, + struct notifier_block *atomic_nb, + struct notifier_block *blocking_nb, + struct netlink_ext_ack *extack) +{ + return nbp_switchdev_sync_objs(p, ctx, atomic_nb, blocking_nb, extack); +} diff --git a/net/bridge/br_vlan_tunnel.c b/net/bridge/br_vlan_tunnel.c index 6399a8a69d07..81833ca7a2c7 100644 --- a/net/bridge/br_vlan_tunnel.c +++ b/net/bridge/br_vlan_tunnel.c @@ -201,6 +201,21 @@ int br_handle_egress_vlan_tunnel(struct sk_buff *skb, if (err) return err; + if (BR_INPUT_SKB_CB(skb)->backup_nhid) { + tunnel_dst = __ip_tun_set_dst(0, 0, 0, 0, 0, TUNNEL_KEY, + tunnel_id, 0); + if (!tunnel_dst) + return -ENOMEM; + + tunnel_dst->u.tun_info.mode |= IP_TUNNEL_INFO_TX | + IP_TUNNEL_INFO_BRIDGE; + tunnel_dst->u.tun_info.key.nhid = + BR_INPUT_SKB_CB(skb)->backup_nhid; + skb_dst_set(skb, &tunnel_dst->dst); + + return 0; + } + tunnel_dst = rcu_dereference(vlan->tinfo.tunnel_dst); if (tunnel_dst && dst_hold_safe(&tunnel_dst->dst)) skb_dst_set(skb, &tunnel_dst->dst); diff --git a/net/core/dev.c b/net/core/dev.c index 69a3e544676c..636b41f0b32d 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -107,6 +107,7 @@ #include <net/pkt_cls.h> #include <net/checksum.h> #include <net/xfrm.h> +#include <net/tcx.h> #include <linux/highmem.h> #include <linux/init.h> #include <linux/module.h> @@ -132,6 +133,7 @@ #include <trace/events/net.h> #include <trace/events/skb.h> #include <trace/events/qdisc.h> +#include <trace/events/xdp.h> #include <linux/inetdevice.h> #include <linux/cpu_rmap.h> #include <linux/static_key.h> @@ -150,11 +152,11 @@ #include <linux/pm_runtime.h> #include <linux/prandom.h> #include <linux/once_lite.h> +#include <net/netdev_rx_queue.h> #include "dev.h" #include "net-sysfs.h" - static DEFINE_SPINLOCK(ptype_lock); struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly; struct list_head ptype_all __read_mostly; /* Taps */ @@ -388,6 +390,8 @@ static void list_netdevice(struct net_device *dev) hlist_add_head_rcu(&dev->index_hlist, dev_index_hash(net, dev->ifindex)); write_unlock(&dev_base_lock); + /* We reserved the ifindex, this can't fail */ + WARN_ON(xa_store(&net->dev_by_index, dev->ifindex, dev, GFP_KERNEL)); dev_base_seq_inc(net); } @@ -397,8 +401,12 @@ static void list_netdevice(struct net_device *dev) */ static void unlist_netdevice(struct net_device *dev, bool lock) { + struct net *net = dev_net(dev); + ASSERT_RTNL(); + xa_erase(&net->dev_by_index, dev->ifindex); + /* Unlink dev from the device chain */ if (lock) write_lock(&dev_base_lock); @@ -2384,8 +2392,7 @@ static bool remove_xps_queue(struct xps_dev_maps *dev_maps, struct xps_map *map = NULL; int pos; - if (dev_maps) - map = xmap_dereference(dev_maps->attr_map[tci]); + map = xmap_dereference(dev_maps->attr_map[tci]); if (!map) return false; @@ -3882,69 +3889,198 @@ int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb) EXPORT_SYMBOL(dev_loopback_xmit); #ifdef CONFIG_NET_EGRESS -static struct sk_buff * -sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev) +static struct netdev_queue * +netdev_tx_queue_mapping(struct net_device *dev, struct sk_buff *skb) { + int qm = skb_get_queue_mapping(skb); + + return netdev_get_tx_queue(dev, netdev_cap_txqueue(dev, qm)); +} + +static bool netdev_xmit_txqueue_skipped(void) +{ + return __this_cpu_read(softnet_data.xmit.skip_txqueue); +} + +void netdev_xmit_skip_txqueue(bool skip) +{ + __this_cpu_write(softnet_data.xmit.skip_txqueue, skip); +} +EXPORT_SYMBOL_GPL(netdev_xmit_skip_txqueue); +#endif /* CONFIG_NET_EGRESS */ + +#ifdef CONFIG_NET_XGRESS +static int tc_run(struct tcx_entry *entry, struct sk_buff *skb) +{ + int ret = TC_ACT_UNSPEC; #ifdef CONFIG_NET_CLS_ACT - struct mini_Qdisc *miniq = rcu_dereference_bh(dev->miniq_egress); - struct tcf_result cl_res; + struct mini_Qdisc *miniq = rcu_dereference_bh(entry->miniq); + struct tcf_result res; if (!miniq) - return skb; + return ret; - /* qdisc_skb_cb(skb)->pkt_len was already set by the caller. */ tc_skb_cb(skb)->mru = 0; tc_skb_cb(skb)->post_ct = false; - mini_qdisc_bstats_cpu_update(miniq, skb); - switch (tcf_classify(skb, miniq->block, miniq->filter_list, &cl_res, false)) { + mini_qdisc_bstats_cpu_update(miniq, skb); + ret = tcf_classify(skb, miniq->block, miniq->filter_list, &res, false); + /* Only tcf related quirks below. */ + switch (ret) { + case TC_ACT_SHOT: + mini_qdisc_qstats_cpu_drop(miniq); + break; case TC_ACT_OK: case TC_ACT_RECLASSIFY: - skb->tc_index = TC_H_MIN(cl_res.classid); + skb->tc_index = TC_H_MIN(res.classid); break; + } +#endif /* CONFIG_NET_CLS_ACT */ + return ret; +} + +static DEFINE_STATIC_KEY_FALSE(tcx_needed_key); + +void tcx_inc(void) +{ + static_branch_inc(&tcx_needed_key); +} + +void tcx_dec(void) +{ + static_branch_dec(&tcx_needed_key); +} + +static __always_inline enum tcx_action_base +tcx_run(const struct bpf_mprog_entry *entry, struct sk_buff *skb, + const bool needs_mac) +{ + const struct bpf_mprog_fp *fp; + const struct bpf_prog *prog; + int ret = TCX_NEXT; + + if (needs_mac) + __skb_push(skb, skb->mac_len); + bpf_mprog_foreach_prog(entry, fp, prog) { + bpf_compute_data_pointers(skb); + ret = bpf_prog_run(prog, skb); + if (ret != TCX_NEXT) + break; + } + if (needs_mac) + __skb_pull(skb, skb->mac_len); + return tcx_action_code(skb, ret); +} + +static __always_inline struct sk_buff * +sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, + struct net_device *orig_dev, bool *another) +{ + struct bpf_mprog_entry *entry = rcu_dereference_bh(skb->dev->tcx_ingress); + int sch_ret; + + if (!entry) + return skb; + if (*pt_prev) { + *ret = deliver_skb(skb, *pt_prev, orig_dev); + *pt_prev = NULL; + } + + qdisc_skb_cb(skb)->pkt_len = skb->len; + tcx_set_ingress(skb, true); + + if (static_branch_unlikely(&tcx_needed_key)) { + sch_ret = tcx_run(entry, skb, true); + if (sch_ret != TC_ACT_UNSPEC) + goto ingress_verdict; + } + sch_ret = tc_run(tcx_entry(entry), skb); +ingress_verdict: + switch (sch_ret) { + case TC_ACT_REDIRECT: + /* skb_mac_header check was done by BPF, so we can safely + * push the L2 header back before redirecting to another + * netdev. + */ + __skb_push(skb, skb->mac_len); + if (skb_do_redirect(skb) == -EAGAIN) { + __skb_pull(skb, skb->mac_len); + *another = true; + break; + } + *ret = NET_RX_SUCCESS; + return NULL; case TC_ACT_SHOT: - mini_qdisc_qstats_cpu_drop(miniq); - *ret = NET_XMIT_DROP; - kfree_skb_reason(skb, SKB_DROP_REASON_TC_EGRESS); + kfree_skb_reason(skb, SKB_DROP_REASON_TC_INGRESS); + *ret = NET_RX_DROP; return NULL; + /* used by tc_run */ case TC_ACT_STOLEN: case TC_ACT_QUEUED: case TC_ACT_TRAP: - *ret = NET_XMIT_SUCCESS; consume_skb(skb); + fallthrough; + case TC_ACT_CONSUMED: + *ret = NET_RX_SUCCESS; return NULL; + } + + return skb; +} + +static __always_inline struct sk_buff * +sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev) +{ + struct bpf_mprog_entry *entry = rcu_dereference_bh(dev->tcx_egress); + int sch_ret; + + if (!entry) + return skb; + + /* qdisc_skb_cb(skb)->pkt_len & tcx_set_ingress() was + * already set by the caller. + */ + if (static_branch_unlikely(&tcx_needed_key)) { + sch_ret = tcx_run(entry, skb, false); + if (sch_ret != TC_ACT_UNSPEC) + goto egress_verdict; + } + sch_ret = tc_run(tcx_entry(entry), skb); +egress_verdict: + switch (sch_ret) { case TC_ACT_REDIRECT: /* No need to push/pop skb's mac_header here on egress! */ skb_do_redirect(skb); *ret = NET_XMIT_SUCCESS; return NULL; - default: - break; + case TC_ACT_SHOT: + kfree_skb_reason(skb, SKB_DROP_REASON_TC_EGRESS); + *ret = NET_XMIT_DROP; + return NULL; + /* used by tc_run */ + case TC_ACT_STOLEN: + case TC_ACT_QUEUED: + case TC_ACT_TRAP: + *ret = NET_XMIT_SUCCESS; + return NULL; } -#endif /* CONFIG_NET_CLS_ACT */ return skb; } - -static struct netdev_queue * -netdev_tx_queue_mapping(struct net_device *dev, struct sk_buff *skb) -{ - int qm = skb_get_queue_mapping(skb); - - return netdev_get_tx_queue(dev, netdev_cap_txqueue(dev, qm)); -} - -static bool netdev_xmit_txqueue_skipped(void) +#else +static __always_inline struct sk_buff * +sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, + struct net_device *orig_dev, bool *another) { - return __this_cpu_read(softnet_data.xmit.skip_txqueue); + return skb; } -void netdev_xmit_skip_txqueue(bool skip) +static __always_inline struct sk_buff * +sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev) { - __this_cpu_write(softnet_data.xmit.skip_txqueue, skip); + return skb; } -EXPORT_SYMBOL_GPL(netdev_xmit_skip_txqueue); -#endif /* CONFIG_NET_EGRESS */ +#endif /* CONFIG_NET_XGRESS */ #ifdef CONFIG_XPS static int __get_xps_queue_idx(struct net_device *dev, struct sk_buff *skb, @@ -4128,9 +4264,7 @@ int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev) skb_update_prio(skb); qdisc_pkt_len_init(skb); -#ifdef CONFIG_NET_CLS_ACT - skb->tc_at_ingress = 0; -#endif + tcx_set_ingress(skb, false); #ifdef CONFIG_NET_EGRESS if (static_branch_unlikely(&egress_needed_key)) { if (nf_hook_egress_active()) { @@ -5064,72 +5198,6 @@ int (*br_fdb_test_addr_hook)(struct net_device *dev, EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook); #endif -static inline struct sk_buff * -sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, - struct net_device *orig_dev, bool *another) -{ -#ifdef CONFIG_NET_CLS_ACT - struct mini_Qdisc *miniq = rcu_dereference_bh(skb->dev->miniq_ingress); - struct tcf_result cl_res; - - /* If there's at least one ingress present somewhere (so - * we get here via enabled static key), remaining devices - * that are not configured with an ingress qdisc will bail - * out here. - */ - if (!miniq) - return skb; - - if (*pt_prev) { - *ret = deliver_skb(skb, *pt_prev, orig_dev); - *pt_prev = NULL; - } - - qdisc_skb_cb(skb)->pkt_len = skb->len; - tc_skb_cb(skb)->mru = 0; - tc_skb_cb(skb)->post_ct = false; - skb->tc_at_ingress = 1; - mini_qdisc_bstats_cpu_update(miniq, skb); - - switch (tcf_classify(skb, miniq->block, miniq->filter_list, &cl_res, false)) { - case TC_ACT_OK: - case TC_ACT_RECLASSIFY: - skb->tc_index = TC_H_MIN(cl_res.classid); - break; - case TC_ACT_SHOT: - mini_qdisc_qstats_cpu_drop(miniq); - kfree_skb_reason(skb, SKB_DROP_REASON_TC_INGRESS); - *ret = NET_RX_DROP; - return NULL; - case TC_ACT_STOLEN: - case TC_ACT_QUEUED: - case TC_ACT_TRAP: - consume_skb(skb); - *ret = NET_RX_SUCCESS; - return NULL; - case TC_ACT_REDIRECT: - /* skb_mac_header check was done by cls/act_bpf, so - * we can safely push the L2 header back before - * redirecting to another netdev - */ - __skb_push(skb, skb->mac_len); - if (skb_do_redirect(skb) == -EAGAIN) { - __skb_pull(skb, skb->mac_len); - *another = true; - break; - } - *ret = NET_RX_SUCCESS; - return NULL; - case TC_ACT_CONSUMED: - *ret = NET_RX_SUCCESS; - return NULL; - default: - break; - } -#endif /* CONFIG_NET_CLS_ACT */ - return skb; -} - /** * netdev_is_rx_handler_busy - check if receive handler is registered * @dev: device to check @@ -6316,12 +6384,8 @@ int dev_set_threaded(struct net_device *dev, bool threaded) * softirq mode will happen in the next round of napi_schedule(). * This should not cause hiccups/stalls to the live traffic. */ - list_for_each_entry(napi, &dev->napi_list, dev_list) { - if (threaded) - set_bit(NAPI_STATE_THREADED, &napi->state); - else - clear_bit(NAPI_STATE_THREADED, &napi->state); - } + list_for_each_entry(napi, &dev->napi_list, dev_list) + assign_bit(NAPI_STATE_THREADED, &napi->state, threaded); return err; } @@ -9413,6 +9477,7 @@ int bpf_xdp_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) { struct net *net = current->nsproxy->net_ns; struct bpf_link_primer link_primer; + struct netlink_ext_ack extack = {}; struct bpf_xdp_link *link; struct net_device *dev; int err, fd; @@ -9440,12 +9505,13 @@ int bpf_xdp_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) goto unlock; } - err = dev_xdp_attach_link(dev, NULL, link); + err = dev_xdp_attach_link(dev, &extack, link); rtnl_unlock(); if (err) { link->dev = NULL; bpf_link_cleanup(&link_primer); + trace_bpf_xdp_link_attach_failed(extack._msg); goto out_put_dev; } @@ -9509,23 +9575,35 @@ err_out: } /** - * dev_new_index - allocate an ifindex - * @net: the applicable net namespace + * dev_index_reserve() - allocate an ifindex in a namespace + * @net: the applicable net namespace + * @ifindex: requested ifindex, pass %0 to get one allocated * - * Returns a suitable unique value for a new device interface - * number. The caller must hold the rtnl semaphore or the - * dev_base_lock to be sure it remains unique. + * Allocate a ifindex for a new device. Caller must either use the ifindex + * to store the device (via list_netdevice()) or call dev_index_release() + * to give the index up. + * + * Return: a suitable unique value for a new device interface number or -errno. */ -static int dev_new_index(struct net *net) +static int dev_index_reserve(struct net *net, u32 ifindex) { - int ifindex = net->ifindex; + int err; - for (;;) { - if (++ifindex <= 0) - ifindex = 1; - if (!__dev_get_by_index(net, ifindex)) - return net->ifindex = ifindex; - } + if (!ifindex) + err = xa_alloc_cyclic(&net->dev_by_index, &ifindex, NULL, + xa_limit_31b, &net->ifindex, GFP_KERNEL); + else + err = xa_insert(&net->dev_by_index, ifindex, NULL, GFP_KERNEL); + if (err < 0) + return err; + + return ifindex; +} + +static void dev_index_release(struct net *net, int ifindex) +{ + /* Expect only unused indexes, unlist_netdevice() removes the used */ + WARN_ON(xa_erase(&net->dev_by_index, ifindex)); } /* Delayed registration/unregisteration */ @@ -9995,11 +10073,10 @@ int register_netdevice(struct net_device *dev) goto err_uninit; } - ret = -EBUSY; - if (!dev->ifindex) - dev->ifindex = dev_new_index(net); - else if (__dev_get_by_index(net, dev->ifindex)) + ret = dev_index_reserve(net, dev->ifindex); + if (ret < 0) goto err_uninit; + dev->ifindex = ret; /* Transfer changeable features to wanted_features and enable * software offloads (GSO and GRO). @@ -10046,7 +10123,7 @@ int register_netdevice(struct net_device *dev) ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev); ret = notifier_to_errno(ret); if (ret) - goto err_uninit; + goto err_ifindex_release; ret = netdev_register_kobject(dev); write_lock(&dev_base_lock); @@ -10102,6 +10179,8 @@ out: err_uninit_notify: call_netdevice_notifiers(NETDEV_PRE_UNINIT, dev); +err_ifindex_release: + dev_index_release(net, dev->ifindex); err_uninit: if (dev->netdev_ops->ndo_uninit) dev->netdev_ops->ndo_uninit(dev); @@ -10617,6 +10696,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, dev_net_set(dev, &init_net); dev->gso_max_size = GSO_LEGACY_MAX_SIZE; + dev->xdp_zc_max_segs = 1; dev->gso_max_segs = GSO_MAX_SEGS; dev->gro_max_size = GRO_LEGACY_MAX_SIZE; dev->gso_ipv4_max_size = GSO_LEGACY_MAX_SIZE; @@ -10838,7 +10918,7 @@ void unregister_netdevice_many_notify(struct list_head *head, /* Shutdown queueing discipline. */ dev_shutdown(dev); - + dev_tcx_uninstall(dev); dev_xdp_uninstall(dev); bpf_dev_bound_netdev_unregister(dev); @@ -10978,9 +11058,19 @@ int __dev_change_net_namespace(struct net_device *dev, struct net *net, } /* Check that new_ifindex isn't used yet. */ - err = -EBUSY; - if (new_ifindex && __dev_get_by_index(net, new_ifindex)) - goto out; + if (new_ifindex) { + err = dev_index_reserve(net, new_ifindex); + if (err < 0) + goto out; + } else { + /* If there is an ifindex conflict assign a new one */ + err = dev_index_reserve(net, dev->ifindex); + if (err == -EBUSY) + err = dev_index_reserve(net, 0); + if (err < 0) + goto out; + new_ifindex = err; + } /* * And now a mini version of register_netdevice unregister_netdevice. @@ -11008,13 +11098,6 @@ int __dev_change_net_namespace(struct net_device *dev, struct net *net, rcu_barrier(); new_nsid = peernet2id_alloc(dev_net(dev), net, GFP_KERNEL); - /* If there is an ifindex conflict assign a new one */ - if (!new_ifindex) { - if (__dev_get_by_index(net, dev->ifindex)) - new_ifindex = dev_new_index(net); - else - new_ifindex = dev->ifindex; - } rtmsg_ifinfo_newnet(RTM_DELLINK, dev, ~0U, GFP_KERNEL, &new_nsid, new_ifindex); @@ -11192,6 +11275,8 @@ static int __net_init netdev_init(struct net *net) if (net->dev_index_head == NULL) goto err_idx; + xa_init_flags(&net->dev_by_index, XA_FLAGS_ALLOC1); + RAW_INIT_NOTIFIER_HEAD(&net->netdev_chain); return 0; @@ -11289,6 +11374,7 @@ static void __net_exit netdev_exit(struct net *net) { kfree(net->dev_name_head); kfree(net->dev_index_head); + xa_destroy(&net->dev_by_index); if (net != &init_net) WARN_ON_ONCE(!list_empty(&net->dev_base_head)); } diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c index 3730945ee294..b46aedc36939 100644 --- a/net/core/dev_ioctl.c +++ b/net/core/dev_ioctl.c @@ -5,6 +5,7 @@ #include <linux/etherdevice.h> #include <linux/rtnetlink.h> #include <linux/net_tstamp.h> +#include <linux/phylib_stubs.h> #include <linux/wireless.h> #include <linux/if_bridge.h> #include <net/dsa_stubs.h> @@ -252,14 +253,121 @@ static int dev_eth_ioctl(struct net_device *dev, return ops->ndo_eth_ioctl(dev, ifr, cmd); } +/** + * dev_get_hwtstamp_phylib() - Get hardware timestamping settings of NIC + * or of attached phylib PHY + * @dev: Network device + * @cfg: Timestamping configuration structure + * + * Helper for enforcing a common policy that phylib timestamping, if available, + * should take precedence in front of hardware timestamping provided by the + * netdev. + * + * Note: phy_mii_ioctl() only handles SIOCSHWTSTAMP (not SIOCGHWTSTAMP), and + * there only exists a phydev->mii_ts->hwtstamp() method. So this will return + * -EOPNOTSUPP for phylib for now, which is still more accurate than letting + * the netdev handle the GET request. + */ +static int dev_get_hwtstamp_phylib(struct net_device *dev, + struct kernel_hwtstamp_config *cfg) +{ + if (phy_has_hwtstamp(dev->phydev)) + return phy_hwtstamp_get(dev->phydev, cfg); + + return dev->netdev_ops->ndo_hwtstamp_get(dev, cfg); +} + static int dev_get_hwtstamp(struct net_device *dev, struct ifreq *ifr) { - return dev_eth_ioctl(dev, ifr, SIOCGHWTSTAMP); + const struct net_device_ops *ops = dev->netdev_ops; + struct kernel_hwtstamp_config kernel_cfg = {}; + struct hwtstamp_config cfg; + int err; + + if (!ops->ndo_hwtstamp_get) + return dev_eth_ioctl(dev, ifr, SIOCGHWTSTAMP); /* legacy */ + + if (!netif_device_present(dev)) + return -ENODEV; + + kernel_cfg.ifr = ifr; + err = dev_get_hwtstamp_phylib(dev, &kernel_cfg); + if (err) + return err; + + /* If the request was resolved through an unconverted driver, omit + * the copy_to_user(), since the implementation has already done that + */ + if (!kernel_cfg.copied_to_user) { + hwtstamp_config_from_kernel(&cfg, &kernel_cfg); + + if (copy_to_user(ifr->ifr_data, &cfg, sizeof(cfg))) + return -EFAULT; + } + + return 0; +} + +/** + * dev_set_hwtstamp_phylib() - Change hardware timestamping of NIC + * or of attached phylib PHY + * @dev: Network device + * @cfg: Timestamping configuration structure + * @extack: Netlink extended ack message structure, for error reporting + * + * Helper for enforcing a common policy that phylib timestamping, if available, + * should take precedence in front of hardware timestamping provided by the + * netdev. If the netdev driver needs to perform specific actions even for PHY + * timestamping to work properly (a switch port must trap the timestamped + * frames and not forward them), it must set IFF_SEE_ALL_HWTSTAMP_REQUESTS in + * dev->priv_flags. + */ +static int dev_set_hwtstamp_phylib(struct net_device *dev, + struct kernel_hwtstamp_config *cfg, + struct netlink_ext_ack *extack) +{ + const struct net_device_ops *ops = dev->netdev_ops; + bool phy_ts = phy_has_hwtstamp(dev->phydev); + struct kernel_hwtstamp_config old_cfg = {}; + bool changed = false; + int err; + + cfg->source = phy_ts ? HWTSTAMP_SOURCE_PHYLIB : HWTSTAMP_SOURCE_NETDEV; + + if (phy_ts && (dev->priv_flags & IFF_SEE_ALL_HWTSTAMP_REQUESTS)) { + err = ops->ndo_hwtstamp_get(dev, &old_cfg); + if (err) + return err; + } + + if (!phy_ts || (dev->priv_flags & IFF_SEE_ALL_HWTSTAMP_REQUESTS)) { + err = ops->ndo_hwtstamp_set(dev, cfg, extack); + if (err) { + if (extack->_msg) + netdev_err(dev, "%s\n", extack->_msg); + return err; + } + } + + if (phy_ts && (dev->priv_flags & IFF_SEE_ALL_HWTSTAMP_REQUESTS)) + changed = kernel_hwtstamp_config_changed(&old_cfg, cfg); + + if (phy_ts) { + err = phy_hwtstamp_set(dev->phydev, cfg, extack); + if (err) { + if (changed) + ops->ndo_hwtstamp_set(dev, &old_cfg, NULL); + return err; + } + } + + return 0; } static int dev_set_hwtstamp(struct net_device *dev, struct ifreq *ifr) { - struct kernel_hwtstamp_config kernel_cfg; + const struct net_device_ops *ops = dev->netdev_ops; + struct kernel_hwtstamp_config kernel_cfg = {}; struct netlink_ext_ack extack = {}; struct hwtstamp_config cfg; int err; @@ -268,6 +376,7 @@ static int dev_set_hwtstamp(struct net_device *dev, struct ifreq *ifr) return -EFAULT; hwtstamp_config_to_kernel(&kernel_cfg, &cfg); + kernel_cfg.ifr = ifr; err = net_hwtstamp_validate(&kernel_cfg); if (err) @@ -280,8 +389,80 @@ static int dev_set_hwtstamp(struct net_device *dev, struct ifreq *ifr) return err; } - return dev_eth_ioctl(dev, ifr, SIOCSHWTSTAMP); + if (!ops->ndo_hwtstamp_set) + return dev_eth_ioctl(dev, ifr, SIOCSHWTSTAMP); /* legacy */ + + if (!netif_device_present(dev)) + return -ENODEV; + + err = dev_set_hwtstamp_phylib(dev, &kernel_cfg, &extack); + if (err) + return err; + + /* The driver may have modified the configuration, so copy the + * updated version of it back to user space + */ + if (!kernel_cfg.copied_to_user) { + hwtstamp_config_from_kernel(&cfg, &kernel_cfg); + + if (copy_to_user(ifr->ifr_data, &cfg, sizeof(cfg))) + return -EFAULT; + } + + return 0; +} + +static int generic_hwtstamp_ioctl_lower(struct net_device *dev, int cmd, + struct kernel_hwtstamp_config *kernel_cfg) +{ + struct ifreq ifrr; + int err; + + strscpy_pad(ifrr.ifr_name, dev->name, IFNAMSIZ); + ifrr.ifr_ifru = kernel_cfg->ifr->ifr_ifru; + + err = dev_eth_ioctl(dev, &ifrr, cmd); + if (err) + return err; + + kernel_cfg->ifr->ifr_ifru = ifrr.ifr_ifru; + kernel_cfg->copied_to_user = true; + + return 0; +} + +int generic_hwtstamp_get_lower(struct net_device *dev, + struct kernel_hwtstamp_config *kernel_cfg) +{ + const struct net_device_ops *ops = dev->netdev_ops; + + if (!netif_device_present(dev)) + return -ENODEV; + + if (ops->ndo_hwtstamp_get) + return dev_get_hwtstamp_phylib(dev, kernel_cfg); + + /* Legacy path: unconverted lower driver */ + return generic_hwtstamp_ioctl_lower(dev, SIOCGHWTSTAMP, kernel_cfg); +} +EXPORT_SYMBOL(generic_hwtstamp_get_lower); + +int generic_hwtstamp_set_lower(struct net_device *dev, + struct kernel_hwtstamp_config *kernel_cfg, + struct netlink_ext_ack *extack) +{ + const struct net_device_ops *ops = dev->netdev_ops; + + if (!netif_device_present(dev)) + return -ENODEV; + + if (ops->ndo_hwtstamp_set) + return dev_set_hwtstamp_phylib(dev, kernel_cfg, extack); + + /* Legacy path: unconverted lower driver */ + return generic_hwtstamp_ioctl_lower(dev, SIOCSHWTSTAMP, kernel_cfg); } +EXPORT_SYMBOL(generic_hwtstamp_set_lower); static int dev_siocbond(struct net_device *dev, struct ifreq *ifr, unsigned int cmd) diff --git a/net/core/filter.c b/net/core/filter.c index 28a59596987a..a094694899c9 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4339,13 +4339,8 @@ int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp, struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); enum bpf_map_type map_type = ri->map_type; - if (map_type == BPF_MAP_TYPE_XSKMAP) { - /* XDP_REDIRECT is not supported AF_XDP yet. */ - if (unlikely(xdp_buff_has_frags(xdp))) - return -EOPNOTSUPP; - + if (map_type == BPF_MAP_TYPE_XSKMAP) return __xdp_do_redirect_xsk(ri, dev, xdp, xdp_prog); - } return __xdp_do_redirect_frame(ri, dev, xdp_convert_buff_to_frame(xdp), xdp_prog); @@ -7350,8 +7345,8 @@ BPF_CALL_3(bpf_sk_assign, struct sk_buff *, skb, struct sock *, sk, u64, flags) return -EOPNOTSUPP; if (unlikely(dev_net(skb->dev) != sock_net(sk))) return -ENETUNREACH; - if (unlikely(sk_fullsock(sk) && sk->sk_reuseport)) - return -ESOCKTNOSUPPORT; + if (sk_unhashed(sk)) + return -EOPNOTSUPP; if (sk_is_refcounted(sk) && unlikely(!refcount_inc_not_zero(&sk->sk_refcnt))) return -ENOENT; @@ -9306,7 +9301,7 @@ static struct bpf_insn *bpf_convert_tstamp_read(const struct bpf_prog *prog, __u8 value_reg = si->dst_reg; __u8 skb_reg = si->src_reg; -#ifdef CONFIG_NET_CLS_ACT +#ifdef CONFIG_NET_XGRESS /* If the tstamp_type is read, * the bpf prog is aware the tstamp could have delivery time. * Thus, read skb->tstamp as is if tstamp_type_access is true. @@ -9340,7 +9335,7 @@ static struct bpf_insn *bpf_convert_tstamp_write(const struct bpf_prog *prog, __u8 value_reg = si->src_reg; __u8 skb_reg = si->dst_reg; -#ifdef CONFIG_NET_CLS_ACT +#ifdef CONFIG_NET_XGRESS /* If the tstamp_type is read, * the bpf prog is aware the tstamp could have delivery time. * Thus, write skb->tstamp as is if tstamp_type_access is true. diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 85a2d0d9bd39..89d15ceaf9af 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -40,7 +40,7 @@ static void dissector_set_key(struct flow_dissector *flow_dissector, enum flow_dissector_key_id key_id) { - flow_dissector->used_keys |= (1 << key_id); + flow_dissector->used_keys |= (1ULL << key_id); } void skb_flow_dissector_init(struct flow_dissector *flow_dissector, @@ -205,6 +205,50 @@ static void __skb_flow_dissect_icmp(const struct sk_buff *skb, skb_flow_get_icmp_tci(skb, key_icmp, data, thoff, hlen); } +static void __skb_flow_dissect_ah(const struct sk_buff *skb, + struct flow_dissector *flow_dissector, + void *target_container, const void *data, + int nhoff, int hlen) +{ + struct flow_dissector_key_ipsec *key_ah; + struct ip_auth_hdr _hdr, *hdr; + + if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_IPSEC)) + return; + + hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr); + if (!hdr) + return; + + key_ah = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_IPSEC, + target_container); + + key_ah->spi = hdr->spi; +} + +static void __skb_flow_dissect_esp(const struct sk_buff *skb, + struct flow_dissector *flow_dissector, + void *target_container, const void *data, + int nhoff, int hlen) +{ + struct flow_dissector_key_ipsec *key_esp; + struct ip_esp_hdr _hdr, *hdr; + + if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_IPSEC)) + return; + + hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr); + if (!hdr) + return; + + key_esp = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_IPSEC, + target_container); + + key_esp->spi = hdr->spi; +} + static void __skb_flow_dissect_l2tpv3(const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container, const void *data, @@ -1571,7 +1615,14 @@ ip_proto_again: __skb_flow_dissect_l2tpv3(skb, flow_dissector, target_container, data, nhoff, hlen); break; - + case IPPROTO_ESP: + __skb_flow_dissect_esp(skb, flow_dissector, target_container, + data, nhoff, hlen); + break; + case IPPROTO_AH: + __skb_flow_dissect_ah(skb, flow_dissector, target_container, + data, nhoff, hlen); + break; default: break; } diff --git a/net/core/flow_offload.c b/net/core/flow_offload.c index acfc1f88ea79..bc5169482710 100644 --- a/net/core/flow_offload.c +++ b/net/core/flow_offload.c @@ -146,6 +146,13 @@ void flow_rule_match_tcp(const struct flow_rule *rule, } EXPORT_SYMBOL(flow_rule_match_tcp); +void flow_rule_match_ipsec(const struct flow_rule *rule, + struct flow_match_ipsec *out) +{ + FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_IPSEC, out); +} +EXPORT_SYMBOL(flow_rule_match_ipsec); + void flow_rule_match_icmp(const struct flow_rule *rule, struct flow_match_icmp *out) { diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 15e3f4606b5f..fccaa5bac0ed 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -23,6 +23,7 @@ #include <linux/of.h> #include <linux/of_net.h> #include <linux/cpu.h> +#include <net/netdev_rx_queue.h> #include "dev.h" #include "net-sysfs.h" diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index a4270fafdf11..797c813c7c77 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -25,6 +25,14 @@ netdev_nl_dev_fill(struct net_device *netdev, struct sk_buff *rsp, return -EINVAL; } + if (netdev->xdp_features & NETDEV_XDP_ACT_XSK_ZEROCOPY) { + if (nla_put_u32(rsp, NETDEV_A_DEV_XDP_ZC_MAX_SEGS, + netdev->xdp_zc_max_segs)) { + genlmsg_cancel(rsp, hdr); + return -EINVAL; + } + } + genlmsg_end(rsp, hdr); return 0; @@ -93,43 +101,22 @@ int netdev_nl_dev_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); struct net_device *netdev; - int idx = 0, s_idx; - int h, s_h; - int err; - - s_h = cb->args[0]; - s_idx = cb->args[1]; + int err = 0; rtnl_lock(); - - for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { - struct hlist_head *head; - - idx = 0; - head = &net->dev_index_head[h]; - hlist_for_each_entry(netdev, head, index_hlist) { - if (idx < s_idx) - goto cont; - err = netdev_nl_dev_fill(netdev, skb, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, 0, - NETDEV_CMD_DEV_GET); - if (err < 0) - break; -cont: - idx++; - } + for_each_netdev_dump(net, netdev, cb->args[0]) { + err = netdev_nl_dev_fill(netdev, skb, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, 0, + NETDEV_CMD_DEV_GET); + if (err < 0) + break; } - rtnl_unlock(); if (err != -EMSGSIZE) return err; - cb->args[1] = idx; - cb->args[0] = h; - cb->seq = net->dev_base_seq; - return skb->len; } diff --git a/net/core/of_net.c b/net/core/of_net.c index 55d3fe229269..93ea425b9248 100644 --- a/net/core/of_net.c +++ b/net/core/of_net.c @@ -8,6 +8,7 @@ #include <linux/kernel.h> #include <linux/of_net.h> #include <linux/of_platform.h> +#include <linux/platform_device.h> #include <linux/phy.h> #include <linux/export.h> #include <linux/device.h> diff --git a/net/core/page_pool.c b/net/core/page_pool.c index a3e12a61d456..77cb75e63aca 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -10,7 +10,7 @@ #include <linux/slab.h> #include <linux/device.h> -#include <net/page_pool.h> +#include <net/page_pool/helpers.h> #include <net/xdp.h> #include <linux/dma-direction.h> @@ -58,6 +58,17 @@ static const char pp_stats[][ETH_GSTRING_LEN] = { "rx_pp_recycle_released_ref", }; +/** + * page_pool_get_stats() - fetch page pool stats + * @pool: pool from which page was allocated + * @stats: struct page_pool_stats to fill in + * + * Retrieve statistics about the page_pool. This API is only available + * if the kernel has been configured with ``CONFIG_PAGE_POOL_STATS=y``. + * A pointer to a caller allocated struct page_pool_stats structure + * is passed to this API which is filled in. The caller can then report + * those stats to the user (perhaps via ethtool, debugfs, etc.). + */ bool page_pool_get_stats(struct page_pool *pool, struct page_pool_stats *stats) { @@ -224,6 +235,10 @@ static int page_pool_init(struct page_pool *pool, return 0; } +/** + * page_pool_create() - create a page pool. + * @params: parameters, see struct page_pool_params + */ struct page_pool *page_pool_create(const struct page_pool_params *params) { struct page_pool *pool; @@ -492,7 +507,7 @@ static s32 page_pool_inflight(struct page_pool *pool) * a regular page (that will eventually be returned to the normal * page-allocator via put_page). */ -void page_pool_release_page(struct page_pool *pool, struct page *page) +static void page_pool_return_page(struct page_pool *pool, struct page *page) { dma_addr_t dma; int count; @@ -518,13 +533,6 @@ skip_dma_unmap: */ count = atomic_inc_return_relaxed(&pool->pages_state_release_cnt); trace_page_pool_state_release(pool, page, count); -} -EXPORT_SYMBOL(page_pool_release_page); - -/* Return a page to the page allocator, cleaning up our state */ -static void page_pool_return_page(struct page_pool *pool, struct page *page) -{ - page_pool_release_page(pool, page); put_page(page); /* An optimization would be to call __free_pages(page, pool->p.order) @@ -579,6 +587,8 @@ static __always_inline struct page * __page_pool_put_page(struct page_pool *pool, struct page *page, unsigned int dma_sync_size, bool allow_direct) { + lockdep_assert_no_hardirq(); + /* This allocator is optimized for the XDP mode that uses * one-frame-per-page, but have fallbacks that act like the * regular page allocator APIs. @@ -616,9 +626,7 @@ __page_pool_put_page(struct page_pool *pool, struct page *page, * will be invoking put_page. */ recycle_stat_inc(pool, released_refcnt); - /* Do not replace this with page_pool_return_page() */ - page_pool_release_page(pool, page); - put_page(page); + page_pool_return_page(pool, page); return NULL; } @@ -635,7 +643,21 @@ void page_pool_put_defragged_page(struct page_pool *pool, struct page *page, } EXPORT_SYMBOL(page_pool_put_defragged_page); -/* Caller must not use data area after call, as this function overwrites it */ +/** + * page_pool_put_page_bulk() - release references on multiple pages + * @pool: pool from which pages were allocated + * @data: array holding page pointers + * @count: number of pages in @data + * + * Tries to refill a number of pages into the ptr_ring cache holding ptr_ring + * producer lock. If the ptr_ring is full, page_pool_put_page_bulk() + * will release leftover pages to the page allocator. + * page_pool_put_page_bulk() is suitable to be run inside the driver NAPI tx + * completion loop for the XDP_REDIRECT use case. + * + * Please note the caller must not use data area after running + * page_pool_put_page_bulk(), as this function overwrites it. + */ void page_pool_put_page_bulk(struct page_pool *pool, void **data, int count) { @@ -915,42 +937,3 @@ void page_pool_update_nid(struct page_pool *pool, int new_nid) } } EXPORT_SYMBOL(page_pool_update_nid); - -bool page_pool_return_skb_page(struct page *page, bool napi_safe) -{ - struct napi_struct *napi; - struct page_pool *pp; - bool allow_direct; - - page = compound_head(page); - - /* page->pp_magic is OR'ed with PP_SIGNATURE after the allocation - * in order to preserve any existing bits, such as bit 0 for the - * head page of compound page and bit 1 for pfmemalloc page, so - * mask those bits for freeing side when doing below checking, - * and page_is_pfmemalloc() is checked in __page_pool_put_page() - * to avoid recycling the pfmemalloc page. - */ - if (unlikely((page->pp_magic & ~0x3UL) != PP_SIGNATURE)) - return false; - - pp = page->pp; - - /* Allow direct recycle if we have reasons to believe that we are - * in the same context as the consumer would run, so there's - * no possible race. - */ - napi = READ_ONCE(pp->p.napi); - allow_direct = napi_safe && napi && - READ_ONCE(napi->list_owner) == smp_processor_id(); - - /* Driver set this to memory recycling info. Reset it on recycle. - * This will *not* work for NIC using a split-page memory model. - * The page will be returned to the pool here regardless of the - * 'flipped' fragment being in use or not. - */ - page_pool_put_full_page(pp, page, allow_direct); - - return true; -} -EXPORT_SYMBOL(page_pool_return_skb_page); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index aef25aa5cf1d..6d2180b8edb3 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -61,7 +61,7 @@ #include "dev.h" #define RTNL_MAX_TYPE 50 -#define RTNL_SLAVE_MAX_TYPE 43 +#define RTNL_SLAVE_MAX_TYPE 44 struct rtnl_link { rtnl_doit_func doit; @@ -1273,7 +1273,6 @@ static noinline_for_stack int rtnl_fill_stats(struct sk_buff *skb, static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb, struct net_device *dev, int vfs_num, - struct nlattr *vfinfo, u32 ext_filter_mask) { struct ifla_vf_rss_query_en vf_rss_query_en; @@ -1343,7 +1342,7 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb, vf_trust.setting = ivi.trusted; vf = nla_nest_start_noflag(skb, IFLA_VF_INFO); if (!vf) - goto nla_put_vfinfo_failure; + return -EMSGSIZE; if (nla_put(skb, IFLA_VF_MAC, sizeof(vf_mac), &vf_mac) || nla_put(skb, IFLA_VF_BROADCAST, sizeof(vf_broadcast), &vf_broadcast) || nla_put(skb, IFLA_VF_VLAN, sizeof(vf_vlan), &vf_vlan) || @@ -1414,8 +1413,6 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb, nla_put_vf_failure: nla_nest_cancel(skb, vf); -nla_put_vfinfo_failure: - nla_nest_cancel(skb, vfinfo); return -EMSGSIZE; } @@ -1441,8 +1438,10 @@ static noinline_for_stack int rtnl_fill_vf(struct sk_buff *skb, return -EMSGSIZE; for (i = 0; i < num_vfs; i++) { - if (rtnl_fill_vfinfo(skb, dev, i, vfinfo, ext_filter_mask)) + if (rtnl_fill_vfinfo(skb, dev, i, ext_filter_mask)) { + nla_nest_cancel(skb, vfinfo); return -EMSGSIZE; + } } nla_nest_end(skb, vfinfo); diff --git a/net/core/scm.c b/net/core/scm.c index 3cd7dd377e53..880027ecf516 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -130,6 +130,7 @@ EXPORT_SYMBOL(__scm_destroy); int __scm_send(struct socket *sock, struct msghdr *msg, struct scm_cookie *p) { + const struct proto_ops *ops = READ_ONCE(sock->ops); struct cmsghdr *cmsg; int err; @@ -153,7 +154,7 @@ int __scm_send(struct socket *sock, struct msghdr *msg, struct scm_cookie *p) switch (cmsg->cmsg_type) { case SCM_RIGHTS: - if (!sock->ops || sock->ops->family != PF_UNIX) + if (!ops || ops->family != PF_UNIX) goto error; err=scm_fp_copy(cmsg, &p->fp); if (err<0) diff --git a/net/core/skbuff.c b/net/core/skbuff.c index a298992060e6..33fdf04d4334 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -73,7 +73,7 @@ #include <net/mpls.h> #include <net/mptcp.h> #include <net/mctp.h> -#include <net/page_pool.h> +#include <net/page_pool/helpers.h> #include <net/dropreason.h> #include <linux/uaccess.h> @@ -879,11 +879,56 @@ static void skb_clone_fraglist(struct sk_buff *skb) skb_get(list); } +#if IS_ENABLED(CONFIG_PAGE_POOL) +bool napi_pp_put_page(struct page *page, bool napi_safe) +{ + bool allow_direct = false; + struct page_pool *pp; + + page = compound_head(page); + + /* page->pp_magic is OR'ed with PP_SIGNATURE after the allocation + * in order to preserve any existing bits, such as bit 0 for the + * head page of compound page and bit 1 for pfmemalloc page, so + * mask those bits for freeing side when doing below checking, + * and page_is_pfmemalloc() is checked in __page_pool_put_page() + * to avoid recycling the pfmemalloc page. + */ + if (unlikely((page->pp_magic & ~0x3UL) != PP_SIGNATURE)) + return false; + + pp = page->pp; + + /* Allow direct recycle if we have reasons to believe that we are + * in the same context as the consumer would run, so there's + * no possible race. + * __page_pool_put_page() makes sure we're not in hardirq context + * and interrupts are enabled prior to accessing the cache. + */ + if (napi_safe || in_softirq()) { + const struct napi_struct *napi = READ_ONCE(pp->p.napi); + + allow_direct = napi && + READ_ONCE(napi->list_owner) == smp_processor_id(); + } + + /* Driver set this to memory recycling info. Reset it on recycle. + * This will *not* work for NIC using a split-page memory model. + * The page will be returned to the pool here regardless of the + * 'flipped' fragment being in use or not. + */ + page_pool_put_full_page(pp, page, allow_direct); + + return true; +} +EXPORT_SYMBOL(napi_pp_put_page); +#endif + static bool skb_pp_recycle(struct sk_buff *skb, void *data, bool napi_safe) { if (!IS_ENABLED(CONFIG_PAGE_POOL) || !skb->pp_recycle) return false; - return page_pool_return_skb_page(virt_to_page(data), napi_safe); + return napi_pp_put_page(virt_to_page(data), napi_safe); } static void skb_kfree_head(void *head, unsigned int end_offset) @@ -6204,7 +6249,7 @@ EXPORT_SYMBOL_GPL(skb_mpls_dec_ttl); * * @header_len: size of linear part * @data_len: needed length in frags - * @max_page_order: max page order desired. + * @order: max page order desired. * @errcode: pointer to error code if any * @gfp_mask: allocation mask * @@ -6212,21 +6257,17 @@ EXPORT_SYMBOL_GPL(skb_mpls_dec_ttl); */ struct sk_buff *alloc_skb_with_frags(unsigned long header_len, unsigned long data_len, - int max_page_order, + int order, int *errcode, gfp_t gfp_mask) { - int npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT; unsigned long chunk; struct sk_buff *skb; struct page *page; - int i; + int nr_frags = 0; *errcode = -EMSGSIZE; - /* Note this test could be relaxed, if we succeed to allocate - * high order pages... - */ - if (npages > MAX_SKB_FRAGS) + if (unlikely(data_len > MAX_SKB_FRAGS * (PAGE_SIZE << order))) return NULL; *errcode = -ENOBUFS; @@ -6234,34 +6275,32 @@ struct sk_buff *alloc_skb_with_frags(unsigned long header_len, if (!skb) return NULL; - skb->truesize += npages << PAGE_SHIFT; - - for (i = 0; npages > 0; i++) { - int order = max_page_order; - - while (order) { - if (npages >= 1 << order) { - page = alloc_pages((gfp_mask & ~__GFP_DIRECT_RECLAIM) | - __GFP_COMP | - __GFP_NOWARN, - order); - if (page) - goto fill_page; - /* Do not retry other high order allocations */ - order = 1; - max_page_order = 0; - } + while (data_len) { + if (nr_frags == MAX_SKB_FRAGS - 1) + goto failure; + while (order && PAGE_ALIGN(data_len) < (PAGE_SIZE << order)) order--; + + if (order) { + page = alloc_pages((gfp_mask & ~__GFP_DIRECT_RECLAIM) | + __GFP_COMP | + __GFP_NOWARN, + order); + if (!page) { + order--; + continue; + } + } else { + page = alloc_page(gfp_mask); + if (!page) + goto failure; } - page = alloc_page(gfp_mask); - if (!page) - goto failure; -fill_page: chunk = min_t(unsigned long, data_len, PAGE_SIZE << order); - skb_fill_page_desc(skb, i, page, 0, chunk); + skb_fill_page_desc(skb, nr_frags, page, 0, chunk); + nr_frags++; + skb->truesize += (PAGE_SIZE << order); data_len -= chunk; - npages -= 1 << order; } return skb; diff --git a/net/core/skmsg.c b/net/core/skmsg.c index ef1a2eb6520b..a0659fc29bcc 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -1204,13 +1204,17 @@ out: static void sk_psock_verdict_data_ready(struct sock *sk) { struct socket *sock = sk->sk_socket; + const struct proto_ops *ops; int copied; trace_sk_data_ready(sk); - if (unlikely(!sock || !sock->ops || !sock->ops->read_skb)) + if (unlikely(!sock)) return; - copied = sock->ops->read_skb(sk, sk_psock_verdict_recv); + ops = READ_ONCE(sock->ops); + if (!ops || !ops->read_skb) + return; + copied = ops->read_skb(sk, sk_psock_verdict_recv); if (copied >= 0) { struct sk_psock *psock; diff --git a/net/core/sock.c b/net/core/sock.c index 732fc37a4771..525619776c6f 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1247,17 +1247,11 @@ set_sndbuf: break; case SO_PASSCRED: - if (valbool) - set_bit(SOCK_PASSCRED, &sock->flags); - else - clear_bit(SOCK_PASSCRED, &sock->flags); + assign_bit(SOCK_PASSCRED, &sock->flags, valbool); break; case SO_PASSPIDFD: - if (valbool) - set_bit(SOCK_PASSPIDFD, &sock->flags); - else - clear_bit(SOCK_PASSPIDFD, &sock->flags); + assign_bit(SOCK_PASSPIDFD, &sock->flags, valbool); break; case SO_TIMESTAMP_OLD: @@ -1283,14 +1277,19 @@ set_sndbuf: break; case SO_RCVLOWAT: + { + int (*set_rcvlowat)(struct sock *sk, int val) = NULL; + if (val < 0) val = INT_MAX; - if (sock && sock->ops->set_rcvlowat) - ret = sock->ops->set_rcvlowat(sk, val); + if (sock) + set_rcvlowat = READ_ONCE(sock->ops)->set_rcvlowat; + if (set_rcvlowat) + ret = set_rcvlowat(sk, val); else WRITE_ONCE(sk->sk_rcvlowat, val ? : 1); break; - + } case SO_RCVTIMEO_OLD: case SO_RCVTIMEO_NEW: ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, @@ -1361,10 +1360,7 @@ set_sndbuf: break; case SO_PASSSEC: - if (valbool) - set_bit(SOCK_PASSSEC, &sock->flags); - else - clear_bit(SOCK_PASSSEC, &sock->flags); + assign_bit(SOCK_PASSSEC, &sock->flags, valbool); break; case SO_MARK: if (!sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) && @@ -1388,11 +1384,16 @@ set_sndbuf: break; case SO_PEEK_OFF: - if (sock->ops->set_peek_off) - ret = sock->ops->set_peek_off(sk, val); + { + int (*set_peek_off)(struct sock *sk, int val); + + set_peek_off = READ_ONCE(sock->ops)->set_peek_off; + if (set_peek_off) + ret = set_peek_off(sk, val); else ret = -EOPNOTSUPP; break; + } case SO_NOFCS: sock_valbool_flag(sk, SOCK_NOFCS, valbool); @@ -1823,14 +1824,14 @@ int sk_getsockopt(struct sock *sk, int level, int optname, case SO_PEERNAME: { - char address[128]; + struct sockaddr_storage address; - lv = sock->ops->getname(sock, (struct sockaddr *)address, 2); + lv = READ_ONCE(sock->ops)->getname(sock, (struct sockaddr *)&address, 2); if (lv < 0) return -ENOTCONN; if (lv < len) return -EINVAL; - if (copy_to_sockptr(optval, address, len)) + if (copy_to_sockptr(optval, &address, len)) return -EFAULT; goto lenout; } @@ -1867,7 +1868,7 @@ int sk_getsockopt(struct sock *sk, int level, int optname, break; case SO_PEEK_OFF: - if (!sock->ops->set_peek_off) + if (!READ_ONCE(sock->ops)->set_peek_off) return -EOPNOTSUPP; v.val = READ_ONCE(sk->sk_peek_off); diff --git a/net/core/xdp.c b/net/core/xdp.c index 8362130bf085..a70670fe9a2d 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -14,7 +14,7 @@ #include <linux/idr.h> #include <linux/rhashtable.h> #include <linux/bug.h> -#include <net/page_pool.h> +#include <net/page_pool/helpers.h> #include <net/xdp.h> #include <net/xdp_priv.h> /* struct xdp_mem_allocator */ diff --git a/net/dccp/feat.h b/net/dccp/feat.h index d76c9be5bfca..57d9c026aa3f 100644 --- a/net/dccp/feat.h +++ b/net/dccp/feat.h @@ -105,7 +105,6 @@ extern int sysctl_dccp_rx_ccid; extern int sysctl_dccp_tx_ccid; int dccp_feat_init(struct sock *sk); -void dccp_feat_initialise_sysctls(void); int dccp_feat_register_sp(struct sock *sk, u8 feat, u8 is_local, u8 const *list, u8 len); int dccp_feat_parse_options(struct sock *, struct dccp_request_sock *, diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index fa8079303cb0..8e919cfe6e23 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -474,7 +474,8 @@ static struct dst_entry* dccp_v4_route_skb(struct net *net, struct sock *sk, .flowi4_oif = inet_iif(skb), .daddr = iph->saddr, .saddr = iph->daddr, - .flowi4_tos = RT_CONN_FLAGS(sk), + .flowi4_tos = ip_sock_rt_tos(sk), + .flowi4_scope = ip_sock_rt_scope(sk), .flowi4_proto = sk->sk_protocol, .fl4_sport = dccp_hdr(skb)->dccph_dport, .fl4_dport = dccp_hdr(skb)->dccph_sport, diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index d29d1163203d..686090bc5945 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -1056,6 +1056,7 @@ static struct proto dccp_v6_prot = { .orphan_count = &dccp_orphan_count, .max_header = MAX_DCCP_HEADER, .obj_size = sizeof(struct dccp6_sock), + .ipv6_pinfo_offset = offsetof(struct dccp6_sock, inet6), .slab_flags = SLAB_TYPESAFE_BY_RCU, .rsk_prot = &dccp6_request_sock_ops, .twsk_prot = &dccp6_timewait_sock_ops, diff --git a/net/dccp/ipv6.h b/net/dccp/ipv6.h index 7e4c2a3b322b..c5d14c48def1 100644 --- a/net/dccp/ipv6.h +++ b/net/dccp/ipv6.h @@ -13,10 +13,6 @@ struct dccp6_sock { struct dccp_sock dccp; - /* - * ipv6_pinfo has to be the last member of dccp6_sock, - * see inet6_sk_generic. - */ struct ipv6_pinfo inet6; }; diff --git a/net/devlink/Makefile b/net/devlink/Makefile index ef91a76646a3..a087af581847 100644 --- a/net/devlink/Makefile +++ b/net/devlink/Makefile @@ -1,3 +1,3 @@ # SPDX-License-Identifier: GPL-2.0 -obj-y := leftover.o core.o netlink.o dev.o health.o +obj-y := leftover.o core.o netlink.o netlink_gen.o dev.o health.o diff --git a/net/devlink/dev.c b/net/devlink/dev.c index bf1d6f1bcfc7..5dfba2248b90 100644 --- a/net/devlink/dev.c +++ b/net/devlink/dev.c @@ -196,7 +196,7 @@ void devlink_notify(struct devlink *devlink, enum devlink_command cmd) msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL); } -int devlink_nl_cmd_get_doit(struct sk_buff *skb, struct genl_info *info) +int devlink_nl_get_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink *devlink = info->user_ptr[0]; struct sk_buff *msg; @@ -217,17 +217,18 @@ int devlink_nl_cmd_get_doit(struct sk_buff *skb, struct genl_info *info) } static int -devlink_nl_cmd_get_dump_one(struct sk_buff *msg, struct devlink *devlink, - struct netlink_callback *cb) +devlink_nl_get_dump_one(struct sk_buff *msg, struct devlink *devlink, + struct netlink_callback *cb) { return devlink_nl_fill(msg, devlink, DEVLINK_CMD_NEW, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI); } -const struct devlink_cmd devl_cmd_get = { - .dump_one = devlink_nl_cmd_get_dump_one, -}; +int devlink_nl_get_dumpit(struct sk_buff *msg, struct netlink_callback *cb) +{ + return devlink_nl_dumpit(msg, cb, devlink_nl_get_dump_one); +} static void devlink_reload_failed_set(struct devlink *devlink, bool reload_failed) @@ -804,7 +805,7 @@ err_cancel_msg: return err; } -int devlink_nl_cmd_info_get_doit(struct sk_buff *skb, struct genl_info *info) +int devlink_nl_info_get_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink *devlink = info->user_ptr[0]; struct sk_buff *msg; @@ -826,8 +827,8 @@ int devlink_nl_cmd_info_get_doit(struct sk_buff *skb, struct genl_info *info) } static int -devlink_nl_cmd_info_get_dump_one(struct sk_buff *msg, struct devlink *devlink, - struct netlink_callback *cb) +devlink_nl_info_get_dump_one(struct sk_buff *msg, struct devlink *devlink, + struct netlink_callback *cb) { int err; @@ -840,9 +841,10 @@ devlink_nl_cmd_info_get_dump_one(struct sk_buff *msg, struct devlink *devlink, return err; } -const struct devlink_cmd devl_cmd_info_get = { - .dump_one = devlink_nl_cmd_info_get_dump_one, -}; +int devlink_nl_info_get_dumpit(struct sk_buff *msg, struct netlink_callback *cb) +{ + return devlink_nl_dumpit(msg, cb, devlink_nl_info_get_dump_one); +} static int devlink_nl_flash_update_fill(struct sk_buff *msg, struct devlink *devlink, diff --git a/net/devlink/devl_internal.h b/net/devlink/devl_internal.h index 62921b2eb0d3..7fdd956ff992 100644 --- a/net/devlink/devl_internal.h +++ b/net/devlink/devl_internal.h @@ -12,6 +12,8 @@ #include <net/devlink.h> #include <net/net_namespace.h> +#include "netlink_gen.h" + #define DEVLINK_REGISTERED XA_MARK_1 #define DEVLINK_RELOAD_STATS_ARRAY_SIZE \ @@ -114,12 +116,15 @@ struct devlink_nl_dump_state { }; }; +typedef int devlink_nl_dump_one_func_t(struct sk_buff *msg, + struct devlink *devlink, + struct netlink_callback *cb); + struct devlink_cmd { - int (*dump_one)(struct sk_buff *msg, struct devlink *devlink, - struct netlink_callback *cb); + devlink_nl_dump_one_func_t *dump_one; }; -extern const struct genl_small_ops devlink_nl_ops[56]; +extern const struct genl_small_ops devlink_nl_small_ops[54]; struct devlink * devlink_get_from_attrs_lock(struct net *net, struct nlattr **attrs); @@ -127,8 +132,9 @@ devlink_get_from_attrs_lock(struct net *net, struct nlattr **attrs); void devlink_notify_unregister(struct devlink *devlink); void devlink_notify_register(struct devlink *devlink); -int devlink_nl_instance_iter_dumpit(struct sk_buff *msg, - struct netlink_callback *cb); +int devlink_nl_dumpit(struct sk_buff *msg, struct netlink_callback *cb, + devlink_nl_dump_one_func_t *dump_one); +int devlink_nl_instance_iter_dumpit(struct sk_buff *msg, struct netlink_callback *cb); static inline struct devlink_nl_dump_state * devlink_dump_state(struct netlink_callback *cb) @@ -149,7 +155,6 @@ devlink_nl_put_handle(struct sk_buff *msg, struct devlink *devlink) } /* Commands */ -extern const struct devlink_cmd devl_cmd_get; extern const struct devlink_cmd devl_cmd_port_get; extern const struct devlink_cmd devl_cmd_sb_get; extern const struct devlink_cmd devl_cmd_sb_pool_get; @@ -157,7 +162,6 @@ extern const struct devlink_cmd devl_cmd_sb_port_pool_get; extern const struct devlink_cmd devl_cmd_sb_tc_pool_bind_get; extern const struct devlink_cmd devl_cmd_param_get; extern const struct devlink_cmd devl_cmd_region_get; -extern const struct devlink_cmd devl_cmd_info_get; extern const struct devlink_cmd devl_cmd_health_reporter_get; extern const struct devlink_cmd devl_cmd_trap_get; extern const struct devlink_cmd devl_cmd_trap_group_get; @@ -214,11 +218,9 @@ struct devlink_rate * devlink_rate_node_get_from_info(struct devlink *devlink, struct genl_info *info); /* Devlink nl cmds */ -int devlink_nl_cmd_get_doit(struct sk_buff *skb, struct genl_info *info); int devlink_nl_cmd_reload(struct sk_buff *skb, struct genl_info *info); int devlink_nl_cmd_eswitch_get_doit(struct sk_buff *skb, struct genl_info *info); int devlink_nl_cmd_eswitch_set_doit(struct sk_buff *skb, struct genl_info *info); -int devlink_nl_cmd_info_get_doit(struct sk_buff *skb, struct genl_info *info); int devlink_nl_cmd_flash_update(struct sk_buff *skb, struct genl_info *info); int devlink_nl_cmd_selftests_get_doit(struct sk_buff *skb, struct genl_info *info); int devlink_nl_cmd_selftests_run(struct sk_buff *skb, struct genl_info *info); diff --git a/net/devlink/leftover.c b/net/devlink/leftover.c index 1f00f874471f..e7900d9fa205 100644 --- a/net/devlink/leftover.c +++ b/net/devlink/leftover.c @@ -3946,7 +3946,7 @@ static int devlink_param_get(struct devlink *devlink, const struct devlink_param *param, struct devlink_param_gset_ctx *ctx) { - if (!param->get || devlink->reload_failed) + if (!param->get) return -EOPNOTSUPP; return param->get(devlink, param->id, ctx); } @@ -3955,7 +3955,7 @@ static int devlink_param_set(struct devlink *devlink, const struct devlink_param *param, struct devlink_param_gset_ctx *ctx) { - if (!param->set || devlink->reload_failed) + if (!param->set) return -EOPNOTSUPP; return param->set(devlink, param->id, ctx); } @@ -6278,14 +6278,7 @@ static int devlink_nl_cmd_trap_policer_set_doit(struct sk_buff *skb, return devlink_trap_policer_set(devlink, policer_item, info); } -const struct genl_small_ops devlink_nl_ops[56] = { - { - .cmd = DEVLINK_CMD_GET, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .doit = devlink_nl_cmd_get_doit, - .dumpit = devlink_nl_instance_iter_dumpit, - /* can be retrieved by unprivileged users */ - }, +const struct genl_small_ops devlink_nl_small_ops[54] = { { .cmd = DEVLINK_CMD_PORT_GET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, @@ -6534,13 +6527,6 @@ const struct genl_small_ops devlink_nl_ops[56] = { .flags = GENL_ADMIN_PERM, }, { - .cmd = DEVLINK_CMD_INFO_GET, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .doit = devlink_nl_cmd_info_get_doit, - .dumpit = devlink_nl_instance_iter_dumpit, - /* can be retrieved by unprivileged users */ - }, - { .cmd = DEVLINK_CMD_HEALTH_REPORTER_GET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_health_reporter_get_doit, @@ -6843,8 +6829,10 @@ int devl_port_register_with_ops(struct devlink *devlink, spin_lock_init(&devlink_port->type_lock); INIT_LIST_HEAD(&devlink_port->reporter_list); err = xa_insert(&devlink->ports, port_index, devlink_port, GFP_KERNEL); - if (err) + if (err) { + devlink_port->registered = false; return err; + } INIT_DELAYED_WORK(&devlink_port->type_warn_dw, &devlink_port_type_warn); devlink_port_type_warn_schedule(devlink_port); diff --git a/net/devlink/netlink.c b/net/devlink/netlink.c index 7a332eb70f70..bada2819827b 100644 --- a/net/devlink/netlink.c +++ b/net/devlink/netlink.c @@ -109,8 +109,8 @@ devlink_get_from_attrs_lock(struct net *net, struct nlattr **attrs) return ERR_PTR(-ENODEV); } -static int devlink_nl_pre_doit(const struct genl_split_ops *ops, - struct sk_buff *skb, struct genl_info *info) +int devlink_nl_pre_doit(const struct genl_split_ops *ops, + struct sk_buff *skb, struct genl_info *info) { struct devlink_linecard *linecard; struct devlink_port *devlink_port; @@ -167,8 +167,8 @@ unlock: return err; } -static void devlink_nl_post_doit(const struct genl_split_ops *ops, - struct sk_buff *skb, struct genl_info *info) +void devlink_nl_post_doit(const struct genl_split_ops *ops, + struct sk_buff *skb, struct genl_info *info) { struct devlink *devlink; @@ -178,7 +178,6 @@ static void devlink_nl_post_doit(const struct genl_split_ops *ops, } static const struct devlink_cmd *devl_cmds[] = { - [DEVLINK_CMD_GET] = &devl_cmd_get, [DEVLINK_CMD_PORT_GET] = &devl_cmd_port_get, [DEVLINK_CMD_SB_GET] = &devl_cmd_sb_get, [DEVLINK_CMD_SB_POOL_GET] = &devl_cmd_sb_pool_get, @@ -186,7 +185,6 @@ static const struct devlink_cmd *devl_cmds[] = { [DEVLINK_CMD_SB_TC_POOL_BIND_GET] = &devl_cmd_sb_tc_pool_bind_get, [DEVLINK_CMD_PARAM_GET] = &devl_cmd_param_get, [DEVLINK_CMD_REGION_GET] = &devl_cmd_region_get, - [DEVLINK_CMD_INFO_GET] = &devl_cmd_info_get, [DEVLINK_CMD_HEALTH_REPORTER_GET] = &devl_cmd_health_reporter_get, [DEVLINK_CMD_TRAP_GET] = &devl_cmd_trap_get, [DEVLINK_CMD_TRAP_GROUP_GET] = &devl_cmd_trap_group_get, @@ -196,23 +194,19 @@ static const struct devlink_cmd *devl_cmds[] = { [DEVLINK_CMD_SELFTESTS_GET] = &devl_cmd_selftests_get, }; -int devlink_nl_instance_iter_dumpit(struct sk_buff *msg, - struct netlink_callback *cb) +int devlink_nl_dumpit(struct sk_buff *msg, struct netlink_callback *cb, + devlink_nl_dump_one_func_t *dump_one) { - const struct genl_dumpit_info *info = genl_dumpit_info(cb); struct devlink_nl_dump_state *state = devlink_dump_state(cb); - const struct devlink_cmd *cmd; struct devlink *devlink; int err = 0; - cmd = devl_cmds[info->op.cmd]; - while ((devlink = devlinks_xa_find_get(sock_net(msg->sk), &state->instance))) { devl_lock(devlink); if (devl_is_registered(devlink)) - err = cmd->dump_one(msg, devlink, cb); + err = dump_one(msg, devlink, cb); else err = 0; @@ -233,6 +227,15 @@ int devlink_nl_instance_iter_dumpit(struct sk_buff *msg, return msg->len; } +int devlink_nl_instance_iter_dumpit(struct sk_buff *msg, + struct netlink_callback *cb) +{ + const struct genl_dumpit_info *info = genl_dumpit_info(cb); + const struct devlink_cmd *cmd = devl_cmds[info->op.cmd]; + + return devlink_nl_dumpit(msg, cb, cmd->dump_one); +} + struct genl_family devlink_nl_family __ro_after_init = { .name = DEVLINK_GENL_NAME, .version = DEVLINK_GENL_VERSION, @@ -243,8 +246,10 @@ struct genl_family devlink_nl_family __ro_after_init = { .pre_doit = devlink_nl_pre_doit, .post_doit = devlink_nl_post_doit, .module = THIS_MODULE, - .small_ops = devlink_nl_ops, - .n_small_ops = ARRAY_SIZE(devlink_nl_ops), + .small_ops = devlink_nl_small_ops, + .n_small_ops = ARRAY_SIZE(devlink_nl_small_ops), + .split_ops = devlink_nl_ops, + .n_split_ops = ARRAY_SIZE(devlink_nl_ops), .resv_start_op = DEVLINK_CMD_SELFTESTS_RUN + 1, .mcgrps = devlink_nl_mcgrps, .n_mcgrps = ARRAY_SIZE(devlink_nl_mcgrps), diff --git a/net/devlink/netlink_gen.c b/net/devlink/netlink_gen.c new file mode 100644 index 000000000000..32d8cbed0c30 --- /dev/null +++ b/net/devlink/netlink_gen.c @@ -0,0 +1,59 @@ +// SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) +/* Do not edit directly, auto-generated from: */ +/* Documentation/netlink/specs/devlink.yaml */ +/* YNL-GEN kernel source */ + +#include <net/netlink.h> +#include <net/genetlink.h> + +#include "netlink_gen.h" + +#include <uapi/linux/devlink.h> + +/* DEVLINK_CMD_GET - do */ +static const struct nla_policy devlink_get_nl_policy[DEVLINK_ATTR_DEV_NAME + 1] = { + [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, }, + [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, }, +}; + +/* DEVLINK_CMD_INFO_GET - do */ +static const struct nla_policy devlink_info_get_nl_policy[DEVLINK_ATTR_DEV_NAME + 1] = { + [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, }, + [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, }, +}; + +/* Ops table for devlink */ +const struct genl_split_ops devlink_nl_ops[4] = { + { + .cmd = DEVLINK_CMD_GET, + .validate = GENL_DONT_VALIDATE_STRICT, + .pre_doit = devlink_nl_pre_doit, + .doit = devlink_nl_get_doit, + .post_doit = devlink_nl_post_doit, + .policy = devlink_get_nl_policy, + .maxattr = DEVLINK_ATTR_DEV_NAME, + .flags = GENL_CMD_CAP_DO, + }, + { + .cmd = DEVLINK_CMD_GET, + .validate = GENL_DONT_VALIDATE_DUMP, + .dumpit = devlink_nl_get_dumpit, + .flags = GENL_CMD_CAP_DUMP, + }, + { + .cmd = DEVLINK_CMD_INFO_GET, + .validate = GENL_DONT_VALIDATE_STRICT, + .pre_doit = devlink_nl_pre_doit, + .doit = devlink_nl_info_get_doit, + .post_doit = devlink_nl_post_doit, + .policy = devlink_info_get_nl_policy, + .maxattr = DEVLINK_ATTR_DEV_NAME, + .flags = GENL_CMD_CAP_DO, + }, + { + .cmd = DEVLINK_CMD_INFO_GET, + .validate = GENL_DONT_VALIDATE_DUMP, + .dumpit = devlink_nl_info_get_dumpit, + .flags = GENL_CMD_CAP_DUMP, + }, +}; diff --git a/net/devlink/netlink_gen.h b/net/devlink/netlink_gen.h new file mode 100644 index 000000000000..11980e04a718 --- /dev/null +++ b/net/devlink/netlink_gen.h @@ -0,0 +1,29 @@ +/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */ +/* Do not edit directly, auto-generated from: */ +/* Documentation/netlink/specs/devlink.yaml */ +/* YNL-GEN kernel header */ + +#ifndef _LINUX_DEVLINK_GEN_H +#define _LINUX_DEVLINK_GEN_H + +#include <net/netlink.h> +#include <net/genetlink.h> + +#include <uapi/linux/devlink.h> + +/* Ops table for devlink */ +extern const struct genl_split_ops devlink_nl_ops[4]; + +int devlink_nl_pre_doit(const struct genl_split_ops *ops, struct sk_buff *skb, + struct genl_info *info); +void +devlink_nl_post_doit(const struct genl_split_ops *ops, struct sk_buff *skb, + struct genl_info *info); + +int devlink_nl_get_doit(struct sk_buff *skb, struct genl_info *info); +int devlink_nl_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb); +int devlink_nl_info_get_doit(struct sk_buff *skb, struct genl_info *info); +int devlink_nl_info_get_dumpit(struct sk_buff *skb, + struct netlink_callback *cb); + +#endif /* _LINUX_DEVLINK_GEN_H */ diff --git a/net/dsa/port.c b/net/dsa/port.c index 2f6195d7b741..37ab238e8304 100644 --- a/net/dsa/port.c +++ b/net/dsa/port.c @@ -1568,27 +1568,6 @@ static void dsa_port_phylink_validate(struct phylink_config *config, phylink_generic_validate(config, supported, state); } -static void dsa_port_phylink_mac_pcs_get_state(struct phylink_config *config, - struct phylink_link_state *state) -{ - struct dsa_port *dp = container_of(config, struct dsa_port, pl_config); - struct dsa_switch *ds = dp->ds; - int err; - - /* Only called for inband modes */ - if (!ds->ops->phylink_mac_link_state) { - state->link = 0; - return; - } - - err = ds->ops->phylink_mac_link_state(ds, dp->index, state); - if (err < 0) { - dev_err(ds->dev, "p%d: phylink_mac_link_state() failed: %d\n", - dp->index, err); - state->link = 0; - } -} - static struct phylink_pcs * dsa_port_phylink_mac_select_pcs(struct phylink_config *config, phy_interface_t interface) @@ -1646,17 +1625,6 @@ static int dsa_port_phylink_mac_finish(struct phylink_config *config, return err; } -static void dsa_port_phylink_mac_an_restart(struct phylink_config *config) -{ - struct dsa_port *dp = container_of(config, struct dsa_port, pl_config); - struct dsa_switch *ds = dp->ds; - - if (!ds->ops->phylink_mac_an_restart) - return; - - ds->ops->phylink_mac_an_restart(ds, dp->index); -} - static void dsa_port_phylink_mac_link_down(struct phylink_config *config, unsigned int mode, phy_interface_t interface) @@ -1700,11 +1668,9 @@ static void dsa_port_phylink_mac_link_up(struct phylink_config *config, static const struct phylink_mac_ops dsa_port_phylink_mac_ops = { .validate = dsa_port_phylink_validate, .mac_select_pcs = dsa_port_phylink_mac_select_pcs, - .mac_pcs_get_state = dsa_port_phylink_mac_pcs_get_state, .mac_prepare = dsa_port_phylink_mac_prepare, .mac_config = dsa_port_phylink_mac_config, .mac_finish = dsa_port_phylink_mac_finish, - .mac_an_restart = dsa_port_phylink_mac_an_restart, .mac_link_down = dsa_port_phylink_mac_link_down, .mac_link_up = dsa_port_phylink_mac_link_up, }; @@ -1720,21 +1686,18 @@ int dsa_port_phylink_create(struct dsa_port *dp) if (err) mode = PHY_INTERFACE_MODE_NA; - /* Presence of phylink_mac_link_state or phylink_mac_an_restart is - * an indicator of a legacy phylink driver. - */ - if (ds->ops->phylink_mac_link_state || - ds->ops->phylink_mac_an_restart) - dp->pl_config.legacy_pre_march2020 = true; - if (ds->ops->phylink_get_caps) { ds->ops->phylink_get_caps(ds, dp->index, &dp->pl_config); } else { /* For legacy drivers */ - __set_bit(PHY_INTERFACE_MODE_INTERNAL, - dp->pl_config.supported_interfaces); - __set_bit(PHY_INTERFACE_MODE_GMII, - dp->pl_config.supported_interfaces); + if (mode != PHY_INTERFACE_MODE_NA) { + __set_bit(mode, dp->pl_config.supported_interfaces); + } else { + __set_bit(PHY_INTERFACE_MODE_INTERNAL, + dp->pl_config.supported_interfaces); + __set_bit(PHY_INTERFACE_MODE_GMII, + dp->pl_config.supported_interfaces); + } } pl = phylink_create(&dp->pl_config, of_fwnode_handle(dp->dn), diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 527b1d576460..48db91b33390 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -21,6 +21,7 @@ #include <linux/if_hsr.h> #include <net/dcbnl.h> #include <linux/netpoll.h> +#include <linux/string.h> #include "dsa.h" #include "port.h" @@ -1056,10 +1057,10 @@ static void dsa_slave_get_strings(struct net_device *dev, if (stringset == ETH_SS_STATS) { int len = ETH_GSTRING_LEN; - strncpy(data, "tx_packets", len); - strncpy(data + len, "tx_bytes", len); - strncpy(data + 2 * len, "rx_packets", len); - strncpy(data + 3 * len, "rx_bytes", len); + strscpy_pad(data, "tx_packets", len); + strscpy_pad(data + len, "tx_bytes", len); + strscpy_pad(data + 2 * len, "rx_packets", len); + strscpy_pad(data + 3 * len, "rx_bytes", len); if (ds->ops->get_strings) ds->ops->get_strings(ds, dp->index, stringset, data + 4 * len); diff --git a/net/dsa/tag_qca.c b/net/dsa/tag_qca.c index e757c8de06f1..e5ff7c34e577 100644 --- a/net/dsa/tag_qca.c +++ b/net/dsa/tag_qca.c @@ -75,10 +75,6 @@ static struct sk_buff *qca_tag_rcv(struct sk_buff *skb, struct net_device *dev) return NULL; } - /* Remove QCA tag and recalculate checksum */ - skb_pull_rcsum(skb, QCA_HDR_LEN); - dsa_strip_etype_header(skb, QCA_HDR_LEN); - /* Get source port information */ port = FIELD_GET(QCA_HDR_RECV_SOURCE_PORT, hdr); @@ -86,6 +82,10 @@ static struct sk_buff *qca_tag_rcv(struct sk_buff *skb, struct net_device *dev) if (!skb->dev) return NULL; + /* Remove QCA tag and recalculate checksum */ + skb_pull_rcsum(skb, QCA_HDR_LEN); + dsa_strip_etype_header(skb, QCA_HDR_LEN); + return skb; } diff --git a/net/ethtool/common.c b/net/ethtool/common.c index 5fb19050991e..f5598c5f50de 100644 --- a/net/ethtool/common.c +++ b/net/ethtool/common.c @@ -665,9 +665,8 @@ const struct ethtool_phy_ops *ethtool_phy_ops; void ethtool_set_ethtool_phy_ops(const struct ethtool_phy_ops *ops) { - rtnl_lock(); + ASSERT_RTNL(); ethtool_phy_ops = ops; - rtnl_unlock(); } EXPORT_SYMBOL_GPL(ethtool_set_ethtool_phy_ops); diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c index 4a51e0ec295c..0b0ce4f81c01 100644 --- a/net/ethtool/ioctl.c +++ b/net/ethtool/ioctl.c @@ -907,6 +907,38 @@ static int ethtool_rxnfc_copy_to_compat(void __user *useraddr, return 0; } +static int ethtool_rxnfc_copy_struct(u32 cmd, struct ethtool_rxnfc *info, + size_t *info_size, void __user *useraddr) +{ + /* struct ethtool_rxnfc was originally defined for + * ETHTOOL_{G,S}RXFH with only the cmd, flow_type and data + * members. User-space might still be using that + * definition. + */ + if (cmd == ETHTOOL_GRXFH || cmd == ETHTOOL_SRXFH) + *info_size = (offsetof(struct ethtool_rxnfc, data) + + sizeof(info->data)); + + if (ethtool_rxnfc_copy_from_user(info, useraddr, *info_size)) + return -EFAULT; + + if ((cmd == ETHTOOL_GRXFH || cmd == ETHTOOL_SRXFH) && info->flow_type & FLOW_RSS) { + *info_size = sizeof(*info); + if (ethtool_rxnfc_copy_from_user(info, useraddr, *info_size)) + return -EFAULT; + /* Since malicious users may modify the original data, + * we need to check whether FLOW_RSS is still requested. + */ + if (!(info->flow_type & FLOW_RSS)) + return -EINVAL; + } + + if (info->cmd != cmd) + return -EINVAL; + + return 0; +} + static int ethtool_rxnfc_copy_to_user(void __user *useraddr, const struct ethtool_rxnfc *rxnfc, size_t size, const u32 *rule_buf) @@ -944,16 +976,9 @@ static noinline_for_stack int ethtool_set_rxnfc(struct net_device *dev, if (!dev->ethtool_ops->set_rxnfc) return -EOPNOTSUPP; - /* struct ethtool_rxnfc was originally defined for - * ETHTOOL_{G,S}RXFH with only the cmd, flow_type and data - * members. User-space might still be using that - * definition. */ - if (cmd == ETHTOOL_SRXFH) - info_size = (offsetof(struct ethtool_rxnfc, data) + - sizeof(info.data)); - - if (ethtool_rxnfc_copy_from_user(&info, useraddr, info_size)) - return -EFAULT; + rc = ethtool_rxnfc_copy_struct(cmd, &info, &info_size, useraddr); + if (rc) + return rc; rc = dev->ethtool_ops->set_rxnfc(dev, &info); if (rc) @@ -978,33 +1003,9 @@ static noinline_for_stack int ethtool_get_rxnfc(struct net_device *dev, if (!ops->get_rxnfc) return -EOPNOTSUPP; - /* struct ethtool_rxnfc was originally defined for - * ETHTOOL_{G,S}RXFH with only the cmd, flow_type and data - * members. User-space might still be using that - * definition. */ - if (cmd == ETHTOOL_GRXFH) - info_size = (offsetof(struct ethtool_rxnfc, data) + - sizeof(info.data)); - - if (ethtool_rxnfc_copy_from_user(&info, useraddr, info_size)) - return -EFAULT; - - /* If FLOW_RSS was requested then user-space must be using the - * new definition, as FLOW_RSS is newer. - */ - if (cmd == ETHTOOL_GRXFH && info.flow_type & FLOW_RSS) { - info_size = sizeof(info); - if (ethtool_rxnfc_copy_from_user(&info, useraddr, info_size)) - return -EFAULT; - /* Since malicious users may modify the original data, - * we need to check whether FLOW_RSS is still requested. - */ - if (!(info.flow_type & FLOW_RSS)) - return -EINVAL; - } - - if (info.cmd != cmd) - return -EINVAL; + ret = ethtool_rxnfc_copy_struct(cmd, &info, &info_size, useraddr); + if (ret) + return ret; if (info.cmd == ETHTOOL_GRXCLSRLALL) { if (info.rule_cnt > 0) { @@ -3207,7 +3208,7 @@ ethtool_rx_flow_rule_create(const struct ethtool_rx_flow_spec_input *input) if (v4_m_spec->ip4src || v4_m_spec->ip4dst) { match->dissector.used_keys |= - BIT(FLOW_DISSECTOR_KEY_IPV4_ADDRS); + BIT_ULL(FLOW_DISSECTOR_KEY_IPV4_ADDRS); match->dissector.offset[FLOW_DISSECTOR_KEY_IPV4_ADDRS] = offsetof(struct ethtool_rx_flow_key, ipv4); } @@ -3222,7 +3223,7 @@ ethtool_rx_flow_rule_create(const struct ethtool_rx_flow_spec_input *input) if (v4_m_spec->psrc || v4_m_spec->pdst) { match->dissector.used_keys |= - BIT(FLOW_DISSECTOR_KEY_PORTS); + BIT_ULL(FLOW_DISSECTOR_KEY_PORTS); match->dissector.offset[FLOW_DISSECTOR_KEY_PORTS] = offsetof(struct ethtool_rx_flow_key, tp); } @@ -3259,7 +3260,7 @@ ethtool_rx_flow_rule_create(const struct ethtool_rx_flow_spec_input *input) if (!ipv6_addr_any((struct in6_addr *)v6_m_spec->ip6src) || !ipv6_addr_any((struct in6_addr *)v6_m_spec->ip6dst)) { match->dissector.used_keys |= - BIT(FLOW_DISSECTOR_KEY_IPV6_ADDRS); + BIT_ULL(FLOW_DISSECTOR_KEY_IPV6_ADDRS); match->dissector.offset[FLOW_DISSECTOR_KEY_IPV6_ADDRS] = offsetof(struct ethtool_rx_flow_key, ipv6); } @@ -3274,7 +3275,7 @@ ethtool_rx_flow_rule_create(const struct ethtool_rx_flow_spec_input *input) if (v6_m_spec->psrc || v6_m_spec->pdst) { match->dissector.used_keys |= - BIT(FLOW_DISSECTOR_KEY_PORTS); + BIT_ULL(FLOW_DISSECTOR_KEY_PORTS); match->dissector.offset[FLOW_DISSECTOR_KEY_PORTS] = offsetof(struct ethtool_rx_flow_key, tp); } @@ -3282,7 +3283,7 @@ ethtool_rx_flow_rule_create(const struct ethtool_rx_flow_spec_input *input) match->key.ip.tos = v6_spec->tclass; match->mask.ip.tos = v6_m_spec->tclass; match->dissector.used_keys |= - BIT(FLOW_DISSECTOR_KEY_IP); + BIT_ULL(FLOW_DISSECTOR_KEY_IP); match->dissector.offset[FLOW_DISSECTOR_KEY_IP] = offsetof(struct ethtool_rx_flow_key, ip); } @@ -3306,7 +3307,7 @@ ethtool_rx_flow_rule_create(const struct ethtool_rx_flow_spec_input *input) break; } - match->dissector.used_keys |= BIT(FLOW_DISSECTOR_KEY_BASIC); + match->dissector.used_keys |= BIT_ULL(FLOW_DISSECTOR_KEY_BASIC); match->dissector.offset[FLOW_DISSECTOR_KEY_BASIC] = offsetof(struct ethtool_rx_flow_key, basic); @@ -3339,7 +3340,7 @@ ethtool_rx_flow_rule_create(const struct ethtool_rx_flow_spec_input *input) if (ext_m_spec->vlan_etype || ext_m_spec->vlan_tci) { match->dissector.used_keys |= - BIT(FLOW_DISSECTOR_KEY_VLAN); + BIT_ULL(FLOW_DISSECTOR_KEY_VLAN); match->dissector.offset[FLOW_DISSECTOR_KEY_VLAN] = offsetof(struct ethtool_rx_flow_key, vlan); } @@ -3354,7 +3355,7 @@ ethtool_rx_flow_rule_create(const struct ethtool_rx_flow_spec_input *input) ETH_ALEN); match->dissector.used_keys |= - BIT(FLOW_DISSECTOR_KEY_ETH_ADDRS); + BIT_ULL(FLOW_DISSECTOR_KEY_ETH_ADDRS); match->dissector.offset[FLOW_DISSECTOR_KEY_ETH_ADDRS] = offsetof(struct ethtool_rx_flow_key, eth_addrs); } diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c index 39a459b0111b..ae344f1b0bbd 100644 --- a/net/ethtool/netlink.c +++ b/net/ethtool/netlink.c @@ -252,8 +252,7 @@ int ethnl_multicast(struct sk_buff *skb, struct net_device *dev) * @ops: request ops of currently processed message type * @req_info: parsed request header of processed request * @reply_data: data needed to compose the reply - * @pos_hash: saved iteration position - hashbucket - * @pos_idx: saved iteration position - index + * @pos_ifindex: saved iteration position - ifindex * * These parameters are kept in struct netlink_callback as context preserved * between iterations. They are initialized by ethnl_default_start() and used @@ -263,8 +262,7 @@ struct ethnl_dump_ctx { const struct ethnl_request_ops *ops; struct ethnl_req_info *req_info; struct ethnl_reply_data *reply_data; - int pos_hash; - int pos_idx; + unsigned long pos_ifindex; }; static const struct ethnl_request_ops * @@ -490,55 +488,27 @@ static int ethnl_default_dumpit(struct sk_buff *skb, { struct ethnl_dump_ctx *ctx = ethnl_dump_context(cb); struct net *net = sock_net(skb->sk); - int s_idx = ctx->pos_idx; - int h, idx = 0; + struct net_device *dev; int ret = 0; rtnl_lock(); - for (h = ctx->pos_hash; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { - struct hlist_head *head; - struct net_device *dev; - unsigned int seq; - - head = &net->dev_index_head[h]; - -restart_chain: - seq = net->dev_base_seq; - cb->seq = seq; - idx = 0; - hlist_for_each_entry(dev, head, index_hlist) { - if (idx < s_idx) - goto cont; - dev_hold(dev); - rtnl_unlock(); - - ret = ethnl_default_dump_one(skb, dev, ctx, cb); - dev_put(dev); - if (ret < 0) { - if (ret == -EOPNOTSUPP) - goto lock_and_cont; - if (likely(skb->len)) - ret = skb->len; - goto out; - } -lock_and_cont: - rtnl_lock(); - if (net->dev_base_seq != seq) { - s_idx = idx + 1; - goto restart_chain; - } -cont: - idx++; - } + for_each_netdev_dump(net, dev, ctx->pos_ifindex) { + dev_hold(dev); + rtnl_unlock(); + + ret = ethnl_default_dump_one(skb, dev, ctx, cb); + + rtnl_lock(); + dev_put(dev); + if (ret < 0 && ret != -EOPNOTSUPP) { + if (likely(skb->len)) + ret = skb->len; + break; + } } rtnl_unlock(); -out: - ctx->pos_hash = h; - ctx->pos_idx = idx; - nl_dump_check_consistent(cb, nlmsg_hdr(skb)); - return ret; } @@ -584,8 +554,7 @@ static int ethnl_default_start(struct netlink_callback *cb) ctx->ops = ops; ctx->req_info = req_info; ctx->reply_data = reply_data; - ctx->pos_hash = 0; - ctx->pos_idx = 0; + ctx->pos_ifindex = 0; return 0; diff --git a/net/ethtool/tunnels.c b/net/ethtool/tunnels.c index 67fb414ca859..05f752557b5e 100644 --- a/net/ethtool/tunnels.c +++ b/net/ethtool/tunnels.c @@ -212,8 +212,7 @@ err_unlock_rtnl: struct ethnl_tunnel_info_dump_ctx { struct ethnl_req_info req_info; - int pos_hash; - int pos_idx; + unsigned long ifindex; }; int ethnl_tunnel_info_start(struct netlink_callback *cb) @@ -243,57 +242,39 @@ int ethnl_tunnel_info_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { struct ethnl_tunnel_info_dump_ctx *ctx = (void *)cb->ctx; struct net *net = sock_net(skb->sk); - int s_idx = ctx->pos_idx; - int h, idx = 0; + struct net_device *dev; int ret = 0; void *ehdr; rtnl_lock(); - cb->seq = net->dev_base_seq; - for (h = ctx->pos_hash; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { - struct hlist_head *head; - struct net_device *dev; - - head = &net->dev_index_head[h]; - idx = 0; - hlist_for_each_entry(dev, head, index_hlist) { - if (idx < s_idx) - goto cont; - - ehdr = ethnl_dump_put(skb, cb, - ETHTOOL_MSG_TUNNEL_INFO_GET_REPLY); - if (!ehdr) { - ret = -EMSGSIZE; - goto out; - } - - ret = ethnl_fill_reply_header(skb, dev, ETHTOOL_A_TUNNEL_INFO_HEADER); - if (ret < 0) { - genlmsg_cancel(skb, ehdr); - goto out; - } - - ctx->req_info.dev = dev; - ret = ethnl_tunnel_info_fill_reply(&ctx->req_info, skb); - ctx->req_info.dev = NULL; - if (ret < 0) { - genlmsg_cancel(skb, ehdr); - if (ret == -EOPNOTSUPP) - goto cont; - goto out; - } - genlmsg_end(skb, ehdr); -cont: - idx++; + for_each_netdev_dump(net, dev, ctx->ifindex) { + ehdr = ethnl_dump_put(skb, cb, + ETHTOOL_MSG_TUNNEL_INFO_GET_REPLY); + if (!ehdr) { + ret = -EMSGSIZE; + break; } + + ret = ethnl_fill_reply_header(skb, dev, + ETHTOOL_A_TUNNEL_INFO_HEADER); + if (ret < 0) { + genlmsg_cancel(skb, ehdr); + break; + } + + ctx->req_info.dev = dev; + ret = ethnl_tunnel_info_fill_reply(&ctx->req_info, skb); + ctx->req_info.dev = NULL; + if (ret < 0) { + genlmsg_cancel(skb, ehdr); + if (ret == -EOPNOTSUPP) + continue; + break; + } + genlmsg_end(skb, ehdr); } -out: rtnl_unlock(); - ctx->pos_hash = h; - ctx->pos_idx = idx; - nl_dump_check_consistent(cb, nlmsg_hdr(skb)); - if (ret == -EMSGSIZE && skb->len) return skb->len; return ret; diff --git a/net/handshake/Makefile b/net/handshake/Makefile index 247d73c6ff6e..ef4d9a2112bd 100644 --- a/net/handshake/Makefile +++ b/net/handshake/Makefile @@ -8,6 +8,6 @@ # obj-y += handshake.o -handshake-y := genl.o netlink.o request.o tlshd.o trace.o +handshake-y := alert.o genl.o netlink.o request.o tlshd.o trace.o obj-$(CONFIG_NET_HANDSHAKE_KUNIT_TEST) += handshake-test.o diff --git a/net/handshake/alert.c b/net/handshake/alert.c new file mode 100644 index 000000000000..329d91984683 --- /dev/null +++ b/net/handshake/alert.c @@ -0,0 +1,110 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Handle the TLS Alert protocol + * + * Author: Chuck Lever <chuck.lever@oracle.com> + * + * Copyright (c) 2023, Oracle and/or its affiliates. + */ + +#include <linux/types.h> +#include <linux/socket.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/inet.h> + +#include <net/sock.h> +#include <net/handshake.h> +#include <net/tls.h> +#include <net/tls_prot.h> + +#include "handshake.h" + +#include <trace/events/handshake.h> + +/** + * tls_alert_send - send a TLS Alert on a kTLS socket + * @sock: open kTLS socket to send on + * @level: TLS Alert level + * @description: TLS Alert description + * + * Returns zero on success or a negative errno. + */ +int tls_alert_send(struct socket *sock, u8 level, u8 description) +{ + u8 record_type = TLS_RECORD_TYPE_ALERT; + u8 buf[CMSG_SPACE(sizeof(record_type))]; + struct msghdr msg = { 0 }; + struct cmsghdr *cmsg; + struct kvec iov; + u8 alert[2]; + int ret; + + trace_tls_alert_send(sock->sk, level, description); + + alert[0] = level; + alert[1] = description; + iov.iov_base = alert; + iov.iov_len = sizeof(alert); + + memset(buf, 0, sizeof(buf)); + msg.msg_control = buf; + msg.msg_controllen = sizeof(buf); + msg.msg_flags = MSG_DONTWAIT; + + cmsg = CMSG_FIRSTHDR(&msg); + cmsg->cmsg_level = SOL_TLS; + cmsg->cmsg_type = TLS_SET_RECORD_TYPE; + cmsg->cmsg_len = CMSG_LEN(sizeof(record_type)); + memcpy(CMSG_DATA(cmsg), &record_type, sizeof(record_type)); + + iov_iter_kvec(&msg.msg_iter, ITER_SOURCE, &iov, 1, iov.iov_len); + ret = sock_sendmsg(sock, &msg); + return ret < 0 ? ret : 0; +} + +/** + * tls_get_record_type - Look for TLS RECORD_TYPE information + * @sk: socket (for IP address information) + * @cmsg: incoming message to be parsed + * + * Returns zero or a TLS_RECORD_TYPE value. + */ +u8 tls_get_record_type(const struct sock *sk, const struct cmsghdr *cmsg) +{ + u8 record_type; + + if (cmsg->cmsg_level != SOL_TLS) + return 0; + if (cmsg->cmsg_type != TLS_GET_RECORD_TYPE) + return 0; + + record_type = *((u8 *)CMSG_DATA(cmsg)); + trace_tls_contenttype(sk, record_type); + return record_type; +} +EXPORT_SYMBOL(tls_get_record_type); + +/** + * tls_alert_recv - Parse TLS Alert messages + * @sk: socket (for IP address information) + * @msg: incoming message to be parsed + * @level: OUT - TLS AlertLevel value + * @description: OUT - TLS AlertDescription value + * + */ +void tls_alert_recv(const struct sock *sk, const struct msghdr *msg, + u8 *level, u8 *description) +{ + const struct kvec *iov; + u8 *data; + + iov = msg->msg_iter.kvec; + data = iov->iov_base; + *level = data[0]; + *description = data[1]; + + trace_tls_alert_recv(sk, *level, *description); +} +EXPORT_SYMBOL(tls_alert_recv); diff --git a/net/handshake/handshake.h b/net/handshake/handshake.h index 4dac965c99df..a48163765a7a 100644 --- a/net/handshake/handshake.h +++ b/net/handshake/handshake.h @@ -41,8 +41,11 @@ struct handshake_req { enum hr_flags_bits { HANDSHAKE_F_REQ_COMPLETED, + HANDSHAKE_F_REQ_SESSION, }; +struct genl_info; + /* Invariants for all handshake requests for one transport layer * security protocol */ @@ -63,6 +66,9 @@ enum hp_flags_bits { HANDSHAKE_F_PROTO_NOTIFY, }; +/* alert.c */ +int tls_alert_send(struct socket *sock, u8 level, u8 description); + /* netlink.c */ int handshake_genl_notify(struct net *net, const struct handshake_proto *proto, gfp_t flags); diff --git a/net/handshake/tlshd.c b/net/handshake/tlshd.c index b735f5cced2f..bbfb4095ddd6 100644 --- a/net/handshake/tlshd.c +++ b/net/handshake/tlshd.c @@ -18,6 +18,7 @@ #include <net/sock.h> #include <net/handshake.h> #include <net/genetlink.h> +#include <net/tls_prot.h> #include <uapi/linux/keyctl.h> #include <uapi/linux/handshake.h> @@ -100,6 +101,9 @@ static void tls_handshake_done(struct handshake_req *req, if (info) tls_handshake_remote_peerids(treq, info); + if (!status) + set_bit(HANDSHAKE_F_REQ_SESSION, &req->hr_flags); + treq->th_consumer_done(treq->th_consumer_data, -status, treq->th_peerid[0]); } @@ -424,3 +428,22 @@ bool tls_handshake_cancel(struct sock *sk) return handshake_req_cancel(sk); } EXPORT_SYMBOL(tls_handshake_cancel); + +/** + * tls_handshake_close - send a Closure alert + * @sock: an open socket + * + */ +void tls_handshake_close(struct socket *sock) +{ + struct handshake_req *req; + + req = handshake_req_hash_lookup(sock->sk); + if (!req) + return; + if (!test_and_clear_bit(HANDSHAKE_F_REQ_SESSION, &req->hr_flags)) + return; + tls_alert_send(sock, TLS_ALERT_LEVEL_WARNING, + TLS_ALERT_DESC_CLOSE_NOTIFY); +} +EXPORT_SYMBOL(tls_handshake_close); diff --git a/net/handshake/trace.c b/net/handshake/trace.c index 1c4d8e27e17a..44432d0857b9 100644 --- a/net/handshake/trace.c +++ b/net/handshake/trace.c @@ -8,8 +8,10 @@ */ #include <linux/types.h> +#include <linux/ipv6.h> #include <net/sock.h> +#include <net/inet_sock.h> #include <net/netlink.h> #include <net/genetlink.h> diff --git a/net/hsr/hsr_netlink.h b/net/hsr/hsr_netlink.h index 501552d9753b..8c99e64e1cea 100644 --- a/net/hsr/hsr_netlink.h +++ b/net/hsr/hsr_netlink.h @@ -23,7 +23,5 @@ void __exit hsr_netlink_exit(void); void hsr_nl_ringerror(struct hsr_priv *hsr, unsigned char addr[ETH_ALEN], struct hsr_port *port); void hsr_nl_nodedown(struct hsr_priv *hsr, unsigned char addr[ETH_ALEN]); -void hsr_nl_framedrop(int dropcount, int dev_idx); -void hsr_nl_linkdown(int dev_idx); #endif /* __HSR_NETLINK_H */ diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c index 4406d796cc2f..39dcccf0f174 100644 --- a/net/ipv4/bpf_tcp_ca.c +++ b/net/ipv4/bpf_tcp_ca.c @@ -51,8 +51,6 @@ static bool is_unsupported(u32 member_offset) return false; } -extern struct btf *btf_vmlinux; - static bool bpf_tcp_ca_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 0819d6001b9a..7876b7d703cb 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -28,9 +28,9 @@ #include <net/tcp.h> #include <net/sock_reuseport.h> -static u32 inet_ehashfn(const struct net *net, const __be32 laddr, - const __u16 lport, const __be32 faddr, - const __be16 fport) +u32 inet_ehashfn(const struct net *net, const __be32 laddr, + const __u16 lport, const __be32 faddr, + const __be16 fport) { static u32 inet_ehash_secret __read_mostly; @@ -39,6 +39,7 @@ static u32 inet_ehashfn(const struct net *net, const __be32 laddr, return __inet_ehashfn(laddr, lport, faddr, fport, inet_ehash_secret + net_hash_mix(net)); } +EXPORT_SYMBOL_GPL(inet_ehashfn); /* This function handles inet_sock, but also timewait and request sockets * for IPv4/IPv6. @@ -332,20 +333,38 @@ static inline int compute_score(struct sock *sk, struct net *net, return score; } -static inline struct sock *lookup_reuseport(struct net *net, struct sock *sk, - struct sk_buff *skb, int doff, - __be32 saddr, __be16 sport, - __be32 daddr, unsigned short hnum) +/** + * inet_lookup_reuseport() - execute reuseport logic on AF_INET socket if necessary. + * @net: network namespace. + * @sk: AF_INET socket, must be in TCP_LISTEN state for TCP or TCP_CLOSE for UDP. + * @skb: context for a potential SK_REUSEPORT program. + * @doff: header offset. + * @saddr: source address. + * @sport: source port. + * @daddr: destination address. + * @hnum: destination port in host byte order. + * @ehashfn: hash function used to generate the fallback hash. + * + * Return: NULL if sk doesn't have SO_REUSEPORT set, otherwise a pointer to + * the selected sock or an error. + */ +struct sock *inet_lookup_reuseport(struct net *net, struct sock *sk, + struct sk_buff *skb, int doff, + __be32 saddr, __be16 sport, + __be32 daddr, unsigned short hnum, + inet_ehashfn_t *ehashfn) { struct sock *reuse_sk = NULL; u32 phash; if (sk->sk_reuseport) { - phash = inet_ehashfn(net, daddr, hnum, saddr, sport); + phash = INDIRECT_CALL_2(ehashfn, udp_ehashfn, inet_ehashfn, + net, daddr, hnum, saddr, sport); reuse_sk = reuseport_select_sock(sk, phash, skb, doff); } return reuse_sk; } +EXPORT_SYMBOL_GPL(inet_lookup_reuseport); /* * Here are some nice properties to exploit here. The BSD API @@ -369,8 +388,8 @@ static struct sock *inet_lhash2_lookup(struct net *net, sk_nulls_for_each_rcu(sk, node, &ilb2->nulls_head) { score = compute_score(sk, net, hnum, daddr, dif, sdif); if (score > hiscore) { - result = lookup_reuseport(net, sk, skb, doff, - saddr, sport, daddr, hnum); + result = inet_lookup_reuseport(net, sk, skb, doff, + saddr, sport, daddr, hnum, inet_ehashfn); if (result) return result; @@ -382,24 +401,23 @@ static struct sock *inet_lhash2_lookup(struct net *net, return result; } -static inline struct sock *inet_lookup_run_bpf(struct net *net, - struct inet_hashinfo *hashinfo, - struct sk_buff *skb, int doff, - __be32 saddr, __be16 sport, - __be32 daddr, u16 hnum, const int dif) +struct sock *inet_lookup_run_sk_lookup(struct net *net, + int protocol, + struct sk_buff *skb, int doff, + __be32 saddr, __be16 sport, + __be32 daddr, u16 hnum, const int dif, + inet_ehashfn_t *ehashfn) { struct sock *sk, *reuse_sk; bool no_reuseport; - if (hashinfo != net->ipv4.tcp_death_row.hashinfo) - return NULL; /* only TCP is supported */ - - no_reuseport = bpf_sk_lookup_run_v4(net, IPPROTO_TCP, saddr, sport, + no_reuseport = bpf_sk_lookup_run_v4(net, protocol, saddr, sport, daddr, hnum, dif, &sk); if (no_reuseport || IS_ERR_OR_NULL(sk)) return sk; - reuse_sk = lookup_reuseport(net, sk, skb, doff, saddr, sport, daddr, hnum); + reuse_sk = inet_lookup_reuseport(net, sk, skb, doff, saddr, sport, daddr, hnum, + ehashfn); if (reuse_sk) sk = reuse_sk; return sk; @@ -417,9 +435,11 @@ struct sock *__inet_lookup_listener(struct net *net, unsigned int hash2; /* Lookup redirect from BPF */ - if (static_branch_unlikely(&bpf_sk_lookup_enabled)) { - result = inet_lookup_run_bpf(net, hashinfo, skb, doff, - saddr, sport, daddr, hnum, dif); + if (static_branch_unlikely(&bpf_sk_lookup_enabled) && + hashinfo == net->ipv4.tcp_death_row.hashinfo) { + result = inet_lookup_run_sk_lookup(net, IPPROTO_TCP, skb, doff, + saddr, sport, daddr, hnum, dif, + inet_ehashfn); if (result) goto done; } diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 6ba1a0fafbaa..f28c87533a46 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -236,7 +236,7 @@ static int ip_finish_output2(struct net *net, struct sock *sk, struct sk_buff *s net_dbg_ratelimited("%s: No header cache and no neighbour!\n", __func__); kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_CREATEFAIL); - return -EINVAL; + return PTR_ERR(neigh); } static int ip_finish_output_gso(struct net *net, struct sock *sk, diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c index e61ea428ea18..a9ba7de092c4 100644 --- a/net/ipv4/netfilter/nf_defrag_ipv4.c +++ b/net/ipv4/netfilter/nf_defrag_ipv4.c @@ -7,6 +7,7 @@ #include <linux/ip.h> #include <linux/netfilter.h> #include <linux/module.h> +#include <linux/rcupdate.h> #include <linux/skbuff.h> #include <net/netns/generic.h> #include <net/route.h> @@ -113,17 +114,31 @@ static void __net_exit defrag4_net_exit(struct net *net) } } +static const struct nf_defrag_hook defrag_hook = { + .owner = THIS_MODULE, + .enable = nf_defrag_ipv4_enable, + .disable = nf_defrag_ipv4_disable, +}; + static struct pernet_operations defrag4_net_ops = { .exit = defrag4_net_exit, }; static int __init nf_defrag_init(void) { - return register_pernet_subsys(&defrag4_net_ops); + int err; + + err = register_pernet_subsys(&defrag4_net_ops); + if (err) + return err; + + rcu_assign_pointer(nf_defrag_v4_hook, &defrag_hook); + return err; } static void __exit nf_defrag_fini(void) { + rcu_assign_pointer(nf_defrag_v4_hook, NULL); unregister_pernet_subsys(&defrag4_net_ops); } diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c index be5498f5dd31..09d36bcbd7d4 100644 --- a/net/ipv4/nexthop.c +++ b/net/ipv4/nexthop.c @@ -1152,41 +1152,64 @@ static bool ipv4_good_nh(const struct fib_nh *nh) return !!(state & NUD_VALID); } -static struct nexthop *nexthop_select_path_hthr(struct nh_group *nhg, int hash) +static bool nexthop_is_good_nh(const struct nexthop *nh) +{ + struct nh_info *nhi = rcu_dereference(nh->nh_info); + + switch (nhi->family) { + case AF_INET: + return ipv4_good_nh(&nhi->fib_nh); + case AF_INET6: + return ipv6_good_nh(&nhi->fib6_nh); + } + + return false; +} + +static struct nexthop *nexthop_select_path_fdb(struct nh_group *nhg, int hash) { - struct nexthop *rc = NULL; int i; - for (i = 0; i < nhg->num_nh; ++i) { + for (i = 0; i < nhg->num_nh; i++) { struct nh_grp_entry *nhge = &nhg->nh_entries[i]; - struct nh_info *nhi; if (hash > atomic_read(&nhge->hthr.upper_bound)) continue; - nhi = rcu_dereference(nhge->nh->nh_info); - if (nhi->fdb_nh) - return nhge->nh; + return nhge->nh; + } + + WARN_ON_ONCE(1); + return NULL; +} + +static struct nexthop *nexthop_select_path_hthr(struct nh_group *nhg, int hash) +{ + struct nexthop *rc = NULL; + int i; + + if (nhg->fdb_nh) + return nexthop_select_path_fdb(nhg, hash); + + for (i = 0; i < nhg->num_nh; ++i) { + struct nh_grp_entry *nhge = &nhg->nh_entries[i]; /* nexthops always check if it is good and does * not rely on a sysctl for this behavior */ - switch (nhi->family) { - case AF_INET: - if (ipv4_good_nh(&nhi->fib_nh)) - return nhge->nh; - break; - case AF_INET6: - if (ipv6_good_nh(&nhi->fib6_nh)) - return nhge->nh; - break; - } + if (!nexthop_is_good_nh(nhge->nh)) + continue; if (!rc) rc = nhge->nh; + + if (hash > atomic_read(&nhge->hthr.upper_bound)) + continue; + + return nhge->nh; } - return rc; + return rc ? : nhg->nh_entries[0].nh; } static struct nexthop *nexthop_select_path_res(struct nh_group *nhg, int hash) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 8ed52e1e3c99..4fbc7ff8c53c 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -457,6 +457,7 @@ void tcp_init_sock(struct sock *sk) WRITE_ONCE(sk->sk_sndbuf, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_wmem[1])); WRITE_ONCE(sk->sk_rcvbuf, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[1])); + tcp_scaling_ratio_init(sk); set_bit(SOCK_SUPPORT_ZC, &sk->sk_socket->flags); sk_sockets_allocated_inc(sk); @@ -1700,7 +1701,7 @@ EXPORT_SYMBOL(tcp_peek_len); /* Make sure sk_rcvbuf is big enough to satisfy SO_RCVLOWAT hint */ int tcp_set_rcvlowat(struct sock *sk, int val) { - int cap; + int space, cap; if (sk->sk_userlocks & SOCK_RCVBUF_LOCK) cap = sk->sk_rcvbuf >> 1; @@ -1715,10 +1716,10 @@ int tcp_set_rcvlowat(struct sock *sk, int val) if (sk->sk_userlocks & SOCK_RCVBUF_LOCK) return 0; - val <<= 1; - if (val > sk->sk_rcvbuf) { - WRITE_ONCE(sk->sk_rcvbuf, val); - tcp_sk(sk)->window_clamp = tcp_win_from_space(sk, val); + space = tcp_space_from_win(sk, val); + if (space > sk->sk_rcvbuf) { + WRITE_ONCE(sk->sk_rcvbuf, space); + tcp_sk(sk)->window_clamp = val; } return 0; } @@ -2864,7 +2865,7 @@ adjudge_to_death: if (sk->sk_state == TCP_FIN_WAIT2) { struct tcp_sock *tp = tcp_sk(sk); - if (tp->linger2 < 0) { + if (READ_ONCE(tp->linger2) < 0) { tcp_set_state(sk, TCP_CLOSE); tcp_send_active_reset(sk, GFP_ATOMIC); __NET_INC_STATS(sock_net(sk), @@ -3290,18 +3291,21 @@ int tcp_sock_set_syncnt(struct sock *sk, int val) if (val < 1 || val > MAX_TCP_SYNCNT) return -EINVAL; - lock_sock(sk); WRITE_ONCE(inet_csk(sk)->icsk_syn_retries, val); - release_sock(sk); return 0; } EXPORT_SYMBOL(tcp_sock_set_syncnt); -void tcp_sock_set_user_timeout(struct sock *sk, u32 val) +int tcp_sock_set_user_timeout(struct sock *sk, int val) { - lock_sock(sk); + /* Cap the max time in ms TCP will retry or probe the window + * before giving up and aborting (ETIMEDOUT) a connection. + */ + if (val < 0) + return -EINVAL; + WRITE_ONCE(inet_csk(sk)->icsk_user_timeout, val); - release_sock(sk); + return 0; } EXPORT_SYMBOL(tcp_sock_set_user_timeout); @@ -3344,9 +3348,7 @@ int tcp_sock_set_keepintvl(struct sock *sk, int val) if (val < 1 || val > MAX_TCP_KEEPINTVL) return -EINVAL; - lock_sock(sk); WRITE_ONCE(tcp_sk(sk)->keepalive_intvl, val * HZ); - release_sock(sk); return 0; } EXPORT_SYMBOL(tcp_sock_set_keepintvl); @@ -3356,10 +3358,8 @@ int tcp_sock_set_keepcnt(struct sock *sk, int val) if (val < 1 || val > MAX_TCP_KEEPCNT) return -EINVAL; - lock_sock(sk); /* Paired with READ_ONCE() in keepalive_probes() */ WRITE_ONCE(tcp_sk(sk)->keepalive_probes, val); - release_sock(sk); return 0; } EXPORT_SYMBOL(tcp_sock_set_keepcnt); @@ -3461,6 +3461,32 @@ int do_tcp_setsockopt(struct sock *sk, int level, int optname, if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; + /* Handle options that can be set without locking the socket. */ + switch (optname) { + case TCP_SYNCNT: + return tcp_sock_set_syncnt(sk, val); + case TCP_USER_TIMEOUT: + return tcp_sock_set_user_timeout(sk, val); + case TCP_KEEPINTVL: + return tcp_sock_set_keepintvl(sk, val); + case TCP_KEEPCNT: + return tcp_sock_set_keepcnt(sk, val); + case TCP_LINGER2: + if (val < 0) + WRITE_ONCE(tp->linger2, -1); + else if (val > TCP_FIN_TIMEOUT_MAX / HZ) + WRITE_ONCE(tp->linger2, TCP_FIN_TIMEOUT_MAX); + else + WRITE_ONCE(tp->linger2, val * HZ); + return 0; + case TCP_DEFER_ACCEPT: + /* Translate value in seconds to number of retransmits */ + WRITE_ONCE(icsk->icsk_accept_queue.rskq_defer_accept, + secs_to_retrans(val, TCP_TIMEOUT_INIT / HZ, + TCP_RTO_MAX / HZ)); + return 0; + } + sockopt_lock_sock(sk); switch (optname) { @@ -3556,25 +3582,6 @@ int do_tcp_setsockopt(struct sock *sk, int level, int optname, case TCP_KEEPIDLE: err = tcp_sock_set_keepidle_locked(sk, val); break; - case TCP_KEEPINTVL: - if (val < 1 || val > MAX_TCP_KEEPINTVL) - err = -EINVAL; - else - WRITE_ONCE(tp->keepalive_intvl, val * HZ); - break; - case TCP_KEEPCNT: - if (val < 1 || val > MAX_TCP_KEEPCNT) - err = -EINVAL; - else - WRITE_ONCE(tp->keepalive_probes, val); - break; - case TCP_SYNCNT: - if (val < 1 || val > MAX_TCP_SYNCNT) - err = -EINVAL; - else - WRITE_ONCE(icsk->icsk_syn_retries, val); - break; - case TCP_SAVE_SYN: /* 0: disable, 1: enable, 2: start from ether_header */ if (val < 0 || val > 2) @@ -3583,22 +3590,6 @@ int do_tcp_setsockopt(struct sock *sk, int level, int optname, tp->save_syn = val; break; - case TCP_LINGER2: - if (val < 0) - WRITE_ONCE(tp->linger2, -1); - else if (val > TCP_FIN_TIMEOUT_MAX / HZ) - WRITE_ONCE(tp->linger2, TCP_FIN_TIMEOUT_MAX); - else - WRITE_ONCE(tp->linger2, val * HZ); - break; - - case TCP_DEFER_ACCEPT: - /* Translate value in seconds to number of retransmits */ - WRITE_ONCE(icsk->icsk_accept_queue.rskq_defer_accept, - secs_to_retrans(val, TCP_TIMEOUT_INIT / HZ, - TCP_RTO_MAX / HZ)); - break; - case TCP_WINDOW_CLAMP: err = tcp_set_window_clamp(sk, val); break; @@ -3613,16 +3604,6 @@ int do_tcp_setsockopt(struct sock *sk, int level, int optname, err = tp->af_specific->md5_parse(sk, optname, optval, optlen); break; #endif - case TCP_USER_TIMEOUT: - /* Cap the max time in ms TCP will retry or probe the window - * before giving up and aborting (ETIMEDOUT) a connection. - */ - if (val < 0) - err = -EINVAL; - else - WRITE_ONCE(icsk->icsk_user_timeout, val); - break; - case TCP_FASTOPEN: if (val >= 0 && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) { diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 57c8af1859c1..8e96ebe373d7 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -237,6 +237,16 @@ static void tcp_measure_rcv_mss(struct sock *sk, const struct sk_buff *skb) */ len = skb_shinfo(skb)->gso_size ? : skb->len; if (len >= icsk->icsk_ack.rcv_mss) { + /* Note: divides are still a bit expensive. + * For the moment, only adjust scaling_ratio + * when we update icsk_ack.rcv_mss. + */ + if (unlikely(len != icsk->icsk_ack.rcv_mss)) { + u64 val = (u64)skb->len << TCP_RMEM_TO_WIN_SCALE; + + do_div(val, skb->truesize); + tcp_sk(sk)->scaling_ratio = val ? val : 1; + } icsk->icsk_ack.rcv_mss = min_t(unsigned int, len, tcp_sk(sk)->advmss); /* Account for possibly-removed options */ @@ -287,7 +297,7 @@ static void tcp_incr_quickack(struct sock *sk, unsigned int max_quickacks) icsk->icsk_ack.quick = quickacks; } -void tcp_enter_quickack_mode(struct sock *sk, unsigned int max_quickacks) +static void tcp_enter_quickack_mode(struct sock *sk, unsigned int max_quickacks) { struct inet_connection_sock *icsk = inet_csk(sk); @@ -295,7 +305,6 @@ void tcp_enter_quickack_mode(struct sock *sk, unsigned int max_quickacks) inet_csk_exit_pingpong_mode(sk); icsk->icsk_ack.ato = TCP_ATO_MIN; } -EXPORT_SYMBOL(tcp_enter_quickack_mode); /* Send ACKs quickly, if "quick" count is not exhausted * and the session is not interactive. @@ -727,8 +736,8 @@ void tcp_rcv_space_adjust(struct sock *sk) if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf) && !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) { - int rcvmem, rcvbuf; u64 rcvwin, grow; + int rcvbuf; /* minimal window to cope with packet losses, assuming * steady state. Add some cushion because of small variations. @@ -740,12 +749,7 @@ void tcp_rcv_space_adjust(struct sock *sk) do_div(grow, tp->rcvq_space.space); rcvwin += (grow << 1); - rcvmem = SKB_TRUESIZE(tp->advmss + MAX_TCP_HEADER); - while (tcp_win_from_space(sk, rcvmem) < tp->advmss) - rcvmem += 128; - - do_div(rcvwin, tp->advmss); - rcvbuf = min_t(u64, rcvwin * rcvmem, + rcvbuf = min_t(u64, tcp_space_from_win(sk, rcvwin), READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2])); if (rcvbuf > sk->sk_rcvbuf) { WRITE_ONCE(sk->sk_rcvbuf, rcvbuf); @@ -4122,9 +4126,8 @@ void tcp_parse_options(const struct net *net, break; #ifdef CONFIG_TCP_MD5SIG case TCPOPT_MD5SIG: - /* - * The MD5 Hash has already been - * checked (see tcp_v{4,6}_do_rcv()). + /* The MD5 Hash has already been + * checked (see tcp_v{4,6}_rcv()). */ break; #endif @@ -4308,10 +4311,16 @@ static inline bool tcp_paws_discard(const struct sock *sk, * (borrowed from freebsd) */ -static inline bool tcp_sequence(const struct tcp_sock *tp, u32 seq, u32 end_seq) +static enum skb_drop_reason tcp_sequence(const struct tcp_sock *tp, + u32 seq, u32 end_seq) { - return !before(end_seq, tp->rcv_wup) && - !after(seq, tp->rcv_nxt + tcp_receive_window(tp)); + if (before(end_seq, tp->rcv_wup)) + return SKB_DROP_REASON_TCP_OLD_SEQUENCE; + + if (after(seq, tp->rcv_nxt + tcp_receive_window(tp))) + return SKB_DROP_REASON_TCP_INVALID_SEQUENCE; + + return SKB_NOT_DROPPED_YET; } /* When we get a reset we do this. */ @@ -5734,7 +5743,8 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, } /* Step 1: check sequence number */ - if (!tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)) { + reason = tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq); + if (reason) { /* RFC793, page 37: "In all states except SYN-SENT, all reset * (RST) segments are validated by checking their SEQ-fields." * And page 69: "If an incoming segment is not acceptable, @@ -5751,7 +5761,6 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, } else if (tcp_reset_check(sk, skb)) { goto reset; } - SKB_DR_SET(reason, TCP_INVALID_SEQUENCE); goto discard; } @@ -6315,7 +6324,7 @@ consume: if (fastopen_fail) return -1; if (sk->sk_write_pending || - icsk->icsk_accept_queue.rskq_defer_accept || + READ_ONCE(icsk->icsk_accept_queue.rskq_defer_accept) || inet_csk_in_pingpong_mode(sk)) { /* Save one ACK. Data will be ready after * several ticks, if write_pending is set. @@ -6615,7 +6624,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) break; } - if (tp->linger2 < 0) { + if (READ_ONCE(tp->linger2) < 0) { tcp_done(sk); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA); return 1; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index a59cc4b83861..5b18a048f613 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -57,6 +57,7 @@ #include <linux/init.h> #include <linux/times.h> #include <linux/slab.h> +#include <linux/sched.h> #include <net/net_namespace.h> #include <net/icmp.h> @@ -2448,6 +2449,8 @@ static void *established_get_first(struct seq_file *seq) struct hlist_nulls_node *node; spinlock_t *lock = inet_ehash_lockp(hinfo, st->bucket); + cond_resched(); + /* Lockless fast path for the common case of empty buckets */ if (empty_bucket(hinfo, st)) continue; diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index 99ac5efe244d..c196759f1d3b 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -990,7 +990,7 @@ static struct genl_family tcp_metrics_nl_family __ro_after_init = { .resv_start_op = TCP_METRICS_CMD_DEL + 1, }; -static unsigned int tcpmhash_entries; +static unsigned int tcpmhash_entries __initdata; static int __init set_tcpmhash_entries(char *str) { ssize_t ret; @@ -1006,15 +1006,11 @@ static int __init set_tcpmhash_entries(char *str) } __setup("tcpmhash_entries=", set_tcpmhash_entries); -static int __net_init tcp_net_metrics_init(struct net *net) +static void __init tcp_metrics_hash_alloc(void) { + unsigned int slots = tcpmhash_entries; size_t size; - unsigned int slots; - if (!net_eq(net, &init_net)) - return 0; - - slots = tcpmhash_entries; if (!slots) { if (totalram_pages() >= 128 * 1024) slots = 16 * 1024; @@ -1027,9 +1023,7 @@ static int __net_init tcp_net_metrics_init(struct net *net) tcp_metrics_hash = kvzalloc(size, GFP_KERNEL); if (!tcp_metrics_hash) - return -ENOMEM; - - return 0; + panic("Could not allocate the tcp_metrics hash table\n"); } static void __net_exit tcp_net_metrics_exit_batch(struct list_head *net_exit_list) @@ -1038,7 +1032,6 @@ static void __net_exit tcp_net_metrics_exit_batch(struct list_head *net_exit_lis } static __net_initdata struct pernet_operations tcp_net_metrics_ops = { - .init = tcp_net_metrics_init, .exit_batch = tcp_net_metrics_exit_batch, }; @@ -1046,9 +1039,11 @@ void __init tcp_metrics_init(void) { int ret; + tcp_metrics_hash_alloc(); + ret = register_pernet_subsys(&tcp_net_metrics_ops); if (ret < 0) - panic("Could not allocate the tcp_metrics hash table\n"); + panic("Could not register tcp_net_metrics_ops\n"); ret = genl_register_family(&tcp_metrics_nl_family); if (ret < 0) diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index c8f2aa003387..13ee12983c42 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -570,8 +570,6 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, newtp->tsoffset = treq->ts_off; #ifdef CONFIG_TCP_MD5SIG newtp->md5sig_info = NULL; /*XXX*/ - if (treq->af_specific->req_md5_lookup(sk, req_to_sk(req))) - newtp->tcp_header_len += TCPOLEN_MD5SIG_ALIGNED; #endif if (skb->len >= TCP_MSS_DEFAULT + newtp->tcp_header_len) newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len; @@ -794,7 +792,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, return sk; /* While TCP_DEFER_ACCEPT is active, drop bare ACK. */ - if (req->num_timeout < inet_csk(sk)->icsk_accept_queue.rskq_defer_accept && + if (req->num_timeout < READ_ONCE(inet_csk(sk)->icsk_accept_queue.rskq_defer_accept) && TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) { inet_rsk(req)->acked = 1; __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDEFERACCEPTDROP); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 51d8638d4b4c..c5412ee77fc8 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -3741,11 +3741,6 @@ static void tcp_connect_init(struct sock *sk) if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_timestamps)) tp->tcp_header_len += TCPOLEN_TSTAMP_ALIGNED; -#ifdef CONFIG_TCP_MD5SIG - if (tp->af_specific->md5_lookup(sk, sk)) - tp->tcp_header_len += TCPOLEN_MD5SIG_ALIGNED; -#endif - /* If user gave his TCP_MAXSEG, record it to clamp */ if (tp->rx_opt.user_mss) tp->rx_opt.mss_clamp = tp->rx_opt.user_mss; diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 470f581eedd4..d45c96c7f5a4 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -26,14 +26,15 @@ static u32 tcp_clamp_rto_to_user_timeout(const struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); - u32 elapsed, start_ts; + u32 elapsed, start_ts, user_timeout; s32 remaining; start_ts = tcp_sk(sk)->retrans_stamp; - if (!icsk->icsk_user_timeout) + user_timeout = READ_ONCE(icsk->icsk_user_timeout); + if (!user_timeout) return icsk->icsk_rto; elapsed = tcp_time_stamp(tcp_sk(sk)) - start_ts; - remaining = icsk->icsk_user_timeout - elapsed; + remaining = user_timeout - elapsed; if (remaining <= 0) return 1; /* user timeout has passed; fire ASAP */ @@ -43,16 +44,17 @@ static u32 tcp_clamp_rto_to_user_timeout(const struct sock *sk) u32 tcp_clamp_probe0_to_user_timeout(const struct sock *sk, u32 when) { struct inet_connection_sock *icsk = inet_csk(sk); - u32 remaining; + u32 remaining, user_timeout; s32 elapsed; - if (!icsk->icsk_user_timeout || !icsk->icsk_probes_tstamp) + user_timeout = READ_ONCE(icsk->icsk_user_timeout); + if (!user_timeout || !icsk->icsk_probes_tstamp) return when; elapsed = tcp_jiffies32 - icsk->icsk_probes_tstamp; if (unlikely(elapsed < 0)) elapsed = 0; - remaining = msecs_to_jiffies(icsk->icsk_user_timeout) - elapsed; + remaining = msecs_to_jiffies(user_timeout) - elapsed; remaining = max_t(u32, remaining, TCP_TIMEOUT_MIN); return min_t(u32, remaining, when); @@ -239,7 +241,8 @@ static int tcp_write_timeout(struct sock *sk) if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { if (icsk->icsk_retransmits) __dst_negative_advice(sk); - retry_until = icsk->icsk_syn_retries ? : + /* Paired with WRITE_ONCE() in tcp_sock_set_syncnt() */ + retry_until = READ_ONCE(icsk->icsk_syn_retries) ? : READ_ONCE(net->ipv4.sysctl_tcp_syn_retries); max_retransmits = retry_until; @@ -269,7 +272,7 @@ static int tcp_write_timeout(struct sock *sk) } if (!expired) expired = retransmits_timed_out(sk, retry_until, - icsk->icsk_user_timeout); + READ_ONCE(icsk->icsk_user_timeout)); tcp_fastopen_active_detect_blackhole(sk, expired); if (BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_RTO_CB_FLAG)) @@ -383,13 +386,16 @@ static void tcp_probe_timer(struct sock *sk) * corresponding system limit. We also implement similar policy when * we use RTO to probe window in tcp_retransmit_timer(). */ - if (!icsk->icsk_probes_tstamp) + if (!icsk->icsk_probes_tstamp) { icsk->icsk_probes_tstamp = tcp_jiffies32; - else if (icsk->icsk_user_timeout && - (s32)(tcp_jiffies32 - icsk->icsk_probes_tstamp) >= - msecs_to_jiffies(icsk->icsk_user_timeout)) - goto abort; + } else { + u32 user_timeout = READ_ONCE(icsk->icsk_user_timeout); + if (user_timeout && + (s32)(tcp_jiffies32 - icsk->icsk_probes_tstamp) >= + msecs_to_jiffies(user_timeout)) + goto abort; + } max_probes = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_retries2); if (sock_flag(sk, SOCK_DEAD)) { const bool alive = inet_csk_rto_backoff(icsk, TCP_RTO_MAX) < TCP_RTO_MAX; @@ -421,8 +427,10 @@ static void tcp_fastopen_synack_timer(struct sock *sk, struct request_sock *req) req->rsk_ops->syn_ack_timeout(req); - /* add one more retry for fastopen */ - max_retries = icsk->icsk_syn_retries ? : + /* Add one more retry for fastopen. + * Paired with WRITE_ONCE() in tcp_sock_set_syncnt() + */ + max_retries = READ_ONCE(icsk->icsk_syn_retries) ? : READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_synack_retries) + 1; if (req->num_timeout >= max_retries) { @@ -706,7 +714,7 @@ static void tcp_keepalive_timer (struct timer_list *t) tcp_mstamp_refresh(tp); if (sk->sk_state == TCP_FIN_WAIT2 && sock_flag(sk, SOCK_DEAD)) { - if (tp->linger2 >= 0) { + if (READ_ONCE(tp->linger2) >= 0) { const int tmo = tcp_fin_time(sk) - TCP_TIMEWAIT_LEN; if (tmo > 0) { @@ -731,13 +739,15 @@ static void tcp_keepalive_timer (struct timer_list *t) elapsed = keepalive_time_elapsed(tp); if (elapsed >= keepalive_time_when(tp)) { + u32 user_timeout = READ_ONCE(icsk->icsk_user_timeout); + /* If the TCP_USER_TIMEOUT option is enabled, use that * to determine when to timeout instead. */ - if ((icsk->icsk_user_timeout != 0 && - elapsed >= msecs_to_jiffies(icsk->icsk_user_timeout) && + if ((user_timeout != 0 && + elapsed >= msecs_to_jiffies(user_timeout) && icsk->icsk_probes_out > 0) || - (icsk->icsk_user_timeout == 0 && + (user_timeout == 0 && icsk->icsk_probes_out >= keepalive_probes(tp))) { tcp_send_active_reset(sk, GFP_ATOMIC); tcp_write_err(sk); diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index abfa860367aa..3e2f29c14fa8 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -407,9 +407,9 @@ static int compute_score(struct sock *sk, struct net *net, return score; } -static u32 udp_ehashfn(const struct net *net, const __be32 laddr, - const __u16 lport, const __be32 faddr, - const __be16 fport) +INDIRECT_CALLABLE_SCOPE +u32 udp_ehashfn(const struct net *net, const __be32 laddr, const __u16 lport, + const __be32 faddr, const __be16 fport) { static u32 udp_ehash_secret __read_mostly; @@ -419,22 +419,6 @@ static u32 udp_ehashfn(const struct net *net, const __be32 laddr, udp_ehash_secret + net_hash_mix(net)); } -static struct sock *lookup_reuseport(struct net *net, struct sock *sk, - struct sk_buff *skb, - __be32 saddr, __be16 sport, - __be32 daddr, unsigned short hnum) -{ - struct sock *reuse_sk = NULL; - u32 hash; - - if (sk->sk_reuseport && sk->sk_state != TCP_ESTABLISHED) { - hash = udp_ehashfn(net, daddr, hnum, saddr, sport); - reuse_sk = reuseport_select_sock(sk, hash, skb, - sizeof(struct udphdr)); - } - return reuse_sk; -} - /* called with rcu_read_lock() */ static struct sock *udp4_lib_lookup2(struct net *net, __be32 saddr, __be16 sport, @@ -452,42 +436,36 @@ static struct sock *udp4_lib_lookup2(struct net *net, score = compute_score(sk, net, saddr, sport, daddr, hnum, dif, sdif); if (score > badness) { - result = lookup_reuseport(net, sk, skb, - saddr, sport, daddr, hnum); + badness = score; + + if (sk->sk_state == TCP_ESTABLISHED) { + result = sk; + continue; + } + + result = inet_lookup_reuseport(net, sk, skb, sizeof(struct udphdr), + saddr, sport, daddr, hnum, udp_ehashfn); + if (!result) { + result = sk; + continue; + } + /* Fall back to scoring if group has connections */ - if (result && !reuseport_has_conns(sk)) + if (!reuseport_has_conns(sk)) return result; - result = result ? : sk; - badness = score; + /* Reuseport logic returned an error, keep original score. */ + if (IS_ERR(result)) + continue; + + badness = compute_score(result, net, saddr, sport, + daddr, hnum, dif, sdif); + } } return result; } -static struct sock *udp4_lookup_run_bpf(struct net *net, - struct udp_table *udptable, - struct sk_buff *skb, - __be32 saddr, __be16 sport, - __be32 daddr, u16 hnum, const int dif) -{ - struct sock *sk, *reuse_sk; - bool no_reuseport; - - if (udptable != net->ipv4.udp_table) - return NULL; /* only UDP is supported */ - - no_reuseport = bpf_sk_lookup_run_v4(net, IPPROTO_UDP, saddr, sport, - daddr, hnum, dif, &sk); - if (no_reuseport || IS_ERR_OR_NULL(sk)) - return sk; - - reuse_sk = lookup_reuseport(net, sk, skb, saddr, sport, daddr, hnum); - if (reuse_sk) - sk = reuse_sk; - return sk; -} - /* UDP is nearly always wildcards out the wazoo, it makes no sense to try * harder than this. -DaveM */ @@ -512,9 +490,11 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr, goto done; /* Lookup redirect from BPF */ - if (static_branch_unlikely(&bpf_sk_lookup_enabled)) { - sk = udp4_lookup_run_bpf(net, udptable, skb, - saddr, sport, daddr, hnum, dif); + if (static_branch_unlikely(&bpf_sk_lookup_enabled) && + udptable == net->ipv4.udp_table) { + sk = inet_lookup_run_sk_lookup(net, IPPROTO_UDP, skb, sizeof(struct udphdr), + saddr, sport, daddr, hnum, dif, + udp_ehashfn); if (sk) { result = sk; goto done; @@ -1557,7 +1537,7 @@ int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb) spin_unlock(&list->lock); if (!sock_flag(sk, SOCK_DEAD)) - sk->sk_data_ready(sk); + INDIRECT_CALL_1(sk->sk_data_ready, sock_def_readable, sk); busylock_release(busy); return 0; @@ -2412,7 +2392,11 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, if (udp4_csum_init(skb, uh, proto)) goto csum_error; - sk = skb_steal_sock(skb, &refcounted); + sk = inet_steal_sock(net, skb, sizeof(struct udphdr), saddr, uh->source, daddr, uh->dest, + &refcounted, udp_ehashfn); + if (IS_ERR(sk)) + goto no_sk; + if (sk) { struct dst_entry *dst = skb_dst(skb); int ret; @@ -2433,7 +2417,7 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, sk = __udp4_lib_lookup_skb(skb, uh->source, uh->dest, udptable); if (sk) return udp_unicast_rcv_skb(sk, skb, uh); - +no_sk: if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) goto drop; nf_reset_ct(skb); diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 94cec2075eee..5184bd0ceb12 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -202,6 +202,7 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = { .ra_defrtr_metric = IP6_RT_PRIO_USER, .accept_ra_from_local = 0, .accept_ra_min_hop_limit= 1, + .accept_ra_min_lft = 0, .accept_ra_pinfo = 1, #ifdef CONFIG_IPV6_ROUTER_PREF .accept_ra_rtr_pref = 1, @@ -262,6 +263,7 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = { .ra_defrtr_metric = IP6_RT_PRIO_USER, .accept_ra_from_local = 0, .accept_ra_min_hop_limit= 1, + .accept_ra_min_lft = 0, .accept_ra_pinfo = 1, #ifdef CONFIG_IPV6_ROUTER_PREF .accept_ra_rtr_pref = 1, @@ -1061,20 +1063,28 @@ ipv6_add_addr(struct inet6_dev *idev, struct ifa6_config *cfg, struct fib6_info *f6i = NULL; int err = 0; - if (addr_type == IPV6_ADDR_ANY || - (addr_type & IPV6_ADDR_MULTICAST && - !(cfg->ifa_flags & IFA_F_MCAUTOJOIN)) || - (!(idev->dev->flags & IFF_LOOPBACK) && - !netif_is_l3_master(idev->dev) && - addr_type & IPV6_ADDR_LOOPBACK)) + if (addr_type == IPV6_ADDR_ANY) { + NL_SET_ERR_MSG_MOD(extack, "Invalid address"); return ERR_PTR(-EADDRNOTAVAIL); + } else if (addr_type & IPV6_ADDR_MULTICAST && + !(cfg->ifa_flags & IFA_F_MCAUTOJOIN)) { + NL_SET_ERR_MSG_MOD(extack, "Cannot assign multicast address without \"IFA_F_MCAUTOJOIN\" flag"); + return ERR_PTR(-EADDRNOTAVAIL); + } else if (!(idev->dev->flags & IFF_LOOPBACK) && + !netif_is_l3_master(idev->dev) && + addr_type & IPV6_ADDR_LOOPBACK) { + NL_SET_ERR_MSG_MOD(extack, "Cannot assign loopback address on this device"); + return ERR_PTR(-EADDRNOTAVAIL); + } if (idev->dead) { - err = -ENODEV; /*XXX*/ + NL_SET_ERR_MSG_MOD(extack, "device is going away"); + err = -ENODEV; goto out; } if (idev->cnf.disable_ipv6) { + NL_SET_ERR_MSG_MOD(extack, "IPv6 is disabled on this device"); err = -EACCES; goto out; } @@ -1101,7 +1111,7 @@ ipv6_add_addr(struct inet6_dev *idev, struct ifa6_config *cfg, goto out; } - f6i = addrconf_f6i_alloc(net, idev, cfg->pfx, false, gfp_flags); + f6i = addrconf_f6i_alloc(net, idev, cfg->pfx, false, gfp_flags, extack); if (IS_ERR(f6i)) { err = PTR_ERR(f6i); f6i = NULL; @@ -2731,6 +2741,9 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len, bool sllao) return; } + if (valid_lft != 0 && valid_lft < in6_dev->cnf.accept_ra_min_lft) + return; + /* * Two things going on here: * 1) Add routes for on-link prefixes @@ -2925,30 +2938,40 @@ static int inet6_addr_add(struct net *net, int ifindex, ASSERT_RTNL(); - if (cfg->plen > 128) + if (cfg->plen > 128) { + NL_SET_ERR_MSG_MOD(extack, "Invalid prefix length"); return -EINVAL; + } /* check the lifetime */ - if (!cfg->valid_lft || cfg->preferred_lft > cfg->valid_lft) + if (!cfg->valid_lft || cfg->preferred_lft > cfg->valid_lft) { + NL_SET_ERR_MSG_MOD(extack, "address lifetime invalid"); return -EINVAL; + } - if (cfg->ifa_flags & IFA_F_MANAGETEMPADDR && cfg->plen != 64) + if (cfg->ifa_flags & IFA_F_MANAGETEMPADDR && cfg->plen != 64) { + NL_SET_ERR_MSG_MOD(extack, "address with \"mngtmpaddr\" flag must have a prefix length of 64"); return -EINVAL; + } dev = __dev_get_by_index(net, ifindex); if (!dev) return -ENODEV; idev = addrconf_add_dev(dev); - if (IS_ERR(idev)) + if (IS_ERR(idev)) { + NL_SET_ERR_MSG_MOD(extack, "IPv6 is disabled on this device"); return PTR_ERR(idev); + } if (cfg->ifa_flags & IFA_F_MCAUTOJOIN) { int ret = ipv6_mc_config(net->ipv6.mc_autojoin_sk, true, cfg->pfx, ifindex); - if (ret < 0) + if (ret < 0) { + NL_SET_ERR_MSG_MOD(extack, "Multicast auto join failed"); return ret; + } } cfg->scope = ipv6_addr_scope(cfg->pfx); @@ -3005,22 +3028,29 @@ static int inet6_addr_add(struct net *net, int ifindex, } static int inet6_addr_del(struct net *net, int ifindex, u32 ifa_flags, - const struct in6_addr *pfx, unsigned int plen) + const struct in6_addr *pfx, unsigned int plen, + struct netlink_ext_ack *extack) { struct inet6_ifaddr *ifp; struct inet6_dev *idev; struct net_device *dev; - if (plen > 128) + if (plen > 128) { + NL_SET_ERR_MSG_MOD(extack, "Invalid prefix length"); return -EINVAL; + } dev = __dev_get_by_index(net, ifindex); - if (!dev) + if (!dev) { + NL_SET_ERR_MSG_MOD(extack, "Unable to find the interface"); return -ENODEV; + } idev = __in6_dev_get(dev); - if (!idev) + if (!idev) { + NL_SET_ERR_MSG_MOD(extack, "IPv6 is disabled on this device"); return -ENXIO; + } read_lock_bh(&idev->lock); list_for_each_entry(ifp, &idev->addr_list, if_list) { @@ -3043,6 +3073,8 @@ static int inet6_addr_del(struct net *net, int ifindex, u32 ifa_flags, } } read_unlock_bh(&idev->lock); + + NL_SET_ERR_MSG_MOD(extack, "address not found"); return -EADDRNOTAVAIL; } @@ -3085,7 +3117,7 @@ int addrconf_del_ifaddr(struct net *net, void __user *arg) rtnl_lock(); err = inet6_addr_del(net, ireq.ifr6_ifindex, 0, &ireq.ifr6_addr, - ireq.ifr6_prefixlen); + ireq.ifr6_prefixlen, NULL); rtnl_unlock(); return err; } @@ -3488,7 +3520,7 @@ static int fixup_permanent_addr(struct net *net, struct fib6_info *f6i, *prev; f6i = addrconf_f6i_alloc(net, idev, &ifp->addr, false, - GFP_ATOMIC); + GFP_ATOMIC, NULL); if (IS_ERR(f6i)) return PTR_ERR(f6i); @@ -4698,7 +4730,7 @@ inet6_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, ifa_flags &= IFA_F_MANAGETEMPADDR; return inet6_addr_del(net, ifm->ifa_index, ifa_flags, pfx, - ifm->ifa_prefixlen); + ifm->ifa_prefixlen, extack); } static int modify_prefix_route(struct inet6_ifaddr *ifp, @@ -4903,8 +4935,10 @@ inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, } dev = __dev_get_by_index(net, ifm->ifa_index); - if (!dev) + if (!dev) { + NL_SET_ERR_MSG_MOD(extack, "Unable to find the interface"); return -ENODEV; + } if (tb[IFA_FLAGS]) cfg.ifa_flags = nla_get_u32(tb[IFA_FLAGS]); @@ -4939,10 +4973,12 @@ inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, } if (nlh->nlmsg_flags & NLM_F_EXCL || - !(nlh->nlmsg_flags & NLM_F_REPLACE)) + !(nlh->nlmsg_flags & NLM_F_REPLACE)) { + NL_SET_ERR_MSG_MOD(extack, "address already assigned"); err = -EEXIST; - else + } else { err = inet6_addr_modify(net, ifa, &cfg); + } in6_ifa_put(ifa); @@ -5602,6 +5638,7 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf, array[DEVCONF_IOAM6_ID_WIDE] = cnf->ioam6_id_wide; array[DEVCONF_NDISC_EVICT_NOCARRIER] = cnf->ndisc_evict_nocarrier; array[DEVCONF_ACCEPT_UNTRACKED_NA] = cnf->accept_untracked_na; + array[DEVCONF_ACCEPT_RA_MIN_LFT] = cnf->accept_ra_min_lft; } static inline size_t inet6_ifla6_size(void) @@ -6796,6 +6833,13 @@ static const struct ctl_table addrconf_sysctl[] = { .proc_handler = proc_dointvec, }, { + .procname = "accept_ra_min_lft", + .data = &ipv6_devconf.accept_ra_min_lft, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { .procname = "accept_ra_pinfo", .data = &ipv6_devconf.accept_ra_pinfo, .maxlen = sizeof(int), diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 5d593ddc0347..9f9c4b838664 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -102,9 +102,9 @@ bool ipv6_mod_enabled(void) } EXPORT_SYMBOL_GPL(ipv6_mod_enabled); -static __inline__ struct ipv6_pinfo *inet6_sk_generic(struct sock *sk) +static struct ipv6_pinfo *inet6_sk_generic(struct sock *sk) { - const int offset = sk->sk_prot->obj_size - sizeof(struct ipv6_pinfo); + const int offset = sk->sk_prot->ipv6_pinfo_offset; return (struct ipv6_pinfo *)(((u8 *)sk) + offset); } diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c index dacdea7fcb62..bb17f484ee2c 100644 --- a/net/ipv6/anycast.c +++ b/net/ipv6/anycast.c @@ -305,7 +305,7 @@ int __ipv6_dev_ac_inc(struct inet6_dev *idev, const struct in6_addr *addr) } net = dev_net(idev->dev); - f6i = addrconf_f6i_alloc(net, idev, addr, true, GFP_ATOMIC); + f6i = addrconf_f6i_alloc(net, idev, addr, true, GFP_ATOMIC, NULL); if (IS_ERR(f6i)) { err = PTR_ERR(f6i); goto out; diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index 9b6818453afe..d80d6024cafa 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -38,10 +38,11 @@ static bool ipv6_mapped_addr_any(const struct in6_addr *a) return ipv6_addr_v4mapped(a) && (a->s6_addr32[3] == 0); } -static void ip6_datagram_flow_key_init(struct flowi6 *fl6, struct sock *sk) +static void ip6_datagram_flow_key_init(struct flowi6 *fl6, + const struct sock *sk) { - struct inet_sock *inet = inet_sk(sk); - struct ipv6_pinfo *np = inet6_sk(sk); + const struct inet_sock *inet = inet_sk(sk); + const struct ipv6_pinfo *np = inet6_sk(sk); int oif = sk->sk_bound_dev_if; memset(fl6, 0, sizeof(*fl6)); diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c index 202fc3aaa83c..4952ae792450 100644 --- a/net/ipv6/exthdrs.c +++ b/net/ipv6/exthdrs.c @@ -612,8 +612,6 @@ looped_back: kfree(buf); - skb_dst_drop(skb); - ip6_route_input(skb); if (skb_dst(skb)->error) { @@ -650,7 +648,6 @@ static int ipv6_rthdr_rcv(struct sk_buff *skb) struct inet6_dev *idev = __in6_dev_get(skb->dev); struct inet6_skb_parm *opt = IP6CB(skb); struct in6_addr *addr = NULL; - struct in6_addr daddr; int n, i; struct ipv6_rt_hdr *hdr; struct rt0_hdr *rthdr; @@ -798,9 +795,7 @@ looped_back: return -1; } - daddr = *addr; - *addr = ipv6_hdr(skb)->daddr; - ipv6_hdr(skb)->daddr = daddr; + swap(*addr, ipv6_hdr(skb)->daddr); ip6_route_input(skb); if (skb_dst(skb)->error) { diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index 65fa5014bc85..6d88f5248c1f 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -1034,11 +1034,9 @@ drop_no_count: return 0; } -void icmpv6_flow_init(struct sock *sk, struct flowi6 *fl6, - u8 type, +void icmpv6_flow_init(const struct sock *sk, struct flowi6 *fl6, u8 type, const struct in6_addr *saddr, - const struct in6_addr *daddr, - int oif) + const struct in6_addr *daddr, int oif) { memset(fl6, 0, sizeof(*fl6)); fl6->saddr = *saddr; diff --git a/net/ipv6/ila/ila_main.c b/net/ipv6/ila/ila_main.c index 3faf62530d6a..69caed07315f 100644 --- a/net/ipv6/ila/ila_main.c +++ b/net/ipv6/ila/ila_main.c @@ -1,6 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 #include <net/genetlink.h> -#include <net/ila.h> #include <net/netns/generic.h> #include <uapi/linux/genetlink.h> #include "ila.h" diff --git a/net/ipv6/ila/ila_xlat.c b/net/ipv6/ila/ila_xlat.c index bee45dfeb187..67e8c9440977 100644 --- a/net/ipv6/ila/ila_xlat.c +++ b/net/ipv6/ila/ila_xlat.c @@ -5,7 +5,6 @@ #include <linux/rhashtable.h> #include <linux/vmalloc.h> #include <net/genetlink.h> -#include <net/ila.h> #include <net/netns/generic.h> #include <uapi/linux/genetlink.h> #include "ila.h" diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c index b64b49012655..b0e8d278e8a9 100644 --- a/net/ipv6/inet6_hashtables.c +++ b/net/ipv6/inet6_hashtables.c @@ -39,6 +39,7 @@ u32 inet6_ehashfn(const struct net *net, return __inet6_ehashfn(lhash, lport, fhash, fport, inet6_ehash_secret + net_hash_mix(net)); } +EXPORT_SYMBOL_GPL(inet6_ehashfn); /* * Sockets in TCP_CLOSE state are _always_ taken out of the hash, so @@ -111,22 +112,40 @@ static inline int compute_score(struct sock *sk, struct net *net, return score; } -static inline struct sock *lookup_reuseport(struct net *net, struct sock *sk, - struct sk_buff *skb, int doff, - const struct in6_addr *saddr, - __be16 sport, - const struct in6_addr *daddr, - unsigned short hnum) +/** + * inet6_lookup_reuseport() - execute reuseport logic on AF_INET6 socket if necessary. + * @net: network namespace. + * @sk: AF_INET6 socket, must be in TCP_LISTEN state for TCP or TCP_CLOSE for UDP. + * @skb: context for a potential SK_REUSEPORT program. + * @doff: header offset. + * @saddr: source address. + * @sport: source port. + * @daddr: destination address. + * @hnum: destination port in host byte order. + * @ehashfn: hash function used to generate the fallback hash. + * + * Return: NULL if sk doesn't have SO_REUSEPORT set, otherwise a pointer to + * the selected sock or an error. + */ +struct sock *inet6_lookup_reuseport(struct net *net, struct sock *sk, + struct sk_buff *skb, int doff, + const struct in6_addr *saddr, + __be16 sport, + const struct in6_addr *daddr, + unsigned short hnum, + inet6_ehashfn_t *ehashfn) { struct sock *reuse_sk = NULL; u32 phash; if (sk->sk_reuseport) { - phash = inet6_ehashfn(net, daddr, hnum, saddr, sport); + phash = INDIRECT_CALL_INET(ehashfn, udp6_ehashfn, inet6_ehashfn, + net, daddr, hnum, saddr, sport); reuse_sk = reuseport_select_sock(sk, phash, skb, doff); } return reuse_sk; } +EXPORT_SYMBOL_GPL(inet6_lookup_reuseport); /* called with rcu_read_lock() */ static struct sock *inet6_lhash2_lookup(struct net *net, @@ -143,8 +162,8 @@ static struct sock *inet6_lhash2_lookup(struct net *net, sk_nulls_for_each_rcu(sk, node, &ilb2->nulls_head) { score = compute_score(sk, net, hnum, daddr, dif, sdif); if (score > hiscore) { - result = lookup_reuseport(net, sk, skb, doff, - saddr, sport, daddr, hnum); + result = inet6_lookup_reuseport(net, sk, skb, doff, + saddr, sport, daddr, hnum, inet6_ehashfn); if (result) return result; @@ -156,30 +175,30 @@ static struct sock *inet6_lhash2_lookup(struct net *net, return result; } -static inline struct sock *inet6_lookup_run_bpf(struct net *net, - struct inet_hashinfo *hashinfo, - struct sk_buff *skb, int doff, - const struct in6_addr *saddr, - const __be16 sport, - const struct in6_addr *daddr, - const u16 hnum, const int dif) +struct sock *inet6_lookup_run_sk_lookup(struct net *net, + int protocol, + struct sk_buff *skb, int doff, + const struct in6_addr *saddr, + const __be16 sport, + const struct in6_addr *daddr, + const u16 hnum, const int dif, + inet6_ehashfn_t *ehashfn) { struct sock *sk, *reuse_sk; bool no_reuseport; - if (hashinfo != net->ipv4.tcp_death_row.hashinfo) - return NULL; /* only TCP is supported */ - - no_reuseport = bpf_sk_lookup_run_v6(net, IPPROTO_TCP, saddr, sport, + no_reuseport = bpf_sk_lookup_run_v6(net, protocol, saddr, sport, daddr, hnum, dif, &sk); if (no_reuseport || IS_ERR_OR_NULL(sk)) return sk; - reuse_sk = lookup_reuseport(net, sk, skb, doff, saddr, sport, daddr, hnum); + reuse_sk = inet6_lookup_reuseport(net, sk, skb, doff, + saddr, sport, daddr, hnum, ehashfn); if (reuse_sk) sk = reuse_sk; return sk; } +EXPORT_SYMBOL_GPL(inet6_lookup_run_sk_lookup); struct sock *inet6_lookup_listener(struct net *net, struct inet_hashinfo *hashinfo, @@ -193,9 +212,11 @@ struct sock *inet6_lookup_listener(struct net *net, unsigned int hash2; /* Lookup redirect from BPF */ - if (static_branch_unlikely(&bpf_sk_lookup_enabled)) { - result = inet6_lookup_run_bpf(net, hashinfo, skb, doff, - saddr, sport, daddr, hnum, dif); + if (static_branch_unlikely(&bpf_sk_lookup_enabled) && + hashinfo == net->ipv4.tcp_death_row.hashinfo) { + result = inet6_lookup_run_sk_lookup(net, IPPROTO_TCP, skb, doff, + saddr, sport, daddr, hnum, dif, + inet6_ehashfn); if (result) goto done; } diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 1e8c90e97608..bc96559bbf0f 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1693,7 +1693,10 @@ alloc_new_skb: fraglen = datalen + fragheaderlen; copy = datalen - transhdrlen - fraggap - pagedlen; - if (copy < 0) { + /* [!] NOTE: copy may be negative if pagedlen>0 + * because then the equation may reduces to -fraggap. + */ + if (copy < 0 && !(flags & MSG_SPLICE_PAGES)) { err = -EINVAL; goto error; } @@ -1744,6 +1747,8 @@ alloc_new_skb: err = -EFAULT; kfree_skb(skb); goto error; + } else if (flags & MSG_SPLICE_PAGES) { + copy = 0; } offset += copy; @@ -1791,6 +1796,10 @@ alloc_new_skb: } else if (flags & MSG_SPLICE_PAGES) { struct msghdr *msg = from; + err = -EIO; + if (WARN_ON_ONCE(copy > msg->msg_iter.count)) + goto error; + err = skb_splice_from_iter(skb, &msg->msg_iter, copy, sk->sk_allocation); if (err < 0) diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index ae818ff46224..ca377159967c 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -474,8 +474,8 @@ int do_ipv6_setsockopt(struct sock *sk, int level, int optname, WRITE_ONCE(sk->sk_prot, &tcp_prot); /* Paired with READ_ONCE() in tcp_(get|set)sockopt() */ WRITE_ONCE(icsk->icsk_af_ops, &ipv4_specific); - sk->sk_socket->ops = &inet_stream_ops; - sk->sk_family = PF_INET; + WRITE_ONCE(sk->sk_socket->ops, &inet_stream_ops); + WRITE_ONCE(sk->sk_family, PF_INET); tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); } else { struct proto *prot = &udp_prot; @@ -488,8 +488,8 @@ int do_ipv6_setsockopt(struct sock *sk, int level, int optname, /* Paired with READ_ONCE(sk->sk_prot) in inet6_dgram_ops */ WRITE_ONCE(sk->sk_prot, prot); - sk->sk_socket->ops = &inet_dgram_ops; - sk->sk_family = PF_INET; + WRITE_ONCE(sk->sk_socket->ops, &inet_dgram_ops); + WRITE_ONCE(sk->sk_family, PF_INET); } /* Disable all options not to allocate memory anymore, diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index 714cdc9e2b8e..5ce25bcb9974 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -1699,11 +1699,9 @@ mld_scount(struct ifmcaddr6 *pmc, int type, int gdeleted, int sdeleted) return scount; } -static void ip6_mc_hdr(struct sock *sk, struct sk_buff *skb, - struct net_device *dev, - const struct in6_addr *saddr, - const struct in6_addr *daddr, - int proto, int len) +static void ip6_mc_hdr(const struct sock *sk, struct sk_buff *skb, + struct net_device *dev, const struct in6_addr *saddr, + const struct in6_addr *daddr, int proto, int len) { struct ipv6hdr *hdr; diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index a42be96ae209..553c8664e0a7 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -1267,10 +1267,6 @@ static enum skb_drop_reason ndisc_router_discovery(struct sk_buff *skb) } #endif - /* - * set the RA_RECV flag in the interface - */ - in6_dev = __in6_dev_get(skb->dev); if (!in6_dev) { ND_PRINTK(0, err, "RA: can't find inet6 device for %s\n", @@ -1328,6 +1324,14 @@ static enum skb_drop_reason ndisc_router_discovery(struct sk_buff *skb) goto skip_defrtr; } + lifetime = ntohs(ra_msg->icmph.icmp6_rt_lifetime); + if (lifetime != 0 && lifetime < in6_dev->cnf.accept_ra_min_lft) { + ND_PRINTK(2, info, + "RA: router lifetime (%ds) is too short: %s\n", + lifetime, skb->dev->name); + goto skip_defrtr; + } + /* Do not accept RA with source-addr found on local machine unless * accept_ra_from_local is set to true. */ @@ -1340,8 +1344,6 @@ static enum skb_drop_reason ndisc_router_discovery(struct sk_buff *skb) goto skip_defrtr; } - lifetime = ntohs(ra_msg->icmph.icmp6_rt_lifetime); - #ifdef CONFIG_IPV6_ROUTER_PREF pref = ra_msg->icmph.icmp6_router_pref; /* 10b is handled as if it were 00b (medium) */ @@ -1517,6 +1519,9 @@ skip_linkparms: if (ri->prefix_len == 0 && !in6_dev->cnf.accept_ra_defrtr) continue; + if (ri->lifetime != 0 && + ntohl(ri->lifetime) < in6_dev->cnf.accept_ra_min_lft) + continue; if (ri->prefix_len < in6_dev->cnf.accept_ra_rt_info_min_plen) continue; if (ri->prefix_len > in6_dev->cnf.accept_ra_rt_info_max_plen) diff --git a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c index cb4eb1d2c620..d59b296b4f51 100644 --- a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c +++ b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c @@ -10,6 +10,7 @@ #include <linux/module.h> #include <linux/skbuff.h> #include <linux/icmp.h> +#include <linux/rcupdate.h> #include <linux/sysctl.h> #include <net/ipv6_frag.h> @@ -96,6 +97,12 @@ static void __net_exit defrag6_net_exit(struct net *net) } } +static const struct nf_defrag_hook defrag_hook = { + .owner = THIS_MODULE, + .enable = nf_defrag_ipv6_enable, + .disable = nf_defrag_ipv6_disable, +}; + static struct pernet_operations defrag6_net_ops = { .exit = defrag6_net_exit, }; @@ -114,6 +121,9 @@ static int __init nf_defrag_init(void) pr_err("nf_defrag_ipv6: can't register pernet ops\n"); goto cleanup_frag6; } + + rcu_assign_pointer(nf_defrag_v6_hook, &defrag_hook); + return ret; cleanup_frag6: @@ -124,6 +134,7 @@ cleanup_frag6: static void __exit nf_defrag_fini(void) { + rcu_assign_pointer(nf_defrag_v6_hook, NULL); unregister_pernet_subsys(&defrag6_net_ops); nf_ct_frag6_cleanup(); } diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c index c2c291827a2c..1b2772834972 100644 --- a/net/ipv6/ping.c +++ b/net/ipv6/ping.c @@ -215,6 +215,7 @@ struct proto pingv6_prot = { .get_port = ping_get_port, .put_port = ping_unhash, .obj_size = sizeof(struct raw6_sock), + .ipv6_pinfo_offset = offsetof(struct raw6_sock, inet6), }; EXPORT_SYMBOL_GPL(pingv6_prot); diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 49381f35b623..ea16734f5e1f 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -1216,6 +1216,7 @@ struct proto rawv6_prot = { .hash = raw_hash_sk, .unhash = raw_unhash_sk, .obj_size = sizeof(struct raw6_sock), + .ipv6_pinfo_offset = offsetof(struct raw6_sock, inet6), .useroffset = offsetof(struct raw6_sock, filter), .usersize = sizeof_field(struct raw6_sock, filter), .h.raw_hash = &raw_v6_hashinfo, diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 56a55585eb79..10751df16dab 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -4544,7 +4544,8 @@ static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff struct fib6_info *addrconf_f6i_alloc(struct net *net, struct inet6_dev *idev, const struct in6_addr *addr, - bool anycast, gfp_t gfp_flags) + bool anycast, gfp_t gfp_flags, + struct netlink_ext_ack *extack) { struct fib6_config cfg = { .fc_table = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL, @@ -4566,7 +4567,7 @@ struct fib6_info *addrconf_f6i_alloc(struct net *net, cfg.fc_flags |= RTF_LOCAL; } - f6i = ip6_route_info_create(&cfg, gfp_flags, NULL); + f6i = ip6_route_info_create(&cfg, gfp_flags, extack); if (!IS_ERR(f6i)) { f6i->dst_nocount = true; diff --git a/net/ipv6/rpl_iptunnel.c b/net/ipv6/rpl_iptunnel.c index b1c028df686e..a013b92cbb86 100644 --- a/net/ipv6/rpl_iptunnel.c +++ b/net/ipv6/rpl_iptunnel.c @@ -272,8 +272,6 @@ static int rpl_input(struct sk_buff *skb) dst = dst_cache_get(&rlwt->cache); preempt_enable(); - skb_dst_drop(skb); - if (!dst) { ip6_route_input(skb); dst = skb_dst(skb); @@ -284,6 +282,7 @@ static int rpl_input(struct sk_buff *skb) preempt_enable(); } } else { + skb_dst_drop(skb); skb_dst_set(skb, dst); } diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 6e86721e1cdb..3a88545a265d 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -2176,6 +2176,7 @@ struct proto tcpv6_prot = { .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem), .max_header = MAX_TCP_HEADER, .obj_size = sizeof(struct tcp6_sock), + .ipv6_pinfo_offset = offsetof(struct tcp6_sock, inet6), .slab_flags = SLAB_TYPESAFE_BY_RCU, .twsk_prot = &tcp6_timewait_sock_ops, .rsk_prot = &tcp6_request_sock_ops, diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index f787e6b8424c..1ea01b0d9be3 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -72,11 +72,12 @@ int udpv6_init_sock(struct sock *sk) return 0; } -static u32 udp6_ehashfn(const struct net *net, - const struct in6_addr *laddr, - const u16 lport, - const struct in6_addr *faddr, - const __be16 fport) +INDIRECT_CALLABLE_SCOPE +u32 udp6_ehashfn(const struct net *net, + const struct in6_addr *laddr, + const u16 lport, + const struct in6_addr *faddr, + const __be16 fport) { static u32 udp6_ehash_secret __read_mostly; static u32 udp_ipv6_hash_secret __read_mostly; @@ -161,24 +162,6 @@ static int compute_score(struct sock *sk, struct net *net, return score; } -static struct sock *lookup_reuseport(struct net *net, struct sock *sk, - struct sk_buff *skb, - const struct in6_addr *saddr, - __be16 sport, - const struct in6_addr *daddr, - unsigned int hnum) -{ - struct sock *reuse_sk = NULL; - u32 hash; - - if (sk->sk_reuseport && sk->sk_state != TCP_ESTABLISHED) { - hash = udp6_ehashfn(net, daddr, hnum, saddr, sport); - reuse_sk = reuseport_select_sock(sk, hash, skb, - sizeof(struct udphdr)); - } - return reuse_sk; -} - /* called with rcu_read_lock() */ static struct sock *udp6_lib_lookup2(struct net *net, const struct in6_addr *saddr, __be16 sport, @@ -195,44 +178,35 @@ static struct sock *udp6_lib_lookup2(struct net *net, score = compute_score(sk, net, saddr, sport, daddr, hnum, dif, sdif); if (score > badness) { - result = lookup_reuseport(net, sk, skb, - saddr, sport, daddr, hnum); + badness = score; + + if (sk->sk_state == TCP_ESTABLISHED) { + result = sk; + continue; + } + + result = inet6_lookup_reuseport(net, sk, skb, sizeof(struct udphdr), + saddr, sport, daddr, hnum, udp6_ehashfn); + if (!result) { + result = sk; + continue; + } + /* Fall back to scoring if group has connections */ - if (result && !reuseport_has_conns(sk)) + if (!reuseport_has_conns(sk)) return result; - result = result ? : sk; - badness = score; + /* Reuseport logic returned an error, keep original score. */ + if (IS_ERR(result)) + continue; + + badness = compute_score(sk, net, saddr, sport, + daddr, hnum, dif, sdif); } } return result; } -static inline struct sock *udp6_lookup_run_bpf(struct net *net, - struct udp_table *udptable, - struct sk_buff *skb, - const struct in6_addr *saddr, - __be16 sport, - const struct in6_addr *daddr, - u16 hnum, const int dif) -{ - struct sock *sk, *reuse_sk; - bool no_reuseport; - - if (udptable != net->ipv4.udp_table) - return NULL; /* only UDP is supported */ - - no_reuseport = bpf_sk_lookup_run_v6(net, IPPROTO_UDP, saddr, sport, - daddr, hnum, dif, &sk); - if (no_reuseport || IS_ERR_OR_NULL(sk)) - return sk; - - reuse_sk = lookup_reuseport(net, sk, skb, saddr, sport, daddr, hnum); - if (reuse_sk) - sk = reuse_sk; - return sk; -} - /* rcu_read_lock() must be held */ struct sock *__udp6_lib_lookup(struct net *net, const struct in6_addr *saddr, __be16 sport, @@ -257,9 +231,11 @@ struct sock *__udp6_lib_lookup(struct net *net, goto done; /* Lookup redirect from BPF */ - if (static_branch_unlikely(&bpf_sk_lookup_enabled)) { - sk = udp6_lookup_run_bpf(net, udptable, skb, - saddr, sport, daddr, hnum, dif); + if (static_branch_unlikely(&bpf_sk_lookup_enabled) && + udptable == net->ipv4.udp_table) { + sk = inet6_lookup_run_sk_lookup(net, IPPROTO_UDP, skb, sizeof(struct udphdr), + saddr, sport, daddr, hnum, dif, + udp6_ehashfn); if (sk) { result = sk; goto done; @@ -992,7 +968,11 @@ int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, goto csum_error; /* Check if the socket is already available, e.g. due to early demux */ - sk = skb_steal_sock(skb, &refcounted); + sk = inet6_steal_sock(net, skb, sizeof(struct udphdr), saddr, uh->source, daddr, uh->dest, + &refcounted, udp6_ehashfn); + if (IS_ERR(sk)) + goto no_sk; + if (sk) { struct dst_entry *dst = skb_dst(skb); int ret; @@ -1026,7 +1006,7 @@ int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, goto report_csum_error; return udp6_unicast_rcv_skb(sk, skb, uh); } - +no_sk: reason = SKB_DROP_REASON_NO_SOCKET; if (!uh->check) @@ -1802,6 +1782,7 @@ struct proto udpv6_prot = { .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_udp_wmem_min), .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_udp_rmem_min), .obj_size = sizeof(struct udp6_sock), + .ipv6_pinfo_offset = offsetof(struct udp6_sock, inet6), .h.udp_table = NULL, .diag_destroy = udp_abort, }; diff --git a/net/ipv6/udplite.c b/net/ipv6/udplite.c index 8e010d07917a..267d491e9707 100644 --- a/net/ipv6/udplite.c +++ b/net/ipv6/udplite.c @@ -67,6 +67,7 @@ struct proto udplitev6_prot = { .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_udp_wmem_min), .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_udp_rmem_min), .obj_size = sizeof(struct udp6_sock), + .ipv6_pinfo_offset = offsetof(struct udp6_sock, inet6), .h.udp_table = &udplite_table, }; diff --git a/net/key/af_key.c b/net/key/af_key.c index ede3c6a60353..542439b6a59c 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -1281,7 +1281,6 @@ static struct xfrm_state * pfkey_msg2xfrm_state(struct net *net, ext_hdrs[SADB_X_EXT_NAT_T_DPORT-1]; natt->encap_dport = n_port->sadb_x_nat_t_port_port; } - memset(&natt->encap_oa, 0, sizeof(natt->encap_oa)); } err = xfrm_init_state(x); diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c index ff78217f0cb1..ed8ebb6f5909 100644 --- a/net/l2tp/l2tp_ip6.c +++ b/net/l2tp/l2tp_ip6.c @@ -36,9 +36,6 @@ struct l2tp_ip6_sock { u32 conn_id; u32 peer_conn_id; - /* ipv6_pinfo has to be the last member of l2tp_ip6_sock, see - * inet6_sk_generic - */ struct ipv6_pinfo inet6; }; @@ -730,6 +727,7 @@ static struct proto l2tp_ip6_prot = { .hash = l2tp_ip6_hash, .unhash = l2tp_ip6_unhash, .obj_size = sizeof(struct l2tp_ip6_sock), + .ipv6_pinfo_offset = offsetof(struct l2tp_ip6_sock, inet6), }; static const struct proto_ops l2tp_ip6_ops = { diff --git a/net/llc/llc_conn.c b/net/llc/llc_conn.c index d037009ee10f..0a3f5e0bec00 100644 --- a/net/llc/llc_conn.c +++ b/net/llc/llc_conn.c @@ -14,14 +14,15 @@ #include <linux/init.h> #include <linux/slab.h> -#include <net/llc_sap.h> -#include <net/llc_conn.h> -#include <net/sock.h> -#include <net/tcp_states.h> -#include <net/llc_c_ev.h> +#include <net/llc.h> #include <net/llc_c_ac.h> +#include <net/llc_c_ev.h> #include <net/llc_c_st.h> +#include <net/llc_conn.h> #include <net/llc_pdu.h> +#include <net/llc_sap.h> +#include <net/sock.h> +#include <net/tcp_states.h> #if 0 #define dprintk(args...) printk(KERN_DEBUG args) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index d80658547836..48e649fe2360 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -67,11 +67,11 @@ static bool mptcp_is_tcpsk(struct sock *sk) * Hand the socket over to tcp so all further socket ops * bypass mptcp. */ - sock->ops = &inet_stream_ops; + WRITE_ONCE(sock->ops, &inet_stream_ops); return true; #if IS_ENABLED(CONFIG_MPTCP_IPV6) } else if (unlikely(sk->sk_prot == &tcpv6_prot)) { - sock->ops = &inet6_stream_ops; + WRITE_ONCE(sock->ops, &inet6_stream_ops); return true; #endif } @@ -90,6 +90,7 @@ static int __mptcp_socket_create(struct mptcp_sock *msk) if (err) return err; + msk->scaling_ratio = tcp_sk(ssock->sk)->scaling_ratio; WRITE_ONCE(msk->first, ssock->sk); WRITE_ONCE(msk->subflow, ssock); subflow = mptcp_subflow_ctx(ssock->sk); @@ -1881,6 +1882,7 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied) { struct mptcp_subflow_context *subflow; struct sock *sk = (struct sock *)msk; + u8 scaling_ratio = U8_MAX; u32 time, advmss = 1; u64 rtt_us, mstamp; @@ -1911,9 +1913,11 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied) rtt_us = max(sf_rtt_us, rtt_us); advmss = max(sf_advmss, advmss); + scaling_ratio = min(tp->scaling_ratio, scaling_ratio); } msk->rcvq_space.rtt_us = rtt_us; + msk->scaling_ratio = scaling_ratio; if (time < (rtt_us >> 3) || rtt_us == 0) return; @@ -1922,8 +1926,8 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied) if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf) && !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) { - int rcvmem, rcvbuf; u64 rcvwin, grow; + int rcvbuf; rcvwin = ((u64)msk->rcvq_space.copied << 1) + 16 * advmss; @@ -1932,18 +1936,13 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied) do_div(grow, msk->rcvq_space.space); rcvwin += (grow << 1); - rcvmem = SKB_TRUESIZE(advmss + MAX_TCP_HEADER); - while (tcp_win_from_space(sk, rcvmem) < advmss) - rcvmem += 128; - - do_div(rcvwin, advmss); - rcvbuf = min_t(u64, rcvwin * rcvmem, + rcvbuf = min_t(u64, __tcp_space_from_win(scaling_ratio, rcvwin), READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2])); if (rcvbuf > sk->sk_rcvbuf) { u32 window_clamp; - window_clamp = tcp_win_from_space(sk, rcvbuf); + window_clamp = __tcp_win_from_space(scaling_ratio, rcvbuf); WRITE_ONCE(sk->sk_rcvbuf, rcvbuf); /* Make subflows follow along. If we do not do this, we @@ -3684,7 +3683,7 @@ static int mptcp_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) goto unlock; } - err = ssock->ops->bind(ssock, uaddr, addr_len); + err = READ_ONCE(ssock->ops)->bind(ssock, uaddr, addr_len); if (!err) mptcp_copy_inaddrs(sock->sk, ssock->sk); @@ -3718,7 +3717,7 @@ static int mptcp_listen(struct socket *sock, int backlog) inet_sk_state_store(sk, TCP_LISTEN); sock_set_flag(sk, SOCK_RCU_FREE); - err = ssock->ops->listen(ssock, backlog); + err = READ_ONCE(ssock->ops)->listen(ssock, backlog); inet_sk_state_store(sk, inet_sk_state_load(ssock->sk)); if (!err) { sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); @@ -3987,6 +3986,7 @@ int __init mptcp_proto_v6_init(void) strcpy(mptcp_v6_prot.name, "MPTCPv6"); mptcp_v6_prot.slab = NULL; mptcp_v6_prot.obj_size = sizeof(struct mptcp6_sock); + mptcp_v6_prot.ipv6_pinfo_offset = offsetof(struct mptcp6_sock, np); err = proto_register(&mptcp_v6_prot, 1); if (err) diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index ba2a873a4d2e..79fc5cdb67bc 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -321,6 +321,7 @@ struct mptcp_sock { u64 time; /* start time of measurement window */ u64 rtt_us; /* last maximum rtt of subflows */ } rcvq_space; + u8 scaling_ratio; u32 subflow_id; u32 setsockopt_seq; @@ -350,9 +351,14 @@ static inline int __mptcp_rmem(const struct sock *sk) return atomic_read(&sk->sk_rmem_alloc) - READ_ONCE(mptcp_sk(sk)->rmem_released); } +static inline int mptcp_win_from_space(const struct sock *sk, int space) +{ + return __tcp_win_from_space(mptcp_sk(sk)->scaling_ratio, space); +} + static inline int __mptcp_space(const struct sock *sk) { - return tcp_win_from_space(sk, READ_ONCE(sk->sk_rcvbuf) - __mptcp_rmem(sk)); + return mptcp_win_from_space(sk, READ_ONCE(sk->sk_rcvbuf) - __mptcp_rmem(sk)); } static inline struct mptcp_data_frag *mptcp_send_head(const struct sock *sk) diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 94ae7dd01c65..9bf3c7bc1762 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -1359,7 +1359,7 @@ void mptcp_space(const struct sock *ssk, int *space, int *full_space) const struct sock *sk = subflow->conn; *space = __mptcp_space(sk); - *full_space = tcp_full_space(sk); + *full_space = mptcp_win_from_space(sk, READ_ONCE(sk->sk_rcvbuf)); } void __mptcp_error_report(struct sock *sk) diff --git a/net/netfilter/core.c b/net/netfilter/core.c index 5f76ae86a656..ef4e76e5aef9 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -680,6 +680,12 @@ EXPORT_SYMBOL_GPL(nfnl_ct_hook); const struct nf_ct_hook __rcu *nf_ct_hook __read_mostly; EXPORT_SYMBOL_GPL(nf_ct_hook); +const struct nf_defrag_hook __rcu *nf_defrag_v4_hook __read_mostly; +EXPORT_SYMBOL_GPL(nf_defrag_v4_hook); + +const struct nf_defrag_hook __rcu *nf_defrag_v6_hook __read_mostly; +EXPORT_SYMBOL_GPL(nf_defrag_v6_hook); + #if IS_ENABLED(CONFIG_NF_CONNTRACK) u8 nf_ctnetlink_has_listener; EXPORT_SYMBOL_GPL(nf_ctnetlink_has_listener); diff --git a/net/netfilter/nf_bpf_link.c b/net/netfilter/nf_bpf_link.c index c36da56d756f..e502ec00b2fe 100644 --- a/net/netfilter/nf_bpf_link.c +++ b/net/netfilter/nf_bpf_link.c @@ -1,6 +1,8 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/bpf.h> #include <linux/filter.h> +#include <linux/kmod.h> +#include <linux/module.h> #include <linux/netfilter.h> #include <net/netfilter/nf_bpf_link.h> @@ -23,8 +25,90 @@ struct bpf_nf_link { struct nf_hook_ops hook_ops; struct net *net; u32 dead; + const struct nf_defrag_hook *defrag_hook; }; +#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV4) || IS_ENABLED(CONFIG_NF_DEFRAG_IPV6) +static const struct nf_defrag_hook * +get_proto_defrag_hook(struct bpf_nf_link *link, + const struct nf_defrag_hook __rcu *global_hook, + const char *mod) +{ + const struct nf_defrag_hook *hook; + int err; + + /* RCU protects us from races against module unloading */ + rcu_read_lock(); + hook = rcu_dereference(global_hook); + if (!hook) { + rcu_read_unlock(); + err = request_module(mod); + if (err) + return ERR_PTR(err < 0 ? err : -EINVAL); + + rcu_read_lock(); + hook = rcu_dereference(global_hook); + } + + if (hook && try_module_get(hook->owner)) { + /* Once we have a refcnt on the module, we no longer need RCU */ + hook = rcu_pointer_handoff(hook); + } else { + WARN_ONCE(!hook, "%s has bad registration", mod); + hook = ERR_PTR(-ENOENT); + } + rcu_read_unlock(); + + if (!IS_ERR(hook)) { + err = hook->enable(link->net); + if (err) { + module_put(hook->owner); + hook = ERR_PTR(err); + } + } + + return hook; +} +#endif + +static int bpf_nf_enable_defrag(struct bpf_nf_link *link) +{ + const struct nf_defrag_hook __maybe_unused *hook; + + switch (link->hook_ops.pf) { +#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV4) + case NFPROTO_IPV4: + hook = get_proto_defrag_hook(link, nf_defrag_v4_hook, "nf_defrag_ipv4"); + if (IS_ERR(hook)) + return PTR_ERR(hook); + + link->defrag_hook = hook; + return 0; +#endif +#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6) + case NFPROTO_IPV6: + hook = get_proto_defrag_hook(link, nf_defrag_v6_hook, "nf_defrag_ipv6"); + if (IS_ERR(hook)) + return PTR_ERR(hook); + + link->defrag_hook = hook; + return 0; +#endif + default: + return -EAFNOSUPPORT; + } +} + +static void bpf_nf_disable_defrag(struct bpf_nf_link *link) +{ + const struct nf_defrag_hook *hook = link->defrag_hook; + + if (!hook) + return; + hook->disable(link->net); + module_put(hook->owner); +} + static void bpf_nf_link_release(struct bpf_link *link) { struct bpf_nf_link *nf_link = container_of(link, struct bpf_nf_link, link); @@ -32,11 +116,11 @@ static void bpf_nf_link_release(struct bpf_link *link) if (nf_link->dead) return; - /* prevent hook-not-found warning splat from netfilter core when - * .detach was already called - */ - if (!cmpxchg(&nf_link->dead, 0, 1)) + /* do not double release in case .detach was already called */ + if (!cmpxchg(&nf_link->dead, 0, 1)) { nf_unregister_net_hook(nf_link->net, &nf_link->hook_ops); + bpf_nf_disable_defrag(nf_link); + } } static void bpf_nf_link_dealloc(struct bpf_link *link) @@ -92,6 +176,8 @@ static const struct bpf_link_ops bpf_nf_link_lops = { static int bpf_nf_check_pf_and_hooks(const union bpf_attr *attr) { + int prio; + switch (attr->link_create.netfilter.pf) { case NFPROTO_IPV4: case NFPROTO_IPV6: @@ -102,19 +188,18 @@ static int bpf_nf_check_pf_and_hooks(const union bpf_attr *attr) return -EAFNOSUPPORT; } - if (attr->link_create.netfilter.flags) + if (attr->link_create.netfilter.flags & ~BPF_F_NETFILTER_IP_DEFRAG) return -EOPNOTSUPP; - /* make sure conntrack confirm is always last. - * - * In the future, if userspace can e.g. request defrag, then - * "defrag_requested && prio before NF_IP_PRI_CONNTRACK_DEFRAG" - * should fail. - */ - switch (attr->link_create.netfilter.priority) { - case NF_IP_PRI_FIRST: return -ERANGE; /* sabotage_in and other warts */ - case NF_IP_PRI_LAST: return -ERANGE; /* e.g. conntrack confirm */ - } + /* make sure conntrack confirm is always last */ + prio = attr->link_create.netfilter.priority; + if (prio == NF_IP_PRI_FIRST) + return -ERANGE; /* sabotage_in and other warts */ + else if (prio == NF_IP_PRI_LAST) + return -ERANGE; /* e.g. conntrack confirm */ + else if ((attr->link_create.netfilter.flags & BPF_F_NETFILTER_IP_DEFRAG) && + prio <= NF_IP_PRI_CONNTRACK_DEFRAG) + return -ERANGE; /* cannot use defrag if prog runs before nf_defrag */ return 0; } @@ -149,6 +234,7 @@ int bpf_nf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) link->net = net; link->dead = false; + link->defrag_hook = NULL; err = bpf_link_prime(&link->link, &link_primer); if (err) { @@ -156,8 +242,17 @@ int bpf_nf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) return err; } + if (attr->link_create.netfilter.flags & BPF_F_NETFILTER_IP_DEFRAG) { + err = bpf_nf_enable_defrag(link); + if (err) { + bpf_link_cleanup(&link_primer); + return err; + } + } + err = nf_register_net_hook(net, &link->hook_ops); if (err) { + bpf_nf_disable_defrag(link); bpf_link_cleanup(&link_primer); return err; } diff --git a/net/netfilter/nf_conntrack_bpf.c b/net/netfilter/nf_conntrack_bpf.c index 0d36d7285e3f..c7a6114091ae 100644 --- a/net/netfilter/nf_conntrack_bpf.c +++ b/net/netfilter/nf_conntrack_bpf.c @@ -14,6 +14,7 @@ #include <linux/types.h> #include <linux/btf_ids.h> #include <linux/net_namespace.h> +#include <net/xdp.h> #include <net/netfilter/nf_conntrack_bpf.h> #include <net/netfilter/nf_conntrack_core.h> diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 992393102d5f..9f6f2e643575 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -1756,7 +1756,7 @@ init_conntrack(struct net *net, struct nf_conn *tmpl, cnet = nf_ct_pernet(net); if (cnet->expect_count) { spin_lock_bh(&nf_conntrack_expect_lock); - exp = nf_ct_find_expectation(net, zone, tuple); + exp = nf_ct_find_expectation(net, zone, tuple, !tmpl || nf_ct_is_confirmed(tmpl)); if (exp) { /* Welcome, Mr. Bond. We've been expecting you... */ __set_bit(IPS_EXPECTED_BIT, &ct->status); diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c index 96948e98ec53..81ca348915c9 100644 --- a/net/netfilter/nf_conntrack_expect.c +++ b/net/netfilter/nf_conntrack_expect.c @@ -171,7 +171,7 @@ EXPORT_SYMBOL_GPL(nf_ct_expect_find_get); struct nf_conntrack_expect * nf_ct_find_expectation(struct net *net, const struct nf_conntrack_zone *zone, - const struct nf_conntrack_tuple *tuple) + const struct nf_conntrack_tuple *tuple, bool unlink) { struct nf_conntrack_net *cnet = nf_ct_pernet(net); struct nf_conntrack_expect *i, *exp = NULL; @@ -211,7 +211,7 @@ nf_ct_find_expectation(struct net *net, !refcount_inc_not_zero(&exp->master->ct_general.use))) return NULL; - if (exp->flags & NF_CT_EXPECT_PERMANENT) { + if (exp->flags & NF_CT_EXPECT_PERMANENT || !unlink) { refcount_inc(&exp->use); return exp; } else if (del_timer(&exp->timeout)) { diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 69c8c8c7e9b8..334db22199c1 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -1321,15 +1321,11 @@ static int ctnetlink_parse_tuple_ip(struct nlattr *attr, struct nlattr *tb[CTA_IP_MAX+1]; int ret = 0; - ret = nla_parse_nested_deprecated(tb, CTA_IP_MAX, attr, NULL, NULL); + ret = nla_parse_nested_deprecated(tb, CTA_IP_MAX, attr, + cta_ip_nla_policy, NULL); if (ret < 0) return ret; - ret = nla_validate_nested_deprecated(attr, CTA_IP_MAX, - cta_ip_nla_policy, NULL); - if (ret) - return ret; - switch (tuple->src.l3num) { case NFPROTO_IPV4: ret = ipv4_nlattr_to_tuple(tb, tuple, flags); diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c index d4fd626d2b8c..e2db1f4ec2df 100644 --- a/net/netfilter/nf_conntrack_proto_dccp.c +++ b/net/netfilter/nf_conntrack_proto_dccp.c @@ -69,6 +69,7 @@ #define DCCP_MSL (2 * 60 * HZ) +#ifdef CONFIG_NF_CONNTRACK_PROCFS static const char * const dccp_state_names[] = { [CT_DCCP_NONE] = "NONE", [CT_DCCP_REQUEST] = "REQUEST", @@ -81,6 +82,7 @@ static const char * const dccp_state_names[] = { [CT_DCCP_IGNORE] = "IGNORE", [CT_DCCP_INVALID] = "INVALID", }; +#endif #define sNO CT_DCCP_NONE #define sRQ CT_DCCP_REQUEST diff --git a/net/netfilter/nf_flow_table_offload.c b/net/netfilter/nf_flow_table_offload.c index 1c26f03fc661..a010b25076ca 100644 --- a/net/netfilter/nf_flow_table_offload.c +++ b/net/netfilter/nf_flow_table_offload.c @@ -34,7 +34,7 @@ static void nf_flow_rule_lwt_match(struct nf_flow_match *match, { struct nf_flow_key *mask = &match->mask; struct nf_flow_key *key = &match->key; - unsigned int enc_keys; + unsigned long long enc_keys; if (!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX)) return; @@ -43,8 +43,8 @@ static void nf_flow_rule_lwt_match(struct nf_flow_match *match, NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_ENC_KEYID, enc_key_id); key->enc_key_id.keyid = tunnel_id_to_key32(tun_info->key.tun_id); mask->enc_key_id.keyid = 0xffffffff; - enc_keys = BIT(FLOW_DISSECTOR_KEY_ENC_KEYID) | - BIT(FLOW_DISSECTOR_KEY_ENC_CONTROL); + enc_keys = BIT_ULL(FLOW_DISSECTOR_KEY_ENC_KEYID) | + BIT_ULL(FLOW_DISSECTOR_KEY_ENC_CONTROL); if (ip_tunnel_info_af(tun_info) == AF_INET) { NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS, @@ -55,7 +55,7 @@ static void nf_flow_rule_lwt_match(struct nf_flow_match *match, mask->enc_ipv4.src = 0xffffffff; if (key->enc_ipv4.dst) mask->enc_ipv4.dst = 0xffffffff; - enc_keys |= BIT(FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS); + enc_keys |= BIT_ULL(FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS); key->enc_control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; } else { memcpy(&key->enc_ipv6.src, &tun_info->key.u.ipv6.dst, @@ -70,7 +70,7 @@ static void nf_flow_rule_lwt_match(struct nf_flow_match *match, sizeof(struct in6_addr))) memset(&mask->enc_ipv6.dst, 0xff, sizeof(struct in6_addr)); - enc_keys |= BIT(FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS); + enc_keys |= BIT_ULL(FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS); key->enc_control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; } @@ -163,14 +163,14 @@ static int nf_flow_rule_match(struct nf_flow_match *match, return -EOPNOTSUPP; } mask->control.addr_type = 0xffff; - match->dissector.used_keys |= BIT(key->control.addr_type); + match->dissector.used_keys |= BIT_ULL(key->control.addr_type); mask->basic.n_proto = 0xffff; switch (tuple->l4proto) { case IPPROTO_TCP: key->tcp.flags = 0; mask->tcp.flags = cpu_to_be16(be32_to_cpu(TCP_FLAG_RST | TCP_FLAG_FIN) >> 16); - match->dissector.used_keys |= BIT(FLOW_DISSECTOR_KEY_TCP); + match->dissector.used_keys |= BIT_ULL(FLOW_DISSECTOR_KEY_TCP); break; case IPPROTO_UDP: case IPPROTO_GRE: @@ -182,9 +182,9 @@ static int nf_flow_rule_match(struct nf_flow_match *match, key->basic.ip_proto = tuple->l4proto; mask->basic.ip_proto = 0xff; - match->dissector.used_keys |= BIT(FLOW_DISSECTOR_KEY_META) | - BIT(FLOW_DISSECTOR_KEY_CONTROL) | - BIT(FLOW_DISSECTOR_KEY_BASIC); + match->dissector.used_keys |= BIT_ULL(FLOW_DISSECTOR_KEY_META) | + BIT_ULL(FLOW_DISSECTOR_KEY_CONTROL) | + BIT_ULL(FLOW_DISSECTOR_KEY_BASIC); switch (tuple->l4proto) { case IPPROTO_TCP: @@ -194,7 +194,7 @@ static int nf_flow_rule_match(struct nf_flow_match *match, key->tp.dst = tuple->dst_port; mask->tp.dst = 0xffff; - match->dissector.used_keys |= BIT(FLOW_DISSECTOR_KEY_PORTS); + match->dissector.used_keys |= BIT_ULL(FLOW_DISSECTOR_KEY_PORTS); break; } diff --git a/net/netfilter/nf_tables_offload.c b/net/netfilter/nf_tables_offload.c index 910ef881c3b8..12ab78fa5d84 100644 --- a/net/netfilter/nf_tables_offload.c +++ b/net/netfilter/nf_tables_offload.c @@ -35,12 +35,12 @@ void nft_flow_rule_set_addr_type(struct nft_flow_rule *flow, struct nft_flow_key *mask = &match->mask; struct nft_flow_key *key = &match->key; - if (match->dissector.used_keys & BIT(FLOW_DISSECTOR_KEY_CONTROL)) + if (match->dissector.used_keys & BIT_ULL(FLOW_DISSECTOR_KEY_CONTROL)) return; key->control.addr_type = addr_type; mask->control.addr_type = 0xffff; - match->dissector.used_keys |= BIT(FLOW_DISSECTOR_KEY_CONTROL); + match->dissector.used_keys |= BIT_ULL(FLOW_DISSECTOR_KEY_CONTROL); match->dissector.offset[FLOW_DISSECTOR_KEY_CONTROL] = offsetof(struct nft_flow_key, control); } @@ -59,7 +59,7 @@ static void nft_flow_rule_transfer_vlan(struct nft_offload_ctx *ctx, .mask = match->mask.basic.n_proto, }; - if (match->dissector.used_keys & BIT(FLOW_DISSECTOR_KEY_VLAN) && + if (match->dissector.used_keys & BIT_ULL(FLOW_DISSECTOR_KEY_VLAN) && (match->key.vlan.vlan_tpid == htons(ETH_P_8021Q) || match->key.vlan.vlan_tpid == htons(ETH_P_8021AD))) { match->key.basic.n_proto = match->key.cvlan.vlan_tpid; @@ -70,8 +70,9 @@ static void nft_flow_rule_transfer_vlan(struct nft_offload_ctx *ctx, match->mask.vlan.vlan_tpid = ethertype.mask; match->dissector.offset[FLOW_DISSECTOR_KEY_CVLAN] = offsetof(struct nft_flow_key, cvlan); - match->dissector.used_keys |= BIT(FLOW_DISSECTOR_KEY_CVLAN); - } else if (match->dissector.used_keys & BIT(FLOW_DISSECTOR_KEY_BASIC) && + match->dissector.used_keys |= BIT_ULL(FLOW_DISSECTOR_KEY_CVLAN); + } else if (match->dissector.used_keys & + BIT_ULL(FLOW_DISSECTOR_KEY_BASIC) && (match->key.basic.n_proto == htons(ETH_P_8021Q) || match->key.basic.n_proto == htons(ETH_P_8021AD))) { match->key.basic.n_proto = match->key.vlan.vlan_tpid; @@ -80,7 +81,7 @@ static void nft_flow_rule_transfer_vlan(struct nft_offload_ctx *ctx, match->mask.vlan.vlan_tpid = ethertype.mask; match->dissector.offset[FLOW_DISSECTOR_KEY_VLAN] = offsetof(struct nft_flow_key, vlan); - match->dissector.used_keys |= BIT(FLOW_DISSECTOR_KEY_VLAN); + match->dissector.used_keys |= BIT_ULL(FLOW_DISSECTOR_KEY_VLAN); } } diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c index e57eb168ee13..53c9e76473ba 100644 --- a/net/netfilter/nfnetlink_log.c +++ b/net/netfilter/nfnetlink_log.c @@ -470,7 +470,6 @@ __build_packet_message(struct nfnl_log_net *log, sk_buff_data_t old_tail = inst->skb->tail; struct sock *sk; const unsigned char *hwhdrp; - ktime_t tstamp; nlh = nfnl_msg_put(inst->skb, 0, 0, nfnl_msg_type(NFNL_SUBSYS_ULOG, NFULNL_MSG_PACKET), @@ -599,10 +598,9 @@ __build_packet_message(struct nfnl_log_net *log, goto nla_put_failure; } - tstamp = skb_tstamp_cond(skb, false); - if (hooknum <= NF_INET_FORWARD && tstamp) { + if (hooknum <= NF_INET_FORWARD) { + struct timespec64 kts = ktime_to_timespec64(skb_tstamp_cond(skb, true)); struct nfulnl_msg_packet_timestamp ts; - struct timespec64 kts = ktime_to_timespec64(tstamp); ts.sec = cpu_to_be64(kts.tv_sec); ts.usec = cpu_to_be64(kts.tv_nsec / NSEC_PER_USEC); diff --git a/net/netfilter/nft_cmp.c b/net/netfilter/nft_cmp.c index 6eb21a4f5698..cd4652259095 100644 --- a/net/netfilter/nft_cmp.c +++ b/net/netfilter/nft_cmp.c @@ -162,7 +162,7 @@ static int __nft_cmp_offload(struct nft_offload_ctx *ctx, memcpy(key + reg->offset, data, reg->len); memcpy(mask + reg->offset, datamask, reg->len); - flow->match.dissector.used_keys |= BIT(reg->key); + flow->match.dissector.used_keys |= BIT_ULL(reg->key); flow->match.dissector.offset[reg->key] = reg->base_offset; if (reg->key == FLOW_DISSECTOR_KEY_META && diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c index 38958e067aa8..e87fd4314c68 100644 --- a/net/netfilter/nft_ct.c +++ b/net/netfilter/nft_ct.c @@ -262,6 +262,7 @@ static void nft_ct_set_zone_eval(const struct nft_expr *expr, regs->verdict.code = NF_DROP; return; } + __set_bit(IPS_CONFIRMED_BIT, &ct->status); } nf_ct_set(skb, ct, IP_CT_NEW); @@ -368,6 +369,7 @@ static bool nft_ct_tmpl_alloc_pcpu(void) return false; } + __set_bit(IPS_CONFIRMED_BIT, &tmp->status); per_cpu(nft_ct_pcpu_template, cpu) = tmp; } diff --git a/net/netfilter/nft_fib.c b/net/netfilter/nft_fib.c index 6e049fd48760..601c9e09d07a 100644 --- a/net/netfilter/nft_fib.c +++ b/net/netfilter/nft_fib.c @@ -14,17 +14,18 @@ #include <net/netfilter/nf_tables.h> #include <net/netfilter/nft_fib.h> +#define NFTA_FIB_F_ALL (NFTA_FIB_F_SADDR | NFTA_FIB_F_DADDR | \ + NFTA_FIB_F_MARK | NFTA_FIB_F_IIF | NFTA_FIB_F_OIF | \ + NFTA_FIB_F_PRESENT) + const struct nla_policy nft_fib_policy[NFTA_FIB_MAX + 1] = { [NFTA_FIB_DREG] = { .type = NLA_U32 }, [NFTA_FIB_RESULT] = { .type = NLA_U32 }, - [NFTA_FIB_FLAGS] = { .type = NLA_U32 }, + [NFTA_FIB_FLAGS] = + NLA_POLICY_MASK(NLA_BE32, NFTA_FIB_F_ALL), }; EXPORT_SYMBOL(nft_fib_policy); -#define NFTA_FIB_F_ALL (NFTA_FIB_F_SADDR | NFTA_FIB_F_DADDR | \ - NFTA_FIB_F_MARK | NFTA_FIB_F_IIF | NFTA_FIB_F_OIF | \ - NFTA_FIB_F_PRESENT) - int nft_fib_validate(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nft_data **data) { @@ -77,7 +78,7 @@ int nft_fib_init(const struct nft_ctx *ctx, const struct nft_expr *expr, priv->flags = ntohl(nla_get_be32(tb[NFTA_FIB_FLAGS])); - if (priv->flags == 0 || (priv->flags & ~NFTA_FIB_F_ALL)) + if (priv->flags == 0) return -EINVAL; if ((priv->flags & (NFTA_FIB_F_SADDR | NFTA_FIB_F_DADDR)) == diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c index 29ac48cdd6db..870e5b113d13 100644 --- a/net/netfilter/nft_lookup.c +++ b/net/netfilter/nft_lookup.c @@ -90,7 +90,8 @@ static const struct nla_policy nft_lookup_policy[NFTA_LOOKUP_MAX + 1] = { [NFTA_LOOKUP_SET_ID] = { .type = NLA_U32 }, [NFTA_LOOKUP_SREG] = { .type = NLA_U32 }, [NFTA_LOOKUP_DREG] = { .type = NLA_U32 }, - [NFTA_LOOKUP_FLAGS] = { .type = NLA_U32 }, + [NFTA_LOOKUP_FLAGS] = + NLA_POLICY_MASK(NLA_BE32, NFT_LOOKUP_F_INV), }; static int nft_lookup_init(const struct nft_ctx *ctx, @@ -120,9 +121,6 @@ static int nft_lookup_init(const struct nft_ctx *ctx, if (tb[NFTA_LOOKUP_FLAGS]) { flags = ntohl(nla_get_be32(tb[NFTA_LOOKUP_FLAGS])); - if (flags & ~NFT_LOOKUP_F_INV) - return -EINVAL; - if (flags & NFT_LOOKUP_F_INV) priv->invert = true; } diff --git a/net/netfilter/nft_masq.c b/net/netfilter/nft_masq.c index b115d77fbbc7..8a14aaca93bb 100644 --- a/net/netfilter/nft_masq.c +++ b/net/netfilter/nft_masq.c @@ -20,7 +20,8 @@ struct nft_masq { }; static const struct nla_policy nft_masq_policy[NFTA_MASQ_MAX + 1] = { - [NFTA_MASQ_FLAGS] = { .type = NLA_U32 }, + [NFTA_MASQ_FLAGS] = + NLA_POLICY_MASK(NLA_BE32, NF_NAT_RANGE_MASK), [NFTA_MASQ_REG_PROTO_MIN] = { .type = NLA_U32 }, [NFTA_MASQ_REG_PROTO_MAX] = { .type = NLA_U32 }, }; @@ -47,11 +48,8 @@ static int nft_masq_init(const struct nft_ctx *ctx, struct nft_masq *priv = nft_expr_priv(expr); int err; - if (tb[NFTA_MASQ_FLAGS]) { + if (tb[NFTA_MASQ_FLAGS]) priv->flags = ntohl(nla_get_be32(tb[NFTA_MASQ_FLAGS])); - if (priv->flags & ~NF_NAT_RANGE_MASK) - return -EINVAL; - } if (tb[NFTA_MASQ_REG_PROTO_MIN]) { err = nft_parse_register_load(tb[NFTA_MASQ_REG_PROTO_MIN], diff --git a/net/netfilter/nft_nat.c b/net/netfilter/nft_nat.c index 5c29915ab028..583885ce7232 100644 --- a/net/netfilter/nft_nat.c +++ b/net/netfilter/nft_nat.c @@ -132,7 +132,8 @@ static const struct nla_policy nft_nat_policy[NFTA_NAT_MAX + 1] = { [NFTA_NAT_REG_ADDR_MAX] = { .type = NLA_U32 }, [NFTA_NAT_REG_PROTO_MIN] = { .type = NLA_U32 }, [NFTA_NAT_REG_PROTO_MAX] = { .type = NLA_U32 }, - [NFTA_NAT_FLAGS] = { .type = NLA_U32 }, + [NFTA_NAT_FLAGS] = + NLA_POLICY_MASK(NLA_BE32, NF_NAT_RANGE_MASK), }; static int nft_nat_validate(const struct nft_ctx *ctx, @@ -246,11 +247,8 @@ static int nft_nat_init(const struct nft_ctx *ctx, const struct nft_expr *expr, priv->flags |= NF_NAT_RANGE_PROTO_SPECIFIED; } - if (tb[NFTA_NAT_FLAGS]) { + if (tb[NFTA_NAT_FLAGS]) priv->flags |= ntohl(nla_get_be32(tb[NFTA_NAT_FLAGS])); - if (priv->flags & ~NF_NAT_RANGE_MASK) - return -EOPNOTSUPP; - } return nf_ct_netns_get(ctx->net, family); } diff --git a/net/netfilter/nft_redir.c b/net/netfilter/nft_redir.c index a70196ffcb1e..a58bd8d291ff 100644 --- a/net/netfilter/nft_redir.c +++ b/net/netfilter/nft_redir.c @@ -22,7 +22,8 @@ struct nft_redir { static const struct nla_policy nft_redir_policy[NFTA_REDIR_MAX + 1] = { [NFTA_REDIR_REG_PROTO_MIN] = { .type = NLA_U32 }, [NFTA_REDIR_REG_PROTO_MAX] = { .type = NLA_U32 }, - [NFTA_REDIR_FLAGS] = { .type = NLA_U32 }, + [NFTA_REDIR_FLAGS] = + NLA_POLICY_MASK(NLA_BE32, NF_NAT_RANGE_MASK), }; static int nft_redir_validate(const struct nft_ctx *ctx, @@ -68,11 +69,8 @@ static int nft_redir_init(const struct nft_ctx *ctx, priv->flags |= NF_NAT_RANGE_PROTO_SPECIFIED; } - if (tb[NFTA_REDIR_FLAGS]) { + if (tb[NFTA_REDIR_FLAGS]) priv->flags = ntohl(nla_get_be32(tb[NFTA_REDIR_FLAGS])); - if (priv->flags & ~NF_NAT_RANGE_MASK) - return -EINVAL; - } return nf_ct_netns_get(ctx->net, ctx->family); } diff --git a/net/netlabel/netlabel_cipso_v4.h b/net/netlabel/netlabel_cipso_v4.h index 85d7ecb05728..9518ab56ec98 100644 --- a/net/netlabel/netlabel_cipso_v4.h +++ b/net/netlabel/netlabel_cipso_v4.h @@ -149,7 +149,4 @@ enum { /* NetLabel protocol functions */ int netlbl_cipsov4_genl_init(void); -/* Free the memory associated with a CIPSOv4 DOI definition */ -void netlbl_cipsov4_doi_free(struct rcu_head *entry); - #endif diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 383631873748..96c605e45235 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -677,6 +677,7 @@ static int netlink_create(struct net *net, struct socket *sock, int protocol, struct netlink_sock *nlk; int (*bind)(struct net *net, int group); void (*unbind)(struct net *net, int group); + void (*release)(struct sock *sock, unsigned long *groups); int err = 0; sock->state = SS_UNCONNECTED; @@ -704,6 +705,7 @@ static int netlink_create(struct net *net, struct socket *sock, int protocol, cb_mutex = nl_table[protocol].cb_mutex; bind = nl_table[protocol].bind; unbind = nl_table[protocol].unbind; + release = nl_table[protocol].release; netlink_unlock_table(); if (err < 0) @@ -719,6 +721,7 @@ static int netlink_create(struct net *net, struct socket *sock, int protocol, nlk->module = module; nlk->netlink_bind = bind; nlk->netlink_unbind = unbind; + nlk->netlink_release = release; out: return err; @@ -763,6 +766,8 @@ static int netlink_release(struct socket *sock) * OK. Socket is unlinked, any packets that arrive now * will be purged. */ + if (nlk->netlink_release) + nlk->netlink_release(sk, nlk->groups); /* must not acquire netlink_table_lock in any way again before unbind * and notifying genetlink is done as otherwise it might deadlock @@ -1432,6 +1437,8 @@ struct netlink_broadcast_data { int delivered; gfp_t allocation; struct sk_buff *skb, *skb2; + int (*tx_filter)(struct sock *dsk, struct sk_buff *skb, void *data); + void *tx_data; }; static void do_one_broadcast(struct sock *sk, @@ -1485,6 +1492,13 @@ static void do_one_broadcast(struct sock *sk, p->delivery_failure = 1; goto out; } + + if (p->tx_filter && p->tx_filter(sk, p->skb2, p->tx_data)) { + kfree_skb(p->skb2); + p->skb2 = NULL; + goto out; + } + if (sk_filter(sk, p->skb2)) { kfree_skb(p->skb2); p->skb2 = NULL; @@ -1507,8 +1521,12 @@ out: sock_put(sk); } -int netlink_broadcast(struct sock *ssk, struct sk_buff *skb, u32 portid, - u32 group, gfp_t allocation) +int netlink_broadcast_filtered(struct sock *ssk, struct sk_buff *skb, + u32 portid, + u32 group, gfp_t allocation, + int (*filter)(struct sock *dsk, + struct sk_buff *skb, void *data), + void *filter_data) { struct net *net = sock_net(ssk); struct netlink_broadcast_data info; @@ -1527,6 +1545,8 @@ int netlink_broadcast(struct sock *ssk, struct sk_buff *skb, u32 portid, info.allocation = allocation; info.skb = skb; info.skb2 = NULL; + info.tx_filter = filter; + info.tx_data = filter_data; /* While we sleep in clone, do not allow to change socket list */ @@ -1552,6 +1572,14 @@ int netlink_broadcast(struct sock *ssk, struct sk_buff *skb, u32 portid, } return -ESRCH; } +EXPORT_SYMBOL(netlink_broadcast_filtered); + +int netlink_broadcast(struct sock *ssk, struct sk_buff *skb, u32 portid, + u32 group, gfp_t allocation) +{ + return netlink_broadcast_filtered(ssk, skb, portid, group, allocation, + NULL, NULL); +} EXPORT_SYMBOL(netlink_broadcast); struct netlink_set_err_data { @@ -1629,10 +1657,7 @@ static void netlink_update_socket_mc(struct netlink_sock *nlk, old = test_bit(group - 1, nlk->groups); subscriptions = nlk->subscriptions - old + new; - if (new) - __set_bit(group - 1, nlk->groups); - else - __clear_bit(group - 1, nlk->groups); + __assign_bit(group - 1, nlk->groups, new); netlink_update_subscriptions(&nlk->sk, subscriptions); netlink_update_listeners(&nlk->sk); } @@ -2069,6 +2094,7 @@ __netlink_kernel_create(struct net *net, int unit, struct module *module, if (cfg) { nl_table[unit].bind = cfg->bind; nl_table[unit].unbind = cfg->unbind; + nl_table[unit].release = cfg->release; nl_table[unit].flags = cfg->flags; } nl_table[unit].registered = 1; diff --git a/net/netlink/af_netlink.h b/net/netlink/af_netlink.h index 90a3198a9b7f..fd424cd63f31 100644 --- a/net/netlink/af_netlink.h +++ b/net/netlink/af_netlink.h @@ -42,6 +42,8 @@ struct netlink_sock { void (*netlink_rcv)(struct sk_buff *skb); int (*netlink_bind)(struct net *net, int group); void (*netlink_unbind)(struct net *net, int group); + void (*netlink_release)(struct sock *sk, + unsigned long *groups); struct module *module; struct rhash_head node; @@ -64,6 +66,8 @@ struct netlink_table { struct module *module; int (*bind)(struct net *net, int group); void (*unbind)(struct net *net, int group); + void (*release)(struct sock *sk, + unsigned long *groups); int registered; }; diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index a157247a1e45..6bd2ce51271f 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -593,8 +593,12 @@ static int genl_validate_ops(const struct genl_family *family) return -EINVAL; /* Check sort order */ - if (a->cmd < b->cmd) + if (a->cmd < b->cmd) { continue; + } else if (a->cmd > b->cmd) { + WARN_ON(1); + return -EINVAL; + } if (a->internal_flags != b->internal_flags || ((a->flags ^ b->flags) & ~(GENL_CMD_CAP_DO | diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index 331730fd3580..fa955e892210 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -455,45 +455,6 @@ static int ovs_ct_handle_fragments(struct net *net, struct sw_flow_key *key, return 0; } -static struct nf_conntrack_expect * -ovs_ct_expect_find(struct net *net, const struct nf_conntrack_zone *zone, - u16 proto, const struct sk_buff *skb) -{ - struct nf_conntrack_tuple tuple; - struct nf_conntrack_expect *exp; - - if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb), proto, net, &tuple)) - return NULL; - - exp = __nf_ct_expect_find(net, zone, &tuple); - if (exp) { - struct nf_conntrack_tuple_hash *h; - - /* Delete existing conntrack entry, if it clashes with the - * expectation. This can happen since conntrack ALGs do not - * check for clashes between (new) expectations and existing - * conntrack entries. nf_conntrack_in() will check the - * expectations only if a conntrack entry can not be found, - * which can lead to OVS finding the expectation (here) in the - * init direction, but which will not be removed by the - * nf_conntrack_in() call, if a matching conntrack entry is - * found instead. In this case all init direction packets - * would be reported as new related packets, while reply - * direction packets would be reported as un-related - * established packets. - */ - h = nf_conntrack_find_get(net, zone, &tuple); - if (h) { - struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); - - nf_ct_delete(ct, 0, 0); - nf_ct_put(ct); - } - } - - return exp; -} - /* This replicates logic from nf_conntrack_core.c that is not exported. */ static enum ip_conntrack_info ovs_ct_get_info(const struct nf_conntrack_tuple_hash *h) @@ -852,36 +813,16 @@ static int ovs_ct_lookup(struct net *net, struct sw_flow_key *key, const struct ovs_conntrack_info *info, struct sk_buff *skb) { - struct nf_conntrack_expect *exp; - - /* If we pass an expected packet through nf_conntrack_in() the - * expectation is typically removed, but the packet could still be - * lost in upcall processing. To prevent this from happening we - * perform an explicit expectation lookup. Expected connections are - * always new, and will be passed through conntrack only when they are - * committed, as it is OK to remove the expectation at that time. - */ - exp = ovs_ct_expect_find(net, &info->zone, info->family, skb); - if (exp) { - u8 state; - - /* NOTE: New connections are NATted and Helped only when - * committed, so we are not calling into NAT here. - */ - state = OVS_CS_F_TRACKED | OVS_CS_F_NEW | OVS_CS_F_RELATED; - __ovs_ct_update_key(key, state, &info->zone, exp->master); - } else { - struct nf_conn *ct; - int err; + struct nf_conn *ct; + int err; - err = __ovs_ct_lookup(net, key, info, skb); - if (err) - return err; + err = __ovs_ct_lookup(net, key, info, skb); + if (err) + return err; - ct = (struct nf_conn *)skb_nfct(skb); - if (ct) - nf_ct_deliver_cached_events(ct); - } + ct = (struct nf_conn *)skb_nfct(skb); + if (ct) + nf_ct_deliver_cached_events(ct); return 0; } @@ -1460,7 +1401,8 @@ int ovs_ct_copy_action(struct net *net, const struct nlattr *attr, if (err) goto err_free_ct; - __set_bit(IPS_CONFIRMED_BIT, &ct_info.ct->status); + if (ct_info.commit) + __set_bit(IPS_CONFIRMED_BIT, &ct_info.ct->status); return 0; err_free_ct: __ovs_ct_free_action(&ct_info); diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index a2935bd18ed9..8f97648d652f 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -2931,8 +2931,10 @@ static struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad, if (prepad + len < PAGE_SIZE || !linear) linear = len; + if (len - linear > MAX_SKB_FRAGS * (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) + linear = len - MAX_SKB_FRAGS * (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER); skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock, - err, 0); + err, PAGE_ALLOC_COSTLY_ORDER); if (!skb) return NULL; diff --git a/net/qrtr/af_qrtr.c b/net/qrtr/af_qrtr.c index 78beb74146e7..41ece61eb57a 100644 --- a/net/qrtr/af_qrtr.c +++ b/net/qrtr/af_qrtr.c @@ -23,6 +23,8 @@ #define QRTR_EPH_PORT_RANGE \ XA_LIMIT(QRTR_MIN_EPH_SOCKET, QRTR_MAX_EPH_SOCKET) +#define QRTR_PORT_CTRL_LEGACY 0xffff + /** * struct qrtr_hdr_v1 - (I|R)PCrouter packet header version 1 * @version: protocol version @@ -495,6 +497,9 @@ int qrtr_endpoint_post(struct qrtr_endpoint *ep, const void *data, size_t len) goto err; } + if (cb->dst_port == QRTR_PORT_CTRL_LEGACY) + cb->dst_port = QRTR_PORT_CTRL; + if (!size || len != ALIGN(size, 4) + hdrlen) goto err; diff --git a/net/qrtr/ns.c b/net/qrtr/ns.c index 0f7a729f1a1f..b1db0b519179 100644 --- a/net/qrtr/ns.c +++ b/net/qrtr/ns.c @@ -16,7 +16,7 @@ #define CREATE_TRACE_POINTS #include <trace/events/qrtr.h> -static RADIX_TREE(nodes, GFP_KERNEL); +static DEFINE_XARRAY(nodes); static struct { struct socket *sock; @@ -66,14 +66,14 @@ struct qrtr_server { struct qrtr_node { unsigned int id; - struct radix_tree_root servers; + struct xarray servers; }; static struct qrtr_node *node_get(unsigned int node_id) { struct qrtr_node *node; - node = radix_tree_lookup(&nodes, node_id); + node = xa_load(&nodes, node_id); if (node) return node; @@ -83,8 +83,9 @@ static struct qrtr_node *node_get(unsigned int node_id) return NULL; node->id = node_id; + xa_init(&node->servers); - if (radix_tree_insert(&nodes, node_id, node)) { + if (xa_store(&nodes, node_id, node, GFP_KERNEL)) { kfree(node); return NULL; } @@ -193,40 +194,23 @@ static void lookup_notify(struct sockaddr_qrtr *to, struct qrtr_server *srv, static int announce_servers(struct sockaddr_qrtr *sq) { - struct radix_tree_iter iter; struct qrtr_server *srv; struct qrtr_node *node; - void __rcu **slot; + unsigned long index; int ret; node = node_get(qrtr_ns.local_node); if (!node) return 0; - rcu_read_lock(); /* Announce the list of servers registered in this node */ - radix_tree_for_each_slot(slot, &node->servers, &iter, 0) { - srv = radix_tree_deref_slot(slot); - if (!srv) - continue; - if (radix_tree_deref_retry(srv)) { - slot = radix_tree_iter_retry(&iter); - continue; - } - slot = radix_tree_iter_resume(slot, &iter); - rcu_read_unlock(); - + xa_for_each(&node->servers, index, srv) { ret = service_announce_new(sq, srv); if (ret < 0) { pr_err("failed to announce new service\n"); return ret; } - - rcu_read_lock(); } - - rcu_read_unlock(); - return 0; } @@ -256,14 +240,17 @@ static struct qrtr_server *server_add(unsigned int service, goto err; /* Delete the old server on the same port */ - old = radix_tree_lookup(&node->servers, port); + old = xa_store(&node->servers, port, srv, GFP_KERNEL); if (old) { - radix_tree_delete(&node->servers, port); - kfree(old); + if (xa_is_err(old)) { + pr_err("failed to add server [0x%x:0x%x] ret:%d\n", + srv->service, srv->instance, xa_err(old)); + goto err; + } else { + kfree(old); + } } - radix_tree_insert(&node->servers, port, srv); - trace_qrtr_ns_server_add(srv->service, srv->instance, srv->node, srv->port); @@ -280,11 +267,11 @@ static int server_del(struct qrtr_node *node, unsigned int port, bool bcast) struct qrtr_server *srv; struct list_head *li; - srv = radix_tree_lookup(&node->servers, port); + srv = xa_load(&node->servers, port); if (!srv) return -ENOENT; - radix_tree_delete(&node->servers, port); + xa_erase(&node->servers, port); /* Broadcast the removal of local servers */ if (srv->node == qrtr_ns.local_node && bcast) @@ -344,13 +331,12 @@ static int ctrl_cmd_hello(struct sockaddr_qrtr *sq) static int ctrl_cmd_bye(struct sockaddr_qrtr *from) { struct qrtr_node *local_node; - struct radix_tree_iter iter; struct qrtr_ctrl_pkt pkt; struct qrtr_server *srv; struct sockaddr_qrtr sq; struct msghdr msg = { }; struct qrtr_node *node; - void __rcu **slot; + unsigned long index; struct kvec iv; int ret; @@ -361,22 +347,9 @@ static int ctrl_cmd_bye(struct sockaddr_qrtr *from) if (!node) return 0; - rcu_read_lock(); /* Advertise removal of this client to all servers of remote node */ - radix_tree_for_each_slot(slot, &node->servers, &iter, 0) { - srv = radix_tree_deref_slot(slot); - if (!srv) - continue; - if (radix_tree_deref_retry(srv)) { - slot = radix_tree_iter_retry(&iter); - continue; - } - slot = radix_tree_iter_resume(slot, &iter); - rcu_read_unlock(); + xa_for_each(&node->servers, index, srv) server_del(node, srv->port, true); - rcu_read_lock(); - } - rcu_read_unlock(); /* Advertise the removal of this client to all local servers */ local_node = node_get(qrtr_ns.local_node); @@ -387,18 +360,7 @@ static int ctrl_cmd_bye(struct sockaddr_qrtr *from) pkt.cmd = cpu_to_le32(QRTR_TYPE_BYE); pkt.client.node = cpu_to_le32(from->sq_node); - rcu_read_lock(); - radix_tree_for_each_slot(slot, &local_node->servers, &iter, 0) { - srv = radix_tree_deref_slot(slot); - if (!srv) - continue; - if (radix_tree_deref_retry(srv)) { - slot = radix_tree_iter_retry(&iter); - continue; - } - slot = radix_tree_iter_resume(slot, &iter); - rcu_read_unlock(); - + xa_for_each(&local_node->servers, index, srv) { sq.sq_family = AF_QIPCRTR; sq.sq_node = srv->node; sq.sq_port = srv->port; @@ -411,11 +373,7 @@ static int ctrl_cmd_bye(struct sockaddr_qrtr *from) pr_err("failed to send bye cmd\n"); return ret; } - rcu_read_lock(); } - - rcu_read_unlock(); - return 0; } @@ -423,7 +381,6 @@ static int ctrl_cmd_del_client(struct sockaddr_qrtr *from, unsigned int node_id, unsigned int port) { struct qrtr_node *local_node; - struct radix_tree_iter iter; struct qrtr_lookup *lookup; struct qrtr_ctrl_pkt pkt; struct msghdr msg = { }; @@ -432,7 +389,7 @@ static int ctrl_cmd_del_client(struct sockaddr_qrtr *from, struct qrtr_node *node; struct list_head *tmp; struct list_head *li; - void __rcu **slot; + unsigned long index; struct kvec iv; int ret; @@ -477,18 +434,7 @@ static int ctrl_cmd_del_client(struct sockaddr_qrtr *from, pkt.client.node = cpu_to_le32(node_id); pkt.client.port = cpu_to_le32(port); - rcu_read_lock(); - radix_tree_for_each_slot(slot, &local_node->servers, &iter, 0) { - srv = radix_tree_deref_slot(slot); - if (!srv) - continue; - if (radix_tree_deref_retry(srv)) { - slot = radix_tree_iter_retry(&iter); - continue; - } - slot = radix_tree_iter_resume(slot, &iter); - rcu_read_unlock(); - + xa_for_each(&local_node->servers, index, srv) { sq.sq_family = AF_QIPCRTR; sq.sq_node = srv->node; sq.sq_port = srv->port; @@ -501,11 +447,7 @@ static int ctrl_cmd_del_client(struct sockaddr_qrtr *from, pr_err("failed to send del client cmd\n"); return ret; } - rcu_read_lock(); } - - rcu_read_unlock(); - return 0; } @@ -576,13 +518,12 @@ static int ctrl_cmd_del_server(struct sockaddr_qrtr *from, static int ctrl_cmd_new_lookup(struct sockaddr_qrtr *from, unsigned int service, unsigned int instance) { - struct radix_tree_iter node_iter; struct qrtr_server_filter filter; - struct radix_tree_iter srv_iter; struct qrtr_lookup *lookup; + struct qrtr_server *srv; struct qrtr_node *node; - void __rcu **node_slot; - void __rcu **srv_slot; + unsigned long node_idx; + unsigned long srv_idx; /* Accept only local observers */ if (from->sq_node != qrtr_ns.local_node) @@ -601,40 +542,14 @@ static int ctrl_cmd_new_lookup(struct sockaddr_qrtr *from, filter.service = service; filter.instance = instance; - rcu_read_lock(); - radix_tree_for_each_slot(node_slot, &nodes, &node_iter, 0) { - node = radix_tree_deref_slot(node_slot); - if (!node) - continue; - if (radix_tree_deref_retry(node)) { - node_slot = radix_tree_iter_retry(&node_iter); - continue; - } - node_slot = radix_tree_iter_resume(node_slot, &node_iter); - - radix_tree_for_each_slot(srv_slot, &node->servers, - &srv_iter, 0) { - struct qrtr_server *srv; - - srv = radix_tree_deref_slot(srv_slot); - if (!srv) - continue; - if (radix_tree_deref_retry(srv)) { - srv_slot = radix_tree_iter_retry(&srv_iter); - continue; - } - + xa_for_each(&nodes, node_idx, node) { + xa_for_each(&node->servers, srv_idx, srv) { if (!server_match(srv, &filter)) continue; - srv_slot = radix_tree_iter_resume(srv_slot, &srv_iter); - - rcu_read_unlock(); lookup_notify(from, srv, true); - rcu_read_lock(); } } - rcu_read_unlock(); /* Empty notification, to indicate end of listing */ lookup_notify(from, NULL, true); diff --git a/net/sched/Kconfig b/net/sched/Kconfig index 4b95cb1ac435..470c70deffe2 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -347,8 +347,7 @@ config NET_SCH_FQ_PIE config NET_SCH_INGRESS tristate "Ingress/classifier-action Qdisc" depends on NET_CLS_ACT - select NET_INGRESS - select NET_EGRESS + select NET_XGRESS help Say Y here if you want to use classifiers for incoming and/or outgoing packets. This qdisc doesn't do anything else besides running classifiers, @@ -679,6 +678,7 @@ config NET_EMATCH_IPT config NET_CLS_ACT bool "Actions" select NET_CLS + select NET_XGRESS help Say Y here if you want to use traffic control actions. Actions get attached to classifiers and are invoked after a successful diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c index abc71a06d634..7c652d14528b 100644 --- a/net/sched/act_ct.c +++ b/net/sched/act_ct.c @@ -1238,7 +1238,8 @@ static int tcf_ct_fill_params(struct net *net, } } - __set_bit(IPS_CONFIRMED_BIT, &tmpl->status); + if (p->ct_action & TCA_CT_ACT_COMMIT) + __set_bit(IPS_CONFIRMED_BIT, &tmpl->status); return 0; err: nf_ct_put(p->tmpl); diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 9f0711da9c95..e5314a31f75a 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -72,6 +72,7 @@ struct fl_flow_key { struct flow_dissector_key_num_of_vlans num_of_vlans; struct flow_dissector_key_pppoe pppoe; struct flow_dissector_key_l2tpv3 l2tpv3; + struct flow_dissector_key_ipsec ipsec; struct flow_dissector_key_cfm cfm; } __aligned(BITS_PER_LONG / 8); /* Ensure that we can do comparisons as longs. */ @@ -726,6 +727,8 @@ static const struct nla_policy fl_policy[TCA_FLOWER_MAX + 1] = { [TCA_FLOWER_KEY_PPPOE_SID] = { .type = NLA_U16 }, [TCA_FLOWER_KEY_PPP_PROTO] = { .type = NLA_U16 }, [TCA_FLOWER_KEY_L2TPV3_SID] = { .type = NLA_U32 }, + [TCA_FLOWER_KEY_SPI] = { .type = NLA_U32 }, + [TCA_FLOWER_KEY_SPI_MASK] = { .type = NLA_U32 }, [TCA_FLOWER_L2_MISS] = NLA_POLICY_MAX(NLA_U8, 1), [TCA_FLOWER_KEY_CFM] = { .type = NLA_NESTED }, }; @@ -796,6 +799,24 @@ static void fl_set_key_val(struct nlattr **tb, nla_memcpy(mask, tb[mask_type], len); } +static int fl_set_key_spi(struct nlattr **tb, struct fl_flow_key *key, + struct fl_flow_key *mask, + struct netlink_ext_ack *extack) +{ + if (key->basic.ip_proto != IPPROTO_ESP && + key->basic.ip_proto != IPPROTO_AH) { + NL_SET_ERR_MSG(extack, + "Protocol must be either ESP or AH"); + return -EINVAL; + } + + fl_set_key_val(tb, &key->ipsec.spi, + TCA_FLOWER_KEY_SPI, + &mask->ipsec.spi, TCA_FLOWER_KEY_SPI_MASK, + sizeof(key->ipsec.spi)); + return 0; +} + static int fl_set_key_port_range(struct nlattr **tb, struct fl_flow_key *key, struct fl_flow_key *mask, struct netlink_ext_ack *extack) @@ -1895,6 +1916,12 @@ static int fl_set_key(struct net *net, struct nlattr **tb, return ret; } + if (tb[TCA_FLOWER_KEY_SPI]) { + ret = fl_set_key_spi(tb, key, mask, extack); + if (ret) + return ret; + } + if (tb[TCA_FLOWER_KEY_ENC_IPV4_SRC] || tb[TCA_FLOWER_KEY_ENC_IPV4_DST]) { key->enc_control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; @@ -2068,6 +2095,8 @@ static void fl_init_dissector(struct flow_dissector *dissector, FL_KEY_SET_IF_MASKED(mask, keys, cnt, FLOW_DISSECTOR_KEY_L2TPV3, l2tpv3); FL_KEY_SET_IF_MASKED(mask, keys, cnt, + FLOW_DISSECTOR_KEY_IPSEC, ipsec); + FL_KEY_SET_IF_MASKED(mask, keys, cnt, FLOW_DISSECTOR_KEY_CFM, cfm); skb_flow_dissector_init(dissector, keys, cnt); @@ -3365,6 +3394,12 @@ static int fl_dump_key(struct sk_buff *skb, struct net *net, sizeof(key->l2tpv3.session_id))) goto nla_put_failure; + if (key->ipsec.spi && + fl_dump_key_val(skb, &key->ipsec.spi, TCA_FLOWER_KEY_SPI, + &mask->ipsec.spi, TCA_FLOWER_KEY_SPI_MASK, + sizeof(key->ipsec.spi))) + goto nla_put_failure; + if ((key->basic.ip_proto == IPPROTO_TCP || key->basic.ip_proto == IPPROTO_UDP || key->basic.ip_proto == IPPROTO_SCTP) && diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c index e35a4e90f4e6..19901e77cd3b 100644 --- a/net/sched/sch_drr.c +++ b/net/sched/sch_drr.c @@ -17,7 +17,6 @@ struct drr_class { struct Qdisc_class_common common; - unsigned int filter_cnt; struct gnet_stats_basic_sync bstats; struct gnet_stats_queue qstats; @@ -150,8 +149,10 @@ static int drr_delete_class(struct Qdisc *sch, unsigned long arg, struct drr_sched *q = qdisc_priv(sch); struct drr_class *cl = (struct drr_class *)arg; - if (cl->filter_cnt > 0) + if (qdisc_class_in_use(&cl->common)) { + NL_SET_ERR_MSG(extack, "DRR class is in use"); return -EBUSY; + } sch_tree_lock(sch); @@ -187,8 +188,8 @@ static unsigned long drr_bind_tcf(struct Qdisc *sch, unsigned long parent, { struct drr_class *cl = drr_find_class(sch, classid); - if (cl != NULL) - cl->filter_cnt++; + if (cl) + qdisc_class_get(&cl->common); return (unsigned long)cl; } @@ -197,7 +198,7 @@ static void drr_unbind_tcf(struct Qdisc *sch, unsigned long arg) { struct drr_class *cl = (struct drr_class *)arg; - cl->filter_cnt--; + qdisc_class_put(&cl->common); } static int drr_graft_class(struct Qdisc *sch, unsigned long arg, diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c index 70b0c5873d32..98805303218d 100644 --- a/net/sched/sch_hfsc.c +++ b/net/sched/sch_hfsc.c @@ -116,7 +116,6 @@ struct hfsc_class { struct net_rate_estimator __rcu *rate_est; struct tcf_proto __rcu *filter_list; /* filter list */ struct tcf_block *block; - unsigned int filter_cnt; /* filter count */ unsigned int level; /* class level in hierarchy */ struct hfsc_sched *sched; /* scheduler data */ @@ -1094,8 +1093,11 @@ hfsc_delete_class(struct Qdisc *sch, unsigned long arg, struct hfsc_sched *q = qdisc_priv(sch); struct hfsc_class *cl = (struct hfsc_class *)arg; - if (cl->level > 0 || cl->filter_cnt > 0 || cl == &q->root) + if (cl->level > 0 || qdisc_class_in_use(&cl->cl_common) || + cl == &q->root) { + NL_SET_ERR_MSG(extack, "HFSC class in use"); return -EBUSY; + } sch_tree_lock(sch); @@ -1223,7 +1225,7 @@ hfsc_bind_tcf(struct Qdisc *sch, unsigned long parent, u32 classid) if (cl != NULL) { if (p != NULL && p->level <= cl->level) return 0; - cl->filter_cnt++; + qdisc_class_get(&cl->cl_common); } return (unsigned long)cl; @@ -1234,7 +1236,7 @@ hfsc_unbind_tcf(struct Qdisc *sch, unsigned long arg) { struct hfsc_class *cl = (struct hfsc_class *)arg; - cl->filter_cnt--; + qdisc_class_put(&cl->cl_common); } static struct tcf_block *hfsc_tcf_block(struct Qdisc *sch, unsigned long arg, diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index 325c29041c7d..0d947414e616 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -102,7 +102,6 @@ struct htb_class { struct tcf_proto __rcu *filter_list; /* class attached filters */ struct tcf_block *block; - int filter_cnt; int level; /* our level (see above) */ unsigned int children; @@ -1710,8 +1709,10 @@ static int htb_delete(struct Qdisc *sch, unsigned long arg, * tc subsys guarantee us that in htb_destroy it holds no class * refs so that we can remove children safely there ? */ - if (cl->children || cl->filter_cnt) + if (cl->children || qdisc_class_in_use(&cl->common)) { + NL_SET_ERR_MSG(extack, "HTB class in use"); return -EBUSY; + } if (!cl->level && htb_parent_last_child(cl)) last_child = 1; @@ -1810,10 +1811,6 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, NL_SET_ERR_MSG(extack, "HTB offload doesn't support the mpu parameter"); goto failure; } - if (hopt->quantum) { - NL_SET_ERR_MSG(extack, "HTB offload doesn't support the quantum parameter"); - goto failure; - } } /* Keeping backward compatible with rate_table based iproute2 tc */ @@ -1910,6 +1907,7 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, .rate = max_t(u64, hopt->rate.rate, rate64), .ceil = max_t(u64, hopt->ceil.rate, ceil64), .prio = hopt->prio, + .quantum = hopt->quantum, .extack = extack, }; err = htb_offload(dev, &offload_opt); @@ -1931,6 +1929,7 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, .rate = max_t(u64, hopt->rate.rate, rate64), .ceil = max_t(u64, hopt->ceil.rate, ceil64), .prio = hopt->prio, + .quantum = hopt->quantum, .extack = extack, }; err = htb_offload(dev, &offload_opt); @@ -2017,6 +2016,7 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, .rate = max_t(u64, hopt->rate.rate, rate64), .ceil = max_t(u64, hopt->ceil.rate, ceil64), .prio = hopt->prio, + .quantum = hopt->quantum, .extack = extack, }; err = htb_offload(dev, &offload_opt); @@ -2108,7 +2108,7 @@ static unsigned long htb_bind_filter(struct Qdisc *sch, unsigned long parent, * be broken by class during destroy IIUC. */ if (cl) - cl->filter_cnt++; + qdisc_class_get(&cl->common); return (unsigned long)cl; } @@ -2116,8 +2116,7 @@ static void htb_unbind_filter(struct Qdisc *sch, unsigned long arg) { struct htb_class *cl = (struct htb_class *)arg; - if (cl) - cl->filter_cnt--; + qdisc_class_put(&cl->common); } static void htb_walk(struct Qdisc *sch, struct qdisc_walker *arg) diff --git a/net/sched/sch_ingress.c b/net/sched/sch_ingress.c index e43a45499372..a463a63192c3 100644 --- a/net/sched/sch_ingress.c +++ b/net/sched/sch_ingress.c @@ -13,6 +13,7 @@ #include <net/netlink.h> #include <net/pkt_sched.h> #include <net/pkt_cls.h> +#include <net/tcx.h> struct ingress_sched_data { struct tcf_block *block; @@ -78,6 +79,8 @@ static int ingress_init(struct Qdisc *sch, struct nlattr *opt, { struct ingress_sched_data *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); + struct bpf_mprog_entry *entry; + bool created; int err; if (sch->parent != TC_H_INGRESS) @@ -85,7 +88,13 @@ static int ingress_init(struct Qdisc *sch, struct nlattr *opt, net_inc_ingress_queue(); - mini_qdisc_pair_init(&q->miniqp, sch, &dev->miniq_ingress); + entry = tcx_entry_fetch_or_create(dev, true, &created); + if (!entry) + return -ENOMEM; + tcx_miniq_set_active(entry, true); + mini_qdisc_pair_init(&q->miniqp, sch, &tcx_entry(entry)->miniq); + if (created) + tcx_entry_update(dev, entry, true); q->block_info.binder_type = FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS; q->block_info.chain_head_change = clsact_chain_head_change; @@ -103,11 +112,22 @@ static int ingress_init(struct Qdisc *sch, struct nlattr *opt, static void ingress_destroy(struct Qdisc *sch) { struct ingress_sched_data *q = qdisc_priv(sch); + struct net_device *dev = qdisc_dev(sch); + struct bpf_mprog_entry *entry = rtnl_dereference(dev->tcx_ingress); if (sch->parent != TC_H_INGRESS) return; tcf_block_put_ext(q->block, sch, &q->block_info); + + if (entry) { + tcx_miniq_set_active(entry, false); + if (!tcx_entry_is_active(entry)) { + tcx_entry_update(dev, NULL, true); + tcx_entry_free(entry); + } + } + net_dec_ingress_queue(); } @@ -223,6 +243,8 @@ static int clsact_init(struct Qdisc *sch, struct nlattr *opt, { struct clsact_sched_data *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); + struct bpf_mprog_entry *entry; + bool created; int err; if (sch->parent != TC_H_CLSACT) @@ -231,7 +253,13 @@ static int clsact_init(struct Qdisc *sch, struct nlattr *opt, net_inc_ingress_queue(); net_inc_egress_queue(); - mini_qdisc_pair_init(&q->miniqp_ingress, sch, &dev->miniq_ingress); + entry = tcx_entry_fetch_or_create(dev, true, &created); + if (!entry) + return -ENOMEM; + tcx_miniq_set_active(entry, true); + mini_qdisc_pair_init(&q->miniqp_ingress, sch, &tcx_entry(entry)->miniq); + if (created) + tcx_entry_update(dev, entry, true); q->ingress_block_info.binder_type = FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS; q->ingress_block_info.chain_head_change = clsact_chain_head_change; @@ -244,7 +272,13 @@ static int clsact_init(struct Qdisc *sch, struct nlattr *opt, mini_qdisc_pair_block_init(&q->miniqp_ingress, q->ingress_block); - mini_qdisc_pair_init(&q->miniqp_egress, sch, &dev->miniq_egress); + entry = tcx_entry_fetch_or_create(dev, false, &created); + if (!entry) + return -ENOMEM; + tcx_miniq_set_active(entry, true); + mini_qdisc_pair_init(&q->miniqp_egress, sch, &tcx_entry(entry)->miniq); + if (created) + tcx_entry_update(dev, entry, false); q->egress_block_info.binder_type = FLOW_BLOCK_BINDER_TYPE_CLSACT_EGRESS; q->egress_block_info.chain_head_change = clsact_chain_head_change; @@ -256,12 +290,31 @@ static int clsact_init(struct Qdisc *sch, struct nlattr *opt, static void clsact_destroy(struct Qdisc *sch) { struct clsact_sched_data *q = qdisc_priv(sch); + struct net_device *dev = qdisc_dev(sch); + struct bpf_mprog_entry *ingress_entry = rtnl_dereference(dev->tcx_ingress); + struct bpf_mprog_entry *egress_entry = rtnl_dereference(dev->tcx_egress); if (sch->parent != TC_H_CLSACT) return; - tcf_block_put_ext(q->egress_block, sch, &q->egress_block_info); tcf_block_put_ext(q->ingress_block, sch, &q->ingress_block_info); + tcf_block_put_ext(q->egress_block, sch, &q->egress_block_info); + + if (ingress_entry) { + tcx_miniq_set_active(ingress_entry, false); + if (!tcx_entry_is_active(ingress_entry)) { + tcx_entry_update(dev, NULL, true); + tcx_entry_free(ingress_entry); + } + } + + if (egress_entry) { + tcx_miniq_set_active(egress_entry, false); + if (!tcx_entry_is_active(egress_entry)) { + tcx_entry_update(dev, NULL, false); + tcx_entry_free(egress_entry); + } + } net_dec_ingress_queue(); net_dec_egress_queue(); diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c index befaf74b33ca..1a25752f1a9a 100644 --- a/net/sched/sch_qfq.c +++ b/net/sched/sch_qfq.c @@ -130,8 +130,6 @@ struct qfq_aggregate; struct qfq_class { struct Qdisc_class_common common; - unsigned int filter_cnt; - struct gnet_stats_basic_sync bstats; struct gnet_stats_queue qstats; struct net_rate_estimator __rcu *rate_est; @@ -545,8 +543,10 @@ static int qfq_delete_class(struct Qdisc *sch, unsigned long arg, struct qfq_sched *q = qdisc_priv(sch); struct qfq_class *cl = (struct qfq_class *)arg; - if (cl->filter_cnt > 0) + if (qdisc_class_in_use(&cl->common)) { + NL_SET_ERR_MSG_MOD(extack, "QFQ class in use"); return -EBUSY; + } sch_tree_lock(sch); @@ -580,8 +580,8 @@ static unsigned long qfq_bind_tcf(struct Qdisc *sch, unsigned long parent, { struct qfq_class *cl = qfq_find_class(sch, classid); - if (cl != NULL) - cl->filter_cnt++; + if (cl) + qdisc_class_get(&cl->common); return (unsigned long)cl; } @@ -590,7 +590,7 @@ static void qfq_unbind_tcf(struct Qdisc *sch, unsigned long arg) { struct qfq_class *cl = (struct qfq_class *)arg; - cl->filter_cnt--; + qdisc_class_put(&cl->common); } static int qfq_graft_class(struct Qdisc *sch, unsigned long arg, diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c index 8c9cfff7fd05..1cb5e41c0ec7 100644 --- a/net/sched/sch_taprio.c +++ b/net/sched/sch_taprio.c @@ -2099,11 +2099,8 @@ static int taprio_init(struct Qdisc *sch, struct nlattr *opt, return -EOPNOTSUPP; } - /* pre-allocate qdisc, attachment can't fail */ - q->qdiscs = kcalloc(dev->num_tx_queues, - sizeof(q->qdiscs[0]), + q->qdiscs = kcalloc(dev->num_tx_queues, sizeof(q->qdiscs[0]), GFP_KERNEL); - if (!q->qdiscs) return -ENOMEM; @@ -2145,25 +2142,32 @@ static void taprio_attach(struct Qdisc *sch) /* Attach underlying qdisc */ for (ntx = 0; ntx < dev->num_tx_queues; ntx++) { - struct Qdisc *qdisc = q->qdiscs[ntx]; - struct Qdisc *old; + struct netdev_queue *dev_queue = netdev_get_tx_queue(dev, ntx); + struct Qdisc *old, *dev_queue_qdisc; if (FULL_OFFLOAD_IS_ENABLED(q->flags)) { + struct Qdisc *qdisc = q->qdiscs[ntx]; + + /* In offload mode, the root taprio qdisc is bypassed + * and the netdev TX queues see the children directly + */ qdisc->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT; - old = dev_graft_qdisc(qdisc->dev_queue, qdisc); + dev_queue_qdisc = qdisc; } else { - old = dev_graft_qdisc(qdisc->dev_queue, sch); - qdisc_refcount_inc(sch); + /* In software mode, attach the root taprio qdisc + * to all netdev TX queues, so that dev_qdisc_enqueue() + * goes through taprio_enqueue(). + */ + dev_queue_qdisc = sch; } + old = dev_graft_qdisc(dev_queue, dev_queue_qdisc); + /* The qdisc's refcount requires to be elevated once + * for each netdev TX queue it is grafted onto + */ + qdisc_refcount_inc(dev_queue_qdisc); if (old) qdisc_put(old); } - - /* access to the child qdiscs is not needed in offload mode */ - if (FULL_OFFLOAD_IS_ENABLED(q->flags)) { - kfree(q->qdiscs); - q->qdiscs = NULL; - } } static struct netdev_queue *taprio_queue_get(struct Qdisc *sch, @@ -2192,13 +2196,23 @@ static int taprio_graft(struct Qdisc *sch, unsigned long cl, if (dev->flags & IFF_UP) dev_deactivate(dev); + /* In offload mode, the child Qdisc is directly attached to the netdev + * TX queue, and thus, we need to keep its refcount elevated in order + * to counteract qdisc_graft()'s call to qdisc_put() once per TX queue. + * However, save the reference to the new qdisc in the private array in + * both software and offload cases, to have an up-to-date reference to + * our children. + */ + *old = q->qdiscs[cl - 1]; if (FULL_OFFLOAD_IS_ENABLED(q->flags)) { - *old = dev_graft_qdisc(dev_queue, new); - } else { - *old = q->qdiscs[cl - 1]; - q->qdiscs[cl - 1] = new; + WARN_ON_ONCE(dev_graft_qdisc(dev_queue, new) != *old); + if (new) + qdisc_refcount_inc(new); + if (*old) + qdisc_put(*old); } + q->qdiscs[cl - 1] = new; if (new) new->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT; @@ -2436,12 +2450,14 @@ start_error: static struct Qdisc *taprio_leaf(struct Qdisc *sch, unsigned long cl) { - struct netdev_queue *dev_queue = taprio_queue_get(sch, cl); + struct taprio_sched *q = qdisc_priv(sch); + struct net_device *dev = qdisc_dev(sch); + unsigned int ntx = cl - 1; - if (!dev_queue) + if (ntx >= dev->num_tx_queues) return NULL; - return rtnl_dereference(dev_queue->qdisc_sleeping); + return q->qdiscs[ntx]; } static unsigned long taprio_find(struct Qdisc *sch, u32 classid) @@ -2456,11 +2472,11 @@ static unsigned long taprio_find(struct Qdisc *sch, u32 classid) static int taprio_dump_class(struct Qdisc *sch, unsigned long cl, struct sk_buff *skb, struct tcmsg *tcm) { - struct netdev_queue *dev_queue = taprio_queue_get(sch, cl); + struct Qdisc *child = taprio_leaf(sch, cl); tcm->tcm_parent = TC_H_ROOT; tcm->tcm_handle |= TC_H_MIN(cl); - tcm->tcm_info = rtnl_dereference(dev_queue->qdisc_sleeping)->handle; + tcm->tcm_info = child->handle; return 0; } @@ -2470,16 +2486,14 @@ static int taprio_dump_class_stats(struct Qdisc *sch, unsigned long cl, __releases(d->lock) __acquires(d->lock) { - struct netdev_queue *dev_queue = taprio_queue_get(sch, cl); + struct Qdisc *child = taprio_leaf(sch, cl); struct tc_taprio_qopt_offload offload = { .cmd = TAPRIO_CMD_QUEUE_STATS, .queue_stats = { .queue = cl - 1, }, }; - struct Qdisc *child; - child = rtnl_dereference(dev_queue->qdisc_sleeping); if (gnet_stats_copy_basic(d, NULL, &child->bstats, true) < 0 || qdisc_qstats_copy(d, child) < 0) return -1; diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index 274d07bd774f..33c0895e101c 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -435,7 +435,8 @@ static void sctp_v4_get_dst(struct sctp_transport *t, union sctp_addr *saddr, fl4->fl4_dport = daddr->v4.sin_port; fl4->flowi4_proto = IPPROTO_SCTP; if (asoc) { - fl4->flowi4_tos = RT_CONN_FLAGS_TOS(asoc->base.sk, tos); + fl4->flowi4_tos = RT_TOS(tos); + fl4->flowi4_scope = ip_sock_rt_scope(asoc->base.sk); fl4->flowi4_oif = asoc->base.sk->sk_bound_dev_if; fl4->fl4_sport = htons(asoc->base.bind_addr.port); } diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 9388d98aebc0..6e3d28aa587c 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -9732,6 +9732,7 @@ struct proto sctpv6_prot = { .unhash = sctp_unhash, .no_autobind = true, .obj_size = sizeof(struct sctp6_sock), + .ipv6_pinfo_offset = offsetof(struct sctp6_sock, inet6), .useroffset = offsetof(struct sctp6_sock, sctp.subscribe), .usersize = offsetof(struct sctp6_sock, sctp.initmsg) - offsetof(struct sctp6_sock, sctp.subscribe) + diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index 1645fba0d2d3..3c1b31bfa1cf 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -539,7 +539,6 @@ int smc_vlan_by_tcpsk(struct socket *clcsock, struct smc_init_info *ini); void smc_conn_free(struct smc_connection *conn); int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini); -void smc_lgr_schedule_free_work_fast(struct smc_link_group *lgr); int smc_core_init(void); void smc_core_exit(void); diff --git a/net/smc/smc_ib.h b/net/smc/smc_ib.h index 034295676e88..4df5f8c8a0a1 100644 --- a/net/smc/smc_ib.h +++ b/net/smc/smc_ib.h @@ -96,7 +96,6 @@ void smc_ib_destroy_queue_pair(struct smc_link *lnk); int smc_ib_create_queue_pair(struct smc_link *lnk); int smc_ib_ready_link(struct smc_link *lnk); int smc_ib_modify_qp_rts(struct smc_link *lnk); -int smc_ib_modify_qp_reset(struct smc_link *lnk); int smc_ib_modify_qp_error(struct smc_link *lnk); long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev); int smc_ib_get_memory_region(struct ib_pd *pd, int access_flags, diff --git a/net/socket.c b/net/socket.c index 2b0e54b2405c..5d4e37595e9a 100644 --- a/net/socket.c +++ b/net/socket.c @@ -136,9 +136,10 @@ static void sock_splice_eof(struct file *file); static void sock_show_fdinfo(struct seq_file *m, struct file *f) { struct socket *sock = f->private_data; + const struct proto_ops *ops = READ_ONCE(sock->ops); - if (sock->ops->show_fdinfo) - sock->ops->show_fdinfo(m, sock); + if (ops->show_fdinfo) + ops->show_fdinfo(m, sock); } #else #define sock_show_fdinfo NULL @@ -646,12 +647,14 @@ EXPORT_SYMBOL(sock_alloc); static void __sock_release(struct socket *sock, struct inode *inode) { - if (sock->ops) { - struct module *owner = sock->ops->owner; + const struct proto_ops *ops = READ_ONCE(sock->ops); + + if (ops) { + struct module *owner = ops->owner; if (inode) inode_lock(inode); - sock->ops->release(sock); + ops->release(sock); sock->sk = NULL; if (inode) inode_unlock(inode); @@ -722,7 +725,7 @@ static noinline void call_trace_sock_send_length(struct sock *sk, int ret, static inline int sock_sendmsg_nosec(struct socket *sock, struct msghdr *msg) { - int ret = INDIRECT_CALL_INET(sock->ops->sendmsg, inet6_sendmsg, + int ret = INDIRECT_CALL_INET(READ_ONCE(sock->ops)->sendmsg, inet6_sendmsg, inet_sendmsg, sock, msg, msg_data_left(msg)); BUG_ON(ret == -EIOCBQUEUED); @@ -786,13 +789,14 @@ int kernel_sendmsg_locked(struct sock *sk, struct msghdr *msg, struct kvec *vec, size_t num, size_t size) { struct socket *sock = sk->sk_socket; + const struct proto_ops *ops = READ_ONCE(sock->ops); - if (!sock->ops->sendmsg_locked) + if (!ops->sendmsg_locked) return sock_no_sendmsg_locked(sk, msg, size); iov_iter_kvec(&msg->msg_iter, ITER_SOURCE, vec, num, size); - return sock->ops->sendmsg_locked(sk, msg, msg_data_left(msg)); + return ops->sendmsg_locked(sk, msg, msg_data_left(msg)); } EXPORT_SYMBOL(kernel_sendmsg_locked); @@ -1017,7 +1021,8 @@ static noinline void call_trace_sock_recv_length(struct sock *sk, int ret, int f static inline int sock_recvmsg_nosec(struct socket *sock, struct msghdr *msg, int flags) { - int ret = INDIRECT_CALL_INET(sock->ops->recvmsg, inet6_recvmsg, + int ret = INDIRECT_CALL_INET(READ_ONCE(sock->ops)->recvmsg, + inet6_recvmsg, inet_recvmsg, sock, msg, msg_data_left(msg), flags); if (trace_sock_recv_length_enabled()) @@ -1072,19 +1077,23 @@ static ssize_t sock_splice_read(struct file *file, loff_t *ppos, unsigned int flags) { struct socket *sock = file->private_data; + const struct proto_ops *ops; - if (unlikely(!sock->ops->splice_read)) + ops = READ_ONCE(sock->ops); + if (unlikely(!ops->splice_read)) return copy_splice_read(file, ppos, pipe, len, flags); - return sock->ops->splice_read(sock, ppos, pipe, len, flags); + return ops->splice_read(sock, ppos, pipe, len, flags); } static void sock_splice_eof(struct file *file) { struct socket *sock = file->private_data; + const struct proto_ops *ops; - if (sock->ops->splice_eof) - sock->ops->splice_eof(sock); + ops = READ_ONCE(sock->ops); + if (ops->splice_eof) + ops->splice_eof(sock); } static ssize_t sock_read_iter(struct kiocb *iocb, struct iov_iter *to) @@ -1181,13 +1190,14 @@ EXPORT_SYMBOL(vlan_ioctl_set); static long sock_do_ioctl(struct net *net, struct socket *sock, unsigned int cmd, unsigned long arg) { + const struct proto_ops *ops = READ_ONCE(sock->ops); struct ifreq ifr; bool need_copyout; int err; void __user *argp = (void __user *)arg; void __user *data; - err = sock->ops->ioctl(sock, cmd, arg); + err = ops->ioctl(sock, cmd, arg); /* * If this ioctl is unknown try to hand it down @@ -1216,6 +1226,7 @@ static long sock_do_ioctl(struct net *net, struct socket *sock, static long sock_ioctl(struct file *file, unsigned cmd, unsigned long arg) { + const struct proto_ops *ops; struct socket *sock; struct sock *sk; void __user *argp = (void __user *)arg; @@ -1223,6 +1234,7 @@ static long sock_ioctl(struct file *file, unsigned cmd, unsigned long arg) struct net *net; sock = file->private_data; + ops = READ_ONCE(sock->ops); sk = sock->sk; net = sock_net(sk); if (unlikely(cmd >= SIOCDEVPRIVATE && cmd <= (SIOCDEVPRIVATE + 15))) { @@ -1280,23 +1292,23 @@ static long sock_ioctl(struct file *file, unsigned cmd, unsigned long arg) break; case SIOCGSTAMP_OLD: case SIOCGSTAMPNS_OLD: - if (!sock->ops->gettstamp) { + if (!ops->gettstamp) { err = -ENOIOCTLCMD; break; } - err = sock->ops->gettstamp(sock, argp, - cmd == SIOCGSTAMP_OLD, - !IS_ENABLED(CONFIG_64BIT)); + err = ops->gettstamp(sock, argp, + cmd == SIOCGSTAMP_OLD, + !IS_ENABLED(CONFIG_64BIT)); break; case SIOCGSTAMP_NEW: case SIOCGSTAMPNS_NEW: - if (!sock->ops->gettstamp) { + if (!ops->gettstamp) { err = -ENOIOCTLCMD; break; } - err = sock->ops->gettstamp(sock, argp, - cmd == SIOCGSTAMP_NEW, - false); + err = ops->gettstamp(sock, argp, + cmd == SIOCGSTAMP_NEW, + false); break; case SIOCGIFCONF: @@ -1357,9 +1369,10 @@ EXPORT_SYMBOL(sock_create_lite); static __poll_t sock_poll(struct file *file, poll_table *wait) { struct socket *sock = file->private_data; + const struct proto_ops *ops = READ_ONCE(sock->ops); __poll_t events = poll_requested_events(wait), flag = 0; - if (!sock->ops->poll) + if (!ops->poll) return 0; if (sk_can_busy_loop(sock->sk)) { @@ -1371,14 +1384,14 @@ static __poll_t sock_poll(struct file *file, poll_table *wait) flag = POLL_BUSY_LOOP; } - return sock->ops->poll(file, sock, wait) | flag; + return ops->poll(file, sock, wait) | flag; } static int sock_mmap(struct file *file, struct vm_area_struct *vma) { struct socket *sock = file->private_data; - return sock->ops->mmap(file, sock, vma); + return READ_ONCE(sock->ops)->mmap(file, sock, vma); } static int sock_close(struct inode *inode, struct file *filp) @@ -1728,7 +1741,7 @@ int __sys_socketpair(int family, int type, int protocol, int __user *usockvec) goto out; } - err = sock1->ops->socketpair(sock1, sock2); + err = READ_ONCE(sock1->ops)->socketpair(sock1, sock2); if (unlikely(err < 0)) { sock_release(sock2); sock_release(sock1); @@ -1789,7 +1802,7 @@ int __sys_bind(int fd, struct sockaddr __user *umyaddr, int addrlen) (struct sockaddr *)&address, addrlen); if (!err) - err = sock->ops->bind(sock, + err = READ_ONCE(sock->ops)->bind(sock, (struct sockaddr *) &address, addrlen); } @@ -1823,7 +1836,7 @@ int __sys_listen(int fd, int backlog) err = security_socket_listen(sock, backlog); if (!err) - err = sock->ops->listen(sock, backlog); + err = READ_ONCE(sock->ops)->listen(sock, backlog); fput_light(sock->file, fput_needed); } @@ -1843,6 +1856,7 @@ struct file *do_accept(struct file *file, unsigned file_flags, struct file *newfile; int err, len; struct sockaddr_storage address; + const struct proto_ops *ops; sock = sock_from_file(file); if (!sock) @@ -1851,15 +1865,16 @@ struct file *do_accept(struct file *file, unsigned file_flags, newsock = sock_alloc(); if (!newsock) return ERR_PTR(-ENFILE); + ops = READ_ONCE(sock->ops); newsock->type = sock->type; - newsock->ops = sock->ops; + newsock->ops = ops; /* * We don't need try_module_get here, as the listening socket (sock) * has the protocol module (sock->ops->owner) held. */ - __module_get(newsock->ops->owner); + __module_get(ops->owner); newfile = sock_alloc_file(newsock, flags, sock->sk->sk_prot_creator->name); if (IS_ERR(newfile)) @@ -1869,14 +1884,13 @@ struct file *do_accept(struct file *file, unsigned file_flags, if (err) goto out_fd; - err = sock->ops->accept(sock, newsock, sock->file->f_flags | file_flags, + err = ops->accept(sock, newsock, sock->file->f_flags | file_flags, false); if (err < 0) goto out_fd; if (upeer_sockaddr) { - len = newsock->ops->getname(newsock, - (struct sockaddr *)&address, 2); + len = ops->getname(newsock, (struct sockaddr *)&address, 2); if (len < 0) { err = -ECONNABORTED; goto out_fd; @@ -1989,8 +2003,8 @@ int __sys_connect_file(struct file *file, struct sockaddr_storage *address, if (err) goto out; - err = sock->ops->connect(sock, (struct sockaddr *)address, addrlen, - sock->file->f_flags | file_flags); + err = READ_ONCE(sock->ops)->connect(sock, (struct sockaddr *)address, + addrlen, sock->file->f_flags | file_flags); out: return err; } @@ -2039,7 +2053,7 @@ int __sys_getsockname(int fd, struct sockaddr __user *usockaddr, if (err) goto out_put; - err = sock->ops->getname(sock, (struct sockaddr *)&address, 0); + err = READ_ONCE(sock->ops)->getname(sock, (struct sockaddr *)&address, 0); if (err < 0) goto out_put; /* "err" is actually length in this case */ @@ -2071,13 +2085,15 @@ int __sys_getpeername(int fd, struct sockaddr __user *usockaddr, sock = sockfd_lookup_light(fd, &err, &fput_needed); if (sock != NULL) { + const struct proto_ops *ops = READ_ONCE(sock->ops); + err = security_socket_getpeername(sock); if (err) { fput_light(sock->file, fput_needed); return err; } - err = sock->ops->getname(sock, (struct sockaddr *)&address, 1); + err = ops->getname(sock, (struct sockaddr *)&address, 1); if (err >= 0) /* "err" is actually length in this case */ err = move_addr_to_user(&address, err, usockaddr, @@ -2227,6 +2243,7 @@ int __sys_setsockopt(int fd, int level, int optname, char __user *user_optval, int optlen) { sockptr_t optval = USER_SOCKPTR(user_optval); + const struct proto_ops *ops; char *kernel_optval = NULL; int err, fput_needed; struct socket *sock; @@ -2255,12 +2272,13 @@ int __sys_setsockopt(int fd, int level, int optname, char __user *user_optval, if (kernel_optval) optval = KERNEL_SOCKPTR(kernel_optval); + ops = READ_ONCE(sock->ops); if (level == SOL_SOCKET && !sock_use_custom_sol_socket(sock)) err = sock_setsockopt(sock, level, optname, optval, optlen); - else if (unlikely(!sock->ops->setsockopt)) + else if (unlikely(!ops->setsockopt)) err = -EOPNOTSUPP; else - err = sock->ops->setsockopt(sock, level, optname, optval, + err = ops->setsockopt(sock, level, optname, optval, optlen); kfree(kernel_optval); out_put: @@ -2285,6 +2303,7 @@ int __sys_getsockopt(int fd, int level, int optname, char __user *optval, int __user *optlen) { int max_optlen __maybe_unused; + const struct proto_ops *ops; int err, fput_needed; struct socket *sock; @@ -2299,12 +2318,13 @@ int __sys_getsockopt(int fd, int level, int optname, char __user *optval, if (!in_compat_syscall()) max_optlen = BPF_CGROUP_GETSOCKOPT_MAX_OPTLEN(optlen); + ops = READ_ONCE(sock->ops); if (level == SOL_SOCKET) err = sock_getsockopt(sock, level, optname, optval, optlen); - else if (unlikely(!sock->ops->getsockopt)) + else if (unlikely(!ops->getsockopt)) err = -EOPNOTSUPP; else - err = sock->ops->getsockopt(sock, level, optname, optval, + err = ops->getsockopt(sock, level, optname, optval, optlen); if (!in_compat_syscall()) @@ -2332,7 +2352,7 @@ int __sys_shutdown_sock(struct socket *sock, int how) err = security_socket_shutdown(sock, how); if (!err) - err = sock->ops->shutdown(sock, how); + err = READ_ONCE(sock->ops)->shutdown(sock, how); return err; } @@ -3324,6 +3344,7 @@ static int compat_sock_ioctl_trans(struct file *file, struct socket *sock, void __user *argp = compat_ptr(arg); struct sock *sk = sock->sk; struct net *net = sock_net(sk); + const struct proto_ops *ops; if (cmd >= SIOCDEVPRIVATE && cmd <= (SIOCDEVPRIVATE + 15)) return sock_ioctl(file, cmd, (unsigned long)argp); @@ -3333,10 +3354,11 @@ static int compat_sock_ioctl_trans(struct file *file, struct socket *sock, return compat_siocwandev(net, argp); case SIOCGSTAMP_OLD: case SIOCGSTAMPNS_OLD: - if (!sock->ops->gettstamp) + ops = READ_ONCE(sock->ops); + if (!ops->gettstamp) return -ENOIOCTLCMD; - return sock->ops->gettstamp(sock, argp, cmd == SIOCGSTAMP_OLD, - !COMPAT_USE_64BIT_TIME); + return ops->gettstamp(sock, argp, cmd == SIOCGSTAMP_OLD, + !COMPAT_USE_64BIT_TIME); case SIOCETHTOOL: case SIOCBONDSLAVEINFOQUERY: @@ -3417,6 +3439,7 @@ static long compat_sock_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct socket *sock = file->private_data; + const struct proto_ops *ops = READ_ONCE(sock->ops); int ret = -ENOIOCTLCMD; struct sock *sk; struct net *net; @@ -3424,8 +3447,8 @@ static long compat_sock_ioctl(struct file *file, unsigned int cmd, sk = sock->sk; net = sock_net(sk); - if (sock->ops->compat_ioctl) - ret = sock->ops->compat_ioctl(sock, cmd, arg); + if (ops->compat_ioctl) + ret = ops->compat_ioctl(sock, cmd, arg); if (ret == -ENOIOCTLCMD && (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)) @@ -3449,7 +3472,7 @@ static long compat_sock_ioctl(struct file *file, unsigned int cmd, int kernel_bind(struct socket *sock, struct sockaddr *addr, int addrlen) { - return sock->ops->bind(sock, addr, addrlen); + return READ_ONCE(sock->ops)->bind(sock, addr, addrlen); } EXPORT_SYMBOL(kernel_bind); @@ -3463,7 +3486,7 @@ EXPORT_SYMBOL(kernel_bind); int kernel_listen(struct socket *sock, int backlog) { - return sock->ops->listen(sock, backlog); + return READ_ONCE(sock->ops)->listen(sock, backlog); } EXPORT_SYMBOL(kernel_listen); @@ -3481,6 +3504,7 @@ EXPORT_SYMBOL(kernel_listen); int kernel_accept(struct socket *sock, struct socket **newsock, int flags) { struct sock *sk = sock->sk; + const struct proto_ops *ops = READ_ONCE(sock->ops); int err; err = sock_create_lite(sk->sk_family, sk->sk_type, sk->sk_protocol, @@ -3488,15 +3512,15 @@ int kernel_accept(struct socket *sock, struct socket **newsock, int flags) if (err < 0) goto done; - err = sock->ops->accept(sock, *newsock, flags, true); + err = ops->accept(sock, *newsock, flags, true); if (err < 0) { sock_release(*newsock); *newsock = NULL; goto done; } - (*newsock)->ops = sock->ops; - __module_get((*newsock)->ops->owner); + (*newsock)->ops = ops; + __module_get(ops->owner); done: return err; @@ -3519,7 +3543,7 @@ EXPORT_SYMBOL(kernel_accept); int kernel_connect(struct socket *sock, struct sockaddr *addr, int addrlen, int flags) { - return sock->ops->connect(sock, addr, addrlen, flags); + return READ_ONCE(sock->ops)->connect(sock, addr, addrlen, flags); } EXPORT_SYMBOL(kernel_connect); @@ -3534,7 +3558,7 @@ EXPORT_SYMBOL(kernel_connect); int kernel_getsockname(struct socket *sock, struct sockaddr *addr) { - return sock->ops->getname(sock, addr, 0); + return READ_ONCE(sock->ops)->getname(sock, addr, 0); } EXPORT_SYMBOL(kernel_getsockname); @@ -3549,7 +3573,7 @@ EXPORT_SYMBOL(kernel_getsockname); int kernel_getpeername(struct socket *sock, struct sockaddr *addr) { - return sock->ops->getname(sock, addr, 1); + return READ_ONCE(sock->ops)->getname(sock, addr, 1); } EXPORT_SYMBOL(kernel_getpeername); @@ -3563,7 +3587,7 @@ EXPORT_SYMBOL(kernel_getpeername); int kernel_sock_shutdown(struct socket *sock, enum sock_shutdown_cmd how) { - return sock->ops->shutdown(sock, how); + return READ_ONCE(sock->ops)->shutdown(sock, how); } EXPORT_SYMBOL(kernel_sock_shutdown); diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index e43f26382411..2ed29e40c6a9 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -43,7 +43,7 @@ #include <net/udp.h> #include <net/tcp.h> #include <net/tcp_states.h> -#include <net/tls.h> +#include <net/tls_prot.h> #include <net/handshake.h> #include <linux/uaccess.h> #include <linux/highmem.h> @@ -226,27 +226,30 @@ static int svc_one_sock_name(struct svc_sock *svsk, char *buf, int remaining) } static int -svc_tcp_sock_process_cmsg(struct svc_sock *svsk, struct msghdr *msg, +svc_tcp_sock_process_cmsg(struct socket *sock, struct msghdr *msg, struct cmsghdr *cmsg, int ret) { - if (cmsg->cmsg_level == SOL_TLS && - cmsg->cmsg_type == TLS_GET_RECORD_TYPE) { - u8 content_type = *((u8 *)CMSG_DATA(cmsg)); - - switch (content_type) { - case TLS_RECORD_TYPE_DATA: - /* TLS sets EOR at the end of each application data - * record, even though there might be more frames - * waiting to be decrypted. - */ - msg->msg_flags &= ~MSG_EOR; - break; - case TLS_RECORD_TYPE_ALERT: - ret = -ENOTCONN; - break; - default: - ret = -EAGAIN; - } + u8 content_type = tls_get_record_type(sock->sk, cmsg); + u8 level, description; + + switch (content_type) { + case 0: + break; + case TLS_RECORD_TYPE_DATA: + /* TLS sets EOR at the end of each application data + * record, even though there might be more frames + * waiting to be decrypted. + */ + msg->msg_flags &= ~MSG_EOR; + break; + case TLS_RECORD_TYPE_ALERT: + tls_alert_recv(sock->sk, msg, &level, &description); + ret = (level == TLS_ALERT_LEVEL_FATAL) ? + -ENOTCONN : -EAGAIN; + break; + default: + /* discard this record type */ + ret = -EAGAIN; } return ret; } @@ -258,13 +261,14 @@ svc_tcp_sock_recv_cmsg(struct svc_sock *svsk, struct msghdr *msg) struct cmsghdr cmsg; u8 buf[CMSG_SPACE(sizeof(u8))]; } u; + struct socket *sock = svsk->sk_sock; int ret; msg->msg_control = &u; msg->msg_controllen = sizeof(u); - ret = sock_recvmsg(svsk->sk_sock, msg, MSG_DONTWAIT); + ret = sock_recvmsg(sock, msg, MSG_DONTWAIT); if (unlikely(msg->msg_controllen != sizeof(u))) - ret = svc_tcp_sock_process_cmsg(svsk, msg, &u.cmsg, ret); + ret = svc_tcp_sock_process_cmsg(sock, msg, &u.cmsg, ret); return ret; } @@ -1621,6 +1625,8 @@ static void svc_tcp_sock_detach(struct svc_xprt *xprt) { struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt); + tls_handshake_close(svsk->sk_sock); + svc_sock_detach(xprt); if (!test_bit(XPT_LISTENER, &xprt->xpt_flags)) { diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 9f010369100a..268a2cc61acd 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -47,7 +47,7 @@ #include <net/checksum.h> #include <net/udp.h> #include <net/tcp.h> -#include <net/tls.h> +#include <net/tls_prot.h> #include <net/handshake.h> #include <linux/bvec.h> @@ -360,24 +360,27 @@ static int xs_sock_process_cmsg(struct socket *sock, struct msghdr *msg, struct cmsghdr *cmsg, int ret) { - if (cmsg->cmsg_level == SOL_TLS && - cmsg->cmsg_type == TLS_GET_RECORD_TYPE) { - u8 content_type = *((u8 *)CMSG_DATA(cmsg)); - - switch (content_type) { - case TLS_RECORD_TYPE_DATA: - /* TLS sets EOR at the end of each application data - * record, even though there might be more frames - * waiting to be decrypted. - */ - msg->msg_flags &= ~MSG_EOR; - break; - case TLS_RECORD_TYPE_ALERT: - ret = -ENOTCONN; - break; - default: - ret = -EAGAIN; - } + u8 content_type = tls_get_record_type(sock->sk, cmsg); + u8 level, description; + + switch (content_type) { + case 0: + break; + case TLS_RECORD_TYPE_DATA: + /* TLS sets EOR at the end of each application data + * record, even though there might be more frames + * waiting to be decrypted. + */ + msg->msg_flags &= ~MSG_EOR; + break; + case TLS_RECORD_TYPE_ALERT: + tls_alert_recv(sock->sk, msg, &level, &description); + ret = (level == TLS_ALERT_LEVEL_FATAL) ? + -EACCES : -EAGAIN; + break; + default: + /* discard this record type */ + ret = -EAGAIN; } return ret; } @@ -777,6 +780,8 @@ static void xs_stream_data_receive(struct sock_xprt *transport) } if (ret == -ESHUTDOWN) kernel_sock_shutdown(transport->sock, SHUT_RDWR); + else if (ret == -EACCES) + xprt_wake_pending_tasks(&transport->xprt, -EACCES); else xs_poll_check_readable(transport); out: @@ -1292,6 +1297,8 @@ static void xs_close(struct rpc_xprt *xprt) dprintk("RPC: xs_close xprt %p\n", xprt); + if (transport->sock) + tls_handshake_close(transport->sock); xs_reset_transport(transport); xprt->reestablish_timeout = 0; } diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c index 8cc42aea19c7..5b045284849e 100644 --- a/net/switchdev/switchdev.c +++ b/net/switchdev/switchdev.c @@ -862,3 +862,28 @@ void switchdev_bridge_port_unoffload(struct net_device *brport_dev, NULL); } EXPORT_SYMBOL_GPL(switchdev_bridge_port_unoffload); + +int switchdev_bridge_port_replay(struct net_device *brport_dev, + struct net_device *dev, const void *ctx, + struct notifier_block *atomic_nb, + struct notifier_block *blocking_nb, + struct netlink_ext_ack *extack) +{ + struct switchdev_notifier_brport_info brport_info = { + .brport = { + .dev = dev, + .ctx = ctx, + .atomic_nb = atomic_nb, + .blocking_nb = blocking_nb, + }, + }; + int err; + + ASSERT_RTNL(); + + err = call_switchdev_blocking_notifiers(SWITCHDEV_BRPORT_REPLAY, + brport_dev, &brport_info.info, + extack); + return notifier_to_errno(err); +} +EXPORT_SYMBOL_GPL(switchdev_bridge_port_replay); diff --git a/net/tipc/addr.h b/net/tipc/addr.h index 0772cfadaa0d..93f82398283d 100644 --- a/net/tipc/addr.h +++ b/net/tipc/addr.h @@ -131,6 +131,5 @@ bool tipc_in_scope(bool legacy_format, u32 domain, u32 addr); void tipc_set_node_id(struct net *net, u8 *id); void tipc_set_node_addr(struct net *net, u32 addr); char *tipc_nodeid2string(char *str, u8 *id); -u32 tipc_node_id2hash(u8 *id128); #endif diff --git a/net/tipc/bearer.h b/net/tipc/bearer.h index 1ee60649bd17..41eac1ee0c09 100644 --- a/net/tipc/bearer.h +++ b/net/tipc/bearer.h @@ -214,8 +214,6 @@ int tipc_nl_media_get(struct sk_buff *skb, struct genl_info *info); int tipc_nl_media_set(struct sk_buff *skb, struct genl_info *info); int __tipc_nl_media_set(struct sk_buff *skb, struct genl_info *info); -int tipc_media_set_priority(const char *name, u32 new_value); -int tipc_media_set_window(const char *name, u32 new_value); int tipc_media_addr_printf(char *buf, int len, struct tipc_media_addr *a); int tipc_enable_l2_media(struct net *net, struct tipc_bearer *b, struct nlattr *attrs[]); diff --git a/net/tipc/link.h b/net/tipc/link.h index a16f401fdabd..d80f5649b395 100644 --- a/net/tipc/link.h +++ b/net/tipc/link.h @@ -148,8 +148,6 @@ int tipc_link_bc_ack_rcv(struct tipc_link *l, u16 acked, u16 gap, struct tipc_gap_ack_blks *ga, struct sk_buff_head *xmitq, struct sk_buff_head *retrq); -void tipc_link_build_bc_sync_msg(struct tipc_link *l, - struct sk_buff_head *xmitq); void tipc_link_bc_init_rcv(struct tipc_link *l, struct tipc_msg *hdr); int tipc_link_bc_sync_rcv(struct tipc_link *l, struct tipc_msg *hdr, struct sk_buff_head *xmitq); diff --git a/net/tipc/name_distr.h b/net/tipc/name_distr.h index e231e6964d61..c677f6f082df 100644 --- a/net/tipc/name_distr.h +++ b/net/tipc/name_distr.h @@ -67,7 +67,6 @@ struct distr_item { __be32 key; }; -void tipc_named_bcast(struct net *net, struct sk_buff *skb); struct sk_buff *tipc_named_publish(struct net *net, struct publication *publ); struct sk_buff *tipc_named_withdraw(struct net *net, struct publication *publ); void tipc_named_node_up(struct net *net, u32 dnode, u16 capabilities); diff --git a/net/tipc/net.h b/net/tipc/net.h index d0c91d2df20a..1cb1e43cf34a 100644 --- a/net/tipc/net.h +++ b/net/tipc/net.h @@ -43,7 +43,6 @@ extern const struct nla_policy tipc_nl_net_policy[]; int tipc_net_init(struct net *net, u8 *node_id, u32 addr); void tipc_net_finalize_work(struct work_struct *work); -void tipc_sched_net_finalize(struct net *net, u32 addr); void tipc_net_stop(struct net *net); int tipc_nl_net_dump(struct sk_buff *skb, struct netlink_callback *cb); int tipc_nl_net_set(struct sk_buff *skb, struct genl_info *info); diff --git a/net/tls/tls.h b/net/tls/tls.h index 86cef1c68e03..164d6a955e26 100644 --- a/net/tls/tls.h +++ b/net/tls/tls.h @@ -39,6 +39,7 @@ #include <linux/types.h> #include <linux/skmsg.h> #include <net/tls.h> +#include <net/tls_prot.h> #define TLS_PAGE_ORDER (min_t(unsigned int, PAGE_ALLOC_COSTLY_ORDER, \ TLS_MAX_PAYLOAD_SIZE >> PAGE_SHIFT)) @@ -86,10 +87,6 @@ void tls_ctx_free(struct sock *sk, struct tls_context *ctx); void update_sk_prot(struct sock *sk, struct tls_context *ctx); int wait_on_pending_writer(struct sock *sk, long *timeo); -int tls_sk_query(struct sock *sk, int optname, char __user *optval, - int __user *optlen); -int tls_sk_attach(struct sock *sk, int optname, char __user *optval, - unsigned int optlen); void tls_err_abort(struct sock *sk, int err); int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx); @@ -110,6 +107,8 @@ bool tls_sw_sock_is_readable(struct sock *sk); ssize_t tls_sw_splice_read(struct socket *sock, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags); +int tls_sw_read_sock(struct sock *sk, read_descriptor_t *desc, + sk_read_actor_t read_actor); int tls_device_sendmsg(struct sock *sk, struct msghdr *msg, size_t size); void tls_device_splice_eof(struct socket *sock); diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c index 529101eb20bd..2392d06845aa 100644 --- a/net/tls/tls_device.c +++ b/net/tls/tls_device.c @@ -440,9 +440,13 @@ static int tls_push_data(struct sock *sk, long timeo; if (flags & - ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL | MSG_SPLICE_PAGES)) + ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL | + MSG_SPLICE_PAGES | MSG_EOR)) return -EOPNOTSUPP; + if ((flags & (MSG_MORE | MSG_EOR)) == (MSG_MORE | MSG_EOR)) + return -EINVAL; + if (unlikely(sk->sk_err)) return -sk->sk_err; diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index 4a8ee2f6badb..f550c84f3408 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -959,10 +959,12 @@ static void build_proto_ops(struct proto_ops ops[TLS_NUM_CONFIG][TLS_NUM_CONFIG] ops[TLS_BASE][TLS_SW ] = ops[TLS_BASE][TLS_BASE]; ops[TLS_BASE][TLS_SW ].splice_read = tls_sw_splice_read; ops[TLS_BASE][TLS_SW ].poll = tls_sk_poll; + ops[TLS_BASE][TLS_SW ].read_sock = tls_sw_read_sock; ops[TLS_SW ][TLS_SW ] = ops[TLS_SW ][TLS_BASE]; ops[TLS_SW ][TLS_SW ].splice_read = tls_sw_splice_read; ops[TLS_SW ][TLS_SW ].poll = tls_sk_poll; + ops[TLS_SW ][TLS_SW ].read_sock = tls_sw_read_sock; #ifdef CONFIG_TLS_DEVICE ops[TLS_HW ][TLS_BASE] = ops[TLS_BASE][TLS_BASE]; diff --git a/net/tls/tls_strp.c b/net/tls/tls_strp.c index f37f4a0fcd3c..ca1e0e198ceb 100644 --- a/net/tls/tls_strp.c +++ b/net/tls/tls_strp.c @@ -369,7 +369,6 @@ static int tls_strp_copyin(read_descriptor_t *desc, struct sk_buff *in_skb, static int tls_strp_read_copyin(struct tls_strparser *strp) { - struct socket *sock = strp->sk->sk_socket; read_descriptor_t desc; desc.arg.data = strp; @@ -377,7 +376,7 @@ static int tls_strp_read_copyin(struct tls_strparser *strp) desc.count = 1; /* give more than one skb per call */ /* sk should be locked here, so okay to do read_sock */ - sock->ops->read_sock(strp->sk, &desc, tls_strp_copyin); + tcp_read_sock(strp->sk, &desc, tls_strp_copyin); return desc.error; } diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 53f944e6d8ef..5c122d7bb784 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -984,6 +984,9 @@ static int tls_sw_sendmsg_locked(struct sock *sk, struct msghdr *msg, int ret = 0; int pending; + if (!eor && (msg->msg_flags & MSG_EOR)) + return -EINVAL; + if (unlikely(msg->msg_controllen)) { ret = tls_process_cmsg(sk, msg, &record_type); if (ret) { @@ -1193,7 +1196,7 @@ int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) int ret; if (msg->msg_flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL | - MSG_CMSG_COMPAT | MSG_SPLICE_PAGES | + MSG_CMSG_COMPAT | MSG_SPLICE_PAGES | MSG_EOR | MSG_SENDPAGE_NOPOLICY)) return -EOPNOTSUPP; @@ -1845,13 +1848,10 @@ tls_read_flush_backlog(struct sock *sk, struct tls_prot_info *prot, return sk_flush_backlog(sk); } -static int tls_rx_reader_lock(struct sock *sk, struct tls_sw_context_rx *ctx, - bool nonblock) +static int tls_rx_reader_acquire(struct sock *sk, struct tls_sw_context_rx *ctx, + bool nonblock) { long timeo; - int err; - - lock_sock(sk); timeo = sock_rcvtimeo(sk, nonblock); @@ -1865,26 +1865,30 @@ static int tls_rx_reader_lock(struct sock *sk, struct tls_sw_context_rx *ctx, !READ_ONCE(ctx->reader_present), &wait); remove_wait_queue(&ctx->wq, &wait); - if (timeo <= 0) { - err = -EAGAIN; - goto err_unlock; - } - if (signal_pending(current)) { - err = sock_intr_errno(timeo); - goto err_unlock; - } + if (timeo <= 0) + return -EAGAIN; + if (signal_pending(current)) + return sock_intr_errno(timeo); } WRITE_ONCE(ctx->reader_present, 1); return 0; +} -err_unlock: - release_sock(sk); +static int tls_rx_reader_lock(struct sock *sk, struct tls_sw_context_rx *ctx, + bool nonblock) +{ + int err; + + lock_sock(sk); + err = tls_rx_reader_acquire(sk, ctx, nonblock); + if (err) + release_sock(sk); return err; } -static void tls_rx_reader_unlock(struct sock *sk, struct tls_sw_context_rx *ctx) +static void tls_rx_reader_release(struct sock *sk, struct tls_sw_context_rx *ctx) { if (unlikely(ctx->reader_contended)) { if (wq_has_sleeper(&ctx->wq)) @@ -1896,6 +1900,11 @@ static void tls_rx_reader_unlock(struct sock *sk, struct tls_sw_context_rx *ctx) } WRITE_ONCE(ctx->reader_present, 0); +} + +static void tls_rx_reader_unlock(struct sock *sk, struct tls_sw_context_rx *ctx) +{ + tls_rx_reader_release(sk, ctx); release_sock(sk); } @@ -2193,6 +2202,102 @@ splice_requeue: goto splice_read_end; } +int tls_sw_read_sock(struct sock *sk, read_descriptor_t *desc, + sk_read_actor_t read_actor) +{ + struct tls_context *tls_ctx = tls_get_ctx(sk); + struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); + struct tls_prot_info *prot = &tls_ctx->prot_info; + struct strp_msg *rxm = NULL; + struct sk_buff *skb = NULL; + struct sk_psock *psock; + size_t flushed_at = 0; + bool released = true; + struct tls_msg *tlm; + ssize_t copied = 0; + ssize_t decrypted; + int err, used; + + psock = sk_psock_get(sk); + if (psock) { + sk_psock_put(sk, psock); + return -EINVAL; + } + err = tls_rx_reader_acquire(sk, ctx, true); + if (err < 0) + return err; + + /* If crypto failed the connection is broken */ + err = ctx->async_wait.err; + if (err) + goto read_sock_end; + + decrypted = 0; + do { + if (!skb_queue_empty(&ctx->rx_list)) { + skb = __skb_dequeue(&ctx->rx_list); + rxm = strp_msg(skb); + tlm = tls_msg(skb); + } else { + struct tls_decrypt_arg darg; + + err = tls_rx_rec_wait(sk, NULL, true, released); + if (err <= 0) + goto read_sock_end; + + memset(&darg.inargs, 0, sizeof(darg.inargs)); + + err = tls_rx_one_record(sk, NULL, &darg); + if (err < 0) { + tls_err_abort(sk, -EBADMSG); + goto read_sock_end; + } + + released = tls_read_flush_backlog(sk, prot, INT_MAX, + 0, decrypted, + &flushed_at); + skb = darg.skb; + rxm = strp_msg(skb); + tlm = tls_msg(skb); + decrypted += rxm->full_len; + + tls_rx_rec_done(ctx); + } + + /* read_sock does not support reading control messages */ + if (tlm->control != TLS_RECORD_TYPE_DATA) { + err = -EINVAL; + goto read_sock_requeue; + } + + used = read_actor(desc, skb, rxm->offset, rxm->full_len); + if (used <= 0) { + if (!copied) + err = used; + goto read_sock_requeue; + } + copied += used; + if (used < rxm->full_len) { + rxm->offset += used; + rxm->full_len -= used; + if (!desc->count) + goto read_sock_requeue; + } else { + consume_skb(skb); + if (!desc->count) + skb = NULL; + } + } while (skb); + +read_sock_end: + tls_rx_reader_release(sk, ctx); + return copied ? : err; + +read_sock_requeue: + __skb_queue_head(&ctx->rx_list, skb); + goto read_sock_end; +} + bool tls_sw_sock_is_readable(struct sock *sk) { struct tls_context *tls_ctx = tls_get_ctx(sk); diff --git a/net/unix/scm.c b/net/unix/scm.c index f9152881d77f..e9dde7176c8a 100644 --- a/net/unix/scm.c +++ b/net/unix/scm.c @@ -29,10 +29,11 @@ struct sock *unix_get_socket(struct file *filp) /* Socket ? */ if (S_ISSOCK(inode->i_mode) && !(filp->f_mode & FMODE_PATH)) { struct socket *sock = SOCKET_I(inode); + const struct proto_ops *ops = READ_ONCE(sock->ops); struct sock *s = sock->sk; /* PF_UNIX ? */ - if (s && sock->ops && sock->ops->family == PF_UNIX) + if (s && ops && ops->family == PF_UNIX) u_sock = s; } else { /* Could be an io_uring instance */ diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c index b769fc258931..352d042b130b 100644 --- a/net/vmw_vsock/virtio_transport_common.c +++ b/net/vmw_vsock/virtio_transport_common.c @@ -348,37 +348,34 @@ virtio_transport_stream_do_peek(struct vsock_sock *vsk, size_t len) { struct virtio_vsock_sock *vvs = vsk->trans; - size_t bytes, total = 0, off; - struct sk_buff *skb, *tmp; - int err = -EFAULT; + struct sk_buff *skb; + size_t total = 0; + int err; spin_lock_bh(&vvs->rx_lock); - skb_queue_walk_safe(&vvs->rx_queue, skb, tmp) { - off = 0; + skb_queue_walk(&vvs->rx_queue, skb) { + size_t bytes; - if (total == len) - break; + bytes = len - total; + if (bytes > skb->len) + bytes = skb->len; - while (total < len && off < skb->len) { - bytes = len - total; - if (bytes > skb->len - off) - bytes = skb->len - off; + spin_unlock_bh(&vvs->rx_lock); - /* sk_lock is held by caller so no one else can dequeue. - * Unlock rx_lock since memcpy_to_msg() may sleep. - */ - spin_unlock_bh(&vvs->rx_lock); + /* sk_lock is held by caller so no one else can dequeue. + * Unlock rx_lock since memcpy_to_msg() may sleep. + */ + err = memcpy_to_msg(msg, skb->data, bytes); + if (err) + goto out; - err = memcpy_to_msg(msg, skb->data + off, bytes); - if (err) - goto out; + total += bytes; - spin_lock_bh(&vvs->rx_lock); + spin_lock_bh(&vvs->rx_lock); - total += bytes; - off += bytes; - } + if (total == len) + break; } spin_unlock_bh(&vvs->rx_lock); @@ -463,6 +460,63 @@ out: return err; } +static ssize_t +virtio_transport_seqpacket_do_peek(struct vsock_sock *vsk, + struct msghdr *msg) +{ + struct virtio_vsock_sock *vvs = vsk->trans; + struct sk_buff *skb; + size_t total, len; + + spin_lock_bh(&vvs->rx_lock); + + if (!vvs->msg_count) { + spin_unlock_bh(&vvs->rx_lock); + return 0; + } + + total = 0; + len = msg_data_left(msg); + + skb_queue_walk(&vvs->rx_queue, skb) { + struct virtio_vsock_hdr *hdr; + + if (total < len) { + size_t bytes; + int err; + + bytes = len - total; + if (bytes > skb->len) + bytes = skb->len; + + spin_unlock_bh(&vvs->rx_lock); + + /* sk_lock is held by caller so no one else can dequeue. + * Unlock rx_lock since memcpy_to_msg() may sleep. + */ + err = memcpy_to_msg(msg, skb->data, bytes); + if (err) + return err; + + spin_lock_bh(&vvs->rx_lock); + } + + total += skb->len; + hdr = virtio_vsock_hdr(skb); + + if (le32_to_cpu(hdr->flags) & VIRTIO_VSOCK_SEQ_EOM) { + if (le32_to_cpu(hdr->flags) & VIRTIO_VSOCK_SEQ_EOR) + msg->msg_flags |= MSG_EOR; + + break; + } + } + + spin_unlock_bh(&vvs->rx_lock); + + return total; +} + static int virtio_transport_seqpacket_do_dequeue(struct vsock_sock *vsk, struct msghdr *msg, int flags) @@ -557,9 +611,9 @@ virtio_transport_seqpacket_dequeue(struct vsock_sock *vsk, int flags) { if (flags & MSG_PEEK) - return -EOPNOTSUPP; - - return virtio_transport_seqpacket_do_dequeue(vsk, msg, flags); + return virtio_transport_seqpacket_do_peek(vsk, msg); + else + return virtio_transport_seqpacket_do_dequeue(vsk, msg, flags); } EXPORT_SYMBOL_GPL(virtio_transport_seqpacket_dequeue); diff --git a/net/vmw_vsock/vmci_transport.h b/net/vmw_vsock/vmci_transport.h index b7b072194282..dbda3ababa14 100644 --- a/net/vmw_vsock/vmci_transport.h +++ b/net/vmw_vsock/vmci_transport.h @@ -116,9 +116,6 @@ struct vmci_transport { spinlock_t lock; /* protects sk. */ }; -int vmci_transport_register(void); -void vmci_transport_unregister(void); - int vmci_transport_send_wrote_bh(struct sockaddr_vm *dst, struct sockaddr_vm *src); int vmci_transport_send_read_bh(struct sockaddr_vm *dst, diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 10ea85c03147..fcfc8472f73d 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -25,6 +25,7 @@ #include <linux/vmalloc.h> #include <net/xdp_sock_drv.h> #include <net/busy_poll.h> +#include <net/netdev_rx_queue.h> #include <net/xdp.h> #include "xsk_queue.h" @@ -135,14 +136,14 @@ int xsk_reg_pool_at_qid(struct net_device *dev, struct xsk_buff_pool *pool, return 0; } -static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) +static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff_xsk *xskb, u32 len, + u32 flags) { - struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp); u64 addr; int err; addr = xp_get_handle(xskb); - err = xskq_prod_reserve_desc(xs->rx, addr, len); + err = xskq_prod_reserve_desc(xs->rx, addr, len, flags); if (err) { xs->rx_queue_full++; return err; @@ -152,48 +153,138 @@ static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) return 0; } -static void xsk_copy_xdp(struct xdp_buff *to, struct xdp_buff *from, u32 len) +static int xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) { - void *from_buf, *to_buf; - u32 metalen; + struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp); + u32 frags = xdp_buff_has_frags(xdp); + struct xdp_buff_xsk *pos, *tmp; + struct list_head *xskb_list; + u32 contd = 0; + int err; - if (unlikely(xdp_data_meta_unsupported(from))) { - from_buf = from->data; - to_buf = to->data; - metalen = 0; - } else { - from_buf = from->data_meta; - metalen = from->data - from->data_meta; - to_buf = to->data - metalen; + if (frags) + contd = XDP_PKT_CONTD; + + err = __xsk_rcv_zc(xs, xskb, len, contd); + if (err || likely(!frags)) + goto out; + + xskb_list = &xskb->pool->xskb_list; + list_for_each_entry_safe(pos, tmp, xskb_list, xskb_list_node) { + if (list_is_singular(xskb_list)) + contd = 0; + len = pos->xdp.data_end - pos->xdp.data; + err = __xsk_rcv_zc(xs, pos, len, contd); + if (err) + return err; + list_del(&pos->xskb_list_node); } - memcpy(to_buf, from_buf, len + metalen); +out: + return err; } -static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) +static void *xsk_copy_xdp_start(struct xdp_buff *from) { + if (unlikely(xdp_data_meta_unsupported(from))) + return from->data; + else + return from->data_meta; +} + +static u32 xsk_copy_xdp(void *to, void **from, u32 to_len, + u32 *from_len, skb_frag_t **frag, u32 rem) +{ + u32 copied = 0; + + while (1) { + u32 copy_len = min_t(u32, *from_len, to_len); + + memcpy(to, *from, copy_len); + copied += copy_len; + if (rem == copied) + return copied; + + if (*from_len == copy_len) { + *from = skb_frag_address(*frag); + *from_len = skb_frag_size((*frag)++); + } else { + *from += copy_len; + *from_len -= copy_len; + } + if (to_len == copy_len) + return copied; + + to_len -= copy_len; + to += copy_len; + } +} + +static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) +{ + u32 frame_size = xsk_pool_get_rx_frame_size(xs->pool); + void *copy_from = xsk_copy_xdp_start(xdp), *copy_to; + u32 from_len, meta_len, rem, num_desc; + struct xdp_buff_xsk *xskb; struct xdp_buff *xsk_xdp; - int err; - u32 len; + skb_frag_t *frag; - len = xdp->data_end - xdp->data; - if (len > xsk_pool_get_rx_frame_size(xs->pool)) { - xs->rx_dropped++; - return -ENOSPC; + from_len = xdp->data_end - copy_from; + meta_len = xdp->data - copy_from; + rem = len + meta_len; + + if (len <= frame_size && !xdp_buff_has_frags(xdp)) { + int err; + + xsk_xdp = xsk_buff_alloc(xs->pool); + if (!xsk_xdp) { + xs->rx_dropped++; + return -ENOMEM; + } + memcpy(xsk_xdp->data - meta_len, copy_from, rem); + xskb = container_of(xsk_xdp, struct xdp_buff_xsk, xdp); + err = __xsk_rcv_zc(xs, xskb, len, 0); + if (err) { + xsk_buff_free(xsk_xdp); + return err; + } + + return 0; } - xsk_xdp = xsk_buff_alloc(xs->pool); - if (!xsk_xdp) { + num_desc = (len - 1) / frame_size + 1; + + if (!xsk_buff_can_alloc(xs->pool, num_desc)) { xs->rx_dropped++; return -ENOMEM; } + if (xskq_prod_nb_free(xs->rx, num_desc) < num_desc) { + xs->rx_queue_full++; + return -ENOBUFS; + } - xsk_copy_xdp(xsk_xdp, xdp, len); - err = __xsk_rcv_zc(xs, xsk_xdp, len); - if (err) { - xsk_buff_free(xsk_xdp); - return err; + if (xdp_buff_has_frags(xdp)) { + struct skb_shared_info *sinfo; + + sinfo = xdp_get_shared_info_from_buff(xdp); + frag = &sinfo->frags[0]; } + + do { + u32 to_len = frame_size + meta_len; + u32 copied; + + xsk_xdp = xsk_buff_alloc(xs->pool); + copy_to = xsk_xdp->data - meta_len; + + copied = xsk_copy_xdp(copy_to, ©_from, to_len, &from_len, &frag, rem); + rem -= copied; + + xskb = container_of(xsk_xdp, struct xdp_buff_xsk, xdp); + __xsk_rcv_zc(xs, xskb, copied - meta_len, rem ? XDP_PKT_CONTD : 0); + meta_len = 0; + } while (rem); + return 0; } @@ -215,7 +306,7 @@ static bool xsk_is_bound(struct xdp_sock *xs) return false; } -static int xsk_rcv_check(struct xdp_sock *xs, struct xdp_buff *xdp) +static int xsk_rcv_check(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) { if (!xsk_is_bound(xs)) return -ENXIO; @@ -223,6 +314,11 @@ static int xsk_rcv_check(struct xdp_sock *xs, struct xdp_buff *xdp) if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index) return -EINVAL; + if (len > xsk_pool_get_rx_frame_size(xs->pool) && !xs->sg) { + xs->rx_dropped++; + return -ENOSPC; + } + sk_mark_napi_id_once_xdp(&xs->sk, xdp); return 0; } @@ -236,12 +332,13 @@ static void xsk_flush(struct xdp_sock *xs) int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) { + u32 len = xdp_get_buff_len(xdp); int err; spin_lock_bh(&xs->rx_lock); - err = xsk_rcv_check(xs, xdp); + err = xsk_rcv_check(xs, xdp, len); if (!err) { - err = __xsk_rcv(xs, xdp); + err = __xsk_rcv(xs, xdp, len); xsk_flush(xs); } spin_unlock_bh(&xs->rx_lock); @@ -250,19 +347,19 @@ int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) static int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) { + u32 len = xdp_get_buff_len(xdp); int err; - u32 len; - err = xsk_rcv_check(xs, xdp); + err = xsk_rcv_check(xs, xdp, len); if (err) return err; if (xdp->rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL) { len = xdp->data_end - xdp->data; - return __xsk_rcv_zc(xs, xdp, len); + return xsk_rcv_zc(xs, xdp, len); } - err = __xsk_rcv(xs, xdp); + err = __xsk_rcv(xs, xdp, len); if (!err) xdp_return_buff(xdp); return err; @@ -321,7 +418,8 @@ bool xsk_tx_peek_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc) rcu_read_lock(); list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) { if (!xskq_cons_peek_desc(xs->tx, desc, pool)) { - xs->tx->queue_empty_descs++; + if (xskq_has_descs(xs->tx)) + xskq_cons_release(xs->tx); continue; } @@ -408,37 +506,91 @@ static int xsk_wakeup(struct xdp_sock *xs, u8 flags) return dev->netdev_ops->ndo_xsk_wakeup(dev, xs->queue_id, flags); } -static void xsk_destruct_skb(struct sk_buff *skb) +static int xsk_cq_reserve_addr_locked(struct xdp_sock *xs, u64 addr) +{ + unsigned long flags; + int ret; + + spin_lock_irqsave(&xs->pool->cq_lock, flags); + ret = xskq_prod_reserve_addr(xs->pool->cq, addr); + spin_unlock_irqrestore(&xs->pool->cq_lock, flags); + + return ret; +} + +static void xsk_cq_submit_locked(struct xdp_sock *xs, u32 n) { - u64 addr = (u64)(long)skb_shinfo(skb)->destructor_arg; - struct xdp_sock *xs = xdp_sk(skb->sk); unsigned long flags; spin_lock_irqsave(&xs->pool->cq_lock, flags); - xskq_prod_submit_addr(xs->pool->cq, addr); + xskq_prod_submit_n(xs->pool->cq, n); spin_unlock_irqrestore(&xs->pool->cq_lock, flags); +} + +static void xsk_cq_cancel_locked(struct xdp_sock *xs, u32 n) +{ + unsigned long flags; + spin_lock_irqsave(&xs->pool->cq_lock, flags); + xskq_prod_cancel_n(xs->pool->cq, n); + spin_unlock_irqrestore(&xs->pool->cq_lock, flags); +} + +static u32 xsk_get_num_desc(struct sk_buff *skb) +{ + return skb ? (long)skb_shinfo(skb)->destructor_arg : 0; +} + +static void xsk_destruct_skb(struct sk_buff *skb) +{ + xsk_cq_submit_locked(xdp_sk(skb->sk), xsk_get_num_desc(skb)); sock_wfree(skb); } +static void xsk_set_destructor_arg(struct sk_buff *skb) +{ + long num = xsk_get_num_desc(xdp_sk(skb->sk)->skb) + 1; + + skb_shinfo(skb)->destructor_arg = (void *)num; +} + +static void xsk_consume_skb(struct sk_buff *skb) +{ + struct xdp_sock *xs = xdp_sk(skb->sk); + + skb->destructor = sock_wfree; + xsk_cq_cancel_locked(xs, xsk_get_num_desc(skb)); + /* Free skb without triggering the perf drop trace */ + consume_skb(skb); + xs->skb = NULL; +} + +static void xsk_drop_skb(struct sk_buff *skb) +{ + xdp_sk(skb->sk)->tx->invalid_descs += xsk_get_num_desc(skb); + xsk_consume_skb(skb); +} + static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs, struct xdp_desc *desc) { struct xsk_buff_pool *pool = xs->pool; u32 hr, len, ts, offset, copy, copied; - struct sk_buff *skb; + struct sk_buff *skb = xs->skb; struct page *page; void *buffer; int err, i; u64 addr; - hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(xs->dev->needed_headroom)); + if (!skb) { + hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(xs->dev->needed_headroom)); - skb = sock_alloc_send_skb(&xs->sk, hr, 1, &err); - if (unlikely(!skb)) - return ERR_PTR(err); + skb = sock_alloc_send_skb(&xs->sk, hr, 1, &err); + if (unlikely(!skb)) + return ERR_PTR(err); - skb_reserve(skb, hr); + skb_reserve(skb, hr); + } addr = desc->addr; len = desc->len; @@ -448,7 +600,10 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs, offset = offset_in_page(buffer); addr = buffer - pool->addrs; - for (copied = 0, i = 0; copied < len; i++) { + for (copied = 0, i = skb_shinfo(skb)->nr_frags; copied < len; i++) { + if (unlikely(i >= MAX_SKB_FRAGS)) + return ERR_PTR(-EFAULT); + page = pool->umem->pgs[addr >> PAGE_SHIFT]; get_page(page); @@ -473,43 +628,77 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs, struct xdp_desc *desc) { struct net_device *dev = xs->dev; - struct sk_buff *skb; + struct sk_buff *skb = xs->skb; + int err; if (dev->priv_flags & IFF_TX_SKB_NO_LINEAR) { skb = xsk_build_skb_zerocopy(xs, desc); - if (IS_ERR(skb)) - return skb; + if (IS_ERR(skb)) { + err = PTR_ERR(skb); + goto free_err; + } } else { u32 hr, tr, len; void *buffer; - int err; - hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(dev->needed_headroom)); - tr = dev->needed_tailroom; + buffer = xsk_buff_raw_get_data(xs->pool, desc->addr); len = desc->len; - skb = sock_alloc_send_skb(&xs->sk, hr + len + tr, 1, &err); - if (unlikely(!skb)) - return ERR_PTR(err); + if (!skb) { + hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(dev->needed_headroom)); + tr = dev->needed_tailroom; + skb = sock_alloc_send_skb(&xs->sk, hr + len + tr, 1, &err); + if (unlikely(!skb)) + goto free_err; - skb_reserve(skb, hr); - skb_put(skb, len); + skb_reserve(skb, hr); + skb_put(skb, len); - buffer = xsk_buff_raw_get_data(xs->pool, desc->addr); - err = skb_store_bits(skb, 0, buffer, len); - if (unlikely(err)) { - kfree_skb(skb); - return ERR_PTR(err); + err = skb_store_bits(skb, 0, buffer, len); + if (unlikely(err)) + goto free_err; + } else { + int nr_frags = skb_shinfo(skb)->nr_frags; + struct page *page; + u8 *vaddr; + + if (unlikely(nr_frags == (MAX_SKB_FRAGS - 1) && xp_mb_desc(desc))) { + err = -EFAULT; + goto free_err; + } + + page = alloc_page(xs->sk.sk_allocation); + if (unlikely(!page)) { + err = -EAGAIN; + goto free_err; + } + + vaddr = kmap_local_page(page); + memcpy(vaddr, buffer, len); + kunmap_local(vaddr); + + skb_add_rx_frag(skb, nr_frags, page, 0, len, 0); } } skb->dev = dev; skb->priority = xs->sk.sk_priority; skb->mark = READ_ONCE(xs->sk.sk_mark); - skb_shinfo(skb)->destructor_arg = (void *)(long)desc->addr; skb->destructor = xsk_destruct_skb; + xsk_set_destructor_arg(skb); return skb; + +free_err: + if (err == -EAGAIN) { + xsk_cq_cancel_locked(xs, 1); + } else { + xsk_set_destructor_arg(skb); + xsk_drop_skb(skb); + xskq_cons_release(xs->tx); + } + + return ERR_PTR(err); } static int __xsk_generic_xmit(struct sock *sk) @@ -519,7 +708,6 @@ static int __xsk_generic_xmit(struct sock *sk) bool sent_frame = false; struct xdp_desc desc; struct sk_buff *skb; - unsigned long flags; int err = 0; mutex_lock(&xs->mutex); @@ -544,47 +732,51 @@ static int __xsk_generic_xmit(struct sock *sk) * if there is space in it. This avoids having to implement * any buffering in the Tx path. */ - spin_lock_irqsave(&xs->pool->cq_lock, flags); - if (xskq_prod_reserve(xs->pool->cq)) { - spin_unlock_irqrestore(&xs->pool->cq_lock, flags); + if (xsk_cq_reserve_addr_locked(xs, desc.addr)) goto out; - } - spin_unlock_irqrestore(&xs->pool->cq_lock, flags); skb = xsk_build_skb(xs, &desc); if (IS_ERR(skb)) { err = PTR_ERR(skb); - spin_lock_irqsave(&xs->pool->cq_lock, flags); - xskq_prod_cancel(xs->pool->cq); - spin_unlock_irqrestore(&xs->pool->cq_lock, flags); - goto out; + if (err == -EAGAIN) + goto out; + err = 0; + continue; + } + + xskq_cons_release(xs->tx); + + if (xp_mb_desc(&desc)) { + xs->skb = skb; + continue; } err = __dev_direct_xmit(skb, xs->queue_id); if (err == NETDEV_TX_BUSY) { /* Tell user-space to retry the send */ - skb->destructor = sock_wfree; - spin_lock_irqsave(&xs->pool->cq_lock, flags); - xskq_prod_cancel(xs->pool->cq); - spin_unlock_irqrestore(&xs->pool->cq_lock, flags); - /* Free skb without triggering the perf drop trace */ - consume_skb(skb); + xskq_cons_cancel_n(xs->tx, xsk_get_num_desc(skb)); + xsk_consume_skb(skb); err = -EAGAIN; goto out; } - xskq_cons_release(xs->tx); /* Ignore NET_XMIT_CN as packet might have been sent */ if (err == NET_XMIT_DROP) { /* SKB completed but not sent */ err = -EBUSY; + xs->skb = NULL; goto out; } sent_frame = true; + xs->skb = NULL; } - xs->tx->queue_empty_descs++; + if (xskq_has_descs(xs->tx)) { + if (xs->skb) + xsk_drop_skb(xs->skb); + xskq_cons_release(xs->tx); + } out: if (sent_frame) @@ -834,6 +1026,9 @@ static int xsk_release(struct socket *sock) net = sock_net(sk); + if (xs->skb) + xsk_drop_skb(xs->skb); + mutex_lock(&net->xdp.lock); sk_del_node_init_rcu(sk); mutex_unlock(&net->xdp.lock); @@ -897,7 +1092,7 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) flags = sxdp->sxdp_flags; if (flags & ~(XDP_SHARED_UMEM | XDP_COPY | XDP_ZEROCOPY | - XDP_USE_NEED_WAKEUP)) + XDP_USE_NEED_WAKEUP | XDP_USE_SG)) return -EINVAL; bound_dev_if = READ_ONCE(sk->sk_bound_dev_if); @@ -929,7 +1124,7 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) struct socket *sock; if ((flags & XDP_COPY) || (flags & XDP_ZEROCOPY) || - (flags & XDP_USE_NEED_WAKEUP)) { + (flags & XDP_USE_NEED_WAKEUP) || (flags & XDP_USE_SG)) { /* Cannot specify flags for shared sockets. */ err = -EINVAL; goto out_unlock; @@ -1029,6 +1224,7 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) xs->dev = dev; xs->zc = xs->umem->zc; + xs->sg = !!(flags & XDP_USE_SG); xs->queue_id = qid; xp_add_xsk(xs->pool, xs); diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c index 26f6d304451e..b3f7b310811e 100644 --- a/net/xdp/xsk_buff_pool.c +++ b/net/xdp/xsk_buff_pool.c @@ -86,6 +86,7 @@ struct xsk_buff_pool *xp_create_and_assign_umem(struct xdp_sock *xs, pool->umem = umem; pool->addrs = umem->addrs; INIT_LIST_HEAD(&pool->free_list); + INIT_LIST_HEAD(&pool->xskb_list); INIT_LIST_HEAD(&pool->xsk_tx_list); spin_lock_init(&pool->xsk_tx_list_lock); spin_lock_init(&pool->cq_lock); @@ -99,6 +100,7 @@ struct xsk_buff_pool *xp_create_and_assign_umem(struct xdp_sock *xs, xskb->pool = pool; xskb->xdp.frame_sz = umem->chunk_size - umem->headroom; INIT_LIST_HEAD(&xskb->free_list_node); + INIT_LIST_HEAD(&xskb->xskb_list_node); if (pool->unaligned) pool->free_heads[i] = xskb; else @@ -187,6 +189,11 @@ int xp_assign_dev(struct xsk_buff_pool *pool, goto err_unreg_pool; } + if (netdev->xdp_zc_max_segs == 1 && (flags & XDP_USE_SG)) { + err = -EOPNOTSUPP; + goto err_unreg_pool; + } + bpf.command = XDP_SETUP_XSK_POOL; bpf.xsk.pool = pool; bpf.xsk.queue_id = queue_id; diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index 6d40a77fccbe..13354a1e4280 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -48,6 +48,11 @@ struct xsk_queue { size_t ring_vmalloc_size; }; +struct parsed_desc { + u32 mb; + u32 valid; +}; + /* The structure of the shared state of the rings are a simple * circular buffer, as outlined in * Documentation/core-api/circular-buffers.rst. For the Rx and @@ -130,18 +135,26 @@ static inline bool xskq_cons_read_addr_unchecked(struct xsk_queue *q, u64 *addr) return false; } +static inline bool xp_unused_options_set(u32 options) +{ + return options & ~XDP_PKT_CONTD; +} + static inline bool xp_aligned_validate_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc) { u64 offset = desc->addr & (pool->chunk_size - 1); + if (!desc->len) + return false; + if (offset + desc->len > pool->chunk_size) return false; if (desc->addr >= pool->addrs_cnt) return false; - if (desc->options) + if (xp_unused_options_set(desc->options)) return false; return true; } @@ -151,6 +164,9 @@ static inline bool xp_unaligned_validate_desc(struct xsk_buff_pool *pool, { u64 addr = xp_unaligned_add_offset_to_addr(desc->addr); + if (!desc->len) + return false; + if (desc->len > pool->chunk_size) return false; @@ -158,7 +174,7 @@ static inline bool xp_unaligned_validate_desc(struct xsk_buff_pool *pool, xp_desc_crosses_non_contig_pg(pool, addr, desc->len)) return false; - if (desc->options) + if (xp_unused_options_set(desc->options)) return false; return true; } @@ -170,6 +186,11 @@ static inline bool xp_validate_desc(struct xsk_buff_pool *pool, xp_aligned_validate_desc(pool, desc); } +static inline bool xskq_has_descs(struct xsk_queue *q) +{ + return q->cached_cons != q->cached_prod; +} + static inline bool xskq_cons_is_valid_desc(struct xsk_queue *q, struct xdp_desc *d, struct xsk_buff_pool *pool) @@ -185,17 +206,15 @@ static inline bool xskq_cons_read_desc(struct xsk_queue *q, struct xdp_desc *desc, struct xsk_buff_pool *pool) { - while (q->cached_cons != q->cached_prod) { + if (q->cached_cons != q->cached_prod) { struct xdp_rxtx_ring *ring = (struct xdp_rxtx_ring *)q->ring; u32 idx = q->cached_cons & q->ring_mask; *desc = ring->desc[idx]; - if (xskq_cons_is_valid_desc(q, desc, pool)) - return true; - - q->cached_cons++; + return xskq_cons_is_valid_desc(q, desc, pool); } + q->queue_empty_descs++; return false; } @@ -204,30 +223,52 @@ static inline void xskq_cons_release_n(struct xsk_queue *q, u32 cnt) q->cached_cons += cnt; } -static inline u32 xskq_cons_read_desc_batch(struct xsk_queue *q, struct xsk_buff_pool *pool, - u32 max) +static inline void parse_desc(struct xsk_queue *q, struct xsk_buff_pool *pool, + struct xdp_desc *desc, struct parsed_desc *parsed) +{ + parsed->valid = xskq_cons_is_valid_desc(q, desc, pool); + parsed->mb = xp_mb_desc(desc); +} + +static inline +u32 xskq_cons_read_desc_batch(struct xsk_queue *q, struct xsk_buff_pool *pool, + u32 max) { u32 cached_cons = q->cached_cons, nb_entries = 0; struct xdp_desc *descs = pool->tx_descs; + u32 total_descs = 0, nr_frags = 0; + /* track first entry, if stumble upon *any* invalid descriptor, rewind + * current packet that consists of frags and stop the processing + */ while (cached_cons != q->cached_prod && nb_entries < max) { struct xdp_rxtx_ring *ring = (struct xdp_rxtx_ring *)q->ring; u32 idx = cached_cons & q->ring_mask; + struct parsed_desc parsed; descs[nb_entries] = ring->desc[idx]; - if (unlikely(!xskq_cons_is_valid_desc(q, &descs[nb_entries], pool))) { - /* Skip the entry */ - cached_cons++; - continue; + cached_cons++; + parse_desc(q, pool, &descs[nb_entries], &parsed); + if (unlikely(!parsed.valid)) + break; + + if (likely(!parsed.mb)) { + total_descs += (nr_frags + 1); + nr_frags = 0; + } else { + nr_frags++; + if (nr_frags == pool->netdev->xdp_zc_max_segs) { + nr_frags = 0; + break; + } } - nb_entries++; - cached_cons++; } + cached_cons -= nr_frags; /* Release valid plus any invalid entries */ xskq_cons_release_n(q, cached_cons - q->cached_cons); - return nb_entries; + return total_descs; } /* Functions for consumers */ @@ -292,6 +333,11 @@ static inline void xskq_cons_release(struct xsk_queue *q) q->cached_cons++; } +static inline void xskq_cons_cancel_n(struct xsk_queue *q, u32 cnt) +{ + q->cached_cons -= cnt; +} + static inline u32 xskq_cons_present_entries(struct xsk_queue *q) { /* No barriers needed since data is not accessed */ @@ -319,9 +365,9 @@ static inline bool xskq_prod_is_full(struct xsk_queue *q) return xskq_prod_nb_free(q, 1) ? false : true; } -static inline void xskq_prod_cancel(struct xsk_queue *q) +static inline void xskq_prod_cancel_n(struct xsk_queue *q, u32 cnt) { - q->cached_prod--; + q->cached_prod -= cnt; } static inline int xskq_prod_reserve(struct xsk_queue *q) @@ -360,7 +406,7 @@ static inline void xskq_prod_write_addr_batch(struct xsk_queue *q, struct xdp_de } static inline int xskq_prod_reserve_desc(struct xsk_queue *q, - u64 addr, u32 len) + u64 addr, u32 len, u32 flags) { struct xdp_rxtx_ring *ring = (struct xdp_rxtx_ring *)q->ring; u32 idx; @@ -372,6 +418,7 @@ static inline int xskq_prod_reserve_desc(struct xsk_queue *q, idx = q->cached_prod++ & q->ring_mask; ring->desc[idx].addr = addr; ring->desc[idx].len = len; + ring->desc[idx].options = flags; return 0; } @@ -386,16 +433,6 @@ static inline void xskq_prod_submit(struct xsk_queue *q) __xskq_prod_submit(q, q->cached_prod); } -static inline void xskq_prod_submit_addr(struct xsk_queue *q, u64 addr) -{ - struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring; - u32 idx = q->ring->producer; - - ring->desc[idx++ & q->ring_mask] = addr; - - __xskq_prod_submit(q, idx); -} - static inline void xskq_prod_submit_n(struct xsk_queue *q, u32 nb_entries) { __xskq_prod_submit(q, q->ring->producer + nb_entries); diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c index 533697e2488f..3784534c9185 100644 --- a/net/xfrm/xfrm_device.c +++ b/net/xfrm/xfrm_device.c @@ -247,12 +247,6 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x, return -EINVAL; } - /* We don't yet support UDP encapsulation and TFC padding. */ - if (x->encap || x->tfcpad) { - NL_SET_ERR_MSG(extack, "Encapsulation and TFC padding can't be offloaded"); - return -EINVAL; - } - if (xuo->flags & ~(XFRM_OFFLOAD_IPV6 | XFRM_OFFLOAD_INBOUND | XFRM_OFFLOAD_PACKET)) { NL_SET_ERR_MSG(extack, "Unrecognized flags in offload request"); @@ -260,6 +254,13 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x, } is_packet_offload = xuo->flags & XFRM_OFFLOAD_PACKET; + + /* We don't yet support UDP encapsulation and TFC padding. */ + if ((!is_packet_offload && x->encap) || x->tfcpad) { + NL_SET_ERR_MSG(extack, "Encapsulation and TFC padding can't be offloaded"); + return -EINVAL; + } + dev = dev_get_by_index(net, xuo->ifindex); if (!dev) { if (!(xuo->flags & XFRM_OFFLOAD_INBOUND)) { |