diff options
Diffstat (limited to 'net/core')
-rw-r--r-- | net/core/dev.c | 279 | ||||
-rw-r--r-- | net/core/dev.h | 14 | ||||
-rw-r--r-- | net/core/dev_addr_lists.c | 2 | ||||
-rw-r--r-- | net/core/dev_api.c | 13 | ||||
-rw-r--r-- | net/core/dev_ioctl.c | 5 | ||||
-rw-r--r-- | net/core/dst.c | 10 | ||||
-rw-r--r-- | net/core/dst_cache.c | 2 | ||||
-rw-r--r-- | net/core/filter.c | 35 | ||||
-rw-r--r-- | net/core/hotdata.c | 5 | ||||
-rw-r--r-- | net/core/ieee8021q_helpers.c | 44 | ||||
-rw-r--r-- | net/core/neighbour.c | 558 | ||||
-rw-r--r-- | net/core/net-sysfs.c | 80 | ||||
-rw-r--r-- | net/core/net-sysfs.h | 2 | ||||
-rw-r--r-- | net/core/net_namespace.c | 70 | ||||
-rw-r--r-- | net/core/netclassid_cgroup.c | 4 | ||||
-rw-r--r-- | net/core/netdev-genl-gen.c | 5 | ||||
-rw-r--r-- | net/core/netdev-genl.c | 14 | ||||
-rw-r--r-- | net/core/netdev_rx_queue.c | 6 | ||||
-rw-r--r-- | net/core/netpoll.c | 480 | ||||
-rw-r--r-- | net/core/page_pool.c | 36 | ||||
-rw-r--r-- | net/core/rtnetlink.c | 10 | ||||
-rw-r--r-- | net/core/scm.c | 32 | ||||
-rw-r--r-- | net/core/selftests.c | 67 | ||||
-rw-r--r-- | net/core/skbuff.c | 38 | ||||
-rw-r--r-- | net/core/skmsg.c | 7 | ||||
-rw-r--r-- | net/core/sock.c | 73 | ||||
-rw-r--r-- | net/core/sock_map.c | 13 | ||||
-rw-r--r-- | net/core/stream.c | 8 | ||||
-rw-r--r-- | net/core/sysctl_net_core.c | 37 |
29 files changed, 1109 insertions, 840 deletions
diff --git a/net/core/dev.c b/net/core/dev.c index be97c440ecd5..b28ce68830b2 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1267,33 +1267,32 @@ struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type) EXPORT_SYMBOL(dev_getfirstbyhwtype); /** - * __dev_get_by_flags - find any device with given flags - * @net: the applicable net namespace - * @if_flags: IFF_* values - * @mask: bitmask of bits in if_flags to check + * netdev_get_by_flags_rcu - find any device with given flags + * @net: the applicable net namespace + * @tracker: tracking object for the acquired reference + * @if_flags: IFF_* values + * @mask: bitmask of bits in if_flags to check + * + * Search for any interface with the given flags. * - * Search for any interface with the given flags. Returns NULL if a device - * is not found or a pointer to the device. Must be called inside - * rtnl_lock(), and result refcount is unchanged. + * Context: rcu_read_lock() must be held. + * Returns: NULL if a device is not found or a pointer to the device. */ - -struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags, - unsigned short mask) +struct net_device *netdev_get_by_flags_rcu(struct net *net, netdevice_tracker *tracker, + unsigned short if_flags, unsigned short mask) { - struct net_device *dev, *ret; - - ASSERT_RTNL(); + struct net_device *dev; - ret = NULL; - for_each_netdev(net, dev) { - if (((dev->flags ^ if_flags) & mask) == 0) { - ret = dev; - break; + for_each_netdev_rcu(net, dev) { + if (((READ_ONCE(dev->flags) ^ if_flags) & mask) == 0) { + netdev_hold(dev, tracker, GFP_ATOMIC); + return dev; } } - return ret; + + return NULL; } -EXPORT_SYMBOL(__dev_get_by_flags); +EXPORT_IPV6_MOD(netdev_get_by_flags_rcu); /** * dev_valid_name - check if name is okay for network device @@ -1769,7 +1768,7 @@ static void __dev_close(struct net_device *dev) list_del(&single); } -void dev_close_many(struct list_head *head, bool unlink) +void netif_close_many(struct list_head *head, bool unlink) { struct net_device *dev, *tmp; @@ -1787,7 +1786,7 @@ void dev_close_many(struct list_head *head, bool unlink) list_del_init(&dev->close_list); } } -EXPORT_SYMBOL(dev_close_many); +EXPORT_SYMBOL_NS_GPL(netif_close_many, "NETDEV_INTERNAL"); void netif_close(struct net_device *dev) { @@ -1795,7 +1794,7 @@ void netif_close(struct net_device *dev) LIST_HEAD(single); list_add(&dev->close_list, &single); - dev_close_many(&single, true); + netif_close_many(&single, true); list_del(&single); } } @@ -3179,7 +3178,6 @@ int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq) if (dev->reg_state == NETREG_REGISTERED || dev->reg_state == NETREG_UNREGISTERING) { - ASSERT_RTNL(); netdev_ops_assert_locked(dev); rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues, @@ -3229,7 +3227,6 @@ int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq) return -EINVAL; if (dev->reg_state == NETREG_REGISTERED) { - ASSERT_RTNL(); netdev_ops_assert_locked(dev); rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues, @@ -4028,7 +4025,10 @@ static void qdisc_pkt_len_init(struct sk_buff *skb) unsigned int hdr_len; /* mac layer + network layer */ - hdr_len = skb_transport_offset(skb); + if (!skb->encapsulation) + hdr_len = skb_transport_offset(skb); + else + hdr_len = skb_inner_transport_offset(skb); /* + transport layer */ if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) { @@ -4798,7 +4798,7 @@ static inline void ____napi_schedule(struct softnet_data *sd, if (test_bit(NAPI_STATE_THREADED, &napi->state)) { /* Paired with smp_mb__before_atomic() in - * napi_enable()/dev_set_threaded(). + * napi_enable()/netif_set_threaded(). * Use READ_ONCE() to guarantee a complete * read on napi->thread. Only call * wake_up_process() when it's not NULL. @@ -5749,6 +5749,7 @@ static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev, static int __netif_receive_skb_core(struct sk_buff **pskb, bool pfmemalloc, struct packet_type **ppt_prev) { + enum skb_drop_reason drop_reason = SKB_DROP_REASON_UNHANDLED_PROTO; struct packet_type *ptype, *pt_prev; rx_handler_func_t *rx_handler; struct sk_buff *skb = *pskb; @@ -5840,8 +5841,10 @@ skip_taps: #endif skb_reset_redirect(skb); skip_classify: - if (pfmemalloc && !skb_pfmemalloc_protocol(skb)) + if (pfmemalloc && !skb_pfmemalloc_protocol(skb)) { + drop_reason = SKB_DROP_REASON_PFMEMALLOC; goto drop; + } if (skb_vlan_tag_present(skb)) { if (pt_prev) { @@ -5939,8 +5942,6 @@ check_vlan_id: } if (pt_prev) { - if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC))) - goto drop; *ppt_prev = pt_prev; } else { drop: @@ -5948,7 +5949,8 @@ drop: dev_core_stats_rx_dropped_inc(skb->dev); else dev_core_stats_rx_nohandler_inc(skb->dev); - kfree_skb_reason(skb, SKB_DROP_REASON_UNHANDLED_PROTO); + + kfree_skb_reason(skb, drop_reason); /* Jamal, now you will not able to escape explaining * me how you were going to use this. :-) */ @@ -6576,8 +6578,7 @@ bool napi_complete_done(struct napi_struct *n, int work_done) * it, we need to bound somehow the time packets are kept in * the GRO layer. */ - gro_flush(&n->gro, !!timeout); - gro_normal_list(&n->gro); + gro_flush_normal(&n->gro, !!timeout); if (unlikely(!list_empty(&n->poll_list))) { /* If n->poll_list is not empty, we need to mask irqs */ @@ -6647,8 +6648,7 @@ static void __busy_poll_stop(struct napi_struct *napi, bool skip_schedule) } /* Flush too old packets. If HZ < 1000, flush all packets */ - gro_flush(&napi->gro, HZ >= 1000); - gro_normal_list(&napi->gro); + gro_flush_normal(&napi->gro, HZ >= 1000); clear_bit(NAPI_STATE_SCHED, &napi->state); } @@ -6926,22 +6926,83 @@ static enum hrtimer_restart napi_watchdog(struct hrtimer *timer) return HRTIMER_NORESTART; } -int dev_set_threaded(struct net_device *dev, bool threaded) +static void napi_stop_kthread(struct napi_struct *napi) +{ + unsigned long val, new; + + /* Wait until the napi STATE_THREADED is unset. */ + while (true) { + val = READ_ONCE(napi->state); + + /* If napi kthread own this napi or the napi is idle, + * STATE_THREADED can be unset here. + */ + if ((val & NAPIF_STATE_SCHED_THREADED) || + !(val & NAPIF_STATE_SCHED)) { + new = val & (~NAPIF_STATE_THREADED); + } else { + msleep(20); + continue; + } + + if (try_cmpxchg(&napi->state, &val, new)) + break; + } + + /* Once STATE_THREADED is unset, wait for SCHED_THREADED to be unset by + * the kthread. + */ + while (true) { + if (!test_bit(NAPIF_STATE_SCHED_THREADED, &napi->state)) + break; + + msleep(20); + } + + kthread_stop(napi->thread); + napi->thread = NULL; +} + +int napi_set_threaded(struct napi_struct *napi, + enum netdev_napi_threaded threaded) +{ + if (threaded) { + if (!napi->thread) { + int err = napi_kthread_create(napi); + + if (err) + return err; + } + } + + if (napi->config) + napi->config->threaded = threaded; + + if (!threaded && napi->thread) { + napi_stop_kthread(napi); + } else { + /* Make sure kthread is created before THREADED bit is set. */ + smp_mb__before_atomic(); + assign_bit(NAPI_STATE_THREADED, &napi->state, threaded); + } + + return 0; +} + +int netif_set_threaded(struct net_device *dev, + enum netdev_napi_threaded threaded) { struct napi_struct *napi; int err = 0; netdev_assert_locked_or_invisible(dev); - if (dev->threaded == threaded) - return 0; - if (threaded) { list_for_each_entry(napi, &dev->napi_list, dev_list) { if (!napi->thread) { err = napi_kthread_create(napi); if (err) { - threaded = false; + threaded = NETDEV_NAPI_THREADED_DISABLED; break; } } @@ -6961,12 +7022,32 @@ int dev_set_threaded(struct net_device *dev, bool threaded) * softirq mode will happen in the next round of napi_schedule(). * This should not cause hiccups/stalls to the live traffic. */ - list_for_each_entry(napi, &dev->napi_list, dev_list) - assign_bit(NAPI_STATE_THREADED, &napi->state, threaded); + list_for_each_entry(napi, &dev->napi_list, dev_list) { + if (!threaded && napi->thread) + napi_stop_kthread(napi); + else + assign_bit(NAPI_STATE_THREADED, &napi->state, threaded); + } return err; } -EXPORT_SYMBOL(dev_set_threaded); + +/** + * netif_threaded_enable() - enable threaded NAPIs + * @dev: net_device instance + * + * Enable threaded mode for the NAPI instances of the device. This may be useful + * for devices where multiple NAPI instances get scheduled by a single + * interrupt. Threaded NAPI allows moving the NAPI processing to cores other + * than the core where IRQ is mapped. + * + * This function should be called before @dev is registered. + */ +void netif_threaded_enable(struct net_device *dev) +{ + WARN_ON_ONCE(netif_set_threaded(dev, NETDEV_NAPI_THREADED_ENABLED)); +} +EXPORT_SYMBOL(netif_threaded_enable); /** * netif_queue_set_napi - Associate queue with the napi @@ -7182,6 +7263,8 @@ static void napi_restore_config(struct napi_struct *n) napi_hash_add(n); n->config->napi_id = n->napi_id; } + + WARN_ON_ONCE(napi_set_threaded(n, n->config->threaded)); } static void napi_save_config(struct napi_struct *n) @@ -7279,7 +7362,7 @@ void netif_napi_add_weight_locked(struct net_device *dev, * threaded mode will not be enabled in napi_enable(). */ if (dev->threaded && napi_kthread_create(napi)) - dev->threaded = false; + dev->threaded = NETDEV_NAPI_THREADED_DISABLED; netif_napi_set_irq_locked(napi, -1); } EXPORT_SYMBOL(netif_napi_add_weight_locked); @@ -7448,8 +7531,7 @@ static int __napi_poll(struct napi_struct *n, bool *repoll) } /* Flush too old packets. If HZ < 1000, flush all packets */ - gro_flush(&n->gro, HZ >= 1000); - gro_normal_list(&n->gro); + gro_flush_normal(&n->gro, HZ >= 1000); /* Some drivers may have called napi_schedule * prior to exhausting their budget. @@ -9387,12 +9469,12 @@ void dev_set_rx_mode(struct net_device *dev) } /** - * dev_get_flags - get flags reported to userspace - * @dev: device + * netif_get_flags() - get flags reported to userspace + * @dev: device * - * Get the combination of flag bits exported through APIs to userspace. + * Get the combination of flag bits exported through APIs to userspace. */ -unsigned int dev_get_flags(const struct net_device *dev) +unsigned int netif_get_flags(const struct net_device *dev) { unsigned int flags; @@ -9415,7 +9497,7 @@ unsigned int dev_get_flags(const struct net_device *dev) return flags; } -EXPORT_SYMBOL(dev_get_flags); +EXPORT_SYMBOL(netif_get_flags); int __dev_change_flags(struct net_device *dev, unsigned int flags, struct netlink_ext_ack *extack) @@ -9527,7 +9609,7 @@ int netif_change_flags(struct net_device *dev, unsigned int flags, return ret; } -int __dev_set_mtu(struct net_device *dev, int new_mtu) +int __netif_set_mtu(struct net_device *dev, int new_mtu) { const struct net_device_ops *ops = dev->netdev_ops; @@ -9538,7 +9620,7 @@ int __dev_set_mtu(struct net_device *dev, int new_mtu) WRITE_ONCE(dev->mtu, new_mtu); return 0; } -EXPORT_SYMBOL(__dev_set_mtu); +EXPORT_SYMBOL_NS_GPL(__netif_set_mtu, "NETDEV_INTERNAL"); int dev_validate_mtu(struct net_device *dev, int new_mtu, struct netlink_ext_ack *extack) @@ -9557,18 +9639,22 @@ int dev_validate_mtu(struct net_device *dev, int new_mtu, } /** - * netif_set_mtu_ext - Change maximum transfer unit - * @dev: device - * @new_mtu: new transfer unit - * @extack: netlink extended ack + * netif_set_mtu_ext() - Change maximum transfer unit + * @dev: device + * @new_mtu: new transfer unit + * @extack: netlink extended ack + * + * Change the maximum transfer size of the network device. * - * Change the maximum transfer size of the network device. + * Return: 0 on success, -errno on failure. */ int netif_set_mtu_ext(struct net_device *dev, int new_mtu, struct netlink_ext_ack *extack) { int err, orig_mtu; + netdev_ops_assert_locked(dev); + if (new_mtu == dev->mtu) return 0; @@ -9585,7 +9671,7 @@ int netif_set_mtu_ext(struct net_device *dev, int new_mtu, return err; orig_mtu = dev->mtu; - err = __dev_set_mtu(dev, new_mtu); + err = __netif_set_mtu(dev, new_mtu); if (!err) { err = call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev, @@ -9595,7 +9681,7 @@ int netif_set_mtu_ext(struct net_device *dev, int new_mtu, /* setting mtu back and notifying everyone again, * so that they have a chance to revert changes. */ - __dev_set_mtu(dev, orig_mtu); + __netif_set_mtu(dev, orig_mtu); call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev, new_mtu); } @@ -9649,13 +9735,15 @@ void netif_set_group(struct net_device *dev, int new_group) } /** - * dev_pre_changeaddr_notify - Call NETDEV_PRE_CHANGEADDR. - * @dev: device - * @addr: new address - * @extack: netlink extended ack + * netif_pre_changeaddr_notify() - Call NETDEV_PRE_CHANGEADDR. + * @dev: device + * @addr: new address + * @extack: netlink extended ack + * + * Return: 0 on success, -errno on failure. */ -int dev_pre_changeaddr_notify(struct net_device *dev, const char *addr, - struct netlink_ext_ack *extack) +int netif_pre_changeaddr_notify(struct net_device *dev, const char *addr, + struct netlink_ext_ack *extack) { struct netdev_notifier_pre_changeaddr_info info = { .info.dev = dev, @@ -9667,7 +9755,7 @@ int dev_pre_changeaddr_notify(struct net_device *dev, const char *addr, rc = call_netdevice_notifiers_info(NETDEV_PRE_CHANGEADDR, &info.info); return notifier_to_errno(rc); } -EXPORT_SYMBOL(dev_pre_changeaddr_notify); +EXPORT_SYMBOL_NS_GPL(netif_pre_changeaddr_notify, "NETDEV_INTERNAL"); int netif_set_mac_address(struct net_device *dev, struct sockaddr_storage *ss, struct netlink_ext_ack *extack) @@ -9681,7 +9769,7 @@ int netif_set_mac_address(struct net_device *dev, struct sockaddr_storage *ss, return -EINVAL; if (!netif_device_present(dev)) return -ENODEV; - err = dev_pre_changeaddr_notify(dev, ss->__data, extack); + err = netif_pre_changeaddr_notify(dev, ss->__data, extack); if (err) return err; if (memcmp(dev->dev_addr, ss->__data, dev->addr_len)) { @@ -9698,7 +9786,7 @@ int netif_set_mac_address(struct net_device *dev, struct sockaddr_storage *ss, DECLARE_RWSEM(dev_addr_sem); /* "sa" is a true struct sockaddr with limited "sa_data" member. */ -int dev_get_mac_address(struct sockaddr *sa, struct net *net, char *dev_name) +int netif_get_mac_address(struct sockaddr *sa, struct net *net, char *dev_name) { size_t size = sizeof(sa->sa_data_min); struct net_device *dev; @@ -9724,7 +9812,7 @@ unlock: up_read(&dev_addr_sem); return ret; } -EXPORT_SYMBOL(dev_get_mac_address); +EXPORT_SYMBOL_NS_GPL(netif_get_mac_address, "NETDEV_INTERNAL"); int netif_change_carrier(struct net_device *dev, bool new_carrier) { @@ -9777,16 +9865,17 @@ int dev_get_phys_port_name(struct net_device *dev, } /** - * dev_get_port_parent_id - Get the device's port parent identifier - * @dev: network device - * @ppid: pointer to a storage for the port's parent identifier - * @recurse: allow/disallow recursion to lower devices + * netif_get_port_parent_id() - Get the device's port parent identifier + * @dev: network device + * @ppid: pointer to a storage for the port's parent identifier + * @recurse: allow/disallow recursion to lower devices + * + * Get the devices's port parent identifier. * - * Get the devices's port parent identifier + * Return: 0 on success, -errno on failure. */ -int dev_get_port_parent_id(struct net_device *dev, - struct netdev_phys_item_id *ppid, - bool recurse) +int netif_get_port_parent_id(struct net_device *dev, + struct netdev_phys_item_id *ppid, bool recurse) { const struct net_device_ops *ops = dev->netdev_ops; struct netdev_phys_item_id first = { }; @@ -9805,7 +9894,7 @@ int dev_get_port_parent_id(struct net_device *dev, return err; netdev_for_each_lower_dev(dev, lower_dev, iter) { - err = dev_get_port_parent_id(lower_dev, ppid, true); + err = netif_get_port_parent_id(lower_dev, ppid, true); if (err) break; if (!first.id_len) @@ -9816,7 +9905,7 @@ int dev_get_port_parent_id(struct net_device *dev, return err; } -EXPORT_SYMBOL(dev_get_port_parent_id); +EXPORT_SYMBOL(netif_get_port_parent_id); /** * netdev_port_same_parent_id - Indicate if two network devices have @@ -9829,8 +9918,8 @@ bool netdev_port_same_parent_id(struct net_device *a, struct net_device *b) struct netdev_phys_item_id a_id = { }; struct netdev_phys_item_id b_id = { }; - if (dev_get_port_parent_id(a, &a_id, true) || - dev_get_port_parent_id(b, &b_id, true)) + if (netif_get_port_parent_id(a, &a_id, true) || + netif_get_port_parent_id(b, &b_id, true)) return false; return netdev_phys_item_id_same(&a_id, &b_id); @@ -10364,7 +10453,8 @@ int bpf_xdp_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) goto unlock; } - bpf_link_init(&link->link, BPF_LINK_TYPE_XDP, &bpf_xdp_link_lops, prog); + bpf_link_init(&link->link, BPF_LINK_TYPE_XDP, &bpf_xdp_link_lops, prog, + attr->link_create.attach_type); link->dev = dev; link->flags = attr->link_create.flags; @@ -10730,12 +10820,14 @@ sync_lower: * *before* calling udp_tunnel_get_rx_info, * but *after* calling udp_tunnel_drop_rx_info. */ + udp_tunnel_nic_lock(dev); if (features & NETIF_F_RX_UDP_TUNNEL_PORT) { dev->features = features; udp_tunnel_get_rx_info(dev); } else { udp_tunnel_drop_rx_info(dev); } + udp_tunnel_nic_unlock(dev); } if (diff & NETIF_F_HW_VLAN_CTAG_FILTER) { @@ -11715,7 +11807,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, dev->priv_len = sizeof_priv; - ref_tracker_dir_init(&dev->refcnt_tracker, 128, name); + ref_tracker_dir_init(&dev->refcnt_tracker, 128, "netdev"); #ifdef CONFIG_PCPU_DEV_REFCNT dev->pcpu_refcnt = alloc_percpu(int); if (!dev->pcpu_refcnt) @@ -11937,21 +12029,8 @@ static void netdev_rss_contexts_free(struct net_device *dev) mutex_lock(&dev->ethtool->rss_lock); xa_for_each(&dev->ethtool->rss_ctx, context, ctx) { - struct ethtool_rxfh_param rxfh; - - rxfh.indir = ethtool_rxfh_context_indir(ctx); - rxfh.key = ethtool_rxfh_context_key(ctx); - rxfh.hfunc = ctx->hfunc; - rxfh.input_xfrm = ctx->input_xfrm; - rxfh.rss_context = context; - rxfh.rss_delete = true; - xa_erase(&dev->ethtool->rss_ctx, context); - if (dev->ethtool_ops->create_rxfh_context) - dev->ethtool_ops->remove_rxfh_context(dev, ctx, - context, NULL); - else - dev->ethtool_ops->set_rxfh(dev, &rxfh, NULL); + dev->ethtool_ops->remove_rxfh_context(dev, ctx, context, NULL); kfree(ctx); } xa_destroy(&dev->ethtool->rss_ctx); @@ -12036,7 +12115,7 @@ void unregister_netdevice_many_notify(struct list_head *head, netdev_lock(dev); } } - dev_close_many(&close_head, true); + netif_close_many(&close_head, true); /* ... now unlock them and go over the rest. */ list_for_each_entry(dev, head, unreg_list) { if (netdev_need_ops_lock(dev)) @@ -12044,7 +12123,7 @@ void unregister_netdevice_many_notify(struct list_head *head, else list_add_tail(&dev->close_list, &close_head); } - dev_close_many(&close_head, true); + netif_close_many(&close_head, true); list_for_each_entry(dev, head, unreg_list) { /* And unlink it from device chain. */ diff --git a/net/core/dev.h b/net/core/dev.h index e93f36b7ddf3..ab69edc0c3e3 100644 --- a/net/core/dev.h +++ b/net/core/dev.h @@ -315,6 +315,20 @@ static inline void napi_set_irq_suspend_timeout(struct napi_struct *n, WRITE_ONCE(n->irq_suspend_timeout, timeout); } +static inline enum netdev_napi_threaded napi_get_threaded(struct napi_struct *n) +{ + if (test_bit(NAPI_STATE_THREADED, &n->state)) + return NETDEV_NAPI_THREADED_ENABLED; + + return NETDEV_NAPI_THREADED_DISABLED; +} + +int napi_set_threaded(struct napi_struct *n, + enum netdev_napi_threaded threaded); + +int netif_set_threaded(struct net_device *dev, + enum netdev_napi_threaded threaded); + int rps_cpumask_housekeeping(struct cpumask *mask); #if defined(CONFIG_DEBUG_NET) && defined(CONFIG_BPF_SYSCALL) diff --git a/net/core/dev_addr_lists.c b/net/core/dev_addr_lists.c index 90716bd736f3..76c91f224886 100644 --- a/net/core/dev_addr_lists.c +++ b/net/core/dev_addr_lists.c @@ -603,7 +603,7 @@ int dev_addr_add(struct net_device *dev, const unsigned char *addr, ASSERT_RTNL(); - err = dev_pre_changeaddr_notify(dev, addr, NULL); + err = netif_pre_changeaddr_notify(dev, addr, NULL); if (err) return err; err = __hw_addr_add(&dev->dev_addrs, addr, dev->addr_len, addr_type); diff --git a/net/core/dev_api.c b/net/core/dev_api.c index 1bf0153195f2..f28852078aa6 100644 --- a/net/core/dev_api.c +++ b/net/core/dev_api.c @@ -367,3 +367,16 @@ void netdev_state_change(struct net_device *dev) netdev_unlock_ops(dev); } EXPORT_SYMBOL(netdev_state_change); + +int dev_set_threaded(struct net_device *dev, + enum netdev_napi_threaded threaded) +{ + int ret; + + netdev_lock(dev); + ret = netif_set_threaded(dev, threaded); + netdev_unlock(dev); + + return ret; +} +EXPORT_SYMBOL(dev_set_threaded); diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c index 616479e71466..9c0ad7f4b5d8 100644 --- a/net/core/dev_ioctl.c +++ b/net/core/dev_ioctl.c @@ -147,7 +147,7 @@ static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cm switch (cmd) { case SIOCGIFFLAGS: /* Get interface flags */ - ifr->ifr_flags = (short) dev_get_flags(dev); + ifr->ifr_flags = (short)netif_get_flags(dev); return 0; case SIOCGIFMETRIC: /* Get the metric on the interface @@ -728,7 +728,8 @@ int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr, switch (cmd) { case SIOCGIFHWADDR: dev_load(net, ifr->ifr_name); - ret = dev_get_mac_address(&ifr->ifr_hwaddr, net, ifr->ifr_name); + ret = netif_get_mac_address(&ifr->ifr_hwaddr, net, + ifr->ifr_name); if (colon) *colon = ':'; return ret; diff --git a/net/core/dst.c b/net/core/dst.c index 795ca07e28a4..e2de8b68c41d 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -145,12 +145,12 @@ void dst_dev_put(struct dst_entry *dst) { struct net_device *dev = dst->dev; - dst->obsolete = DST_OBSOLETE_DEAD; + WRITE_ONCE(dst->obsolete, DST_OBSOLETE_DEAD); if (dst->ops->ifdown) dst->ops->ifdown(dst, dev); - dst->input = dst_discard; - dst->output = dst_discard_out; - dst->dev = blackhole_netdev; + WRITE_ONCE(dst->input, dst_discard); + WRITE_ONCE(dst->output, dst_discard_out); + WRITE_ONCE(dst->dev, blackhole_netdev); netdev_ref_replace(dev, blackhole_netdev, &dst->dev_tracker, GFP_ATOMIC); } @@ -263,7 +263,7 @@ unsigned int dst_blackhole_mtu(const struct dst_entry *dst) { unsigned int mtu = dst_metric_raw(dst, RTAX_MTU); - return mtu ? : dst->dev->mtu; + return mtu ? : dst_dev(dst)->mtu; } EXPORT_SYMBOL_GPL(dst_blackhole_mtu); diff --git a/net/core/dst_cache.c b/net/core/dst_cache.c index 93a04d18e505..9ab4902324e1 100644 --- a/net/core/dst_cache.c +++ b/net/core/dst_cache.c @@ -52,7 +52,7 @@ static struct dst_entry *dst_cache_per_cpu_get(struct dst_cache *dst_cache, if (unlikely(!time_after(idst->refresh_ts, READ_ONCE(dst_cache->reset_ts)) || - (dst->obsolete && !dst->ops->check(dst, idst->cookie)))) { + (READ_ONCE(dst->obsolete) && !dst->ops->check(dst, idst->cookie)))) { dst_cache_per_cpu_dst_set(idst, NULL, 0); dst_release(dst); goto fail; diff --git a/net/core/filter.c b/net/core/filter.c index 7a72f766aacf..c09a85c17496 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -122,6 +122,7 @@ EXPORT_SYMBOL_GPL(copy_bpf_fprog_from_user); * @sk: sock associated with &sk_buff * @skb: buffer to filter * @cap: limit on how short the eBPF program may trim the packet + * @reason: record drop reason on errors (negative return value) * * Run the eBPF program and then cut skb->data to correct size returned by * the program. If pkt_len is 0 we toss packet. If skb->len is smaller @@ -130,7 +131,8 @@ EXPORT_SYMBOL_GPL(copy_bpf_fprog_from_user); * be accepted or -EPERM if the packet should be tossed. * */ -int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap) +int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, + unsigned int cap, enum skb_drop_reason *reason) { int err; struct sk_filter *filter; @@ -142,15 +144,20 @@ int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap) */ if (skb_pfmemalloc(skb) && !sock_flag(sk, SOCK_MEMALLOC)) { NET_INC_STATS(sock_net(sk), LINUX_MIB_PFMEMALLOCDROP); + *reason = SKB_DROP_REASON_PFMEMALLOC; return -ENOMEM; } err = BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb); - if (err) + if (err) { + *reason = SKB_DROP_REASON_SOCKET_FILTER; return err; + } err = security_sock_rcv_skb(sk, skb); - if (err) + if (err) { + *reason = SKB_DROP_REASON_SECURITY_HOOK; return err; + } rcu_read_lock(); filter = rcu_dereference(sk->sk_filter); @@ -162,6 +169,8 @@ int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap) pkt_len = bpf_prog_run_save_cb(filter->prog, skb); skb->sk = save_sk; err = pkt_len ? pskb_trim(skb, max(cap, pkt_len)) : -EPERM; + if (err) + *reason = SKB_DROP_REASON_SOCKET_FILTER; } rcu_read_unlock(); @@ -8690,7 +8699,7 @@ static bool bpf_skb_is_valid_access(int off, int size, enum bpf_access_type type if (size != sizeof(__u64)) return false; break; - case offsetof(struct __sk_buff, sk): + case bpf_ctx_range_ptr(struct __sk_buff, sk): if (type == BPF_WRITE || size != sizeof(__u64)) return false; info->reg_type = PTR_TO_SOCK_COMMON_OR_NULL; @@ -9268,7 +9277,7 @@ static bool sock_addr_is_valid_access(int off, int size, return false; } break; - case offsetof(struct bpf_sock_addr, sk): + case bpf_ctx_range_ptr(struct bpf_sock_addr, sk): if (type != BPF_READ) return false; if (size != sizeof(__u64)) @@ -9318,17 +9327,17 @@ static bool sock_ops_is_valid_access(int off, int size, if (size != sizeof(__u64)) return false; break; - case offsetof(struct bpf_sock_ops, sk): + case bpf_ctx_range_ptr(struct bpf_sock_ops, sk): if (size != sizeof(__u64)) return false; info->reg_type = PTR_TO_SOCKET_OR_NULL; break; - case offsetof(struct bpf_sock_ops, skb_data): + case bpf_ctx_range_ptr(struct bpf_sock_ops, skb_data): if (size != sizeof(__u64)) return false; info->reg_type = PTR_TO_PACKET; break; - case offsetof(struct bpf_sock_ops, skb_data_end): + case bpf_ctx_range_ptr(struct bpf_sock_ops, skb_data_end): if (size != sizeof(__u64)) return false; info->reg_type = PTR_TO_PACKET_END; @@ -9337,7 +9346,7 @@ static bool sock_ops_is_valid_access(int off, int size, bpf_ctx_record_field_size(info, size_default); return bpf_ctx_narrow_access_ok(off, size, size_default); - case offsetof(struct bpf_sock_ops, skb_hwtstamp): + case bpf_ctx_range(struct bpf_sock_ops, skb_hwtstamp): if (size != sizeof(__u64)) return false; break; @@ -9407,17 +9416,17 @@ static bool sk_msg_is_valid_access(int off, int size, return false; switch (off) { - case offsetof(struct sk_msg_md, data): + case bpf_ctx_range_ptr(struct sk_msg_md, data): info->reg_type = PTR_TO_PACKET; if (size != sizeof(__u64)) return false; break; - case offsetof(struct sk_msg_md, data_end): + case bpf_ctx_range_ptr(struct sk_msg_md, data_end): info->reg_type = PTR_TO_PACKET_END; if (size != sizeof(__u64)) return false; break; - case offsetof(struct sk_msg_md, sk): + case bpf_ctx_range_ptr(struct sk_msg_md, sk): if (size != sizeof(__u64)) return false; info->reg_type = PTR_TO_SOCKET; @@ -11623,7 +11632,7 @@ static bool sk_lookup_is_valid_access(int off, int size, return false; switch (off) { - case offsetof(struct bpf_sk_lookup, sk): + case bpf_ctx_range_ptr(struct bpf_sk_lookup, sk): info->reg_type = PTR_TO_SOCKET_OR_NULL; return size == sizeof(__u64); diff --git a/net/core/hotdata.c b/net/core/hotdata.c index 0bc893d5f07b..95d0a4df1006 100644 --- a/net/core/hotdata.c +++ b/net/core/hotdata.c @@ -2,7 +2,9 @@ #include <linux/cache.h> #include <linux/jiffies.h> #include <linux/list.h> +#include <net/aligned_data.h> #include <net/hotdata.h> +#include <net/ip.h> #include <net/proto_memory.h> struct net_hotdata net_hotdata __cacheline_aligned = { @@ -22,3 +24,6 @@ struct net_hotdata net_hotdata __cacheline_aligned = { .sysctl_mem_pcpu_rsv = SK_MEMORY_PCPU_RESERVE }; EXPORT_SYMBOL(net_hotdata); + +struct net_aligned_data net_aligned_data; +EXPORT_IPV6_MOD(net_aligned_data); diff --git a/net/core/ieee8021q_helpers.c b/net/core/ieee8021q_helpers.c index 759a9b9f3f89..669b357b73b2 100644 --- a/net/core/ieee8021q_helpers.c +++ b/net/core/ieee8021q_helpers.c @@ -7,6 +7,11 @@ #include <net/dscp.h> #include <net/ieee8021q.h> +/* verify that table covers all 8 traffic types */ +#define TT_MAP_SIZE_OK(tbl) \ + compiletime_assert(ARRAY_SIZE(tbl) == IEEE8021Q_TT_MAX, \ + #tbl " size mismatch") + /* The following arrays map Traffic Types (TT) to traffic classes (TC) for * different number of queues as shown in the example provided by * IEEE 802.1Q-2022 in Annex I "I.3 Traffic type to traffic class mapping" and @@ -101,51 +106,28 @@ int ieee8021q_tt_to_tc(enum ieee8021q_traffic_type tt, unsigned int num_queues) switch (num_queues) { case 8: - compiletime_assert(ARRAY_SIZE(ieee8021q_8queue_tt_tc_map) != - IEEE8021Q_TT_MAX - 1, - "ieee8021q_8queue_tt_tc_map != max - 1"); + TT_MAP_SIZE_OK(ieee8021q_8queue_tt_tc_map); return ieee8021q_8queue_tt_tc_map[tt]; case 7: - compiletime_assert(ARRAY_SIZE(ieee8021q_7queue_tt_tc_map) != - IEEE8021Q_TT_MAX - 1, - "ieee8021q_7queue_tt_tc_map != max - 1"); - + TT_MAP_SIZE_OK(ieee8021q_7queue_tt_tc_map); return ieee8021q_7queue_tt_tc_map[tt]; case 6: - compiletime_assert(ARRAY_SIZE(ieee8021q_6queue_tt_tc_map) != - IEEE8021Q_TT_MAX - 1, - "ieee8021q_6queue_tt_tc_map != max - 1"); - + TT_MAP_SIZE_OK(ieee8021q_6queue_tt_tc_map); return ieee8021q_6queue_tt_tc_map[tt]; case 5: - compiletime_assert(ARRAY_SIZE(ieee8021q_5queue_tt_tc_map) != - IEEE8021Q_TT_MAX - 1, - "ieee8021q_5queue_tt_tc_map != max - 1"); - + TT_MAP_SIZE_OK(ieee8021q_5queue_tt_tc_map); return ieee8021q_5queue_tt_tc_map[tt]; case 4: - compiletime_assert(ARRAY_SIZE(ieee8021q_4queue_tt_tc_map) != - IEEE8021Q_TT_MAX - 1, - "ieee8021q_4queue_tt_tc_map != max - 1"); - + TT_MAP_SIZE_OK(ieee8021q_4queue_tt_tc_map); return ieee8021q_4queue_tt_tc_map[tt]; case 3: - compiletime_assert(ARRAY_SIZE(ieee8021q_3queue_tt_tc_map) != - IEEE8021Q_TT_MAX - 1, - "ieee8021q_3queue_tt_tc_map != max - 1"); - + TT_MAP_SIZE_OK(ieee8021q_3queue_tt_tc_map); return ieee8021q_3queue_tt_tc_map[tt]; case 2: - compiletime_assert(ARRAY_SIZE(ieee8021q_2queue_tt_tc_map) != - IEEE8021Q_TT_MAX - 1, - "ieee8021q_2queue_tt_tc_map != max - 1"); - + TT_MAP_SIZE_OK(ieee8021q_2queue_tt_tc_map); return ieee8021q_2queue_tt_tc_map[tt]; case 1: - compiletime_assert(ARRAY_SIZE(ieee8021q_1queue_tt_tc_map) != - IEEE8021Q_TT_MAX - 1, - "ieee8021q_1queue_tt_tc_map != max - 1"); - + TT_MAP_SIZE_OK(ieee8021q_1queue_tt_tc_map); return ieee8021q_1queue_tt_tc_map[tt]; } diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 49dce9a82295..bddfa389effa 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -28,6 +28,7 @@ #include <net/neighbour.h> #include <net/arp.h> #include <net/dst.h> +#include <net/ip.h> #include <net/sock.h> #include <net/netevent.h> #include <net/netlink.h> @@ -53,8 +54,8 @@ static void neigh_timer_handler(struct timer_list *t); static void __neigh_notify(struct neighbour *n, int type, int flags, u32 pid); static void neigh_update_notify(struct neighbour *neigh, u32 nlmsg_pid); -static int pneigh_ifdown_and_unlock(struct neigh_table *tbl, - struct net_device *dev); +static void pneigh_ifdown(struct neigh_table *tbl, struct net_device *dev, + bool skip_perm); #ifdef CONFIG_PROC_FS static const struct seq_operations neigh_stat_seq_ops; @@ -153,11 +154,12 @@ static void neigh_update_gc_list(struct neighbour *n) if (n->dead) goto out; - /* remove from the gc list if new state is permanent or if neighbor - * is externally learned; otherwise entry should be on the gc list + /* remove from the gc list if new state is permanent or if neighbor is + * externally learned / validated; otherwise entry should be on the gc + * list */ exempt_from_gc = n->nud_state & NUD_PERMANENT || - n->flags & NTF_EXT_LEARNED; + n->flags & (NTF_EXT_LEARNED | NTF_EXT_VALIDATED); on_gc_list = !list_empty(&n->gc_list); if (exempt_from_gc && on_gc_list) { @@ -204,6 +206,7 @@ static void neigh_update_flags(struct neighbour *neigh, u32 flags, int *notify, ndm_flags = (flags & NEIGH_UPDATE_F_EXT_LEARNED) ? NTF_EXT_LEARNED : 0; ndm_flags |= (flags & NEIGH_UPDATE_F_MANAGED) ? NTF_MANAGED : 0; + ndm_flags |= (flags & NEIGH_UPDATE_F_EXT_VALIDATED) ? NTF_EXT_VALIDATED : 0; if ((old_flags ^ ndm_flags) & NTF_EXT_LEARNED) { if (ndm_flags & NTF_EXT_LEARNED) @@ -221,6 +224,14 @@ static void neigh_update_flags(struct neighbour *neigh, u32 flags, int *notify, *notify = 1; *managed_update = true; } + if ((old_flags ^ ndm_flags) & NTF_EXT_VALIDATED) { + if (ndm_flags & NTF_EXT_VALIDATED) + neigh->flags |= NTF_EXT_VALIDATED; + else + neigh->flags &= ~NTF_EXT_VALIDATED; + *notify = 1; + *gc_update = true; + } } bool neigh_remove_one(struct neighbour *n) @@ -368,6 +379,43 @@ static void pneigh_queue_purge(struct sk_buff_head *list, struct net *net, } } +static void neigh_flush_one(struct neighbour *n) +{ + hlist_del_rcu(&n->hash); + hlist_del_rcu(&n->dev_list); + + write_lock(&n->lock); + + neigh_del_timer(n); + neigh_mark_dead(n); + + if (refcount_read(&n->refcnt) != 1) { + /* The most unpleasant situation. + * We must destroy neighbour entry, + * but someone still uses it. + * + * The destroy will be delayed until + * the last user releases us, but + * we must kill timers etc. and move + * it to safe state. + */ + __skb_queue_purge(&n->arp_queue); + n->arp_queue_len_bytes = 0; + WRITE_ONCE(n->output, neigh_blackhole); + + if (n->nud_state & NUD_VALID) + n->nud_state = NUD_NOARP; + else + n->nud_state = NUD_NONE; + + neigh_dbg(2, "neigh %p is stray\n", n); + } + + write_unlock(&n->lock); + + neigh_cleanup_and_release(n); +} + static void neigh_flush_dev(struct neigh_table *tbl, struct net_device *dev, bool skip_perm) { @@ -378,35 +426,29 @@ static void neigh_flush_dev(struct neigh_table *tbl, struct net_device *dev, dev_head = neigh_get_dev_table(dev, tbl->family); hlist_for_each_entry_safe(n, tmp, dev_head, dev_list) { - if (skip_perm && n->nud_state & NUD_PERMANENT) + if (skip_perm && + (n->nud_state & NUD_PERMANENT || + n->flags & NTF_EXT_VALIDATED)) continue; - hlist_del_rcu(&n->hash); - hlist_del_rcu(&n->dev_list); - write_lock(&n->lock); - neigh_del_timer(n); - neigh_mark_dead(n); - if (refcount_read(&n->refcnt) != 1) { - /* The most unpleasant situation. - * We must destroy neighbour entry, - * but someone still uses it. - * - * The destroy will be delayed until - * the last user releases us, but - * we must kill timers etc. and move - * it to safe state. - */ - __skb_queue_purge(&n->arp_queue); - n->arp_queue_len_bytes = 0; - WRITE_ONCE(n->output, neigh_blackhole); - if (n->nud_state & NUD_VALID) - n->nud_state = NUD_NOARP; - else - n->nud_state = NUD_NONE; - neigh_dbg(2, "neigh %p is stray\n", n); - } - write_unlock(&n->lock); - neigh_cleanup_and_release(n); + neigh_flush_one(n); + } +} + +static void neigh_flush_table(struct neigh_table *tbl) +{ + struct neigh_hash_table *nht; + int i; + + nht = rcu_dereference_protected(tbl->nht, + lockdep_is_held(&tbl->lock)); + + for (i = 0; i < (1 << nht->hash_shift); i++) { + struct hlist_node *tmp; + struct neighbour *n; + + neigh_for_each_in_bucket_safe(n, tmp, &nht->hash_heads[i]) + neigh_flush_one(n); } } @@ -422,8 +464,15 @@ static int __neigh_ifdown(struct neigh_table *tbl, struct net_device *dev, bool skip_perm) { write_lock_bh(&tbl->lock); - neigh_flush_dev(tbl, dev, skip_perm); - pneigh_ifdown_and_unlock(tbl, dev); + if (likely(dev)) { + neigh_flush_dev(tbl, dev, skip_perm); + } else { + DEBUG_NET_WARN_ON_ONCE(skip_perm); + neigh_flush_table(tbl); + } + write_unlock_bh(&tbl->lock); + + pneigh_ifdown(tbl, dev, skip_perm); pneigh_queue_purge(&tbl->proxy_queue, dev ? dev_net(dev) : NULL, tbl->family); if (skb_queue_empty_lockless(&tbl->proxy_queue)) @@ -706,54 +755,53 @@ static u32 pneigh_hash(const void *pkey, unsigned int key_len) return hash_val; } -static struct pneigh_entry *__pneigh_lookup_1(struct pneigh_entry *n, - struct net *net, - const void *pkey, - unsigned int key_len, - struct net_device *dev) +struct pneigh_entry *pneigh_lookup(struct neigh_table *tbl, + struct net *net, const void *pkey, + struct net_device *dev) { + struct pneigh_entry *n; + unsigned int key_len; + u32 hash_val; + + key_len = tbl->key_len; + hash_val = pneigh_hash(pkey, key_len); + n = rcu_dereference_check(tbl->phash_buckets[hash_val], + lockdep_is_held(&tbl->phash_lock)); + while (n) { if (!memcmp(n->key, pkey, key_len) && net_eq(pneigh_net(n), net) && (n->dev == dev || !n->dev)) return n; - n = n->next; - } - return NULL; -} -struct pneigh_entry *__pneigh_lookup(struct neigh_table *tbl, - struct net *net, const void *pkey, struct net_device *dev) -{ - unsigned int key_len = tbl->key_len; - u32 hash_val = pneigh_hash(pkey, key_len); + n = rcu_dereference_check(n->next, lockdep_is_held(&tbl->phash_lock)); + } - return __pneigh_lookup_1(tbl->phash_buckets[hash_val], - net, pkey, key_len, dev); + return NULL; } -EXPORT_SYMBOL_GPL(__pneigh_lookup); +EXPORT_IPV6_MOD(pneigh_lookup); -struct pneigh_entry * pneigh_lookup(struct neigh_table *tbl, - struct net *net, const void *pkey, - struct net_device *dev, int creat) +int pneigh_create(struct neigh_table *tbl, struct net *net, + const void *pkey, struct net_device *dev, + u32 flags, u8 protocol, bool permanent) { struct pneigh_entry *n; - unsigned int key_len = tbl->key_len; - u32 hash_val = pneigh_hash(pkey, key_len); - - read_lock_bh(&tbl->lock); - n = __pneigh_lookup_1(tbl->phash_buckets[hash_val], - net, pkey, key_len, dev); - read_unlock_bh(&tbl->lock); + unsigned int key_len; + u32 hash_val; + int err = 0; - if (n || !creat) - goto out; + mutex_lock(&tbl->phash_lock); - ASSERT_RTNL(); + n = pneigh_lookup(tbl, net, pkey, dev); + if (n) + goto update; + key_len = tbl->key_len; n = kzalloc(sizeof(*n) + key_len, GFP_KERNEL); - if (!n) + if (!n) { + err = -ENOBUFS; goto out; + } write_pnet(&n->net, net); memcpy(n->key, pkey, key_len); @@ -763,73 +811,98 @@ struct pneigh_entry * pneigh_lookup(struct neigh_table *tbl, if (tbl->pconstructor && tbl->pconstructor(n)) { netdev_put(dev, &n->dev_tracker); kfree(n); - n = NULL; + err = -ENOBUFS; goto out; } - write_lock_bh(&tbl->lock); + hash_val = pneigh_hash(pkey, key_len); n->next = tbl->phash_buckets[hash_val]; - tbl->phash_buckets[hash_val] = n; - write_unlock_bh(&tbl->lock); + rcu_assign_pointer(tbl->phash_buckets[hash_val], n); +update: + WRITE_ONCE(n->flags, flags); + n->permanent = permanent; + WRITE_ONCE(n->protocol, protocol); out: - return n; + mutex_unlock(&tbl->phash_lock); + return err; } -EXPORT_SYMBOL(pneigh_lookup); +static void pneigh_destroy(struct rcu_head *rcu) +{ + struct pneigh_entry *n = container_of(rcu, struct pneigh_entry, rcu); + + netdev_put(n->dev, &n->dev_tracker); + kfree(n); +} int pneigh_delete(struct neigh_table *tbl, struct net *net, const void *pkey, struct net_device *dev) { - struct pneigh_entry *n, **np; - unsigned int key_len = tbl->key_len; - u32 hash_val = pneigh_hash(pkey, key_len); + struct pneigh_entry *n, __rcu **np; + unsigned int key_len; + u32 hash_val; - write_lock_bh(&tbl->lock); - for (np = &tbl->phash_buckets[hash_val]; (n = *np) != NULL; + key_len = tbl->key_len; + hash_val = pneigh_hash(pkey, key_len); + + mutex_lock(&tbl->phash_lock); + + for (np = &tbl->phash_buckets[hash_val]; + (n = rcu_dereference_protected(*np, 1)) != NULL; np = &n->next) { if (!memcmp(n->key, pkey, key_len) && n->dev == dev && net_eq(pneigh_net(n), net)) { - *np = n->next; - write_unlock_bh(&tbl->lock); + rcu_assign_pointer(*np, n->next); + + mutex_unlock(&tbl->phash_lock); + if (tbl->pdestructor) tbl->pdestructor(n); - netdev_put(n->dev, &n->dev_tracker); - kfree(n); + + call_rcu(&n->rcu, pneigh_destroy); return 0; } } - write_unlock_bh(&tbl->lock); + + mutex_unlock(&tbl->phash_lock); return -ENOENT; } -static int pneigh_ifdown_and_unlock(struct neigh_table *tbl, - struct net_device *dev) +static void pneigh_ifdown(struct neigh_table *tbl, struct net_device *dev, + bool skip_perm) { - struct pneigh_entry *n, **np, *freelist = NULL; + struct pneigh_entry *n, __rcu **np; + LIST_HEAD(head); u32 h; + mutex_lock(&tbl->phash_lock); + for (h = 0; h <= PNEIGH_HASHMASK; h++) { np = &tbl->phash_buckets[h]; - while ((n = *np) != NULL) { + while ((n = rcu_dereference_protected(*np, 1)) != NULL) { + if (skip_perm && n->permanent) + goto skip; if (!dev || n->dev == dev) { - *np = n->next; - n->next = freelist; - freelist = n; + rcu_assign_pointer(*np, n->next); + list_add(&n->free_node, &head); continue; } +skip: np = &n->next; } } - write_unlock_bh(&tbl->lock); - while ((n = freelist)) { - freelist = n->next; - n->next = NULL; + + mutex_unlock(&tbl->phash_lock); + + while (!list_empty(&head)) { + n = list_first_entry(&head, typeof(*n), free_node); + list_del(&n->free_node); + if (tbl->pdestructor) tbl->pdestructor(n); - netdev_put(n->dev, &n->dev_tracker); - kfree(n); + + call_rcu(&n->rcu, pneigh_destroy); } - return -ENOENT; } static inline void neigh_parms_put(struct neigh_parms *parms) @@ -937,7 +1010,8 @@ static void neigh_periodic_work(struct work_struct *work) state = n->nud_state; if ((state & (NUD_PERMANENT | NUD_IN_TIMER)) || - (n->flags & NTF_EXT_LEARNED)) { + (n->flags & + (NTF_EXT_LEARNED | NTF_EXT_VALIDATED))) { write_unlock(&n->lock); continue; } @@ -1090,9 +1164,15 @@ static void neigh_timer_handler(struct timer_list *t) if ((neigh->nud_state & (NUD_INCOMPLETE | NUD_PROBE)) && atomic_read(&neigh->probes) >= neigh_max_probes(neigh)) { - WRITE_ONCE(neigh->nud_state, NUD_FAILED); + if (neigh->nud_state == NUD_PROBE && + neigh->flags & NTF_EXT_VALIDATED) { + WRITE_ONCE(neigh->nud_state, NUD_STALE); + neigh->updated = jiffies; + } else { + WRITE_ONCE(neigh->nud_state, NUD_FAILED); + neigh_invalidate(neigh); + } notify = 1; - neigh_invalidate(neigh); goto out; } @@ -1240,6 +1320,8 @@ static void neigh_update_hhs(struct neighbour *neigh) NTF_ROUTER flag. NEIGH_UPDATE_F_ISROUTER indicates if the neighbour is known as a router. + NEIGH_UPDATE_F_EXT_VALIDATED means that the entry will not be removed + or invalidated. Caller MUST hold reference count on the entry. */ @@ -1402,7 +1484,8 @@ static int __neigh_update(struct neighbour *neigh, const u8 *lladdr, * we can reinject the packet there. */ n2 = NULL; - if (dst && dst->obsolete != DST_OBSOLETE_DEAD) { + if (dst && + READ_ONCE(dst->obsolete) != DST_OBSOLETE_DEAD) { n2 = dst_neigh_lookup_skb(dst, skb); if (n2) n1 = n2; @@ -1756,6 +1839,7 @@ void neigh_table_init(int index, struct neigh_table *tbl) WARN_ON(tbl->entry_size % NEIGH_PRIV_ALIGN); rwlock_init(&tbl->lock); + mutex_init(&tbl->phash_lock); INIT_DEFERRABLE_WORK(&tbl->gc_work, neigh_periodic_work); queue_delayed_work(system_power_efficient_wq, &tbl->gc_work, @@ -1972,21 +2056,13 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, if (tb[NDA_PROTOCOL]) protocol = nla_get_u8(tb[NDA_PROTOCOL]); if (ndm_flags & NTF_PROXY) { - struct pneigh_entry *pn; - - if (ndm_flags & NTF_MANAGED) { + if (ndm_flags & (NTF_MANAGED | NTF_EXT_VALIDATED)) { NL_SET_ERR_MSG(extack, "Invalid NTF_* flag combination"); goto out; } - err = -ENOBUFS; - pn = pneigh_lookup(tbl, net, dst, dev, 1); - if (pn) { - pn->flags = ndm_flags; - if (protocol) - pn->protocol = protocol; - err = 0; - } + err = pneigh_create(tbl, net, dst, dev, ndm_flags, protocol, + !!(ndm->ndm_state & NUD_PERMANENT)); goto out; } @@ -2004,7 +2080,8 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, if (neigh == NULL) { bool ndm_permanent = ndm->ndm_state & NUD_PERMANENT; bool exempt_from_gc = ndm_permanent || - ndm_flags & NTF_EXT_LEARNED; + ndm_flags & (NTF_EXT_LEARNED | + NTF_EXT_VALIDATED); if (!(nlh->nlmsg_flags & NLM_F_CREATE)) { err = -ENOENT; @@ -2015,10 +2092,27 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, err = -EINVAL; goto out; } + if (ndm_flags & NTF_EXT_VALIDATED) { + u8 state = ndm->ndm_state; + + /* NTF_USE and NTF_MANAGED will result in the neighbor + * being created with an invalid state (NUD_NONE). + */ + if (ndm_flags & (NTF_USE | NTF_MANAGED)) + state = NUD_NONE; + + if (!(state & NUD_VALID)) { + NL_SET_ERR_MSG(extack, + "Cannot create externally validated neighbor with an invalid state"); + err = -EINVAL; + goto out; + } + } neigh = ___neigh_create(tbl, dst, dev, ndm_flags & - (NTF_EXT_LEARNED | NTF_MANAGED), + (NTF_EXT_LEARNED | NTF_MANAGED | + NTF_EXT_VALIDATED), exempt_from_gc, true); if (IS_ERR(neigh)) { err = PTR_ERR(neigh); @@ -2030,6 +2124,24 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, neigh_release(neigh); goto out; } + if (ndm_flags & NTF_EXT_VALIDATED) { + u8 state = ndm->ndm_state; + + /* NTF_USE and NTF_MANAGED do not update the existing + * state other than clearing it if it was + * NUD_PERMANENT. + */ + if (ndm_flags & (NTF_USE | NTF_MANAGED)) + state = READ_ONCE(neigh->nud_state) & ~NUD_PERMANENT; + + if (!(state & NUD_VALID)) { + NL_SET_ERR_MSG(extack, + "Cannot mark neighbor as externally validated with an invalid state"); + err = -EINVAL; + neigh_release(neigh); + goto out; + } + } if (!(nlh->nlmsg_flags & NLM_F_REPLACE)) flags &= ~(NEIGH_UPDATE_F_OVERRIDE | @@ -2046,13 +2158,13 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, flags |= NEIGH_UPDATE_F_MANAGED; if (ndm_flags & NTF_USE) flags |= NEIGH_UPDATE_F_USE; + if (ndm_flags & NTF_EXT_VALIDATED) + flags |= NEIGH_UPDATE_F_EXT_VALIDATED; err = __neigh_update(neigh, lladdr, ndm->ndm_state, flags, NETLINK_CB(skb).portid, extack); - if (!err && ndm_flags & (NTF_USE | NTF_MANAGED)) { + if (!err && ndm_flags & (NTF_USE | NTF_MANAGED)) neigh_event_send(neigh, NULL); - err = 0; - } neigh_release(neigh); out: return err; @@ -2579,13 +2691,15 @@ static int pneigh_fill_info(struct sk_buff *skb, struct pneigh_entry *pn, u32 neigh_flags, neigh_flags_ext; struct nlmsghdr *nlh; struct ndmsg *ndm; + u8 protocol; nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ndm), flags); if (nlh == NULL) return -EMSGSIZE; - neigh_flags_ext = pn->flags >> NTF_EXT_SHIFT; - neigh_flags = pn->flags & NTF_OLD_MASK; + neigh_flags = READ_ONCE(pn->flags); + neigh_flags_ext = neigh_flags >> NTF_EXT_SHIFT; + neigh_flags &= NTF_OLD_MASK; ndm = nlmsg_data(nlh); ndm->ndm_family = tbl->family; @@ -2599,7 +2713,8 @@ static int pneigh_fill_info(struct sk_buff *skb, struct pneigh_entry *pn, if (nla_put(skb, NDA_DST, tbl->key_len, pn->key)) goto nla_put_failure; - if (pn->protocol && nla_put_u8(skb, NDA_PROTOCOL, pn->protocol)) + protocol = READ_ONCE(pn->protocol); + if (protocol && nla_put_u8(skb, NDA_PROTOCOL, protocol)) goto nla_put_failure; if (neigh_flags_ext && nla_put_u32(skb, NDA_FLAGS_EXT, neigh_flags_ext)) goto nla_put_failure; @@ -2706,12 +2821,12 @@ static int pneigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, if (filter->dev_idx || filter->master_idx) flags |= NLM_F_DUMP_FILTERED; - read_lock_bh(&tbl->lock); - for (h = s_h; h <= PNEIGH_HASHMASK; h++) { if (h > s_h) s_idx = 0; - for (n = tbl->phash_buckets[h], idx = 0; n; n = n->next) { + for (n = rcu_dereference(tbl->phash_buckets[h]), idx = 0; + n; + n = rcu_dereference(n->next)) { if (idx < s_idx || pneigh_net(n) != net) goto next; if (neigh_ifindex_filtered(n->dev, filter->dev_idx) || @@ -2720,16 +2835,13 @@ static int pneigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, err = pneigh_fill_info(skb, n, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, RTM_NEWNEIGH, flags, tbl); - if (err < 0) { - read_unlock_bh(&tbl->lock); + if (err < 0) goto out; - } next: idx++; } } - read_unlock_bh(&tbl->lock); out: cb->args[3] = h; cb->args[4] = idx; @@ -2846,64 +2958,58 @@ static int neigh_dump_info(struct sk_buff *skb, struct netlink_callback *cb) return err; } -static int neigh_valid_get_req(const struct nlmsghdr *nlh, - struct neigh_table **tbl, - void **dst, int *dev_idx, u8 *ndm_flags, - struct netlink_ext_ack *extack) +static struct ndmsg *neigh_valid_get_req(const struct nlmsghdr *nlh, + struct nlattr **tb, + struct netlink_ext_ack *extack) { - struct nlattr *tb[NDA_MAX + 1]; struct ndmsg *ndm; int err, i; ndm = nlmsg_payload(nlh, sizeof(*ndm)); if (!ndm) { NL_SET_ERR_MSG(extack, "Invalid header for neighbor get request"); - return -EINVAL; + return ERR_PTR(-EINVAL); } if (ndm->ndm_pad1 || ndm->ndm_pad2 || ndm->ndm_state || ndm->ndm_type) { NL_SET_ERR_MSG(extack, "Invalid values in header for neighbor get request"); - return -EINVAL; + return ERR_PTR(-EINVAL); } if (ndm->ndm_flags & ~NTF_PROXY) { NL_SET_ERR_MSG(extack, "Invalid flags in header for neighbor get request"); - return -EINVAL; + return ERR_PTR(-EINVAL); + } + + if (!(ndm->ndm_flags & NTF_PROXY) && !ndm->ndm_ifindex) { + NL_SET_ERR_MSG(extack, "No device specified"); + return ERR_PTR(-EINVAL); } err = nlmsg_parse_deprecated_strict(nlh, sizeof(struct ndmsg), tb, NDA_MAX, nda_policy, extack); if (err < 0) - return err; - - *ndm_flags = ndm->ndm_flags; - *dev_idx = ndm->ndm_ifindex; - *tbl = neigh_find_table(ndm->ndm_family); - if (*tbl == NULL) { - NL_SET_ERR_MSG(extack, "Unsupported family in header for neighbor get request"); - return -EAFNOSUPPORT; - } + return ERR_PTR(err); for (i = 0; i <= NDA_MAX; ++i) { - if (!tb[i]) - continue; - switch (i) { case NDA_DST: - if (nla_len(tb[i]) != (int)(*tbl)->key_len) { - NL_SET_ERR_MSG(extack, "Invalid network address in neighbor get request"); - return -EINVAL; + if (!tb[i]) { + NL_SET_ERR_ATTR_MISS(extack, NULL, NDA_DST); + return ERR_PTR(-EINVAL); } - *dst = nla_data(tb[i]); break; default: + if (!tb[i]) + continue; + NL_SET_ERR_MSG(extack, "Unsupported attribute in neighbor get request"); - return -EINVAL; + return ERR_PTR(-EINVAL); } } - return 0; + return ndm; } static inline size_t neigh_nlmsg_size(void) @@ -2917,27 +3023,6 @@ static inline size_t neigh_nlmsg_size(void) + nla_total_size(1); /* NDA_PROTOCOL */ } -static int neigh_get_reply(struct net *net, struct neighbour *neigh, - u32 pid, u32 seq) -{ - struct sk_buff *skb; - int err = 0; - - skb = nlmsg_new(neigh_nlmsg_size(), GFP_KERNEL); - if (!skb) - return -ENOBUFS; - - err = neigh_fill_info(skb, neigh, pid, seq, RTM_NEWNEIGH, 0); - if (err) { - kfree_skb(skb); - goto errout; - } - - err = rtnl_unicast(skb, net, pid); -errout: - return err; -} - static inline size_t pneigh_nlmsg_size(void) { return NLMSG_ALIGN(sizeof(struct ndmsg)) @@ -2946,85 +3031,91 @@ static inline size_t pneigh_nlmsg_size(void) + nla_total_size(1); /* NDA_PROTOCOL */ } -static int pneigh_get_reply(struct net *net, struct pneigh_entry *neigh, - u32 pid, u32 seq, struct neigh_table *tbl) +static int neigh_get(struct sk_buff *in_skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) { + struct net *net = sock_net(in_skb->sk); + u32 pid = NETLINK_CB(in_skb).portid; + struct nlattr *tb[NDA_MAX + 1]; + struct net_device *dev = NULL; + u32 seq = nlh->nlmsg_seq; + struct neigh_table *tbl; + struct neighbour *neigh; struct sk_buff *skb; - int err = 0; + struct ndmsg *ndm; + void *dst; + int err; - skb = nlmsg_new(pneigh_nlmsg_size(), GFP_KERNEL); + ndm = neigh_valid_get_req(nlh, tb, extack); + if (IS_ERR(ndm)) + return PTR_ERR(ndm); + + if (ndm->ndm_flags & NTF_PROXY) + skb = nlmsg_new(neigh_nlmsg_size(), GFP_KERNEL); + else + skb = nlmsg_new(pneigh_nlmsg_size(), GFP_KERNEL); if (!skb) return -ENOBUFS; - err = pneigh_fill_info(skb, neigh, pid, seq, RTM_NEWNEIGH, 0, tbl); - if (err) { - kfree_skb(skb); - goto errout; - } + rcu_read_lock(); - err = rtnl_unicast(skb, net, pid); -errout: - return err; -} + tbl = neigh_find_table(ndm->ndm_family); + if (!tbl) { + NL_SET_ERR_MSG(extack, "Unsupported family in header for neighbor get request"); + err = -EAFNOSUPPORT; + goto err_unlock; + } -static int neigh_get(struct sk_buff *in_skb, struct nlmsghdr *nlh, - struct netlink_ext_ack *extack) -{ - struct net *net = sock_net(in_skb->sk); - struct net_device *dev = NULL; - struct neigh_table *tbl = NULL; - struct neighbour *neigh; - void *dst = NULL; - u8 ndm_flags = 0; - int dev_idx = 0; - int err; + if (nla_len(tb[NDA_DST]) != (int)tbl->key_len) { + NL_SET_ERR_MSG(extack, "Invalid network address in neighbor get request"); + err = -EINVAL; + goto err_unlock; + } - err = neigh_valid_get_req(nlh, &tbl, &dst, &dev_idx, &ndm_flags, - extack); - if (err < 0) - return err; + dst = nla_data(tb[NDA_DST]); - if (dev_idx) { - dev = __dev_get_by_index(net, dev_idx); + if (ndm->ndm_ifindex) { + dev = dev_get_by_index_rcu(net, ndm->ndm_ifindex); if (!dev) { NL_SET_ERR_MSG(extack, "Unknown device ifindex"); - return -ENODEV; + err = -ENODEV; + goto err_unlock; } } - if (!dst) { - NL_SET_ERR_MSG(extack, "Network address not specified"); - return -EINVAL; - } - - if (ndm_flags & NTF_PROXY) { + if (ndm->ndm_flags & NTF_PROXY) { struct pneigh_entry *pn; - pn = pneigh_lookup(tbl, net, dst, dev, 0); + pn = pneigh_lookup(tbl, net, dst, dev); if (!pn) { NL_SET_ERR_MSG(extack, "Proxy neighbour entry not found"); - return -ENOENT; + err = -ENOENT; + goto err_unlock; } - return pneigh_get_reply(net, pn, NETLINK_CB(in_skb).portid, - nlh->nlmsg_seq, tbl); - } - if (!dev) { - NL_SET_ERR_MSG(extack, "No device specified"); - return -EINVAL; - } + err = pneigh_fill_info(skb, pn, pid, seq, RTM_NEWNEIGH, 0, tbl); + if (err) + goto err_unlock; + } else { + neigh = neigh_lookup(tbl, dst, dev); + if (!neigh) { + NL_SET_ERR_MSG(extack, "Neighbour entry not found"); + err = -ENOENT; + goto err_unlock; + } - neigh = neigh_lookup(tbl, dst, dev); - if (!neigh) { - NL_SET_ERR_MSG(extack, "Neighbour entry not found"); - return -ENOENT; + err = neigh_fill_info(skb, neigh, pid, seq, RTM_NEWNEIGH, 0); + neigh_release(neigh); + if (err) + goto err_unlock; } - err = neigh_get_reply(net, neigh, NETLINK_CB(in_skb).portid, - nlh->nlmsg_seq); - - neigh_release(neigh); + rcu_read_unlock(); + return rtnl_unicast(skb, net, pid); +err_unlock: + rcu_read_unlock(); + kfree_skb(skb); return err; } @@ -3231,9 +3322,10 @@ static struct pneigh_entry *pneigh_get_first(struct seq_file *seq) state->flags |= NEIGH_SEQ_IS_PNEIGH; for (bucket = 0; bucket <= PNEIGH_HASHMASK; bucket++) { - pn = tbl->phash_buckets[bucket]; + pn = rcu_dereference(tbl->phash_buckets[bucket]); + while (pn && !net_eq(pneigh_net(pn), net)) - pn = pn->next; + pn = rcu_dereference(pn->next); if (pn) break; } @@ -3251,15 +3343,17 @@ static struct pneigh_entry *pneigh_get_next(struct seq_file *seq, struct neigh_table *tbl = state->tbl; do { - pn = pn->next; + pn = rcu_dereference(pn->next); } while (pn && !net_eq(pneigh_net(pn), net)); while (!pn) { if (++state->bucket > PNEIGH_HASHMASK) break; - pn = tbl->phash_buckets[state->bucket]; + + pn = rcu_dereference(tbl->phash_buckets[state->bucket]); + while (pn && !net_eq(pneigh_net(pn), net)) - pn = pn->next; + pn = rcu_dereference(pn->next); if (pn) break; } @@ -3823,7 +3917,7 @@ static const struct rtnl_msg_handler neigh_rtnl_msg_handlers[] __initconst = { {.msgtype = RTM_NEWNEIGH, .doit = neigh_add}, {.msgtype = RTM_DELNEIGH, .doit = neigh_delete}, {.msgtype = RTM_GETNEIGH, .doit = neigh_get, .dumpit = neigh_dump_info, - .flags = RTNL_FLAG_DUMP_UNLOCKED}, + .flags = RTNL_FLAG_DOIT_UNLOCKED | RTNL_FLAG_DUMP_UNLOCKED}, {.msgtype = RTM_GETNEIGHTBL, .dumpit = neightbl_dump_info}, {.msgtype = RTM_SETNEIGHTBL, .doit = neightbl_set}, }; diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 1ace0cd01adc..c28cd6665444 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -256,7 +256,7 @@ static ssize_t name_assign_type_show(struct device *dev, } static DEVICE_ATTR_RO(name_assign_type); -/* use same locking rules as GIFHWADDR ioctl's (dev_get_mac_address()) */ +/* use same locking rules as GIFHWADDR ioctl's (netif_get_mac_address()) */ static ssize_t address_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -641,12 +641,6 @@ static ssize_t phys_port_id_show(struct device *dev, struct netdev_phys_item_id ppid; ssize_t ret; - /* The check is also done in dev_get_phys_port_id; this helps returning - * early without hitting the locking section below. - */ - if (!netdev->netdev_ops->ndo_get_phys_port_id) - return -EOPNOTSUPP; - ret = sysfs_rtnl_lock(&dev->kobj, &attr->attr, netdev); if (ret) return ret; @@ -668,13 +662,6 @@ static ssize_t phys_port_name_show(struct device *dev, char name[IFNAMSIZ]; ssize_t ret; - /* The checks are also done in dev_get_phys_port_name; this helps - * returning early without hitting the locking section below. - */ - if (!netdev->netdev_ops->ndo_get_phys_port_name && - !netdev->devlink_port) - return -EOPNOTSUPP; - ret = sysfs_rtnl_lock(&dev->kobj, &attr->attr, netdev); if (ret) return ret; @@ -696,19 +683,11 @@ static ssize_t phys_switch_id_show(struct device *dev, struct netdev_phys_item_id ppid = { }; ssize_t ret; - /* The checks are also done in dev_get_phys_port_name; this helps - * returning early without hitting the locking section below. This works - * because recurse is false when calling dev_get_port_parent_id. - */ - if (!netdev->netdev_ops->ndo_get_port_parent_id && - !netdev->devlink_port) - return -EOPNOTSUPP; - ret = sysfs_rtnl_lock(&dev->kobj, &attr->attr, netdev); if (ret) return ret; - ret = dev_get_port_parent_id(netdev, &ppid, false); + ret = netif_get_port_parent_id(netdev, &ppid, false); if (!ret) ret = sysfs_emit(buf, "%*phN\n", ppid.id_len, ppid.id); @@ -718,6 +697,40 @@ static ssize_t phys_switch_id_show(struct device *dev, } static DEVICE_ATTR_RO(phys_switch_id); +static struct attribute *netdev_phys_attrs[] __ro_after_init = { + &dev_attr_phys_port_id.attr, + &dev_attr_phys_port_name.attr, + &dev_attr_phys_switch_id.attr, + NULL, +}; + +static umode_t netdev_phys_is_visible(struct kobject *kobj, + struct attribute *attr, int index) +{ + struct device *dev = kobj_to_dev(kobj); + struct net_device *netdev = to_net_dev(dev); + + if (attr == &dev_attr_phys_port_id.attr) { + if (!netdev->netdev_ops->ndo_get_phys_port_id) + return 0; + } else if (attr == &dev_attr_phys_port_name.attr) { + if (!netdev->netdev_ops->ndo_get_phys_port_name && + !netdev->devlink_port) + return 0; + } else if (attr == &dev_attr_phys_switch_id.attr) { + if (!netdev->netdev_ops->ndo_get_port_parent_id && + !netdev->devlink_port) + return 0; + } + + return attr->mode; +} + +static const struct attribute_group netdev_phys_group = { + .attrs = netdev_phys_attrs, + .is_visible = netdev_phys_is_visible, +}; + static ssize_t threaded_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -744,7 +757,7 @@ static int modify_napi_threaded(struct net_device *dev, unsigned long val) if (val != 0 && val != 1) return -EOPNOTSUPP; - ret = dev_set_threaded(dev, val); + ret = netif_set_threaded(dev, val); return ret; } @@ -783,9 +796,6 @@ static struct attribute *net_class_attrs[] __ro_after_init = { &dev_attr_tx_queue_len.attr, &dev_attr_gro_flush_timeout.attr, &dev_attr_napi_defer_hard_irqs.attr, - &dev_attr_phys_port_id.attr, - &dev_attr_phys_port_name.attr, - &dev_attr_phys_switch_id.attr, &dev_attr_proto_down.attr, &dev_attr_carrier_up_count.attr, &dev_attr_carrier_down_count.attr, @@ -1200,12 +1210,21 @@ static int rx_queue_default_mask(struct net_device *dev, struct netdev_rx_queue *queue) { #if IS_ENABLED(CONFIG_RPS) && IS_ENABLED(CONFIG_SYSCTL) - struct cpumask *rps_default_mask = READ_ONCE(dev_net(dev)->core.rps_default_mask); + struct cpumask *rps_default_mask; + int res = 0; + mutex_lock(&rps_default_mask_mutex); + + rps_default_mask = dev_net(dev)->core.rps_default_mask; if (rps_default_mask && !cpumask_empty(rps_default_mask)) - return netdev_rx_queue_set_rps_mask(queue, rps_default_mask); -#endif + res = netdev_rx_queue_set_rps_mask(queue, rps_default_mask); + + mutex_unlock(&rps_default_mask_mutex); + + return res; +#else return 0; +#endif } static int rx_queue_add_kobject(struct net_device *dev, int index) @@ -2328,6 +2347,7 @@ int netdev_register_kobject(struct net_device *ndev) groups++; *groups++ = &netstat_group; + *groups++ = &netdev_phys_group; if (wireless_group_needed(ndev)) *groups++ = &wireless_group; diff --git a/net/core/net-sysfs.h b/net/core/net-sysfs.h index 8a5b04c2699a..e938f25e8e86 100644 --- a/net/core/net-sysfs.h +++ b/net/core/net-sysfs.h @@ -11,4 +11,6 @@ int netdev_queue_update_kobjects(struct net_device *net, int netdev_change_owner(struct net_device *, const struct net *net_old, const struct net *net_new); +extern struct mutex rps_default_mask_mutex; + #endif diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index ae54f26709ca..1b6f3826dd0e 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -19,9 +19,9 @@ #include <linux/net_namespace.h> #include <linux/sched/task.h> #include <linux/uidgid.h> -#include <linux/cookie.h> #include <linux/proc_fs.h> +#include <net/aligned_data.h> #include <net/sock.h> #include <net/netlink.h> #include <net/net_namespace.h> @@ -64,8 +64,6 @@ DECLARE_RWSEM(pernet_ops_rwsem); static unsigned int max_gen_ptrs = INITIAL_NET_GEN_PTRS; -DEFINE_COOKIE(net_cookie); - static struct net_generic *net_alloc_generic(void) { unsigned int gen_ptrs = READ_ONCE(max_gen_ptrs); @@ -319,10 +317,10 @@ int peernet2id_alloc(struct net *net, struct net *peer, gfp_t gfp) if (refcount_read(&net->ns.count) == 0) return NETNSA_NSID_NOT_ASSIGNED; - spin_lock_bh(&net->nsid_lock); + spin_lock(&net->nsid_lock); id = __peernet2id(net, peer); if (id >= 0) { - spin_unlock_bh(&net->nsid_lock); + spin_unlock(&net->nsid_lock); return id; } @@ -332,12 +330,12 @@ int peernet2id_alloc(struct net *net, struct net *peer, gfp_t gfp) * just been idr_remove()'d from there in cleanup_net(). */ if (!maybe_get_net(peer)) { - spin_unlock_bh(&net->nsid_lock); + spin_unlock(&net->nsid_lock); return NETNSA_NSID_NOT_ASSIGNED; } id = alloc_netid(net, peer, -1); - spin_unlock_bh(&net->nsid_lock); + spin_unlock(&net->nsid_lock); put_net(peer); if (id < 0) @@ -403,8 +401,8 @@ static __net_init void preinit_net(struct net *net, struct user_namespace *user_ { refcount_set(&net->passive, 1); refcount_set(&net->ns.count, 1); - ref_tracker_dir_init(&net->refcnt_tracker, 128, "net refcnt"); - ref_tracker_dir_init(&net->notrefcnt_tracker, 128, "net notrefcnt"); + ref_tracker_dir_init(&net->refcnt_tracker, 128, "net_refcnt"); + ref_tracker_dir_init(&net->notrefcnt_tracker, 128, "net_notrefcnt"); get_random_bytes(&net->hash_mix, sizeof(u32)); net->dev_base_seq = 1; @@ -434,9 +432,7 @@ static __net_init int setup_net(struct net *net) LIST_HEAD(net_exit_list); int error = 0; - preempt_disable(); - net->net_cookie = gen_cookie_next(&net_cookie); - preempt_enable(); + net->net_cookie = atomic64_inc_return(&net_aligned_data.net_cookie); list_for_each_entry(ops, &pernet_list, list) { error = ops_init(ops, net); @@ -628,20 +624,20 @@ static void unhash_nsid(struct net *net, struct net *last) for_each_net(tmp) { int id; - spin_lock_bh(&tmp->nsid_lock); + spin_lock(&tmp->nsid_lock); id = __peernet2id(tmp, net); if (id >= 0) idr_remove(&tmp->netns_ids, id); - spin_unlock_bh(&tmp->nsid_lock); + spin_unlock(&tmp->nsid_lock); if (id >= 0) rtnl_net_notifyid(tmp, RTM_DELNSID, id, 0, NULL, GFP_KERNEL); if (tmp == last) break; } - spin_lock_bh(&net->nsid_lock); + spin_lock(&net->nsid_lock); idr_destroy(&net->netns_ids); - spin_unlock_bh(&net->nsid_lock); + spin_unlock(&net->nsid_lock); } static LLIST_HEAD(cleanup_list); @@ -791,16 +787,50 @@ struct net *get_net_ns_by_pid(pid_t pid) } EXPORT_SYMBOL_GPL(get_net_ns_by_pid); +#ifdef CONFIG_NET_NS_REFCNT_TRACKER +static void net_ns_net_debugfs(struct net *net) +{ + ref_tracker_dir_symlink(&net->refcnt_tracker, "netns-%llx-%u-refcnt", + net->net_cookie, net->ns.inum); + ref_tracker_dir_symlink(&net->notrefcnt_tracker, "netns-%llx-%u-notrefcnt", + net->net_cookie, net->ns.inum); +} + +static int __init init_net_debugfs(void) +{ + ref_tracker_dir_debugfs(&init_net.refcnt_tracker); + ref_tracker_dir_debugfs(&init_net.notrefcnt_tracker); + net_ns_net_debugfs(&init_net); + return 0; +} +late_initcall(init_net_debugfs); +#else +static void net_ns_net_debugfs(struct net *net) +{ +} +#endif + static __net_init int net_ns_net_init(struct net *net) { #ifdef CONFIG_NET_NS net->ns.ops = &netns_operations; #endif - return ns_alloc_inum(&net->ns); + net->ns.inum = PROC_NET_INIT_INO; + if (net != &init_net) { + int ret = ns_alloc_inum(&net->ns); + if (ret) + return ret; + } + net_ns_net_debugfs(net); + return 0; } static __net_exit void net_ns_net_exit(struct net *net) { + /* + * Initial network namespace doesn't exit so we don't need any + * special checks here. + */ ns_free_inum(&net->ns); } @@ -852,9 +882,9 @@ static int rtnl_net_newid(struct sk_buff *skb, struct nlmsghdr *nlh, return PTR_ERR(peer); } - spin_lock_bh(&net->nsid_lock); + spin_lock(&net->nsid_lock); if (__peernet2id(net, peer) >= 0) { - spin_unlock_bh(&net->nsid_lock); + spin_unlock(&net->nsid_lock); err = -EEXIST; NL_SET_BAD_ATTR(extack, nla); NL_SET_ERR_MSG(extack, @@ -863,7 +893,7 @@ static int rtnl_net_newid(struct sk_buff *skb, struct nlmsghdr *nlh, } err = alloc_netid(net, peer, nsid); - spin_unlock_bh(&net->nsid_lock); + spin_unlock(&net->nsid_lock); if (err >= 0) { rtnl_net_notifyid(net, RTM_NEWNSID, err, NETLINK_CB(skb).portid, nlh, GFP_KERNEL); diff --git a/net/core/netclassid_cgroup.c b/net/core/netclassid_cgroup.c index d22f0919821e..dff66d8fb325 100644 --- a/net/core/netclassid_cgroup.c +++ b/net/core/netclassid_cgroup.c @@ -21,7 +21,9 @@ static inline struct cgroup_cls_state *css_cls_state(struct cgroup_subsys_state struct cgroup_cls_state *task_cls_state(struct task_struct *p) { return css_cls_state(task_css_check(p, net_cls_cgrp_id, - rcu_read_lock_bh_held())); + rcu_read_lock_held() || + rcu_read_lock_bh_held() || + rcu_read_lock_trace_held())); } EXPORT_SYMBOL_GPL(task_cls_state); diff --git a/net/core/netdev-genl-gen.c b/net/core/netdev-genl-gen.c index 4fc44587f493..e9a2a6f26cb7 100644 --- a/net/core/netdev-genl-gen.c +++ b/net/core/netdev-genl-gen.c @@ -92,11 +92,12 @@ static const struct nla_policy netdev_bind_rx_nl_policy[NETDEV_A_DMABUF_FD + 1] }; /* NETDEV_CMD_NAPI_SET - do */ -static const struct nla_policy netdev_napi_set_nl_policy[NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT + 1] = { +static const struct nla_policy netdev_napi_set_nl_policy[NETDEV_A_NAPI_THREADED + 1] = { [NETDEV_A_NAPI_ID] = { .type = NLA_U32, }, [NETDEV_A_NAPI_DEFER_HARD_IRQS] = NLA_POLICY_FULL_RANGE(NLA_U32, &netdev_a_napi_defer_hard_irqs_range), [NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT] = { .type = NLA_UINT, }, [NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT] = { .type = NLA_UINT, }, + [NETDEV_A_NAPI_THREADED] = NLA_POLICY_MAX(NLA_U32, 1), }; /* NETDEV_CMD_BIND_TX - do */ @@ -193,7 +194,7 @@ static const struct genl_split_ops netdev_nl_ops[] = { .cmd = NETDEV_CMD_NAPI_SET, .doit = netdev_nl_napi_set_doit, .policy = netdev_napi_set_nl_policy, - .maxattr = NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT, + .maxattr = NETDEV_A_NAPI_THREADED, .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, }, { diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index 2afa7b2141aa..6314eb7bdf69 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -184,6 +184,10 @@ netdev_nl_napi_fill_one(struct sk_buff *rsp, struct napi_struct *napi, if (napi->irq >= 0 && nla_put_u32(rsp, NETDEV_A_NAPI_IRQ, napi->irq)) goto nla_put_failure; + if (nla_put_uint(rsp, NETDEV_A_NAPI_THREADED, + napi_get_threaded(napi))) + goto nla_put_failure; + if (napi->thread) { pid = task_pid_nr(napi->thread); if (nla_put_u32(rsp, NETDEV_A_NAPI_PID, pid)) @@ -322,8 +326,18 @@ netdev_nl_napi_set_config(struct napi_struct *napi, struct genl_info *info) { u64 irq_suspend_timeout = 0; u64 gro_flush_timeout = 0; + u8 threaded = 0; u32 defer = 0; + if (info->attrs[NETDEV_A_NAPI_THREADED]) { + int ret; + + threaded = nla_get_uint(info->attrs[NETDEV_A_NAPI_THREADED]); + ret = napi_set_threaded(napi, threaded); + if (ret) + return ret; + } + if (info->attrs[NETDEV_A_NAPI_DEFER_HARD_IRQS]) { defer = nla_get_u32(info->attrs[NETDEV_A_NAPI_DEFER_HARD_IRQS]); napi_set_defer_hard_irqs(napi, defer); diff --git a/net/core/netdev_rx_queue.c b/net/core/netdev_rx_queue.c index d126f10197bf..3bf1151d8061 100644 --- a/net/core/netdev_rx_queue.c +++ b/net/core/netdev_rx_queue.c @@ -97,14 +97,12 @@ int __net_mp_open_rxq(struct net_device *dev, unsigned int rxq_idx, if (!netdev_need_ops_lock(dev)) return -EOPNOTSUPP; - if (rxq_idx >= dev->real_num_rx_queues) - return -EINVAL; - rxq_idx = array_index_nospec(rxq_idx, dev->real_num_rx_queues); - if (rxq_idx >= dev->real_num_rx_queues) { NL_SET_ERR_MSG(extack, "rx queue index out of range"); return -ERANGE; } + rxq_idx = array_index_nospec(rxq_idx, dev->real_num_rx_queues); + if (dev->cfg->hds_config != ETHTOOL_TCP_DATA_SPLIT_ENABLED) { NL_SET_ERR_MSG(extack, "tcp-data-split is disabled"); return -EINVAL; diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 6ad84d4a2b46..a1da97b5b30b 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -58,13 +58,6 @@ static void zap_completion_queue(void); static unsigned int carrier_timeout = 4; module_param(carrier_timeout, uint, 0644); -#define np_info(np, fmt, ...) \ - pr_info("%s: " fmt, np->name, ##__VA_ARGS__) -#define np_err(np, fmt, ...) \ - pr_err("%s: " fmt, np->name, ##__VA_ARGS__) -#define np_notice(np, fmt, ...) \ - pr_notice("%s: " fmt, np->name, ##__VA_ARGS__) - static netdev_tx_t netpoll_start_xmit(struct sk_buff *skb, struct net_device *dev, struct netdev_queue *txq) @@ -379,6 +372,31 @@ out: return ret; } +static void netpoll_udp_checksum(struct netpoll *np, struct sk_buff *skb, + int len) +{ + struct udphdr *udph; + int udp_len; + + udp_len = len + sizeof(struct udphdr); + udph = udp_hdr(skb); + + /* check needs to be set, since it will be consumed in csum_partial */ + udph->check = 0; + if (np->ipv6) + udph->check = csum_ipv6_magic(&np->local_ip.in6, + &np->remote_ip.in6, + udp_len, IPPROTO_UDP, + csum_partial(udph, udp_len, 0)); + else + udph->check = csum_tcpudp_magic(np->local_ip.ip, + np->remote_ip.ip, + udp_len, IPPROTO_UDP, + csum_partial(udph, udp_len, 0)); + if (udph->check == 0) + udph->check = CSUM_MANGLED_0; +} + netdev_tx_t netpoll_send_skb(struct netpoll *np, struct sk_buff *skb) { unsigned long flags; @@ -396,24 +414,101 @@ netdev_tx_t netpoll_send_skb(struct netpoll *np, struct sk_buff *skb) } EXPORT_SYMBOL(netpoll_send_skb); +static void push_ipv6(struct netpoll *np, struct sk_buff *skb, int len) +{ + struct ipv6hdr *ip6h; + + skb_push(skb, sizeof(struct ipv6hdr)); + skb_reset_network_header(skb); + ip6h = ipv6_hdr(skb); + + /* ip6h->version = 6; ip6h->priority = 0; */ + *(unsigned char *)ip6h = 0x60; + ip6h->flow_lbl[0] = 0; + ip6h->flow_lbl[1] = 0; + ip6h->flow_lbl[2] = 0; + + ip6h->payload_len = htons(sizeof(struct udphdr) + len); + ip6h->nexthdr = IPPROTO_UDP; + ip6h->hop_limit = 32; + ip6h->saddr = np->local_ip.in6; + ip6h->daddr = np->remote_ip.in6; + + skb->protocol = htons(ETH_P_IPV6); +} + +static void push_ipv4(struct netpoll *np, struct sk_buff *skb, int len) +{ + static atomic_t ip_ident; + struct iphdr *iph; + int ip_len; + + ip_len = len + sizeof(struct udphdr) + sizeof(struct iphdr); + + skb_push(skb, sizeof(struct iphdr)); + skb_reset_network_header(skb); + iph = ip_hdr(skb); + + /* iph->version = 4; iph->ihl = 5; */ + *(unsigned char *)iph = 0x45; + iph->tos = 0; + put_unaligned(htons(ip_len), &iph->tot_len); + iph->id = htons(atomic_inc_return(&ip_ident)); + iph->frag_off = 0; + iph->ttl = 64; + iph->protocol = IPPROTO_UDP; + iph->check = 0; + put_unaligned(np->local_ip.ip, &iph->saddr); + put_unaligned(np->remote_ip.ip, &iph->daddr); + iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); + skb->protocol = htons(ETH_P_IP); +} + +static void push_udp(struct netpoll *np, struct sk_buff *skb, int len) +{ + struct udphdr *udph; + int udp_len; + + udp_len = len + sizeof(struct udphdr); + + skb_push(skb, sizeof(struct udphdr)); + skb_reset_transport_header(skb); + + udph = udp_hdr(skb); + udph->source = htons(np->local_port); + udph->dest = htons(np->remote_port); + udph->len = htons(udp_len); + + netpoll_udp_checksum(np, skb, len); +} + +static void push_eth(struct netpoll *np, struct sk_buff *skb) +{ + struct ethhdr *eth; + + eth = skb_push(skb, ETH_HLEN); + skb_reset_mac_header(skb); + ether_addr_copy(eth->h_source, np->dev->dev_addr); + ether_addr_copy(eth->h_dest, np->remote_mac); + if (np->ipv6) + eth->h_proto = htons(ETH_P_IPV6); + else + eth->h_proto = htons(ETH_P_IP); +} + int netpoll_send_udp(struct netpoll *np, const char *msg, int len) { int total_len, ip_len, udp_len; struct sk_buff *skb; - struct udphdr *udph; - struct iphdr *iph; - struct ethhdr *eth; - static atomic_t ip_ident; - struct ipv6hdr *ip6h; if (!IS_ENABLED(CONFIG_PREEMPT_RT)) WARN_ON_ONCE(!irqs_disabled()); - udp_len = len + sizeof(*udph); + udp_len = len + sizeof(struct udphdr); if (np->ipv6) - ip_len = udp_len + sizeof(*ip6h); + ip_len = udp_len + sizeof(struct ipv6hdr); else - ip_len = udp_len + sizeof(*iph); + ip_len = udp_len + sizeof(struct iphdr); total_len = ip_len + LL_RESERVED_SPACE(np->dev); @@ -425,117 +520,18 @@ int netpoll_send_udp(struct netpoll *np, const char *msg, int len) skb_copy_to_linear_data(skb, msg, len); skb_put(skb, len); - skb_push(skb, sizeof(*udph)); - skb_reset_transport_header(skb); - udph = udp_hdr(skb); - udph->source = htons(np->local_port); - udph->dest = htons(np->remote_port); - udph->len = htons(udp_len); - - udph->check = 0; - if (np->ipv6) { - udph->check = csum_ipv6_magic(&np->local_ip.in6, - &np->remote_ip.in6, - udp_len, IPPROTO_UDP, - csum_partial(udph, udp_len, 0)); - if (udph->check == 0) - udph->check = CSUM_MANGLED_0; - - skb_push(skb, sizeof(*ip6h)); - skb_reset_network_header(skb); - ip6h = ipv6_hdr(skb); - - /* ip6h->version = 6; ip6h->priority = 0; */ - *(unsigned char *)ip6h = 0x60; - ip6h->flow_lbl[0] = 0; - ip6h->flow_lbl[1] = 0; - ip6h->flow_lbl[2] = 0; - - ip6h->payload_len = htons(sizeof(struct udphdr) + len); - ip6h->nexthdr = IPPROTO_UDP; - ip6h->hop_limit = 32; - ip6h->saddr = np->local_ip.in6; - ip6h->daddr = np->remote_ip.in6; - - eth = skb_push(skb, ETH_HLEN); - skb_reset_mac_header(skb); - skb->protocol = eth->h_proto = htons(ETH_P_IPV6); - } else { - udph->check = csum_tcpudp_magic(np->local_ip.ip, - np->remote_ip.ip, - udp_len, IPPROTO_UDP, - csum_partial(udph, udp_len, 0)); - if (udph->check == 0) - udph->check = CSUM_MANGLED_0; - - skb_push(skb, sizeof(*iph)); - skb_reset_network_header(skb); - iph = ip_hdr(skb); - - /* iph->version = 4; iph->ihl = 5; */ - *(unsigned char *)iph = 0x45; - iph->tos = 0; - put_unaligned(htons(ip_len), &(iph->tot_len)); - iph->id = htons(atomic_inc_return(&ip_ident)); - iph->frag_off = 0; - iph->ttl = 64; - iph->protocol = IPPROTO_UDP; - iph->check = 0; - put_unaligned(np->local_ip.ip, &(iph->saddr)); - put_unaligned(np->remote_ip.ip, &(iph->daddr)); - iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); - - eth = skb_push(skb, ETH_HLEN); - skb_reset_mac_header(skb); - skb->protocol = eth->h_proto = htons(ETH_P_IP); - } - - ether_addr_copy(eth->h_source, np->dev->dev_addr); - ether_addr_copy(eth->h_dest, np->remote_mac); - + push_udp(np, skb, len); + if (np->ipv6) + push_ipv6(np, skb, len); + else + push_ipv4(np, skb, len); + push_eth(np, skb); skb->dev = np->dev; return (int)netpoll_send_skb(np, skb); } EXPORT_SYMBOL(netpoll_send_udp); -void netpoll_print_options(struct netpoll *np) -{ - np_info(np, "local port %d\n", np->local_port); - if (np->ipv6) - np_info(np, "local IPv6 address %pI6c\n", &np->local_ip.in6); - else - np_info(np, "local IPv4 address %pI4\n", &np->local_ip.ip); - np_info(np, "interface name '%s'\n", np->dev_name); - np_info(np, "local ethernet address '%pM'\n", np->dev_mac); - np_info(np, "remote port %d\n", np->remote_port); - if (np->ipv6) - np_info(np, "remote IPv6 address %pI6c\n", &np->remote_ip.in6); - else - np_info(np, "remote IPv4 address %pI4\n", &np->remote_ip.ip); - np_info(np, "remote ethernet address %pM\n", np->remote_mac); -} -EXPORT_SYMBOL(netpoll_print_options); - -static int netpoll_parse_ip_addr(const char *str, union inet_addr *addr) -{ - const char *end; - - if (!strchr(str, ':') && - in4_pton(str, -1, (void *)addr, -1, &end) > 0) { - if (!*end) - return 0; - } - if (in6_pton(str, -1, addr->in6.s6_addr, -1, &end) > 0) { -#if IS_ENABLED(CONFIG_IPV6) - if (!*end) - return 1; -#else - return -1; -#endif - } - return -1; -} static void skb_pool_flush(struct netpoll *np) { @@ -546,95 +542,6 @@ static void skb_pool_flush(struct netpoll *np) skb_queue_purge_reason(skb_pool, SKB_CONSUMED); } -int netpoll_parse_options(struct netpoll *np, char *opt) -{ - char *cur=opt, *delim; - int ipv6; - bool ipversion_set = false; - - if (*cur != '@') { - if ((delim = strchr(cur, '@')) == NULL) - goto parse_failed; - *delim = 0; - if (kstrtou16(cur, 10, &np->local_port)) - goto parse_failed; - cur = delim; - } - cur++; - - if (*cur != '/') { - ipversion_set = true; - if ((delim = strchr(cur, '/')) == NULL) - goto parse_failed; - *delim = 0; - ipv6 = netpoll_parse_ip_addr(cur, &np->local_ip); - if (ipv6 < 0) - goto parse_failed; - else - np->ipv6 = (bool)ipv6; - cur = delim; - } - cur++; - - if (*cur != ',') { - /* parse out dev_name or dev_mac */ - if ((delim = strchr(cur, ',')) == NULL) - goto parse_failed; - *delim = 0; - - np->dev_name[0] = '\0'; - eth_broadcast_addr(np->dev_mac); - if (!strchr(cur, ':')) - strscpy(np->dev_name, cur, sizeof(np->dev_name)); - else if (!mac_pton(cur, np->dev_mac)) - goto parse_failed; - - cur = delim; - } - cur++; - - if (*cur != '@') { - /* dst port */ - if ((delim = strchr(cur, '@')) == NULL) - goto parse_failed; - *delim = 0; - if (*cur == ' ' || *cur == '\t') - np_info(np, "warning: whitespace is not allowed\n"); - if (kstrtou16(cur, 10, &np->remote_port)) - goto parse_failed; - cur = delim; - } - cur++; - - /* dst ip */ - if ((delim = strchr(cur, '/')) == NULL) - goto parse_failed; - *delim = 0; - ipv6 = netpoll_parse_ip_addr(cur, &np->remote_ip); - if (ipv6 < 0) - goto parse_failed; - else if (ipversion_set && np->ipv6 != (bool)ipv6) - goto parse_failed; - else - np->ipv6 = (bool)ipv6; - cur = delim + 1; - - if (*cur != 0) { - /* MAC address */ - if (!mac_pton(cur, np->remote_mac)) - goto parse_failed; - } - - netpoll_print_options(np); - - return 0; - - parse_failed: - np_info(np, "couldn't parse config at '%s'!\n", cur); - return -1; -} -EXPORT_SYMBOL(netpoll_parse_options); - static void refill_skbs_work_handler(struct work_struct *work) { struct netpoll *np = @@ -716,13 +623,97 @@ static char *egress_dev(struct netpoll *np, char *buf) return buf; } +static void netpoll_wait_carrier(struct netpoll *np, struct net_device *ndev, + unsigned int timeout) +{ + unsigned long atmost; + + atmost = jiffies + timeout * HZ; + while (!netif_carrier_ok(ndev)) { + if (time_after(jiffies, atmost)) { + np_notice(np, "timeout waiting for carrier\n"); + break; + } + msleep(1); + } +} + +/* + * Take the IPv6 from ndev and populate local_ip structure in netpoll + */ +static int netpoll_take_ipv6(struct netpoll *np, struct net_device *ndev) +{ + char buf[MAC_ADDR_STR_LEN + 1]; + int err = -EDESTADDRREQ; + struct inet6_dev *idev; + + if (!IS_ENABLED(CONFIG_IPV6)) { + np_err(np, "IPv6 is not supported %s, aborting\n", + egress_dev(np, buf)); + return -EINVAL; + } + + idev = __in6_dev_get(ndev); + if (idev) { + struct inet6_ifaddr *ifp; + + read_lock_bh(&idev->lock); + list_for_each_entry(ifp, &idev->addr_list, if_list) { + if (!!(ipv6_addr_type(&ifp->addr) & IPV6_ADDR_LINKLOCAL) != + !!(ipv6_addr_type(&np->remote_ip.in6) & IPV6_ADDR_LINKLOCAL)) + continue; + /* Got the IP, let's return */ + np->local_ip.in6 = ifp->addr; + err = 0; + break; + } + read_unlock_bh(&idev->lock); + } + if (err) { + np_err(np, "no IPv6 address for %s, aborting\n", + egress_dev(np, buf)); + return err; + } + + np_info(np, "local IPv6 %pI6c\n", &np->local_ip.in6); + return 0; +} + +/* + * Take the IPv4 from ndev and populate local_ip structure in netpoll + */ +static int netpoll_take_ipv4(struct netpoll *np, struct net_device *ndev) +{ + char buf[MAC_ADDR_STR_LEN + 1]; + const struct in_ifaddr *ifa; + struct in_device *in_dev; + + in_dev = __in_dev_get_rtnl(ndev); + if (!in_dev) { + np_err(np, "no IP address for %s, aborting\n", + egress_dev(np, buf)); + return -EDESTADDRREQ; + } + + ifa = rtnl_dereference(in_dev->ifa_list); + if (!ifa) { + np_err(np, "no IP address for %s, aborting\n", + egress_dev(np, buf)); + return -EDESTADDRREQ; + } + + np->local_ip.ip = ifa->ifa_local; + np_info(np, "local IP %pI4\n", &np->local_ip.ip); + + return 0; +} + int netpoll_setup(struct netpoll *np) { struct net *net = current->nsproxy->net_ns; char buf[MAC_ADDR_STR_LEN + 1]; struct net_device *ndev = NULL; bool ip_overwritten = false; - struct in_device *in_dev; int err; rtnl_lock(); @@ -746,85 +737,31 @@ int netpoll_setup(struct netpoll *np) } if (!netif_running(ndev)) { - unsigned long atmost; - np_info(np, "device %s not up yet, forcing it\n", egress_dev(np, buf)); err = dev_open(ndev, NULL); - if (err) { np_err(np, "failed to open %s\n", ndev->name); goto put; } rtnl_unlock(); - atmost = jiffies + carrier_timeout * HZ; - while (!netif_carrier_ok(ndev)) { - if (time_after(jiffies, atmost)) { - np_notice(np, "timeout waiting for carrier\n"); - break; - } - msleep(1); - } - + netpoll_wait_carrier(np, ndev, carrier_timeout); rtnl_lock(); } if (!np->local_ip.ip) { if (!np->ipv6) { - const struct in_ifaddr *ifa; - - in_dev = __in_dev_get_rtnl(ndev); - if (!in_dev) - goto put_noaddr; - - ifa = rtnl_dereference(in_dev->ifa_list); - if (!ifa) { -put_noaddr: - np_err(np, "no IP address for %s, aborting\n", - egress_dev(np, buf)); - err = -EDESTADDRREQ; + err = netpoll_take_ipv4(np, ndev); + if (err) goto put; - } - - np->local_ip.ip = ifa->ifa_local; - ip_overwritten = true; - np_info(np, "local IP %pI4\n", &np->local_ip.ip); } else { -#if IS_ENABLED(CONFIG_IPV6) - struct inet6_dev *idev; - - err = -EDESTADDRREQ; - idev = __in6_dev_get(ndev); - if (idev) { - struct inet6_ifaddr *ifp; - - read_lock_bh(&idev->lock); - list_for_each_entry(ifp, &idev->addr_list, if_list) { - if (!!(ipv6_addr_type(&ifp->addr) & IPV6_ADDR_LINKLOCAL) != - !!(ipv6_addr_type(&np->remote_ip.in6) & IPV6_ADDR_LINKLOCAL)) - continue; - np->local_ip.in6 = ifp->addr; - ip_overwritten = true; - err = 0; - break; - } - read_unlock_bh(&idev->lock); - } - if (err) { - np_err(np, "no IPv6 address for %s, aborting\n", - egress_dev(np, buf)); + err = netpoll_take_ipv6(np, ndev); + if (err) goto put; - } else - np_info(np, "local IPv6 %pI6c\n", &np->local_ip.in6); -#else - np_err(np, "IPv6 is not supported %s, aborting\n", - egress_dev(np, buf)); - err = -EINVAL; - goto put; -#endif } + ip_overwritten = true; } err = __netpoll_setup(np, ndev); @@ -863,7 +800,7 @@ static void rcu_cleanup_netpoll_info(struct rcu_head *rcu_head) kfree(npinfo); } -void __netpoll_cleanup(struct netpoll *np) +static void __netpoll_cleanup(struct netpoll *np) { struct netpoll_info *npinfo; @@ -885,7 +822,6 @@ void __netpoll_cleanup(struct netpoll *np) skb_pool_flush(np); } -EXPORT_SYMBOL_GPL(__netpoll_cleanup); void __netpoll_free(struct netpoll *np) { diff --git a/net/core/page_pool.c b/net/core/page_pool.c index ba7cf3e3c32f..05e2e22a8f7c 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -371,7 +371,7 @@ struct page_pool *page_pool_create(const struct page_pool_params *params) } EXPORT_SYMBOL(page_pool_create); -static void page_pool_return_page(struct page_pool *pool, netmem_ref netmem); +static void page_pool_return_netmem(struct page_pool *pool, netmem_ref netmem); static noinline netmem_ref page_pool_refill_alloc_cache(struct page_pool *pool) { @@ -409,7 +409,7 @@ static noinline netmem_ref page_pool_refill_alloc_cache(struct page_pool *pool) * (2) break out to fallthrough to alloc_pages_node. * This limit stress on page buddy alloactor. */ - page_pool_return_page(pool, netmem); + page_pool_return_netmem(pool, netmem); alloc_stat_inc(pool, waive); netmem = 0; break; @@ -544,8 +544,8 @@ static struct page *__page_pool_alloc_page_order(struct page_pool *pool, } /* slow path */ -static noinline netmem_ref __page_pool_alloc_pages_slow(struct page_pool *pool, - gfp_t gfp) +static noinline netmem_ref __page_pool_alloc_netmems_slow(struct page_pool *pool, + gfp_t gfp) { const int bulk = PP_ALLOC_CACHE_REFILL; unsigned int pp_order = pool->p.order; @@ -615,7 +615,7 @@ netmem_ref page_pool_alloc_netmems(struct page_pool *pool, gfp_t gfp) if (static_branch_unlikely(&page_pool_mem_providers) && pool->mp_ops) netmem = pool->mp_ops->alloc_netmems(pool, gfp); else - netmem = __page_pool_alloc_pages_slow(pool, gfp); + netmem = __page_pool_alloc_netmems_slow(pool, gfp); return netmem; } EXPORT_SYMBOL(page_pool_alloc_netmems); @@ -673,8 +673,8 @@ void page_pool_clear_pp_info(netmem_ref netmem) netmem_set_pp(netmem, NULL); } -static __always_inline void __page_pool_release_page_dma(struct page_pool *pool, - netmem_ref netmem) +static __always_inline void __page_pool_release_netmem_dma(struct page_pool *pool, + netmem_ref netmem) { struct page *old, *page = netmem_to_page(netmem); unsigned long id; @@ -712,7 +712,7 @@ static __always_inline void __page_pool_release_page_dma(struct page_pool *pool, * a regular page (that will eventually be returned to the normal * page-allocator via put_page). */ -void page_pool_return_page(struct page_pool *pool, netmem_ref netmem) +static void page_pool_return_netmem(struct page_pool *pool, netmem_ref netmem) { int count; bool put; @@ -721,7 +721,7 @@ void page_pool_return_page(struct page_pool *pool, netmem_ref netmem) if (static_branch_unlikely(&page_pool_mem_providers) && pool->mp_ops) put = pool->mp_ops->release_netmem(pool, netmem); else - __page_pool_release_page_dma(pool, netmem); + __page_pool_release_netmem_dma(pool, netmem); /* This may be the last page returned, releasing the pool, so * it is not safe to reference pool afterwards. @@ -826,7 +826,7 @@ __page_pool_put_page(struct page_pool *pool, netmem_ref netmem, * will be invoking put_page. */ recycle_stat_inc(pool, released_refcnt); - page_pool_return_page(pool, netmem); + page_pool_return_netmem(pool, netmem); return 0; } @@ -869,7 +869,7 @@ void page_pool_put_unrefed_netmem(struct page_pool *pool, netmem_ref netmem, if (netmem && !page_pool_recycle_in_ring(pool, netmem)) { /* Cache full, fallback to free pages */ recycle_stat_inc(pool, ring_full); - page_pool_return_page(pool, netmem); + page_pool_return_netmem(pool, netmem); } } EXPORT_SYMBOL(page_pool_put_unrefed_netmem); @@ -912,7 +912,7 @@ static void page_pool_recycle_ring_bulk(struct page_pool *pool, * since put_page() with refcnt == 1 can be an expensive operation. */ for (; i < bulk_len; i++) - page_pool_return_page(pool, bulk[i]); + page_pool_return_netmem(pool, bulk[i]); } /** @@ -995,7 +995,7 @@ static netmem_ref page_pool_drain_frag(struct page_pool *pool, return netmem; } - page_pool_return_page(pool, netmem); + page_pool_return_netmem(pool, netmem); return 0; } @@ -1009,7 +1009,7 @@ static void page_pool_free_frag(struct page_pool *pool) if (!netmem || page_pool_unref_netmem(netmem, drain_count)) return; - page_pool_return_page(pool, netmem); + page_pool_return_netmem(pool, netmem); } netmem_ref page_pool_alloc_frag_netmem(struct page_pool *pool, @@ -1076,7 +1076,7 @@ static void page_pool_empty_ring(struct page_pool *pool) pr_crit("%s() page_pool refcnt %d violation\n", __func__, netmem_ref_count(netmem)); - page_pool_return_page(pool, netmem); + page_pool_return_netmem(pool, netmem); } } @@ -1109,7 +1109,7 @@ static void page_pool_empty_alloc_cache_once(struct page_pool *pool) */ while (pool->alloc.count) { netmem = pool->alloc.cache[--pool->alloc.count]; - page_pool_return_page(pool, netmem); + page_pool_return_netmem(pool, netmem); } } @@ -1136,7 +1136,7 @@ static void page_pool_scrub(struct page_pool *pool) } xa_for_each(&pool->dma_mapped, id, ptr) - __page_pool_release_page_dma(pool, page_to_netmem(ptr)); + __page_pool_release_netmem_dma(pool, page_to_netmem((struct page *)ptr)); } /* No more consumers should exist, but producers could still @@ -1253,7 +1253,7 @@ void page_pool_update_nid(struct page_pool *pool, int new_nid) /* Flush pool alloc cache, as refill will check NUMA node */ while (pool->alloc.count) { netmem = pool->alloc.cache[--pool->alloc.count]; - page_pool_return_page(pool, netmem); + page_pool_return_netmem(pool, netmem); } } EXPORT_SYMBOL(page_pool_update_nid); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index c57692eb8da9..094b085cff20 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1026,9 +1026,11 @@ int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst, u32 id, .rta_error = error, .rta_id = id, }; + unsigned long delta; if (dst) { - ci.rta_lastuse = jiffies_delta_to_clock_t(jiffies - dst->lastuse); + delta = jiffies - READ_ONCE(dst->lastuse); + ci.rta_lastuse = jiffies_delta_to_clock_t(delta); ci.rta_used = dst->__use; ci.rta_clntref = rcuref_read(&dst->__rcuref); } @@ -1446,7 +1448,7 @@ static int rtnl_phys_switch_id_fill(struct sk_buff *skb, struct net_device *dev) struct netdev_phys_item_id ppid = { }; int err; - err = dev_get_port_parent_id(dev, &ppid, false); + err = netif_get_port_parent_id(dev, &ppid, false); if (err) { if (err == -EOPNOTSUPP) return 0; @@ -2036,7 +2038,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, ifm->__ifi_pad = 0; ifm->ifi_type = READ_ONCE(dev->type); ifm->ifi_index = READ_ONCE(dev->ifindex); - ifm->ifi_flags = dev_get_flags(dev); + ifm->ifi_flags = netif_get_flags(dev); ifm->ifi_change = change; if (tgt_netnsid >= 0 && nla_put_s32(skb, IFLA_TARGET_NETNSID, tgt_netnsid)) @@ -5225,7 +5227,7 @@ int ndo_dflt_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq, ifm->__ifi_pad = 0; ifm->ifi_type = dev->type; ifm->ifi_index = dev->ifindex; - ifm->ifi_flags = dev_get_flags(dev); + ifm->ifi_flags = netif_get_flags(dev); ifm->ifi_change = 0; diff --git a/net/core/scm.c b/net/core/scm.c index 0225bd94170f..072d5742440a 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -23,6 +23,8 @@ #include <linux/security.h> #include <linux/pid_namespace.h> #include <linux/pid.h> +#include <uapi/linux/pidfd.h> +#include <linux/pidfs.h> #include <linux/nsproxy.h> #include <linux/slab.h> #include <linux/errqueue.h> @@ -145,6 +147,22 @@ void __scm_destroy(struct scm_cookie *scm) } EXPORT_SYMBOL(__scm_destroy); +static inline int scm_replace_pid(struct scm_cookie *scm, struct pid *pid) +{ + int err; + + /* drop all previous references */ + scm_destroy_cred(scm); + + err = pidfs_register_pid(pid); + if (unlikely(err)) + return err; + + scm->pid = pid; + scm->creds.pid = pid_vnr(pid); + return 0; +} + int __scm_send(struct socket *sock, struct msghdr *msg, struct scm_cookie *p) { const struct proto_ops *ops = READ_ONCE(sock->ops); @@ -189,15 +207,21 @@ int __scm_send(struct socket *sock, struct msghdr *msg, struct scm_cookie *p) if (err) goto error; - p->creds.pid = creds.pid; if (!p->pid || pid_vnr(p->pid) != creds.pid) { struct pid *pid; err = -ESRCH; pid = find_get_pid(creds.pid); if (!pid) goto error; - put_pid(p->pid); - p->pid = pid; + + /* pass a struct pid reference from + * find_get_pid() to scm_replace_pid(). + */ + err = scm_replace_pid(p, pid); + if (err) { + put_pid(pid); + goto error; + } } err = -EINVAL; @@ -459,7 +483,7 @@ static void scm_pidfd_recv(struct msghdr *msg, struct scm_cookie *scm) if (!scm->pid) return; - pidfd = pidfd_prepare(scm->pid, 0, &pidfd_file); + pidfd = pidfd_prepare(scm->pid, PIDFD_STALE, &pidfd_file); if (put_cmsg(msg, SOL_SOCKET, SCM_PIDFD, sizeof(int), &pidfd)) { if (pidfd_file) { diff --git a/net/core/selftests.c b/net/core/selftests.c index 406faf8e5f3f..3d79133a91a6 100644 --- a/net/core/selftests.c +++ b/net/core/selftests.c @@ -27,6 +27,7 @@ struct net_packet_attrs { int max_size; u8 id; u16 queue_mapping; + bool bad_csum; }; struct net_test_priv { @@ -165,6 +166,20 @@ static struct sk_buff *net_test_get_skb(struct net_device *ndev, thdr->check = ~tcp_v4_check(l4len, ihdr->saddr, ihdr->daddr, 0); skb->csum_start = skb_transport_header(skb) - skb->head; skb->csum_offset = offsetof(struct tcphdr, check); + + if (attr->bad_csum) { + /* Force mangled checksum */ + if (skb_checksum_help(skb)) { + kfree_skb(skb); + return NULL; + } + + if (thdr->check != CSUM_MANGLED_0) + thdr->check = CSUM_MANGLED_0; + else + thdr->check = csum16_sub(thdr->check, + cpu_to_be16(1)); + } } else { udp4_hwcsum(skb, ihdr->saddr, ihdr->daddr); } @@ -239,7 +254,11 @@ static int net_test_loopback_validate(struct sk_buff *skb, if (tpriv->packet->id != shdr->id) goto out; - tpriv->ok = true; + if (tpriv->packet->bad_csum && skb->ip_summed == CHECKSUM_UNNECESSARY) + tpriv->ok = -EIO; + else + tpriv->ok = true; + complete(&tpriv->comp); out: kfree_skb(skb); @@ -285,7 +304,12 @@ static int __net_test_loopback(struct net_device *ndev, attr->timeout = NET_LB_TIMEOUT; wait_for_completion_timeout(&tpriv->comp, attr->timeout); - ret = tpriv->ok ? 0 : -ETIMEDOUT; + if (tpriv->ok < 0) + ret = tpriv->ok; + else if (!tpriv->ok) + ret = -ETIMEDOUT; + else + ret = 0; cleanup: dev_remove_pack(&tpriv->pt); @@ -345,6 +369,42 @@ static int net_test_phy_loopback_tcp(struct net_device *ndev) return __net_test_loopback(ndev, &attr); } +/** + * net_test_phy_loopback_tcp_bad_csum - PHY loopback test with a deliberately + * corrupted TCP checksum + * @ndev: the network device to test + * + * Builds the same minimal Ethernet/IPv4/TCP frame as + * net_test_phy_loopback_tcp(), then flips the least-significant bit of the TCP + * checksum so the resulting value is provably invalid (neither 0 nor 0xFFFF). + * The frame is transmitted through the device’s internal PHY loopback path: + * + * test code -> MAC driver -> MAC HW -> xMII -> PHY -> + * internal PHY loopback -> xMII -> MAC HW -> MAC driver -> test code + * + * Result interpretation + * --------------------- + * 0 The frame is delivered to the stack and the driver reports + * ip_summed as CHECKSUM_NONE or CHECKSUM_COMPLETE - both are + * valid ways to indicate “bad checksum, let the stack verify.” + * -ETIMEDOUT The MAC/PHY silently dropped the frame; hardware checksum + * verification filtered it out before the driver saw it. + * -EIO The driver returned the frame with ip_summed == + * CHECKSUM_UNNECESSARY, falsely claiming a valid checksum and + * indicating a serious RX-path defect. + * + * Return: 0 on success or a negative error code on failure. + */ +static int net_test_phy_loopback_tcp_bad_csum(struct net_device *ndev) +{ + struct net_packet_attrs attr = { }; + + attr.dst = ndev->dev_addr; + attr.tcp = true; + attr.bad_csum = true; + return __net_test_loopback(ndev, &attr); +} + static const struct net_test { char name[ETH_GSTRING_LEN]; int (*fn)(struct net_device *ndev); @@ -369,6 +429,9 @@ static const struct net_test { .name = "PHY internal loopback, TCP ", .fn = net_test_phy_loopback_tcp, }, { + .name = "PHY loopback, bad TCP csum ", + .fn = net_test_phy_loopback_tcp_bad_csum, + }, { /* This test should be done after all PHY loopback test */ .name = "PHY internal loopback, disable", .fn = net_test_phy_loopback_disable, diff --git a/net/core/skbuff.c b/net/core/skbuff.c index d6420b74ea9c..ee0274417948 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -384,8 +384,7 @@ static inline void __finalize_skb_around(struct sk_buff *skb, void *data, skb_set_kcov_handle(skb, kcov_common_handle()); } -static inline void *__slab_build_skb(struct sk_buff *skb, void *data, - unsigned int *size) +static inline void *__slab_build_skb(void *data, unsigned int *size) { void *resized; @@ -418,7 +417,7 @@ struct sk_buff *slab_build_skb(void *data) return NULL; memset(skb, 0, offsetof(struct sk_buff, tail)); - data = __slab_build_skb(skb, data, &size); + data = __slab_build_skb(data, &size); __finalize_skb_around(skb, data, size); return skb; @@ -435,7 +434,7 @@ static void __build_skb_around(struct sk_buff *skb, void *data, * using slab buffer should use slab_build_skb() instead. */ if (WARN_ONCE(size == 0, "Use slab_build_skb() instead")) - data = __slab_build_skb(skb, data, &size); + data = __slab_build_skb(data, &size); __finalize_skb_around(skb, data, size); } @@ -3060,10 +3059,8 @@ static bool spd_can_coalesce(const struct splice_pipe_desc *spd, /* * Fill page/offset/length into spd, if it can hold more pages. */ -static bool spd_fill_page(struct splice_pipe_desc *spd, - struct pipe_inode_info *pipe, struct page *page, - unsigned int *len, unsigned int offset, - bool linear, +static bool spd_fill_page(struct splice_pipe_desc *spd, struct page *page, + unsigned int *len, unsigned int offset, bool linear, struct sock *sk) { if (unlikely(spd->nr_pages == MAX_SKB_FRAGS)) @@ -3091,8 +3088,7 @@ static bool __splice_segment(struct page *page, unsigned int poff, unsigned int plen, unsigned int *off, unsigned int *len, struct splice_pipe_desc *spd, bool linear, - struct sock *sk, - struct pipe_inode_info *pipe) + struct sock *sk) { if (!*len) return true; @@ -3111,8 +3107,7 @@ static bool __splice_segment(struct page *page, unsigned int poff, do { unsigned int flen = min(*len, plen); - if (spd_fill_page(spd, pipe, page, &flen, poff, - linear, sk)) + if (spd_fill_page(spd, page, &flen, poff, linear, sk)) return true; poff += flen; plen -= flen; @@ -3130,8 +3125,8 @@ static bool __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe, unsigned int *offset, unsigned int *len, struct splice_pipe_desc *spd, struct sock *sk) { - int seg; struct sk_buff *iter; + int seg; /* map the linear part : * If skb->head_frag is set, this 'linear' part is backed by a @@ -3143,7 +3138,7 @@ static bool __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe, skb_headlen(skb), offset, len, spd, skb_head_is_locked(skb), - sk, pipe)) + sk)) return true; /* @@ -3160,7 +3155,7 @@ static bool __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe, if (__splice_segment(skb_frag_page(f), skb_frag_off(f), skb_frag_size(f), - offset, len, spd, false, sk, pipe)) + offset, len, spd, false, sk)) return true; } @@ -3235,6 +3230,7 @@ typedef int (*sendmsg_func)(struct sock *sk, struct msghdr *msg); static int __skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset, int len, sendmsg_func sendmsg, int flags) { + int more_hint = sk_is_tcp(sk) ? MSG_MORE : 0; unsigned int orig_len = len; struct sk_buff *head = skb; unsigned short fragidx; @@ -3252,6 +3248,8 @@ do_frag_list: kv.iov_len = slen; memset(&msg, 0, sizeof(msg)); msg.msg_flags = MSG_DONTWAIT | flags; + if (slen < len) + msg.msg_flags |= more_hint; iov_iter_kvec(&msg.msg_iter, ITER_SOURCE, &kv, 1, slen); ret = INDIRECT_CALL_2(sendmsg, sendmsg_locked, @@ -3292,6 +3290,8 @@ do_frag_list: flags, }; + if (slen < len) + msg.msg_flags |= more_hint; bvec_set_page(&bvec, skb_frag_page(frag), slen, skb_frag_off(frag) + offset); iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, @@ -6763,8 +6763,7 @@ static int pskb_carve(struct sk_buff *skb, const u32 off, gfp_t gfp); /* carve out the first eat bytes from skb's frag_list. May recurse into * pskb_carve() */ -static int pskb_carve_frag_list(struct sk_buff *skb, - struct skb_shared_info *shinfo, int eat, +static int pskb_carve_frag_list(struct skb_shared_info *shinfo, int eat, gfp_t gfp_mask) { struct sk_buff *list = shinfo->frag_list; @@ -6869,7 +6868,7 @@ static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off, skb_clone_fraglist(skb); /* split line is in frag list */ - if (k == 0 && pskb_carve_frag_list(skb, shinfo, off - pos, gfp_mask)) { + if (k == 0 && pskb_carve_frag_list(shinfo, off - pos, gfp_mask)) { /* skb_frag_unref() is not needed here as shinfo->nr_frags = 0. */ if (skb_has_frag_list(skb)) kfree_skb_list(skb_shinfo(skb)->frag_list); @@ -7234,7 +7233,6 @@ static void skb_splice_csum_page(struct sk_buff *skb, struct page *page, * @skb: The buffer to add pages to * @iter: Iterator representing the pages to be added * @maxsize: Maximum amount of pages to be added - * @gfp: Allocation flags * * This is a common helper function for supporting MSG_SPLICE_PAGES. It * extracts pages from an iterator and adds them to the socket buffer if @@ -7245,7 +7243,7 @@ static void skb_splice_csum_page(struct sk_buff *skb, struct page *page, * insufficient space in the buffer to transfer anything. */ ssize_t skb_splice_from_iter(struct sk_buff *skb, struct iov_iter *iter, - ssize_t maxsize, gfp_t gfp) + ssize_t maxsize) { size_t frag_limit = READ_ONCE(net_hotdata.sysctl_max_skb_frags); struct page *pages[8], **ppages = pages; diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 34c51eb1a14f..83c78379932e 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -656,6 +656,13 @@ static void sk_psock_backlog(struct work_struct *work) bool ingress; int ret; + /* If sk is quickly removed from the map and then added back, the old + * psock should not be scheduled, because there are now two psocks + * pointing to the same sk. + */ + if (!sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) + return; + /* Increment the psock refcnt to synchronize with close(fd) path in * sock_map_close(), ensuring we wait for backlog thread completion * before sk_socket freed. If refcnt increment fails, it indicates diff --git a/net/core/sock.c b/net/core/sock.c index 3b409bc8ef6d..7c26ec8dce63 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -526,11 +526,10 @@ int sock_queue_rcv_skb_reason(struct sock *sk, struct sk_buff *skb, enum skb_drop_reason drop_reason; int err; - err = sk_filter(sk, skb); - if (err) { - drop_reason = SKB_DROP_REASON_SOCKET_FILTER; + err = sk_filter_reason(sk, skb, &drop_reason); + if (err) goto out; - } + err = __sock_queue_rcv_skb(sk, skb); switch (err) { case -ENOMEM: @@ -553,15 +552,18 @@ EXPORT_SYMBOL(sock_queue_rcv_skb_reason); int __sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested, unsigned int trim_cap, bool refcounted) { + enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED; int rc = NET_RX_SUCCESS; + int err; - if (sk_filter_trim_cap(sk, skb, trim_cap)) + if (sk_filter_trim_cap(sk, skb, trim_cap, &reason)) goto discard_and_relse; skb->dev = NULL; if (sk_rcvqueues_full(sk, READ_ONCE(sk->sk_rcvbuf))) { atomic_inc(&sk->sk_drops); + reason = SKB_DROP_REASON_SOCKET_RCVBUFF; goto discard_and_relse; } if (nested) @@ -577,8 +579,12 @@ int __sk_receive_skb(struct sock *sk, struct sk_buff *skb, rc = sk_backlog_rcv(sk, skb); mutex_release(&sk->sk_lock.dep_map, _RET_IP_); - } else if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) { + } else if ((err = sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf)))) { bh_unlock_sock(sk); + if (err == -ENOMEM) + reason = SKB_DROP_REASON_PFMEMALLOC; + if (err == -ENOBUFS) + reason = SKB_DROP_REASON_SOCKET_BACKLOG; atomic_inc(&sk->sk_drops); goto discard_and_relse; } @@ -589,7 +595,7 @@ out: sock_put(sk); return rc; discard_and_relse: - kfree_skb(skb); + sk_skb_reason_drop(sk, skb, reason); goto out; } EXPORT_SYMBOL(__sk_receive_skb); @@ -602,7 +608,7 @@ struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie) { struct dst_entry *dst = __sk_dst_get(sk); - if (dst && dst->obsolete && + if (dst && READ_ONCE(dst->obsolete) && INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check, dst, cookie) == NULL) { sk_tx_queue_clear(sk); @@ -620,7 +626,7 @@ struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie) { struct dst_entry *dst = sk_dst_get(sk); - if (dst && dst->obsolete && + if (dst && READ_ONCE(dst->obsolete) && INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check, dst, cookie) == NULL) { sk_dst_reset(sk); @@ -818,12 +824,10 @@ EXPORT_SYMBOL(sock_set_priority); void sock_set_sndtimeo(struct sock *sk, s64 secs) { - lock_sock(sk); if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1) WRITE_ONCE(sk->sk_sndtimeo, secs * HZ); else WRITE_ONCE(sk->sk_sndtimeo, MAX_SCHEDULE_TIMEOUT); - release_sock(sk); } EXPORT_SYMBOL(sock_set_sndtimeo); @@ -837,14 +841,6 @@ static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns) } } -void sock_enable_timestamps(struct sock *sk) -{ - lock_sock(sk); - __sock_set_timestamps(sk, true, false, true); - release_sock(sk); -} -EXPORT_SYMBOL(sock_enable_timestamps); - void sock_set_timestamp(struct sock *sk, int optname, bool valbool) { switch (optname) { @@ -1295,6 +1291,14 @@ int sk_setsockopt(struct sock *sk, int level, int optname, case SO_DEVMEM_DONTNEED: return sock_devmem_dontneed(sk, optval, optlen); #endif + case SO_SNDTIMEO_OLD: + case SO_SNDTIMEO_NEW: + return sock_set_timeout(&sk->sk_sndtimeo, optval, + optlen, optname == SO_SNDTIMEO_OLD); + case SO_RCVTIMEO_OLD: + case SO_RCVTIMEO_NEW: + return sock_set_timeout(&sk->sk_rcvtimeo, optval, + optlen, optname == SO_RCVTIMEO_OLD); } sockopt_lock_sock(sk); @@ -1450,18 +1454,6 @@ set_sndbuf: WRITE_ONCE(sk->sk_rcvlowat, val ? : 1); break; } - case SO_RCVTIMEO_OLD: - case SO_RCVTIMEO_NEW: - ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, - optlen, optname == SO_RCVTIMEO_OLD); - break; - - case SO_SNDTIMEO_OLD: - case SO_SNDTIMEO_NEW: - ret = sock_set_timeout(&sk->sk_sndtimeo, optval, - optlen, optname == SO_SNDTIMEO_OLD); - break; - case SO_ATTACH_FILTER: { struct sock_fprog fprog; @@ -2602,8 +2594,8 @@ static u32 sk_dst_gso_max_size(struct sock *sk, struct dst_entry *dst) !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr)); #endif /* pairs with the WRITE_ONCE() in netif_set_gso(_ipv4)_max_size() */ - max_size = is_ipv6 ? READ_ONCE(dst->dev->gso_max_size) : - READ_ONCE(dst->dev->gso_ipv4_max_size); + max_size = is_ipv6 ? READ_ONCE(dst_dev(dst)->gso_max_size) : + READ_ONCE(dst_dev(dst)->gso_ipv4_max_size); if (max_size > GSO_LEGACY_MAX_SIZE && !sk_is_tcp(sk)) max_size = GSO_LEGACY_MAX_SIZE; @@ -2614,7 +2606,7 @@ void sk_setup_caps(struct sock *sk, struct dst_entry *dst) { u32 max_segs = 1; - sk->sk_route_caps = dst->dev->features; + sk->sk_route_caps = dst_dev(dst)->features; if (sk_is_tcp(sk)) { struct inet_connection_sock *icsk = inet_csk(sk); @@ -2632,7 +2624,7 @@ void sk_setup_caps(struct sock *sk, struct dst_entry *dst) sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM; sk->sk_gso_max_size = sk_dst_gso_max_size(sk, dst); /* pairs with the WRITE_ONCE() in netif_set_gso_max_segs() */ - max_segs = max_t(u32, READ_ONCE(dst->dev->gso_max_segs), 1); + max_segs = max_t(u32, READ_ONCE(dst_dev(dst)->gso_max_segs), 1); } } sk->sk_gso_max_segs = max_segs; @@ -2788,17 +2780,6 @@ void sock_pfree(struct sk_buff *skb) EXPORT_SYMBOL(sock_pfree); #endif /* CONFIG_INET */ -kuid_t sock_i_uid(struct sock *sk) -{ - kuid_t uid; - - read_lock_bh(&sk->sk_callback_lock); - uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID; - read_unlock_bh(&sk->sk_callback_lock); - return uid; -} -EXPORT_SYMBOL(sock_i_uid); - unsigned long __sock_i_ino(struct sock *sk) { unsigned long ino; diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 82a14f131d00..5947b38e4f8b 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -1709,7 +1709,6 @@ EXPORT_SYMBOL_GPL(sock_map_close); struct sockmap_link { struct bpf_link link; struct bpf_map *map; - enum bpf_attach_type attach_type; }; static void sock_map_link_release(struct bpf_link *link) @@ -1721,7 +1720,7 @@ static void sock_map_link_release(struct bpf_link *link) goto out; WARN_ON_ONCE(sock_map_prog_update(sockmap_link->map, NULL, link->prog, link, - sockmap_link->attach_type)); + link->attach_type)); bpf_map_put_with_uref(sockmap_link->map); sockmap_link->map = NULL; @@ -1772,7 +1771,7 @@ static int sock_map_link_update_prog(struct bpf_link *link, } ret = sock_map_prog_link_lookup(sockmap_link->map, &pprog, &plink, - sockmap_link->attach_type); + link->attach_type); if (ret) goto out; @@ -1817,7 +1816,7 @@ static int sock_map_link_fill_info(const struct bpf_link *link, u32 map_id = sock_map_link_get_map_id(sockmap_link); info->sockmap.map_id = map_id; - info->sockmap.attach_type = sockmap_link->attach_type; + info->sockmap.attach_type = link->attach_type; return 0; } @@ -1828,7 +1827,7 @@ static void sock_map_link_show_fdinfo(const struct bpf_link *link, u32 map_id = sock_map_link_get_map_id(sockmap_link); seq_printf(seq, "map_id:\t%u\n", map_id); - seq_printf(seq, "attach_type:\t%u\n", sockmap_link->attach_type); + seq_printf(seq, "attach_type:\t%u\n", link->attach_type); } static const struct bpf_link_ops sock_map_link_ops = { @@ -1866,9 +1865,9 @@ int sock_map_link_create(const union bpf_attr *attr, struct bpf_prog *prog) } attach_type = attr->link_create.attach_type; - bpf_link_init(&sockmap_link->link, BPF_LINK_TYPE_SOCKMAP, &sock_map_link_ops, prog); + bpf_link_init(&sockmap_link->link, BPF_LINK_TYPE_SOCKMAP, &sock_map_link_ops, prog, + attach_type); sockmap_link->map = map; - sockmap_link->attach_type = attach_type; ret = bpf_link_prime(&sockmap_link->link, &link_primer); if (ret) { diff --git a/net/core/stream.c b/net/core/stream.c index b16dfa568a2d..7a37e7dd2c43 100644 --- a/net/core/stream.c +++ b/net/core/stream.c @@ -23,9 +23,13 @@ /** * sk_stream_write_space - stream socket write_space callback. - * @sk: socket + * @sk: pointer to the socket structure * - * FIXME: write proper description + * This function is invoked when there's space available in the socket's + * send buffer for writing. It first checks if the socket is writable, + * clears the SOCK_NOSPACE flag indicating that memory for writing + * is now available, wakes up any processes waiting for write operations + * and sends asynchronous notifications if needed. */ void sk_stream_write_space(struct sock *sk) { diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 5dbb2c6f371d..8cf04b57ade1 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -28,6 +28,7 @@ #include <net/rps.h> #include "dev.h" +#include "net-sysfs.h" static int int_3600 = 3600; static int min_sndbuf = SOCK_MIN_SNDBUF; @@ -96,50 +97,40 @@ free_buf: #ifdef CONFIG_RPS -static struct cpumask *rps_default_mask_cow_alloc(struct net *net) -{ - struct cpumask *rps_default_mask; - - if (net->core.rps_default_mask) - return net->core.rps_default_mask; - - rps_default_mask = kzalloc(cpumask_size(), GFP_KERNEL); - if (!rps_default_mask) - return NULL; - - /* pairs with READ_ONCE in rx_queue_default_mask() */ - WRITE_ONCE(net->core.rps_default_mask, rps_default_mask); - return rps_default_mask; -} +DEFINE_MUTEX(rps_default_mask_mutex); static int rps_default_mask_sysctl(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct net *net = (struct net *)table->data; + struct cpumask *mask; int err = 0; - rtnl_lock(); + mutex_lock(&rps_default_mask_mutex); + mask = net->core.rps_default_mask; if (write) { - struct cpumask *rps_default_mask = rps_default_mask_cow_alloc(net); - + if (!mask) { + mask = kzalloc(cpumask_size(), GFP_KERNEL); + net->core.rps_default_mask = mask; + } err = -ENOMEM; - if (!rps_default_mask) + if (!mask) goto done; - err = cpumask_parse(buffer, rps_default_mask); + err = cpumask_parse(buffer, mask); if (err) goto done; - err = rps_cpumask_housekeeping(rps_default_mask); + err = rps_cpumask_housekeeping(mask); if (err) goto done; } else { err = dump_cpumask(buffer, lenp, ppos, - net->core.rps_default_mask ? : cpu_none_mask); + mask ?: cpu_none_mask); } done: - rtnl_unlock(); + mutex_unlock(&rps_default_mask_mutex); return err; } |