diff options
Diffstat (limited to 'net')
144 files changed, 3776 insertions, 1577 deletions
diff --git a/net/Kconfig b/net/Kconfig index f0a8692496ff..d27d0deac0bf 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -290,15 +290,21 @@ config MAX_SKB_FRAGS If unsure, say 17. config RPS - bool + bool "Receive packet steering" depends on SMP && SYSFS default y + help + Software receive side packet steering (RPS) distributes the + load of received packet processing across multiple CPUs. config RFS_ACCEL - bool + bool "Hardware acceleration of RFS" depends on RPS select CPU_RMAP default y + help + Allowing drivers for multiqueue hardware with flow filter tables to + accelerate RFS. config SOCK_RX_QUEUE_MAPPING bool @@ -351,7 +357,7 @@ config BPF_STREAM_PARSER BPF_MAP_TYPE_SOCKMAP. config NET_FLOW_LIMIT - bool + bool "Net flow limit" depends on RPS default y help @@ -502,6 +508,7 @@ config FAILOVER config ETHTOOL_NETLINK bool "Netlink interface for ethtool" + select DIMLIB default y help An alternative userspace interface for ethtool based on generic diff --git a/net/atm/ioctl.c b/net/atm/ioctl.c index f81f8d56f5c0..0f7a39aeccc8 100644 --- a/net/atm/ioctl.c +++ b/net/atm/ioctl.c @@ -68,7 +68,7 @@ static int do_vcc_ioctl(struct socket *sock, unsigned int cmd, goto done; } error = put_user(sk->sk_sndbuf - sk_wmem_alloc_get(sk), - (int __user *)argp) ? -EFAULT : 0; + (int __user *)argp); goto done; case SIOCINQ: { @@ -83,7 +83,7 @@ static int do_vcc_ioctl(struct socket *sock, unsigned int cmd, skb = skb_peek(&sk->sk_receive_queue); amount = skb ? skb->len : 0; spin_unlock_irq(&sk->sk_receive_queue.lock); - error = put_user(amount, (int __user *)argp) ? -EFAULT : 0; + error = put_user(amount, (int __user *)argp); goto done; } case ATM_SETSC: diff --git a/net/bpf/bpf_dummy_struct_ops.c b/net/bpf/bpf_dummy_struct_ops.c index 891cdf61c65a..3ea52b05adfb 100644 --- a/net/bpf/bpf_dummy_struct_ops.c +++ b/net/bpf/bpf_dummy_struct_ops.c @@ -272,12 +272,12 @@ static int bpf_dummy_init_member(const struct btf_type *t, return -EOPNOTSUPP; } -static int bpf_dummy_reg(void *kdata) +static int bpf_dummy_reg(void *kdata, struct bpf_link *link) { return -EOPNOTSUPP; } -static void bpf_dummy_unreg(void *kdata) +static void bpf_dummy_unreg(void *kdata, struct bpf_link *link) { } diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 36ae54f57bf5..a6d7f790cdda 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -283,9 +283,10 @@ static int xdp_recv_frames(struct xdp_frame **frames, int nframes, static int xdp_test_run_batch(struct xdp_test_data *xdp, struct bpf_prog *prog, u32 repeat) { - struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); + struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx; int err = 0, act, ret, i, nframes = 0, batch_sz; struct xdp_frame **frames = xdp->frames; + struct bpf_redirect_info *ri; struct xdp_page_head *head; struct xdp_frame *frm; bool redirect = false; @@ -295,6 +296,8 @@ static int xdp_test_run_batch(struct xdp_test_data *xdp, struct bpf_prog *prog, batch_sz = min_t(u32, repeat, xdp->batch_size); local_bh_disable(); + bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx); + ri = bpf_net_ctx_get_ri(); xdp_set_return_frame_no_direct(); for (i = 0; i < batch_sz; i++) { @@ -359,6 +362,7 @@ out: } xdp_clear_return_frame_no_direct(); + bpf_net_ctx_clear(bpf_net_ctx); local_bh_enable(); return err; } @@ -394,6 +398,7 @@ static int bpf_test_run_xdp_live(struct bpf_prog *prog, struct xdp_buff *ctx, static int bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat, u32 *retval, u32 *time, bool xdp) { + struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx; struct bpf_prog_array_item item = {.prog = prog}; struct bpf_run_ctx *old_ctx; struct bpf_cg_run_ctx run_ctx; @@ -419,10 +424,14 @@ static int bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat, do { run_ctx.prog_item = &item; local_bh_disable(); + bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx); + if (xdp) *retval = bpf_prog_run_xdp(prog, ctx); else *retval = bpf_prog_run(prog, ctx); + + bpf_net_ctx_clear(bpf_net_ctx); local_bh_enable(); } while (bpf_test_timer_continue(&t, 1, repeat, &ret, time)); bpf_reset_run_ctx(old_ctx); diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c index bf30c50b5689..3c9f6538990e 100644 --- a/net/bridge/br_netfilter_hooks.c +++ b/net/bridge/br_netfilter_hooks.c @@ -137,6 +137,7 @@ static inline bool is_pppoe_ipv6(const struct sk_buff *skb, #define NF_BRIDGE_MAX_MAC_HEADER_LENGTH (PPPOE_SES_HLEN + ETH_HLEN) struct brnf_frag_data { + local_lock_t bh_lock; char mac[NF_BRIDGE_MAX_MAC_HEADER_LENGTH]; u8 encap_size; u8 size; @@ -144,7 +145,9 @@ struct brnf_frag_data { __be16 vlan_proto; }; -static DEFINE_PER_CPU(struct brnf_frag_data, brnf_frag_data_storage); +static DEFINE_PER_CPU(struct brnf_frag_data, brnf_frag_data_storage) = { + .bh_lock = INIT_LOCAL_LOCK(bh_lock), +}; static void nf_bridge_info_free(struct sk_buff *skb) { @@ -850,6 +853,7 @@ static int br_nf_dev_queue_xmit(struct net *net, struct sock *sk, struct sk_buff { struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); unsigned int mtu, mtu_reserved; + int ret; mtu_reserved = nf_bridge_mtu_reduction(skb); mtu = skb->dev->mtu; @@ -882,6 +886,7 @@ static int br_nf_dev_queue_xmit(struct net *net, struct sock *sk, struct sk_buff IPCB(skb)->frag_max_size = nf_bridge->frag_max_size; + local_lock_nested_bh(&brnf_frag_data_storage.bh_lock); data = this_cpu_ptr(&brnf_frag_data_storage); if (skb_vlan_tag_present(skb)) { @@ -897,7 +902,9 @@ static int br_nf_dev_queue_xmit(struct net *net, struct sock *sk, struct sk_buff skb_copy_from_linear_data_offset(skb, -data->size, data->mac, data->size); - return br_nf_ip_fragment(net, sk, skb, br_nf_push_frag_xmit); + ret = br_nf_ip_fragment(net, sk, skb, br_nf_push_frag_xmit); + local_unlock_nested_bh(&brnf_frag_data_storage.bh_lock); + return ret; } if (IS_ENABLED(CONFIG_NF_DEFRAG_IPV6) && skb->protocol == htons(ETH_P_IPV6)) { @@ -909,6 +916,7 @@ static int br_nf_dev_queue_xmit(struct net *net, struct sock *sk, struct sk_buff IP6CB(skb)->frag_max_size = nf_bridge->frag_max_size; + local_lock_nested_bh(&brnf_frag_data_storage.bh_lock); data = this_cpu_ptr(&brnf_frag_data_storage); data->encap_size = nf_bridge_encap_header_len(skb); data->size = ETH_HLEN + data->encap_size; @@ -916,8 +924,12 @@ static int br_nf_dev_queue_xmit(struct net *net, struct sock *sk, struct sk_buff skb_copy_from_linear_data_offset(skb, -data->size, data->mac, data->size); - if (v6ops) - return v6ops->fragment(net, sk, skb, br_nf_push_frag_xmit); + if (v6ops) { + ret = v6ops->fragment(net, sk, skb, br_nf_push_frag_xmit); + local_unlock_nested_bh(&brnf_frag_data_storage.bh_lock); + return ret; + } + local_unlock_nested_bh(&brnf_frag_data_storage.bh_lock); kfree_skb(skb); return -EMSGSIZE; diff --git a/net/bridge/br_netlink_tunnel.c b/net/bridge/br_netlink_tunnel.c index 17abf092f7ca..71a12da30004 100644 --- a/net/bridge/br_netlink_tunnel.c +++ b/net/bridge/br_netlink_tunnel.c @@ -315,8 +315,8 @@ int br_process_vlan_tunnel_info(const struct net_bridge *br, if (curr_change) *changed = curr_change; - __vlan_tunnel_handle_range(p, &v_start, &v_end, v, - curr_change); + __vlan_tunnel_handle_range(p, &v_start, &v_end, v, + curr_change); } if (v_start && v_end) br_vlan_notify(br, p, v_start->vid, v_end->vid, diff --git a/net/bridge/netfilter/nf_conntrack_bridge.c b/net/bridge/netfilter/nf_conntrack_bridge.c index c3c51b9a6826..816bb0fde718 100644 --- a/net/bridge/netfilter/nf_conntrack_bridge.c +++ b/net/bridge/netfilter/nf_conntrack_bridge.c @@ -32,7 +32,7 @@ static int nf_br_ip_fragment(struct net *net, struct sock *sk, struct sk_buff *)) { int frag_max_size = BR_INPUT_SKB_CB(skb)->frag_max_size; - bool mono_delivery_time = skb->mono_delivery_time; + u8 tstamp_type = skb->tstamp_type; unsigned int hlen, ll_rs, mtu; ktime_t tstamp = skb->tstamp; struct ip_frag_state state; @@ -82,7 +82,7 @@ static int nf_br_ip_fragment(struct net *net, struct sock *sk, if (iter.frag) ip_fraglist_prepare(skb, &iter); - skb_set_delivery_time(skb, tstamp, mono_delivery_time); + skb_set_delivery_time(skb, tstamp, tstamp_type); err = output(net, sk, data, skb); if (err || !iter.frag) break; @@ -113,7 +113,7 @@ slow_path: goto blackhole; } - skb_set_delivery_time(skb2, tstamp, mono_delivery_time); + skb_set_delivery_time(skb2, tstamp, tstamp_type); err = output(net, sk, data, skb2); if (err) goto blackhole; diff --git a/net/caif/cfpkt_skbuff.c b/net/caif/cfpkt_skbuff.c index 7796414d47e5..2ae8cfa3df88 100644 --- a/net/caif/cfpkt_skbuff.c +++ b/net/caif/cfpkt_skbuff.c @@ -21,13 +21,6 @@ do { \ pr_warn(errmsg); \ } while (0) -struct cfpktq { - struct sk_buff_head head; - atomic_t count; - /* Lock protects count updates */ - spinlock_t lock; -}; - /* * net/caif/ is generic and does not * understand SKB, so we do this typecast diff --git a/net/can/Kconfig b/net/can/Kconfig index cb56be8e3862..af64a6f76458 100644 --- a/net/can/Kconfig +++ b/net/can/Kconfig @@ -56,18 +56,17 @@ config CAN_GW source "net/can/j1939/Kconfig" config CAN_ISOTP - tristate "ISO 15765-2:2016 CAN transport protocol" + tristate "ISO 15765-2 CAN transport protocol" help CAN Transport Protocols offer support for segmented Point-to-Point communication between CAN nodes via two defined CAN Identifiers. + This protocol driver implements segmented data transfers for CAN CC + (aka Classical CAN, CAN 2.0B) and CAN FD frame types which were + introduced with ISO 15765-2:2016. As CAN frames can only transport a small amount of data bytes - (max. 8 bytes for 'classic' CAN and max. 64 bytes for CAN FD) this + (max. 8 bytes for CAN CC and max. 64 bytes for CAN FD) this segmentation is needed to transport longer Protocol Data Units (PDU) as needed e.g. for vehicle diagnosis (UDS, ISO 14229) or IP-over-CAN traffic. - This protocol driver implements data transfers according to - ISO 15765-2:2016 for 'classic' CAN and CAN FD frame types. - If you want to perform automotive vehicle diagnostic services (UDS), - say 'y'. endif diff --git a/net/can/isotp.c b/net/can/isotp.c index 25bac0fafc83..16046931542a 100644 --- a/net/can/isotp.c +++ b/net/can/isotp.c @@ -72,7 +72,7 @@ #include <net/sock.h> #include <net/net_namespace.h> -MODULE_DESCRIPTION("PF_CAN isotp 15765-2:2016 protocol"); +MODULE_DESCRIPTION("PF_CAN ISO 15765-2 transport protocol"); MODULE_LICENSE("Dual BSD/GPL"); MODULE_AUTHOR("Oliver Hartkopp <socketcan@hartkopp.net>"); MODULE_ALIAS("can-proto-6"); @@ -83,10 +83,11 @@ MODULE_ALIAS("can-proto-6"); (CAN_EFF_MASK | CAN_EFF_FLAG | CAN_RTR_FLAG) : \ (CAN_SFF_MASK | CAN_EFF_FLAG | CAN_RTR_FLAG)) -/* ISO 15765-2:2016 supports more than 4095 byte per ISO PDU as the FF_DL can - * take full 32 bit values (4 Gbyte). We would need some good concept to handle - * this between user space and kernel space. For now set the static buffer to - * something about 8 kbyte to be able to test this new functionality. +/* Since ISO 15765-2:2016 the CAN isotp protocol supports more than 4095 + * byte per ISO PDU as the FF_DL can take full 32 bit values (4 Gbyte). + * We would need some good concept to handle this between user space and + * kernel space. For now set the static buffer to something about 8 kbyte + * to be able to test this new functionality. */ #define DEFAULT_MAX_PDU_SIZE 8300 diff --git a/net/core/datagram.c b/net/core/datagram.c index e614cfd8e14a..95f242591fd2 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -417,14 +417,14 @@ static int __skb_datagram_iter(const struct sk_buff *skb, int offset, end = start + skb_frag_size(frag); if ((copy = end - offset) > 0) { struct page *page = skb_frag_page(frag); - u8 *vaddr = kmap(page); + u8 *vaddr = kmap_local_page(page); if (copy > len) copy = len; n = INDIRECT_CALL_1(cb, simple_copy_to_iter, vaddr + skb_frag_off(frag) + offset - start, copy, data, to); - kunmap(page); + kunmap_local(vaddr); offset += n; if (n != copy) goto short_copy; diff --git a/net/core/dev.c b/net/core/dev.c index 2b4819b610b8..0a23d7da7fbc 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -229,7 +229,7 @@ static inline void backlog_lock_irq_save(struct softnet_data *sd, { if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads()) spin_lock_irqsave(&sd->input_pkt_queue.lock, *flags); - else if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + else local_irq_save(*flags); } @@ -237,7 +237,7 @@ static inline void backlog_lock_irq_disable(struct softnet_data *sd) { if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads()) spin_lock_irq(&sd->input_pkt_queue.lock); - else if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + else local_irq_disable(); } @@ -246,7 +246,7 @@ static inline void backlog_unlock_irq_restore(struct softnet_data *sd, { if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads()) spin_unlock_irqrestore(&sd->input_pkt_queue.lock, *flags); - else if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + else local_irq_restore(*flags); } @@ -254,7 +254,7 @@ static inline void backlog_unlock_irq_enable(struct softnet_data *sd) { if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads()) spin_unlock_irq(&sd->input_pkt_queue.lock); - else if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + else local_irq_enable(); } @@ -449,7 +449,9 @@ static RAW_NOTIFIER_HEAD(netdev_chain); * queue in the local softnet handler. */ -DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data); +DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data) = { + .process_queue_bh_lock = INIT_LOCAL_LOCK(process_queue_bh_lock), +}; EXPORT_PER_CPU_SYMBOL(softnet_data); /* Page_pool has a lockless array/stack to alloc/recycle pages. @@ -2160,7 +2162,7 @@ EXPORT_SYMBOL(net_disable_timestamp); static inline void net_timestamp_set(struct sk_buff *skb) { skb->tstamp = 0; - skb->mono_delivery_time = 0; + skb->tstamp_type = SKB_CLOCK_REALTIME; if (static_branch_unlikely(&netstamp_needed_key)) skb->tstamp = ktime_get_real(); } @@ -3940,6 +3942,7 @@ netdev_tx_queue_mapping(struct net_device *dev, struct sk_buff *skb) return netdev_get_tx_queue(dev, netdev_cap_txqueue(dev, qm)); } +#ifndef CONFIG_PREEMPT_RT static bool netdev_xmit_txqueue_skipped(void) { return __this_cpu_read(softnet_data.xmit.skip_txqueue); @@ -3950,6 +3953,19 @@ void netdev_xmit_skip_txqueue(bool skip) __this_cpu_write(softnet_data.xmit.skip_txqueue, skip); } EXPORT_SYMBOL_GPL(netdev_xmit_skip_txqueue); + +#else +static bool netdev_xmit_txqueue_skipped(void) +{ + return current->net_xmit.skip_txqueue; +} + +void netdev_xmit_skip_txqueue(bool skip) +{ + current->net_xmit.skip_txqueue = skip; +} +EXPORT_SYMBOL_GPL(netdev_xmit_skip_txqueue); +#endif #endif /* CONFIG_NET_EGRESS */ #ifdef CONFIG_NET_XGRESS @@ -4029,10 +4045,13 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, { struct bpf_mprog_entry *entry = rcu_dereference_bh(skb->dev->tcx_ingress); enum skb_drop_reason drop_reason = SKB_DROP_REASON_TC_INGRESS; + struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx; int sch_ret; if (!entry) return skb; + + bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx); if (*pt_prev) { *ret = deliver_skb(skb, *pt_prev, orig_dev); *pt_prev = NULL; @@ -4061,10 +4080,12 @@ ingress_verdict: break; } *ret = NET_RX_SUCCESS; + bpf_net_ctx_clear(bpf_net_ctx); return NULL; case TC_ACT_SHOT: kfree_skb_reason(skb, drop_reason); *ret = NET_RX_DROP; + bpf_net_ctx_clear(bpf_net_ctx); return NULL; /* used by tc_run */ case TC_ACT_STOLEN: @@ -4074,8 +4095,10 @@ ingress_verdict: fallthrough; case TC_ACT_CONSUMED: *ret = NET_RX_SUCCESS; + bpf_net_ctx_clear(bpf_net_ctx); return NULL; } + bpf_net_ctx_clear(bpf_net_ctx); return skb; } @@ -4085,11 +4108,14 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev) { struct bpf_mprog_entry *entry = rcu_dereference_bh(dev->tcx_egress); enum skb_drop_reason drop_reason = SKB_DROP_REASON_TC_EGRESS; + struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx; int sch_ret; if (!entry) return skb; + bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx); + /* qdisc_skb_cb(skb)->pkt_len & tcx_set_ingress() was * already set by the caller. */ @@ -4105,10 +4131,12 @@ egress_verdict: /* No need to push/pop skb's mac_header here on egress! */ skb_do_redirect(skb); *ret = NET_XMIT_SUCCESS; + bpf_net_ctx_clear(bpf_net_ctx); return NULL; case TC_ACT_SHOT: kfree_skb_reason(skb, drop_reason); *ret = NET_XMIT_DROP; + bpf_net_ctx_clear(bpf_net_ctx); return NULL; /* used by tc_run */ case TC_ACT_STOLEN: @@ -4118,8 +4146,10 @@ egress_verdict: fallthrough; case TC_ACT_CONSUMED: *ret = NET_XMIT_SUCCESS; + bpf_net_ctx_clear(bpf_net_ctx); return NULL; } + bpf_net_ctx_clear(bpf_net_ctx); return skb; } @@ -5234,7 +5264,7 @@ static __latent_entropy void net_tx_action(struct softirq_action *h) trace_consume_skb(skb, net_tx_action); else trace_kfree_skb(skb, net_tx_action, - get_kfree_skb_cb(skb)->reason); + get_kfree_skb_cb(skb)->reason, NULL); if (skb->fclone != SKB_FCLONE_UNAVAILABLE) __kfree_skb(skb); @@ -5935,6 +5965,7 @@ static void flush_backlog(struct work_struct *work) } backlog_unlock_irq_enable(sd); + local_lock_nested_bh(&softnet_data.process_queue_bh_lock); skb_queue_walk_safe(&sd->process_queue, skb, tmp) { if (skb->dev->reg_state == NETREG_UNREGISTERING) { __skb_unlink(skb, &sd->process_queue); @@ -5942,6 +5973,7 @@ static void flush_backlog(struct work_struct *work) rps_input_queue_head_incr(sd); } } + local_unlock_nested_bh(&softnet_data.process_queue_bh_lock); local_bh_enable(); } @@ -6063,7 +6095,9 @@ static int process_backlog(struct napi_struct *napi, int quota) while (again) { struct sk_buff *skb; + local_lock_nested_bh(&softnet_data.process_queue_bh_lock); while ((skb = __skb_dequeue(&sd->process_queue))) { + local_unlock_nested_bh(&softnet_data.process_queue_bh_lock); rcu_read_lock(); __netif_receive_skb(skb); rcu_read_unlock(); @@ -6072,7 +6106,9 @@ static int process_backlog(struct napi_struct *napi, int quota) return work; } + local_lock_nested_bh(&softnet_data.process_queue_bh_lock); } + local_unlock_nested_bh(&softnet_data.process_queue_bh_lock); backlog_lock_irq_disable(sd); if (skb_queue_empty(&sd->input_pkt_queue)) { @@ -6087,8 +6123,10 @@ static int process_backlog(struct napi_struct *napi, int quota) napi->state &= NAPIF_STATE_THREADED; again = false; } else { + local_lock_nested_bh(&softnet_data.process_queue_bh_lock); skb_queue_splice_tail_init(&sd->input_pkt_queue, &sd->process_queue); + local_unlock_nested_bh(&softnet_data.process_queue_bh_lock); } backlog_unlock_irq_enable(sd); } @@ -6301,6 +6339,7 @@ enum { static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock, unsigned flags, u16 budget) { + struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx; bool skip_schedule = false; unsigned long timeout; int rc; @@ -6318,6 +6357,7 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock, clear_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state); local_bh_disable(); + bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx); if (flags & NAPI_F_PREFER_BUSY_POLL) { napi->defer_hard_irqs_count = READ_ONCE(napi->dev->napi_defer_hard_irqs); @@ -6340,6 +6380,7 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock, netpoll_poll_unlock(have_poll_lock); if (rc == budget) __busy_poll_stop(napi, skip_schedule); + bpf_net_ctx_clear(bpf_net_ctx); local_bh_enable(); } @@ -6349,6 +6390,7 @@ static void __napi_busy_loop(unsigned int napi_id, { unsigned long start_time = loop_end ? busy_loop_current_time() : 0; int (*napi_poll)(struct napi_struct *napi, int budget); + struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx; void *have_poll_lock = NULL; struct napi_struct *napi; @@ -6367,6 +6409,7 @@ restart: int work = 0; local_bh_disable(); + bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx); if (!napi_poll) { unsigned long val = READ_ONCE(napi->state); @@ -6397,6 +6440,7 @@ count: __NET_ADD_STATS(dev_net(napi->dev), LINUX_MIB_BUSYPOLLRXPACKETS, work); skb_defer_free_flush(this_cpu_ptr(&softnet_data)); + bpf_net_ctx_clear(bpf_net_ctx); local_bh_enable(); if (!loop_end || loop_end(loop_end_arg, start_time)) @@ -6824,6 +6868,7 @@ static int napi_thread_wait(struct napi_struct *napi) static void napi_threaded_poll_loop(struct napi_struct *napi) { + struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx; struct softnet_data *sd; unsigned long last_qs = jiffies; @@ -6832,6 +6877,8 @@ static void napi_threaded_poll_loop(struct napi_struct *napi) void *have; local_bh_disable(); + bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx); + sd = this_cpu_ptr(&softnet_data); sd->in_napi_threaded_poll = true; @@ -6847,6 +6894,7 @@ static void napi_threaded_poll_loop(struct napi_struct *napi) net_rps_action_and_irq_enable(sd); } skb_defer_free_flush(sd); + bpf_net_ctx_clear(bpf_net_ctx); local_bh_enable(); if (!repoll) @@ -6872,10 +6920,12 @@ static __latent_entropy void net_rx_action(struct softirq_action *h) struct softnet_data *sd = this_cpu_ptr(&softnet_data); unsigned long time_limit = jiffies + usecs_to_jiffies(READ_ONCE(net_hotdata.netdev_budget_usecs)); + struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx; int budget = READ_ONCE(net_hotdata.netdev_budget); LIST_HEAD(list); LIST_HEAD(repoll); + bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx); start: sd->in_net_rx_action = true; local_irq_disable(); @@ -6928,7 +6978,8 @@ start: sd->in_net_rx_action = false; net_rps_action_and_irq_enable(sd); -end:; +end: + bpf_net_ctx_clear(bpf_net_ctx); } struct netdev_adjacent { @@ -10703,6 +10754,54 @@ void netdev_run_todo(void) wake_up(&netdev_unregistering_wq); } +/* Collate per-cpu network dstats statistics + * + * Read per-cpu network statistics from dev->dstats and populate the related + * fields in @s. + */ +static void dev_fetch_dstats(struct rtnl_link_stats64 *s, + const struct pcpu_dstats __percpu *dstats) +{ + int cpu; + + for_each_possible_cpu(cpu) { + u64 rx_packets, rx_bytes, rx_drops; + u64 tx_packets, tx_bytes, tx_drops; + const struct pcpu_dstats *stats; + unsigned int start; + + stats = per_cpu_ptr(dstats, cpu); + do { + start = u64_stats_fetch_begin(&stats->syncp); + rx_packets = u64_stats_read(&stats->rx_packets); + rx_bytes = u64_stats_read(&stats->rx_bytes); + rx_drops = u64_stats_read(&stats->rx_drops); + tx_packets = u64_stats_read(&stats->tx_packets); + tx_bytes = u64_stats_read(&stats->tx_bytes); + tx_drops = u64_stats_read(&stats->tx_drops); + } while (u64_stats_fetch_retry(&stats->syncp, start)); + + s->rx_packets += rx_packets; + s->rx_bytes += rx_bytes; + s->rx_dropped += rx_drops; + s->tx_packets += tx_packets; + s->tx_bytes += tx_bytes; + s->tx_dropped += tx_drops; + } +} + +/* ndo_get_stats64 implementation for dtstats-based accounting. + * + * Populate @s from dev->stats and dev->dstats. This is used internally by the + * core for NETDEV_PCPU_STAT_DSTAT-type stats collection. + */ +static void dev_get_dstats64(const struct net_device *dev, + struct rtnl_link_stats64 *s) +{ + netdev_stats_to_stats64(s, &dev->stats); + dev_fetch_dstats(s, dev->dstats); +} + /* Convert net_device_stats to rtnl_link_stats64. rtnl_link_stats64 has * all the same fields in the same order as net_device_stats, with only * the type differing, but rtnl_link_stats64 may have additional fields @@ -10779,6 +10878,8 @@ struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev, netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev)); } else if (dev->pcpu_stat_type == NETDEV_PCPU_STAT_TSTATS) { dev_get_tstats64(dev, storage); + } else if (dev->pcpu_stat_type == NETDEV_PCPU_STAT_DSTATS) { + dev_get_dstats64(dev, storage); } else { netdev_stats_to_stats64(storage, &dev->stats); } diff --git a/net/core/dev.h b/net/core/dev.h index b7b518bc2be5..5654325c5b71 100644 --- a/net/core/dev.h +++ b/net/core/dev.h @@ -150,6 +150,8 @@ struct napi_struct *napi_by_id(unsigned int napi_id); void kick_defer_list_purge(struct softnet_data *sd, unsigned int cpu); #define XMIT_RECURSION_LIMIT 8 + +#ifndef CONFIG_PREEMPT_RT static inline bool dev_xmit_recursion(void) { return unlikely(__this_cpu_read(softnet_data.xmit.recursion) > @@ -165,5 +167,25 @@ static inline void dev_xmit_recursion_dec(void) { __this_cpu_dec(softnet_data.xmit.recursion); } +#else +static inline bool dev_xmit_recursion(void) +{ + return unlikely(current->net_xmit.recursion > XMIT_RECURSION_LIMIT); +} + +static inline void dev_xmit_recursion_inc(void) +{ + current->net_xmit.recursion++; +} + +static inline void dev_xmit_recursion_dec(void) +{ + current->net_xmit.recursion--; +} +#endif + +int dev_set_hwtstamp_phylib(struct net_device *dev, + struct kernel_hwtstamp_config *cfg, + struct netlink_ext_ack *extack); #endif diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c index 9a66cf5015f2..b9719ed3c3fd 100644 --- a/net/core/dev_ioctl.c +++ b/net/core/dev_ioctl.c @@ -363,7 +363,6 @@ int dev_set_hwtstamp_phylib(struct net_device *dev, return 0; } -EXPORT_SYMBOL_GPL(dev_set_hwtstamp_phylib); static int dev_set_hwtstamp(struct net_device *dev, struct ifreq *ifr) { diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c index 430ed18f8584..2e0ae3328232 100644 --- a/net/core/drop_monitor.c +++ b/net/core/drop_monitor.c @@ -109,7 +109,8 @@ static u32 net_dm_queue_len = 1000; struct net_dm_alert_ops { void (*kfree_skb_probe)(void *ignore, struct sk_buff *skb, void *location, - enum skb_drop_reason reason); + enum skb_drop_reason reason, + struct sock *rx_sk); void (*napi_poll_probe)(void *ignore, struct napi_struct *napi, int work, int budget); void (*work_item_func)(struct work_struct *work); @@ -264,7 +265,8 @@ out: static void trace_kfree_skb_hit(void *ignore, struct sk_buff *skb, void *location, - enum skb_drop_reason reason) + enum skb_drop_reason reason, + struct sock *rx_sk) { trace_drop_common(skb, location); } @@ -491,7 +493,8 @@ static const struct net_dm_alert_ops net_dm_alert_summary_ops = { static void net_dm_packet_trace_kfree_skb_hit(void *ignore, struct sk_buff *skb, void *location, - enum skb_drop_reason reason) + enum skb_drop_reason reason, + struct sock *rx_sk) { ktime_t tstamp = ktime_get_real(); struct per_cpu_dm_data *data; diff --git a/net/core/filter.c b/net/core/filter.c index 9933851c685e..eb1c4425c06f 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1658,9 +1658,12 @@ struct bpf_scratchpad { __be32 diff[MAX_BPF_STACK / sizeof(__be32)]; u8 buff[MAX_BPF_STACK]; }; + local_lock_t bh_lock; }; -static DEFINE_PER_CPU(struct bpf_scratchpad, bpf_sp); +static DEFINE_PER_CPU(struct bpf_scratchpad, bpf_sp) = { + .bh_lock = INIT_LOCAL_LOCK(bh_lock), +}; static inline int __bpf_try_make_writable(struct sk_buff *skb, unsigned int write_len) @@ -2021,6 +2024,7 @@ BPF_CALL_5(bpf_csum_diff, __be32 *, from, u32, from_size, struct bpf_scratchpad *sp = this_cpu_ptr(&bpf_sp); u32 diff_size = from_size + to_size; int i, j = 0; + __wsum ret; /* This is quite flexible, some examples: * @@ -2034,12 +2038,15 @@ BPF_CALL_5(bpf_csum_diff, __be32 *, from, u32, from_size, diff_size > sizeof(sp->diff))) return -EINVAL; + local_lock_nested_bh(&bpf_sp.bh_lock); for (i = 0; i < from_size / sizeof(__be32); i++, j++) sp->diff[j] = ~from[i]; for (i = 0; i < to_size / sizeof(__be32); i++, j++) sp->diff[j] = to[i]; - return csum_partial(sp->diff, diff_size, seed); + ret = csum_partial(sp->diff, diff_size, seed); + local_unlock_nested_bh(&bpf_sp.bh_lock); + return ret; } static const struct bpf_func_proto bpf_csum_diff_proto = { @@ -2279,12 +2286,12 @@ static int __bpf_redirect_neigh_v6(struct sk_buff *skb, struct net_device *dev, err = bpf_out_neigh_v6(net, skb, dev, nh); if (unlikely(net_xmit_eval(err))) - dev->stats.tx_errors++; + DEV_STATS_INC(dev, tx_errors); else ret = NET_XMIT_SUCCESS; goto out_xmit; out_drop: - dev->stats.tx_errors++; + DEV_STATS_INC(dev, tx_errors); kfree_skb(skb); out_xmit: return ret; @@ -2385,12 +2392,12 @@ static int __bpf_redirect_neigh_v4(struct sk_buff *skb, struct net_device *dev, err = bpf_out_neigh_v4(net, skb, dev, nh); if (unlikely(net_xmit_eval(err))) - dev->stats.tx_errors++; + DEV_STATS_INC(dev, tx_errors); else ret = NET_XMIT_SUCCESS; goto out_xmit; out_drop: - dev->stats.tx_errors++; + DEV_STATS_INC(dev, tx_errors); kfree_skb(skb); out_xmit: return ret; @@ -2476,9 +2483,6 @@ static const struct bpf_func_proto bpf_clone_redirect_proto = { .arg3_type = ARG_ANYTHING, }; -DEFINE_PER_CPU(struct bpf_redirect_info, bpf_redirect_info); -EXPORT_PER_CPU_SYMBOL_GPL(bpf_redirect_info); - static struct net_device *skb_get_peer_dev(struct net_device *dev) { const struct net_device_ops *ops = dev->netdev_ops; @@ -2491,7 +2495,7 @@ static struct net_device *skb_get_peer_dev(struct net_device *dev) int skb_do_redirect(struct sk_buff *skb) { - struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); + struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); struct net *net = dev_net(skb->dev); struct net_device *dev; u32 flags = ri->flags; @@ -2524,7 +2528,7 @@ out_drop: BPF_CALL_2(bpf_redirect, u32, ifindex, u64, flags) { - struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); + struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); if (unlikely(flags & (~(BPF_F_INGRESS) | BPF_F_REDIRECT_INTERNAL))) return TC_ACT_SHOT; @@ -2545,7 +2549,7 @@ static const struct bpf_func_proto bpf_redirect_proto = { BPF_CALL_2(bpf_redirect_peer, u32, ifindex, u64, flags) { - struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); + struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); if (unlikely(flags)) return TC_ACT_SHOT; @@ -2567,7 +2571,7 @@ static const struct bpf_func_proto bpf_redirect_peer_proto = { BPF_CALL_4(bpf_redirect_neigh, u32, ifindex, struct bpf_redir_neigh *, params, int, plen, u64, flags) { - struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); + struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); if (unlikely((plen && plen < sizeof(*params)) || flags)) return TC_ACT_SHOT; @@ -4293,30 +4297,13 @@ void xdp_do_check_flushed(struct napi_struct *napi) } #endif -void bpf_clear_redirect_map(struct bpf_map *map) -{ - struct bpf_redirect_info *ri; - int cpu; - - for_each_possible_cpu(cpu) { - ri = per_cpu_ptr(&bpf_redirect_info, cpu); - /* Avoid polluting remote cacheline due to writes if - * not needed. Once we pass this test, we need the - * cmpxchg() to make sure it hasn't been changed in - * the meantime by remote CPU. - */ - if (unlikely(READ_ONCE(ri->map) == map)) - cmpxchg(&ri->map, map, NULL); - } -} - DEFINE_STATIC_KEY_FALSE(bpf_master_redirect_enabled_key); EXPORT_SYMBOL_GPL(bpf_master_redirect_enabled_key); u32 xdp_master_redirect(struct xdp_buff *xdp) { + struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); struct net_device *master, *slave; - struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); master = netdev_master_upper_dev_get_rcu(xdp->rxq->dev); slave = master->netdev_ops->ndo_xdp_get_xmit_slave(master, xdp); @@ -4388,7 +4375,7 @@ static __always_inline int __xdp_do_redirect_frame(struct bpf_redirect_info *ri, map = READ_ONCE(ri->map); /* The map pointer is cleared when the map is being torn - * down by bpf_clear_redirect_map() + * down by dev_map_free() */ if (unlikely(!map)) { err = -ENOENT; @@ -4433,7 +4420,7 @@ err: int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp, struct bpf_prog *xdp_prog) { - struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); + struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); enum bpf_map_type map_type = ri->map_type; if (map_type == BPF_MAP_TYPE_XSKMAP) @@ -4447,7 +4434,7 @@ EXPORT_SYMBOL_GPL(xdp_do_redirect); int xdp_do_redirect_frame(struct net_device *dev, struct xdp_buff *xdp, struct xdp_frame *xdpf, struct bpf_prog *xdp_prog) { - struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); + struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); enum bpf_map_type map_type = ri->map_type; if (map_type == BPF_MAP_TYPE_XSKMAP) @@ -4464,7 +4451,7 @@ static int xdp_do_generic_redirect_map(struct net_device *dev, enum bpf_map_type map_type, u32 map_id, u32 flags) { - struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); + struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); struct bpf_map *map; int err; @@ -4476,7 +4463,7 @@ static int xdp_do_generic_redirect_map(struct net_device *dev, map = READ_ONCE(ri->map); /* The map pointer is cleared when the map is being torn - * down by bpf_clear_redirect_map() + * down by dev_map_free() */ if (unlikely(!map)) { err = -ENOENT; @@ -4518,7 +4505,7 @@ err: int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb, struct xdp_buff *xdp, struct bpf_prog *xdp_prog) { - struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); + struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); enum bpf_map_type map_type = ri->map_type; void *fwd = ri->tgt_value; u32 map_id = ri->map_id; @@ -4554,7 +4541,7 @@ err: BPF_CALL_2(bpf_xdp_redirect, u32, ifindex, u64, flags) { - struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); + struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); if (unlikely(flags)) return XDP_ABORTED; @@ -6455,6 +6442,7 @@ BPF_CALL_4(bpf_lwt_seg6_store_bytes, struct sk_buff *, skb, u32, offset, void *srh_tlvs, *srh_end, *ptr; int srhoff = 0; + lockdep_assert_held(&srh_state->bh_lock); if (srh == NULL) return -EINVAL; @@ -6511,6 +6499,7 @@ BPF_CALL_4(bpf_lwt_seg6_action, struct sk_buff *, skb, int hdroff = 0; int err; + lockdep_assert_held(&srh_state->bh_lock); switch (action) { case SEG6_LOCAL_ACTION_END_X: if (!seg6_bpf_has_valid_srh(skb)) @@ -6587,6 +6576,7 @@ BPF_CALL_3(bpf_lwt_seg6_adjust_srh, struct sk_buff *, skb, u32, offset, int srhoff = 0; int ret; + lockdep_assert_held(&srh_state->bh_lock); if (unlikely(srh == NULL)) return -EINVAL; @@ -7731,17 +7721,21 @@ BPF_CALL_3(bpf_skb_set_tstamp, struct sk_buff *, skb, return -EOPNOTSUPP; switch (tstamp_type) { - case BPF_SKB_TSTAMP_DELIVERY_MONO: + case BPF_SKB_CLOCK_REALTIME: + skb->tstamp = tstamp; + skb->tstamp_type = SKB_CLOCK_REALTIME; + break; + case BPF_SKB_CLOCK_MONOTONIC: if (!tstamp) return -EINVAL; skb->tstamp = tstamp; - skb->mono_delivery_time = 1; + skb->tstamp_type = SKB_CLOCK_MONOTONIC; break; - case BPF_SKB_TSTAMP_UNSPEC: - if (tstamp) + case BPF_SKB_CLOCK_TAI: + if (!tstamp) return -EINVAL; - skb->tstamp = 0; - skb->mono_delivery_time = 0; + skb->tstamp = tstamp; + skb->tstamp_type = SKB_CLOCK_TAI; break; default: return -EINVAL; @@ -9392,16 +9386,17 @@ static struct bpf_insn *bpf_convert_tstamp_type_read(const struct bpf_insn *si, { __u8 value_reg = si->dst_reg; __u8 skb_reg = si->src_reg; - /* AX is needed because src_reg and dst_reg could be the same */ - __u8 tmp_reg = BPF_REG_AX; - - *insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, - SKB_BF_MONO_TC_OFFSET); - *insn++ = BPF_JMP32_IMM(BPF_JSET, tmp_reg, - SKB_MONO_DELIVERY_TIME_MASK, 2); - *insn++ = BPF_MOV32_IMM(value_reg, BPF_SKB_TSTAMP_UNSPEC); - *insn++ = BPF_JMP_A(1); - *insn++ = BPF_MOV32_IMM(value_reg, BPF_SKB_TSTAMP_DELIVERY_MONO); + BUILD_BUG_ON(__SKB_CLOCK_MAX != (int)BPF_SKB_CLOCK_TAI); + BUILD_BUG_ON(SKB_CLOCK_REALTIME != (int)BPF_SKB_CLOCK_REALTIME); + BUILD_BUG_ON(SKB_CLOCK_MONOTONIC != (int)BPF_SKB_CLOCK_MONOTONIC); + BUILD_BUG_ON(SKB_CLOCK_TAI != (int)BPF_SKB_CLOCK_TAI); + *insn++ = BPF_LDX_MEM(BPF_B, value_reg, skb_reg, SKB_BF_MONO_TC_OFFSET); + *insn++ = BPF_ALU32_IMM(BPF_AND, value_reg, SKB_TSTAMP_TYPE_MASK); +#ifdef __BIG_ENDIAN_BITFIELD + *insn++ = BPF_ALU32_IMM(BPF_RSH, value_reg, SKB_TSTAMP_TYPE_RSHIFT); +#else + BUILD_BUG_ON(!(SKB_TSTAMP_TYPE_MASK & 0x1)); +#endif return insn; } @@ -9444,11 +9439,12 @@ static struct bpf_insn *bpf_convert_tstamp_read(const struct bpf_prog *prog, __u8 tmp_reg = BPF_REG_AX; *insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, SKB_BF_MONO_TC_OFFSET); - *insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg, - TC_AT_INGRESS_MASK | SKB_MONO_DELIVERY_TIME_MASK); - *insn++ = BPF_JMP32_IMM(BPF_JNE, tmp_reg, - TC_AT_INGRESS_MASK | SKB_MONO_DELIVERY_TIME_MASK, 2); - /* skb->tc_at_ingress && skb->mono_delivery_time, + /* check if ingress mask bits is set */ + *insn++ = BPF_JMP32_IMM(BPF_JSET, tmp_reg, TC_AT_INGRESS_MASK, 1); + *insn++ = BPF_JMP_A(4); + *insn++ = BPF_JMP32_IMM(BPF_JSET, tmp_reg, SKB_TSTAMP_TYPE_MASK, 1); + *insn++ = BPF_JMP_A(2); + /* skb->tc_at_ingress && skb->tstamp_type, * read 0 as the (rcv) timestamp. */ *insn++ = BPF_MOV64_IMM(value_reg, 0); @@ -9473,7 +9469,7 @@ static struct bpf_insn *bpf_convert_tstamp_write(const struct bpf_prog *prog, * the bpf prog is aware the tstamp could have delivery time. * Thus, write skb->tstamp as is if tstamp_type_access is true. * Otherwise, writing at ingress will have to clear the - * mono_delivery_time bit also. + * skb->tstamp_type bit also. */ if (!prog->tstamp_type_access) { __u8 tmp_reg = BPF_REG_AX; @@ -9483,8 +9479,8 @@ static struct bpf_insn *bpf_convert_tstamp_write(const struct bpf_prog *prog, *insn++ = BPF_JMP32_IMM(BPF_JSET, tmp_reg, TC_AT_INGRESS_MASK, 1); /* goto <store> */ *insn++ = BPF_JMP_A(2); - /* <clear>: mono_delivery_time */ - *insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg, ~SKB_MONO_DELIVERY_TIME_MASK); + /* <clear>: skb->tstamp_type */ + *insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg, ~SKB_TSTAMP_TYPE_MASK); *insn++ = BPF_STX_MEM(BPF_B, skb_reg, tmp_reg, SKB_BF_MONO_TC_OFFSET); } #endif diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index f82e9a7d3b37..e64a26379807 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -382,7 +382,9 @@ skb_flow_dissect_tunnel_info(const struct sk_buff *skb, !dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_IP) && !dissector_uses_key(flow_dissector, - FLOW_DISSECTOR_KEY_ENC_OPTS)) + FLOW_DISSECTOR_KEY_ENC_OPTS) && + !dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_ENC_FLAGS)) return; info = skb_tunnel_info(skb); @@ -475,6 +477,18 @@ skb_flow_dissect_tunnel_info(const struct sk_buff *skb, IP_TUNNEL_GENEVE_OPT_BIT); enc_opt->dst_opt_type = val < __IP_TUNNEL_FLAG_NUM ? val : 0; } + + if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_FLAGS)) { + struct flow_dissector_key_enc_flags *enc_flags; + IP_TUNNEL_DECLARE_FLAGS(flags) = {}; + + enc_flags = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_ENC_FLAGS, + target_container); + ip_tunnel_set_encflags_present(flags); + ip_tunnel_flags_and(flags, flags, info->key.tun_flags); + enc_flags->flags = bitmap_read(flags, IP_TUNNEL_CSUM_BIT, 32); + } } EXPORT_SYMBOL(skb_flow_dissect_tunnel_info); @@ -1792,6 +1806,13 @@ u32 flow_hash_from_keys(struct flow_keys *keys) } EXPORT_SYMBOL(flow_hash_from_keys); +u32 flow_hash_from_keys_seed(struct flow_keys *keys, + const siphash_key_t *keyval) +{ + return __flow_hash_from_keys(keys, keyval); +} +EXPORT_SYMBOL(flow_hash_from_keys_seed); + static inline u32 ___skb_get_hash(const struct sk_buff *skb, struct flow_keys *keys, const siphash_key_t *keyval) @@ -1831,22 +1852,23 @@ EXPORT_SYMBOL(make_flow_keys_digest); static struct flow_dissector flow_keys_dissector_symmetric __read_mostly; -u32 __skb_get_hash_symmetric(const struct sk_buff *skb) +u32 __skb_get_hash_symmetric_net(const struct net *net, const struct sk_buff *skb) { struct flow_keys keys; __flow_hash_secret_init(); memset(&keys, 0, sizeof(keys)); - __skb_flow_dissect(NULL, skb, &flow_keys_dissector_symmetric, + __skb_flow_dissect(net, skb, &flow_keys_dissector_symmetric, &keys, NULL, 0, 0, 0, 0); return __flow_hash_from_keys(&keys, &hashrnd); } -EXPORT_SYMBOL_GPL(__skb_get_hash_symmetric); +EXPORT_SYMBOL_GPL(__skb_get_hash_symmetric_net); /** - * __skb_get_hash: calculate a flow hash + * __skb_get_hash_net: calculate a flow hash + * @net: associated network namespace, derived from @skb if NULL * @skb: sk_buff to calculate flow hash from * * This function calculates a flow hash based on src/dst addresses @@ -1854,18 +1876,24 @@ EXPORT_SYMBOL_GPL(__skb_get_hash_symmetric); * on success, zero indicates no valid hash. Also, sets l4_hash in skb * if hash is a canonical 4-tuple hash over transport ports. */ -void __skb_get_hash(struct sk_buff *skb) +void __skb_get_hash_net(const struct net *net, struct sk_buff *skb) { struct flow_keys keys; u32 hash; + memset(&keys, 0, sizeof(keys)); + + __skb_flow_dissect(net, skb, &flow_keys_dissector, + &keys, NULL, 0, 0, 0, + FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL); + __flow_hash_secret_init(); - hash = ___skb_get_hash(skb, &keys, &hashrnd); + hash = __flow_hash_from_keys(&keys, &hashrnd); __skb_set_sw_hash(skb, hash, flow_keys_have_l4(&keys)); } -EXPORT_SYMBOL(__skb_get_hash); +EXPORT_SYMBOL(__skb_get_hash_net); __u32 skb_get_hash_perturb(const struct sk_buff *skb, const siphash_key_t *perturb) diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c index fae9c4694186..412816076b8b 100644 --- a/net/core/gen_estimator.c +++ b/net/core/gen_estimator.c @@ -206,7 +206,7 @@ void gen_kill_estimator(struct net_rate_estimator __rcu **rate_est) { struct net_rate_estimator *est; - est = xchg((__force struct net_rate_estimator **)rate_est, NULL); + est = unrcu_pointer(xchg(rate_est, NULL)); if (est) { timer_shutdown_sync(&est->timer); kfree_rcu(est, rcu); diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c index 4a0797f0a154..afb05f58b64c 100644 --- a/net/core/lwt_bpf.c +++ b/net/core/lwt_bpf.c @@ -38,13 +38,14 @@ static inline struct bpf_lwt *bpf_lwt_lwtunnel(struct lwtunnel_state *lwt) static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt, struct dst_entry *dst, bool can_redirect) { + struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx; int ret; - /* Migration disable and BH disable are needed to protect per-cpu - * redirect_info between BPF prog and skb_do_redirect(). + /* Disabling BH is needed to protect per-CPU bpf_redirect_info between + * BPF prog and skb_do_redirect(). */ - migrate_disable(); local_bh_disable(); + bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx); bpf_compute_data_pointers(skb); ret = bpf_prog_run_save_cb(lwt->prog, skb); @@ -77,8 +78,8 @@ static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt, break; } + bpf_net_ctx_clear(bpf_net_ctx); local_bh_enable(); - migrate_enable(); return ret; } diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 45fd88405b6b..277751375b0a 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -3578,7 +3578,7 @@ static void neigh_copy_dflt_parms(struct net *net, struct neigh_parms *p, rcu_read_unlock(); } -static void neigh_proc_update(struct ctl_table *ctl, int write) +static void neigh_proc_update(const struct ctl_table *ctl, int write) { struct net_device *dev = ctl->extra1; struct neigh_parms *p = ctl->extra2; diff --git a/net/core/page_pool.c b/net/core/page_pool.c index f4444b4e39e6..3927a0a7fa9a 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -1027,8 +1027,8 @@ static void page_pool_disable_direct_recycling(struct page_pool *pool) /* To avoid races with recycling and additional barriers make sure * pool and NAPI are unlinked when NAPI is disabled. */ - WARN_ON(!test_bit(NAPI_STATE_SCHED, &pool->p.napi->state) || - READ_ONCE(pool->p.napi->list_owner) != -1); + WARN_ON(!test_bit(NAPI_STATE_SCHED, &pool->p.napi->state)); + WARN_ON(READ_ONCE(pool->p.napi->list_owner) != -1); WRITE_ONCE(pool->p.napi, NULL); } diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 4668d6718040..eabfc8290f5e 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -6486,6 +6486,7 @@ static int rtnl_mdb_del(struct sk_buff *skb, struct nlmsghdr *nlh, static int rtnl_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { + const bool needs_lock = !(cb->flags & RTNL_FLAG_DUMP_UNLOCKED); rtnl_dumpit_func dumpit = cb->data; int err; @@ -6495,7 +6496,11 @@ static int rtnl_dumpit(struct sk_buff *skb, struct netlink_callback *cb) if (!dumpit) return 0; + if (needs_lock) + rtnl_lock(); err = dumpit(skb, cb); + if (needs_lock) + rtnl_unlock(); /* Old dump handlers used to send NLM_DONE as in a separate recvmsg(). * Some applications which parse netlink manually depend on this. @@ -6515,7 +6520,8 @@ static int rtnetlink_dump_start(struct sock *ssk, struct sk_buff *skb, const struct nlmsghdr *nlh, struct netlink_dump_control *control) { - if (control->flags & RTNL_FLAG_DUMP_SPLIT_NLM_DONE) { + if (control->flags & RTNL_FLAG_DUMP_SPLIT_NLM_DONE || + !(control->flags & RTNL_FLAG_DUMP_UNLOCKED)) { WARN_ON(control->data); control->data = control->dump; control->dump = rtnl_dumpit; @@ -6703,7 +6709,6 @@ static int __net_init rtnetlink_net_init(struct net *net) struct netlink_kernel_cfg cfg = { .groups = RTNLGRP_MAX, .input = rtnetlink_rcv, - .cb_mutex = &rtnl_mutex, .flags = NL_CFG_F_NONROOT_RECV, .bind = rtnetlink_bind, }; diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 466999a7515e..eb9a7e65b5c8 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -277,6 +277,7 @@ static void *page_frag_alloc_1k(struct page_frag_1k *nc, gfp_t gfp_mask) #endif struct napi_alloc_cache { + local_lock_t bh_lock; struct page_frag_cache page; struct page_frag_1k page_small; unsigned int skb_count; @@ -284,7 +285,9 @@ struct napi_alloc_cache { }; static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache); -static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache); +static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache) = { + .bh_lock = INIT_LOCAL_LOCK(bh_lock), +}; /* Double check that napi_get_frags() allocates skbs with * skb->head being backed by slab, not a page fragment. @@ -306,11 +309,16 @@ void napi_get_frags_check(struct napi_struct *napi) void *__napi_alloc_frag_align(unsigned int fragsz, unsigned int align_mask) { struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); + void *data; fragsz = SKB_DATA_ALIGN(fragsz); - return __page_frag_alloc_align(&nc->page, fragsz, GFP_ATOMIC, + local_lock_nested_bh(&napi_alloc_cache.bh_lock); + data = __page_frag_alloc_align(&nc->page, fragsz, GFP_ATOMIC, align_mask); + local_unlock_nested_bh(&napi_alloc_cache.bh_lock); + return data; + } EXPORT_SYMBOL(__napi_alloc_frag_align); @@ -318,19 +326,15 @@ void *__netdev_alloc_frag_align(unsigned int fragsz, unsigned int align_mask) { void *data; - fragsz = SKB_DATA_ALIGN(fragsz); if (in_hardirq() || irqs_disabled()) { struct page_frag_cache *nc = this_cpu_ptr(&netdev_alloc_cache); + fragsz = SKB_DATA_ALIGN(fragsz); data = __page_frag_alloc_align(nc, fragsz, GFP_ATOMIC, align_mask); } else { - struct napi_alloc_cache *nc; - local_bh_disable(); - nc = this_cpu_ptr(&napi_alloc_cache); - data = __page_frag_alloc_align(&nc->page, fragsz, GFP_ATOMIC, - align_mask); + data = __napi_alloc_frag_align(fragsz, align_mask); local_bh_enable(); } return data; @@ -342,16 +346,20 @@ static struct sk_buff *napi_skb_cache_get(void) struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); struct sk_buff *skb; + local_lock_nested_bh(&napi_alloc_cache.bh_lock); if (unlikely(!nc->skb_count)) { nc->skb_count = kmem_cache_alloc_bulk(net_hotdata.skbuff_cache, GFP_ATOMIC, NAPI_SKB_CACHE_BULK, nc->skb_cache); - if (unlikely(!nc->skb_count)) + if (unlikely(!nc->skb_count)) { + local_unlock_nested_bh(&napi_alloc_cache.bh_lock); return NULL; + } } skb = nc->skb_cache[--nc->skb_count]; + local_unlock_nested_bh(&napi_alloc_cache.bh_lock); kasan_mempool_unpoison_object(skb, kmem_cache_size(net_hotdata.skbuff_cache)); return skb; @@ -744,9 +752,13 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len, pfmemalloc = nc->pfmemalloc; } else { local_bh_disable(); + local_lock_nested_bh(&napi_alloc_cache.bh_lock); + nc = this_cpu_ptr(&napi_alloc_cache.page); data = page_frag_alloc(nc, len, gfp_mask); pfmemalloc = nc->pfmemalloc; + + local_unlock_nested_bh(&napi_alloc_cache.bh_lock); local_bh_enable(); } @@ -810,11 +822,11 @@ struct sk_buff *napi_alloc_skb(struct napi_struct *napi, unsigned int len) goto skb_success; } - nc = this_cpu_ptr(&napi_alloc_cache); - if (sk_memalloc_socks()) gfp_mask |= __GFP_MEMALLOC; + local_lock_nested_bh(&napi_alloc_cache.bh_lock); + nc = this_cpu_ptr(&napi_alloc_cache); if (NAPI_HAS_SMALL_PAGE_FRAG && len <= SKB_WITH_OVERHEAD(1024)) { /* we are artificially inflating the allocation size, but * that is not as bad as it may look like, as: @@ -836,6 +848,7 @@ struct sk_buff *napi_alloc_skb(struct napi_struct *napi, unsigned int len) data = page_frag_alloc(&nc->page, len, gfp_mask); pfmemalloc = nc->page.pfmemalloc; } + local_unlock_nested_bh(&napi_alloc_cache.bh_lock); if (unlikely(!data)) return NULL; @@ -1190,7 +1203,8 @@ void __kfree_skb(struct sk_buff *skb) EXPORT_SYMBOL(__kfree_skb); static __always_inline -bool __kfree_skb_reason(struct sk_buff *skb, enum skb_drop_reason reason) +bool __sk_skb_reason_drop(struct sock *sk, struct sk_buff *skb, + enum skb_drop_reason reason) { if (unlikely(!skb_unref(skb))) return false; @@ -1203,26 +1217,27 @@ bool __kfree_skb_reason(struct sk_buff *skb, enum skb_drop_reason reason) if (reason == SKB_CONSUMED) trace_consume_skb(skb, __builtin_return_address(0)); else - trace_kfree_skb(skb, __builtin_return_address(0), reason); + trace_kfree_skb(skb, __builtin_return_address(0), reason, sk); return true; } /** - * kfree_skb_reason - free an sk_buff with special reason + * sk_skb_reason_drop - free an sk_buff with special reason + * @sk: the socket to receive @skb, or NULL if not applicable * @skb: buffer to free * @reason: reason why this skb is dropped * - * Drop a reference to the buffer and free it if the usage count has - * hit zero. Meanwhile, pass the drop reason to 'kfree_skb' - * tracepoint. + * Drop a reference to the buffer and free it if the usage count has hit + * zero. Meanwhile, pass the receiving socket and drop reason to + * 'kfree_skb' tracepoint. */ void __fix_address -kfree_skb_reason(struct sk_buff *skb, enum skb_drop_reason reason) +sk_skb_reason_drop(struct sock *sk, struct sk_buff *skb, enum skb_drop_reason reason) { - if (__kfree_skb_reason(skb, reason)) + if (__sk_skb_reason_drop(sk, skb, reason)) __kfree_skb(skb); } -EXPORT_SYMBOL(kfree_skb_reason); +EXPORT_SYMBOL(sk_skb_reason_drop); #define KFREE_SKB_BULK_SIZE 16 @@ -1261,7 +1276,7 @@ kfree_skb_list_reason(struct sk_buff *segs, enum skb_drop_reason reason) while (segs) { struct sk_buff *next = segs->next; - if (__kfree_skb_reason(segs, reason)) { + if (__sk_skb_reason_drop(NULL, segs, reason)) { skb_poison_list(segs); kfree_skb_add_bulk(segs, &sa, reason); } @@ -1433,6 +1448,7 @@ static void napi_skb_cache_put(struct sk_buff *skb) if (!kasan_mempool_poison_object(skb)) return; + local_lock_nested_bh(&napi_alloc_cache.bh_lock); nc->skb_cache[nc->skb_count++] = skb; if (unlikely(nc->skb_count == NAPI_SKB_CACHE_SIZE)) { @@ -1444,6 +1460,7 @@ static void napi_skb_cache_put(struct sk_buff *skb) nc->skb_cache + NAPI_SKB_CACHE_HALF); nc->skb_count = NAPI_SKB_CACHE_HALF; } + local_unlock_nested_bh(&napi_alloc_cache.bh_lock); } void __napi_kfree_skb(struct sk_buff *skb, enum skb_drop_reason reason) @@ -4139,6 +4156,9 @@ int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen) if (skb_zcopy(tgt) || skb_zcopy(skb)) return 0; + DEBUG_NET_WARN_ON_ONCE(tgt->pp_recycle != skb->pp_recycle); + DEBUG_NET_WARN_ON_ONCE(skb_cmp_decrypted(tgt, skb)); + todo = shiftlen; from = 0; to = skb_shinfo(tgt)->nr_frags; diff --git a/net/core/sock.c b/net/core/sock.c index 100e975073ca..9abc4fe25953 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1083,6 +1083,17 @@ bool sockopt_capable(int cap) } EXPORT_SYMBOL(sockopt_capable); +static int sockopt_validate_clockid(__kernel_clockid_t value) +{ + switch (value) { + case CLOCK_REALTIME: + case CLOCK_MONOTONIC: + case CLOCK_TAI: + return 0; + } + return -EINVAL; +} + /* * This is meant for all protocols to use and covers goings on * at the socket level. Everything here is generic. @@ -1497,6 +1508,11 @@ set_sndbuf: ret = -EPERM; break; } + + ret = sockopt_validate_clockid(sk_txtime.clockid); + if (ret) + break; + sock_valbool_flag(sk, SOCK_TXTIME, true); sk->sk_clockid = sk_txtime.clockid; sk->sk_txtime_deadline_mode = @@ -2262,7 +2278,12 @@ static void sk_init_common(struct sock *sk) lockdep_set_class_and_name(&sk->sk_error_queue.lock, af_elock_keys + sk->sk_family, af_family_elock_key_strings[sk->sk_family]); - lockdep_set_class_and_name(&sk->sk_callback_lock, + if (sk->sk_kern_sock) + lockdep_set_class_and_name(&sk->sk_callback_lock, + af_kern_callback_keys + sk->sk_family, + af_family_kern_clock_key_strings[sk->sk_family]); + else + lockdep_set_class_and_name(&sk->sk_callback_lock, af_callback_keys + sk->sk_family, af_family_clock_key_strings[sk->sk_family]); } @@ -3460,18 +3481,6 @@ void sock_init_data_uid(struct socket *sock, struct sock *sk, kuid_t uid) } sk->sk_uid = uid; - rwlock_init(&sk->sk_callback_lock); - if (sk->sk_kern_sock) - lockdep_set_class_and_name( - &sk->sk_callback_lock, - af_kern_callback_keys + sk->sk_family, - af_family_kern_clock_key_strings[sk->sk_family]); - else - lockdep_set_class_and_name( - &sk->sk_callback_lock, - af_callback_keys + sk->sk_family, - af_family_clock_key_strings[sk->sk_family]); - sk->sk_state_change = sock_def_wakeup; sk->sk_data_ready = sock_def_readable; sk->sk_write_space = sock_def_write_space; diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c index 654122838025..a08eed9b9142 100644 --- a/net/core/sock_diag.c +++ b/net/core/sock_diag.c @@ -18,7 +18,7 @@ static const struct sock_diag_handler __rcu *sock_diag_handlers[AF_MAX]; -static struct sock_diag_inet_compat __rcu *inet_rcv_compat; +static const struct sock_diag_inet_compat __rcu *inet_rcv_compat; static struct workqueue_struct *broadcast_wq; @@ -187,8 +187,7 @@ void sock_diag_broadcast_destroy(struct sock *sk) void sock_diag_register_inet_compat(const struct sock_diag_inet_compat *ptr) { - xchg((__force const struct sock_diag_inet_compat **)&inet_rcv_compat, - ptr); + xchg(&inet_rcv_compat, RCU_INITIALIZER(ptr)); } EXPORT_SYMBOL_GPL(sock_diag_register_inet_compat); @@ -196,8 +195,7 @@ void sock_diag_unregister_inet_compat(const struct sock_diag_inet_compat *ptr) { const struct sock_diag_inet_compat *old; - old = xchg((__force const struct sock_diag_inet_compat **)&inet_rcv_compat, - NULL); + old = unrcu_pointer(xchg(&inet_rcv_compat, NULL)); WARN_ON_ONCE(old != ptr); } EXPORT_SYMBOL_GPL(sock_diag_unregister_inet_compat); diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index c9fb9ad87485..2079000691e2 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -383,38 +383,6 @@ proc_dolongvec_minmax_bpf_restricted(struct ctl_table *table, int write, static struct ctl_table net_core_table[] = { { - .procname = "wmem_max", - .data = &sysctl_wmem_max, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &min_sndbuf, - }, - { - .procname = "rmem_max", - .data = &sysctl_rmem_max, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &min_rcvbuf, - }, - { - .procname = "wmem_default", - .data = &sysctl_wmem_default, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &min_sndbuf, - }, - { - .procname = "rmem_default", - .data = &sysctl_rmem_default, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &min_rcvbuf, - }, - { .procname = "mem_pcpu_rsv", .data = &net_hotdata.sysctl_mem_pcpu_rsv, .maxlen = sizeof(int), @@ -697,6 +665,41 @@ static struct ctl_table netns_core_table[] = { .extra2 = SYSCTL_ONE, .proc_handler = proc_dou8vec_minmax, }, + /* sysctl_core_net_init() will set the values after this + * to readonly in network namespaces + */ + { + .procname = "wmem_max", + .data = &sysctl_wmem_max, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &min_sndbuf, + }, + { + .procname = "rmem_max", + .data = &sysctl_rmem_max, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &min_rcvbuf, + }, + { + .procname = "wmem_default", + .data = &sysctl_wmem_default, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &min_sndbuf, + }, + { + .procname = "rmem_default", + .data = &sysctl_rmem_default, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &min_rcvbuf, + }, }; static int __init fb_tunnels_only_for_init_net_sysctl_setup(char *str) @@ -724,8 +727,14 @@ static __net_init int sysctl_core_net_init(struct net *net) if (tbl == NULL) goto err_dup; - for (i = 0; i < table_size; ++i) + for (i = 0; i < table_size; ++i) { + if (tbl[i].data == &sysctl_wmem_max) + break; + tbl[i].data += (char *)net - (char *)&init_net; + } + for (; i < table_size; ++i) + tbl[i].mode &= ~0222; } net->core.sysctl_hdr = register_net_sysctl_sz(net, "net/core", tbl, table_size); diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c index 251a57cf5822..fecc8190064f 100644 --- a/net/dccp/minisocks.c +++ b/net/dccp/minisocks.c @@ -54,17 +54,10 @@ void dccp_time_wait(struct sock *sk, int state, int timeo) if (state == DCCP_TIME_WAIT) timeo = DCCP_TIMEWAIT_LEN; - /* tw_timer is pinned, so we need to make sure BH are disabled - * in following section, otherwise timer handler could run before - * we complete the initialization. - */ - local_bh_disable(); - inet_twsk_schedule(tw, timeo); /* Linkage updates. * Note that access to tw after this point is illegal. */ - inet_twsk_hashdance(tw, sk, &dccp_hashinfo); - local_bh_enable(); + inet_twsk_hashdance_schedule(tw, sk, &dccp_hashinfo, timeo); } else { /* Sorry, if we're out of memory, just CLOSE this * socket up. We've got bigger problems than diff --git a/net/devlink/dpipe.c b/net/devlink/dpipe.c index a72a9292efc5..55009b377447 100644 --- a/net/devlink/dpipe.c +++ b/net/devlink/dpipe.c @@ -839,7 +839,7 @@ EXPORT_SYMBOL_GPL(devlink_dpipe_table_counter_enabled); */ int devl_dpipe_table_register(struct devlink *devlink, const char *table_name, - struct devlink_dpipe_table_ops *table_ops, + const struct devlink_dpipe_table_ops *table_ops, void *priv, bool counter_control_extern) { struct devlink_dpipe_table *table; diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig index 8e698bea99a3..8d5bf869eb14 100644 --- a/net/dsa/Kconfig +++ b/net/dsa/Kconfig @@ -129,7 +129,7 @@ config NET_DSA_TAG_RTL4_A tristate "Tag driver for Realtek 4 byte protocol A tags" help Say Y or M if you want to enable support for tagging frames for the - Realtek switches with 4 byte protocol A tags, sich as found in + Realtek switches with 4 byte protocol A tags, such as found in the Realtek RTL8366RB. config NET_DSA_TAG_RTL8_4 diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index 12521a7d4048..668c729946ea 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -1507,9 +1507,7 @@ static int dsa_switch_probe(struct dsa_switch *ds) if (ds->phylink_mac_ops) { if (ds->ops->phylink_mac_select_pcs || - ds->ops->phylink_mac_prepare || ds->ops->phylink_mac_config || - ds->ops->phylink_mac_finish || ds->ops->phylink_mac_link_down || ds->ops->phylink_mac_link_up) return -EINVAL; diff --git a/net/dsa/port.c b/net/dsa/port.c index 9a249d4ac3a5..25258b33e59e 100644 --- a/net/dsa/port.c +++ b/net/dsa/port.c @@ -1467,10 +1467,34 @@ int dsa_port_change_conduit(struct dsa_port *dp, struct net_device *conduit, */ dsa_user_unsync_ha(dev); + /* If live-changing, we also need to uninstall the user device address + * from the port FDB and the conduit interface. + */ + if (dev->flags & IFF_UP) + dsa_user_host_uc_uninstall(dev); + err = dsa_port_assign_conduit(dp, conduit, extack, true); if (err) goto rewind_old_addrs; + /* If the port doesn't have its own MAC address and relies on the DSA + * conduit's one, inherit it again from the new DSA conduit. + */ + if (is_zero_ether_addr(dp->mac)) + eth_hw_addr_inherit(dev, conduit); + + /* If live-changing, we need to install the user device address to the + * port FDB and the conduit interface. + */ + if (dev->flags & IFF_UP) { + err = dsa_user_host_uc_install(dev, dev->dev_addr); + if (err) { + NL_SET_ERR_MSG_MOD(extack, + "Failed to install host UC address"); + goto rewind_addr_inherit; + } + } + dsa_user_sync_ha(dev); if (vlan_filtering) { @@ -1500,10 +1524,26 @@ rewind_new_vlan: rewind_new_addrs: dsa_user_unsync_ha(dev); + if (dev->flags & IFF_UP) + dsa_user_host_uc_uninstall(dev); + +rewind_addr_inherit: + if (is_zero_ether_addr(dp->mac)) + eth_hw_addr_inherit(dev, old_conduit); + dsa_port_assign_conduit(dp, old_conduit, NULL, false); /* Restore the objects on the old CPU port */ rewind_old_addrs: + if (dev->flags & IFF_UP) { + tmp = dsa_user_host_uc_install(dev, dev->dev_addr); + if (tmp) { + dev_err(ds->dev, + "port %d failed to restore host UC address: %pe\n", + dp->index, ERR_PTR(tmp)); + } + } + dsa_user_sync_ha(dev); if (vlan_filtering) { @@ -1549,21 +1589,6 @@ dsa_port_phylink_mac_select_pcs(struct phylink_config *config, return pcs; } -static int dsa_port_phylink_mac_prepare(struct phylink_config *config, - unsigned int mode, - phy_interface_t interface) -{ - struct dsa_port *dp = dsa_phylink_to_port(config); - struct dsa_switch *ds = dp->ds; - int err = 0; - - if (ds->ops->phylink_mac_prepare) - err = ds->ops->phylink_mac_prepare(ds, dp->index, mode, - interface); - - return err; -} - static void dsa_port_phylink_mac_config(struct phylink_config *config, unsigned int mode, const struct phylink_link_state *state) @@ -1577,21 +1602,6 @@ static void dsa_port_phylink_mac_config(struct phylink_config *config, ds->ops->phylink_mac_config(ds, dp->index, mode, state); } -static int dsa_port_phylink_mac_finish(struct phylink_config *config, - unsigned int mode, - phy_interface_t interface) -{ - struct dsa_port *dp = dsa_phylink_to_port(config); - struct dsa_switch *ds = dp->ds; - int err = 0; - - if (ds->ops->phylink_mac_finish) - err = ds->ops->phylink_mac_finish(ds, dp->index, mode, - interface); - - return err; -} - static void dsa_port_phylink_mac_link_down(struct phylink_config *config, unsigned int mode, phy_interface_t interface) @@ -1624,9 +1634,7 @@ static void dsa_port_phylink_mac_link_up(struct phylink_config *config, static const struct phylink_mac_ops dsa_port_phylink_mac_ops = { .mac_select_pcs = dsa_port_phylink_mac_select_pcs, - .mac_prepare = dsa_port_phylink_mac_prepare, .mac_config = dsa_port_phylink_mac_config, - .mac_finish = dsa_port_phylink_mac_finish, .mac_link_down = dsa_port_phylink_mac_link_down, .mac_link_up = dsa_port_phylink_mac_link_up, }; diff --git a/net/dsa/user.c b/net/dsa/user.c index 867c5fe9a4da..e8f56a40b614 100644 --- a/net/dsa/user.c +++ b/net/dsa/user.c @@ -355,60 +355,82 @@ static int dsa_user_get_iflink(const struct net_device *dev) return READ_ONCE(dsa_user_to_conduit(dev)->ifindex); } -static int dsa_user_open(struct net_device *dev) +int dsa_user_host_uc_install(struct net_device *dev, const u8 *addr) { struct net_device *conduit = dsa_user_to_conduit(dev); struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; int err; - err = dev_open(conduit, NULL); - if (err < 0) { - netdev_err(dev, "failed to open conduit %s\n", conduit->name); - goto out; - } - if (dsa_switch_supports_uc_filtering(ds)) { - err = dsa_port_standalone_host_fdb_add(dp, dev->dev_addr, 0); + err = dsa_port_standalone_host_fdb_add(dp, addr, 0); if (err) goto out; } - if (!ether_addr_equal(dev->dev_addr, conduit->dev_addr)) { - err = dev_uc_add(conduit, dev->dev_addr); + if (!ether_addr_equal(addr, conduit->dev_addr)) { + err = dev_uc_add(conduit, addr); if (err < 0) goto del_host_addr; } - err = dsa_port_enable_rt(dp, dev->phydev); - if (err) - goto del_unicast; - return 0; -del_unicast: - if (!ether_addr_equal(dev->dev_addr, conduit->dev_addr)) - dev_uc_del(conduit, dev->dev_addr); del_host_addr: if (dsa_switch_supports_uc_filtering(ds)) - dsa_port_standalone_host_fdb_del(dp, dev->dev_addr, 0); + dsa_port_standalone_host_fdb_del(dp, addr, 0); out: return err; } -static int dsa_user_close(struct net_device *dev) +void dsa_user_host_uc_uninstall(struct net_device *dev) { struct net_device *conduit = dsa_user_to_conduit(dev); struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; - dsa_port_disable_rt(dp); - if (!ether_addr_equal(dev->dev_addr, conduit->dev_addr)) dev_uc_del(conduit, dev->dev_addr); if (dsa_switch_supports_uc_filtering(ds)) dsa_port_standalone_host_fdb_del(dp, dev->dev_addr, 0); +} + +static int dsa_user_open(struct net_device *dev) +{ + struct net_device *conduit = dsa_user_to_conduit(dev); + struct dsa_port *dp = dsa_user_to_port(dev); + int err; + + err = dev_open(conduit, NULL); + if (err < 0) { + netdev_err(dev, "failed to open conduit %s\n", conduit->name); + goto out; + } + + err = dsa_user_host_uc_install(dev, dev->dev_addr); + if (err) + goto out; + + err = dsa_port_enable_rt(dp, dev->phydev); + if (err) + goto out_del_host_uc; + + return 0; + +out_del_host_uc: + dsa_user_host_uc_uninstall(dev); +out: + return err; +} + +static int dsa_user_close(struct net_device *dev) +{ + struct dsa_port *dp = dsa_user_to_port(dev); + + dsa_port_disable_rt(dp); + + dsa_user_host_uc_uninstall(dev); return 0; } @@ -448,7 +470,6 @@ static void dsa_user_set_rx_mode(struct net_device *dev) static int dsa_user_set_mac_address(struct net_device *dev, void *a) { - struct net_device *conduit = dsa_user_to_conduit(dev); struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; struct sockaddr *addr = a; @@ -470,34 +491,16 @@ static int dsa_user_set_mac_address(struct net_device *dev, void *a) if (!(dev->flags & IFF_UP)) goto out_change_dev_addr; - if (dsa_switch_supports_uc_filtering(ds)) { - err = dsa_port_standalone_host_fdb_add(dp, addr->sa_data, 0); - if (err) - return err; - } - - if (!ether_addr_equal(addr->sa_data, conduit->dev_addr)) { - err = dev_uc_add(conduit, addr->sa_data); - if (err < 0) - goto del_unicast; - } + err = dsa_user_host_uc_install(dev, addr->sa_data); + if (err) + return err; - if (!ether_addr_equal(dev->dev_addr, conduit->dev_addr)) - dev_uc_del(conduit, dev->dev_addr); - - if (dsa_switch_supports_uc_filtering(ds)) - dsa_port_standalone_host_fdb_del(dp, dev->dev_addr, 0); + dsa_user_host_uc_uninstall(dev); out_change_dev_addr: eth_hw_addr_set(dev, addr->sa_data); return 0; - -del_unicast: - if (dsa_switch_supports_uc_filtering(ds)) - dsa_port_standalone_host_fdb_del(dp, addr->sa_data, 0); - - return err; } struct dsa_user_dump_ctx { @@ -2879,12 +2882,6 @@ int dsa_user_change_conduit(struct net_device *dev, struct net_device *conduit, ERR_PTR(err)); } - /* If the port doesn't have its own MAC address and relies on the DSA - * conduit's one, inherit it again from the new DSA conduit. - */ - if (is_zero_ether_addr(dp->mac)) - eth_hw_addr_inherit(dev, conduit); - return 0; out_revert_conduit_link: diff --git a/net/dsa/user.h b/net/dsa/user.h index 996069130bea..016884bead3c 100644 --- a/net/dsa/user.h +++ b/net/dsa/user.h @@ -42,6 +42,8 @@ int dsa_user_suspend(struct net_device *user_dev); int dsa_user_resume(struct net_device *user_dev); int dsa_user_register_notifier(void); void dsa_user_unregister_notifier(void); +int dsa_user_host_uc_install(struct net_device *dev, const u8 *addr); +void dsa_user_host_uc_uninstall(struct net_device *dev); void dsa_user_sync_ha(struct net_device *dev); void dsa_user_unsync_ha(struct net_device *dev); void dsa_user_setup_tagger(struct net_device *user); diff --git a/net/ethtool/cabletest.c b/net/ethtool/cabletest.c index 06a151165c31..f6f136ec7ddf 100644 --- a/net/ethtool/cabletest.c +++ b/net/ethtool/cabletest.c @@ -207,10 +207,6 @@ err: } EXPORT_SYMBOL_GPL(ethnl_cable_test_fault_length); -struct cable_test_tdr_req_info { - struct ethnl_req_info base; -}; - static const struct nla_policy cable_test_tdr_act_cfg_policy[] = { [ETHTOOL_A_CABLE_TEST_TDR_CFG_FIRST] = { .type = NLA_U32 }, [ETHTOOL_A_CABLE_TEST_TDR_CFG_LAST] = { .type = NLA_U32 }, diff --git a/net/ethtool/coalesce.c b/net/ethtool/coalesce.c index 83112c1a71ae..759b16e3d134 100644 --- a/net/ethtool/coalesce.c +++ b/net/ethtool/coalesce.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only +#include <linux/dim.h> #include "netlink.h" #include "common.h" @@ -82,6 +83,14 @@ static int coalesce_prepare_data(const struct ethnl_req_info *req_base, static int coalesce_reply_size(const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { + int modersz = nla_total_size(0) + /* _PROFILE_IRQ_MODERATION, nest */ + nla_total_size(sizeof(u32)) + /* _IRQ_MODERATION_USEC */ + nla_total_size(sizeof(u32)) + /* _IRQ_MODERATION_PKTS */ + nla_total_size(sizeof(u32)); /* _IRQ_MODERATION_COMPS */ + + int total_modersz = nla_total_size(0) + /* _{R,T}X_PROFILE, nest */ + modersz * NET_DIM_PARAMS_NUM_PROFILES; + return nla_total_size(sizeof(u32)) + /* _RX_USECS */ nla_total_size(sizeof(u32)) + /* _RX_MAX_FRAMES */ nla_total_size(sizeof(u32)) + /* _RX_USECS_IRQ */ @@ -108,7 +117,8 @@ static int coalesce_reply_size(const struct ethnl_req_info *req_base, nla_total_size(sizeof(u8)) + /* _USE_CQE_MODE_RX */ nla_total_size(sizeof(u32)) + /* _TX_AGGR_MAX_BYTES */ nla_total_size(sizeof(u32)) + /* _TX_AGGR_MAX_FRAMES */ - nla_total_size(sizeof(u32)); /* _TX_AGGR_TIME_USECS */ + nla_total_size(sizeof(u32)) + /* _TX_AGGR_TIME_USECS */ + total_modersz * 2; /* _{R,T}X_PROFILE */ } static bool coalesce_put_u32(struct sk_buff *skb, u16 attr_type, u32 val, @@ -127,14 +137,84 @@ static bool coalesce_put_bool(struct sk_buff *skb, u16 attr_type, u32 val, return nla_put_u8(skb, attr_type, !!val); } +/** + * coalesce_put_profile - fill reply with a nla nest with four child nla nests. + * @skb: socket buffer the message is stored in + * @attr_type: nest attr type ETHTOOL_A_COALESCE_*X_PROFILE + * @profile: data passed to userspace + * @coal_flags: modifiable parameters supported by the driver + * + * Put a dim profile nest attribute. Refer to ETHTOOL_A_PROFILE_IRQ_MODERATION. + * + * Return: 0 on success or a negative error code. + */ +static int coalesce_put_profile(struct sk_buff *skb, u16 attr_type, + const struct dim_cq_moder *profile, + u8 coal_flags) +{ + struct nlattr *profile_attr, *moder_attr; + int i, ret; + + if (!profile || !coal_flags) + return 0; + + profile_attr = nla_nest_start(skb, attr_type); + if (!profile_attr) + return -EMSGSIZE; + + for (i = 0; i < NET_DIM_PARAMS_NUM_PROFILES; i++) { + moder_attr = nla_nest_start(skb, + ETHTOOL_A_PROFILE_IRQ_MODERATION); + if (!moder_attr) { + ret = -EMSGSIZE; + goto cancel_profile; + } + + if (coal_flags & DIM_COALESCE_USEC) { + ret = nla_put_u32(skb, ETHTOOL_A_IRQ_MODERATION_USEC, + profile[i].usec); + if (ret) + goto cancel_moder; + } + + if (coal_flags & DIM_COALESCE_PKTS) { + ret = nla_put_u32(skb, ETHTOOL_A_IRQ_MODERATION_PKTS, + profile[i].pkts); + if (ret) + goto cancel_moder; + } + + if (coal_flags & DIM_COALESCE_COMPS) { + ret = nla_put_u32(skb, ETHTOOL_A_IRQ_MODERATION_COMPS, + profile[i].comps); + if (ret) + goto cancel_moder; + } + + nla_nest_end(skb, moder_attr); + } + + nla_nest_end(skb, profile_attr); + + return 0; + +cancel_moder: + nla_nest_cancel(skb, moder_attr); +cancel_profile: + nla_nest_cancel(skb, profile_attr); + return ret; +} + static int coalesce_fill_reply(struct sk_buff *skb, const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { const struct coalesce_reply_data *data = COALESCE_REPDATA(reply_base); const struct kernel_ethtool_coalesce *kcoal = &data->kernel_coalesce; + struct dim_irq_moder *moder = req_base->dev->irq_moder; const struct ethtool_coalesce *coal = &data->coalesce; u32 supported = data->supported_params; + int ret = 0; if (coalesce_put_u32(skb, ETHTOOL_A_COALESCE_RX_USECS, coal->rx_coalesce_usecs, supported) || @@ -192,11 +272,41 @@ static int coalesce_fill_reply(struct sk_buff *skb, kcoal->tx_aggr_time_usecs, supported)) return -EMSGSIZE; - return 0; + if (!moder) + return 0; + + rcu_read_lock(); + if (moder->profile_flags & DIM_PROFILE_RX) { + ret = coalesce_put_profile(skb, ETHTOOL_A_COALESCE_RX_PROFILE, + rcu_dereference(moder->rx_profile), + moder->coal_flags); + if (ret) + goto out; + } + + if (moder->profile_flags & DIM_PROFILE_TX) + ret = coalesce_put_profile(skb, ETHTOOL_A_COALESCE_TX_PROFILE, + rcu_dereference(moder->tx_profile), + moder->coal_flags); + +out: + rcu_read_unlock(); + return ret; } /* COALESCE_SET */ +static const struct nla_policy coalesce_irq_moderation_policy[] = { + [ETHTOOL_A_IRQ_MODERATION_USEC] = { .type = NLA_U32 }, + [ETHTOOL_A_IRQ_MODERATION_PKTS] = { .type = NLA_U32 }, + [ETHTOOL_A_IRQ_MODERATION_COMPS] = { .type = NLA_U32 }, +}; + +static const struct nla_policy coalesce_profile_policy[] = { + [ETHTOOL_A_PROFILE_IRQ_MODERATION] = + NLA_POLICY_NESTED(coalesce_irq_moderation_policy), +}; + const struct nla_policy ethnl_coalesce_set_policy[] = { [ETHTOOL_A_COALESCE_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), @@ -227,6 +337,10 @@ const struct nla_policy ethnl_coalesce_set_policy[] = { [ETHTOOL_A_COALESCE_TX_AGGR_MAX_BYTES] = { .type = NLA_U32 }, [ETHTOOL_A_COALESCE_TX_AGGR_MAX_FRAMES] = { .type = NLA_U32 }, [ETHTOOL_A_COALESCE_TX_AGGR_TIME_USECS] = { .type = NLA_U32 }, + [ETHTOOL_A_COALESCE_RX_PROFILE] = + NLA_POLICY_NESTED(coalesce_profile_policy), + [ETHTOOL_A_COALESCE_TX_PROFILE] = + NLA_POLICY_NESTED(coalesce_profile_policy), }; static int @@ -234,6 +348,7 @@ ethnl_set_coalesce_validate(struct ethnl_req_info *req_info, struct genl_info *info) { const struct ethtool_ops *ops = req_info->dev->ethtool_ops; + struct dim_irq_moder *irq_moder = req_info->dev->irq_moder; struct nlattr **tb = info->attrs; u32 supported_params; u16 a; @@ -243,6 +358,12 @@ ethnl_set_coalesce_validate(struct ethnl_req_info *req_info, /* make sure that only supported parameters are present */ supported_params = ops->supported_coalesce_params; + if (irq_moder && irq_moder->profile_flags & DIM_PROFILE_RX) + supported_params |= ETHTOOL_COALESCE_RX_PROFILE; + + if (irq_moder && irq_moder->profile_flags & DIM_PROFILE_TX) + supported_params |= ETHTOOL_COALESCE_TX_PROFILE; + for (a = ETHTOOL_A_COALESCE_RX_USECS; a < __ETHTOOL_A_COALESCE_CNT; a++) if (tb[a] && !(supported_params & attr_to_mask(a))) { NL_SET_ERR_MSG_ATTR(info->extack, tb[a], @@ -253,6 +374,138 @@ ethnl_set_coalesce_validate(struct ethnl_req_info *req_info, return 1; } +/** + * ethnl_update_irq_moder - update a specific field in the given profile + * @irq_moder: place that collects dim related information + * @irq_field: field in profile to modify + * @attr_type: attr type ETHTOOL_A_IRQ_MODERATION_* + * @tb: netlink attribute with new values or null + * @coal_bit: DIM_COALESCE_* bit from coal_flags + * @mod: pointer to bool for modification tracking + * @extack: netlink extended ack + * + * Return: 0 on success or a negative error code. + */ +static int ethnl_update_irq_moder(struct dim_irq_moder *irq_moder, + u16 *irq_field, u16 attr_type, + struct nlattr **tb, + u8 coal_bit, bool *mod, + struct netlink_ext_ack *extack) +{ + int ret = 0; + u32 val; + + if (!tb[attr_type]) + return 0; + + if (irq_moder->coal_flags & coal_bit) { + val = nla_get_u32(tb[attr_type]); + if (*irq_field == val) + return 0; + + *irq_field = val; + *mod = true; + } else { + NL_SET_BAD_ATTR(extack, tb[attr_type]); + ret = -EOPNOTSUPP; + } + + return ret; +} + +/** + * ethnl_update_profile - get a profile nest with child nests from userspace. + * @dev: netdevice to update the profile + * @dst: profile get from the driver and modified by ethnl_update_profile. + * @nests: nest attr ETHTOOL_A_COALESCE_*X_PROFILE to set profile. + * @mod: pointer to bool for modification tracking + * @extack: Netlink extended ack + * + * Layout of nests: + * Nested ETHTOOL_A_COALESCE_*X_PROFILE attr + * Nested ETHTOOL_A_PROFILE_IRQ_MODERATION attr + * ETHTOOL_A_IRQ_MODERATION_USEC attr + * ETHTOOL_A_IRQ_MODERATION_PKTS attr + * ETHTOOL_A_IRQ_MODERATION_COMPS attr + * ... + * Nested ETHTOOL_A_PROFILE_IRQ_MODERATION attr + * ETHTOOL_A_IRQ_MODERATION_USEC attr + * ETHTOOL_A_IRQ_MODERATION_PKTS attr + * ETHTOOL_A_IRQ_MODERATION_COMPS attr + * + * Return: 0 on success or a negative error code. + */ +static int ethnl_update_profile(struct net_device *dev, + struct dim_cq_moder __rcu **dst, + const struct nlattr *nests, + bool *mod, + struct netlink_ext_ack *extack) +{ + int len_irq_moder = ARRAY_SIZE(coalesce_irq_moderation_policy); + struct nlattr *tb[ARRAY_SIZE(coalesce_irq_moderation_policy)]; + struct dim_irq_moder *irq_moder = dev->irq_moder; + struct dim_cq_moder *new_profile, *old_profile; + int ret, rem, i = 0, len; + struct nlattr *nest; + + if (!nests) + return 0; + + if (!*dst) + return -EOPNOTSUPP; + + old_profile = rtnl_dereference(*dst); + len = NET_DIM_PARAMS_NUM_PROFILES * sizeof(*old_profile); + new_profile = kmemdup(old_profile, len, GFP_KERNEL); + if (!new_profile) + return -ENOMEM; + + nla_for_each_nested_type(nest, ETHTOOL_A_PROFILE_IRQ_MODERATION, + nests, rem) { + ret = nla_parse_nested(tb, len_irq_moder - 1, nest, + coalesce_irq_moderation_policy, + extack); + if (ret) + goto err_out; + + ret = ethnl_update_irq_moder(irq_moder, &new_profile[i].usec, + ETHTOOL_A_IRQ_MODERATION_USEC, + tb, DIM_COALESCE_USEC, + mod, extack); + if (ret) + goto err_out; + + ret = ethnl_update_irq_moder(irq_moder, &new_profile[i].pkts, + ETHTOOL_A_IRQ_MODERATION_PKTS, + tb, DIM_COALESCE_PKTS, + mod, extack); + if (ret) + goto err_out; + + ret = ethnl_update_irq_moder(irq_moder, &new_profile[i].comps, + ETHTOOL_A_IRQ_MODERATION_COMPS, + tb, DIM_COALESCE_COMPS, + mod, extack); + if (ret) + goto err_out; + + i++; + } + + /* After the profile is modified, dim itself is a dynamic + * mechanism and will quickly fit to the appropriate + * coalescing parameters according to the new profile. + */ + rcu_assign_pointer(*dst, new_profile); + kfree_rcu(old_profile, rcu); + + return 0; + +err_out: + kfree(new_profile); + return ret; +} + static int __ethnl_set_coalesce(struct ethnl_req_info *req_info, struct genl_info *info, bool *dual_change) @@ -317,6 +570,22 @@ __ethnl_set_coalesce(struct ethnl_req_info *req_info, struct genl_info *info, ethnl_update_u32(&kernel_coalesce.tx_aggr_time_usecs, tb[ETHTOOL_A_COALESCE_TX_AGGR_TIME_USECS], &mod); + if (dev->irq_moder && dev->irq_moder->profile_flags & DIM_PROFILE_RX) { + ret = ethnl_update_profile(dev, &dev->irq_moder->rx_profile, + tb[ETHTOOL_A_COALESCE_RX_PROFILE], + &mod, info->extack); + if (ret < 0) + return ret; + } + + if (dev->irq_moder && dev->irq_moder->profile_flags & DIM_PROFILE_TX) { + ret = ethnl_update_profile(dev, &dev->irq_moder->tx_profile, + tb[ETHTOOL_A_COALESCE_TX_PROFILE], + &mod, info->extack); + if (ret < 0) + return ret; + } + /* Update operation modes */ ethnl_update_bool32(&coalesce.use_adaptive_rx_coalesce, tb[ETHTOOL_A_COALESCE_USE_ADAPTIVE_RX], &mod_mode); diff --git a/net/hsr/hsr_device.c b/net/hsr/hsr_device.c index e6904288d40d..e4cc6b78dcfc 100644 --- a/net/hsr/hsr_device.c +++ b/net/hsr/hsr_device.c @@ -73,9 +73,15 @@ static void hsr_check_announce(struct net_device *hsr_dev) mod_timer(&hsr->announce_timer, jiffies + msecs_to_jiffies(HSR_ANNOUNCE_INTERVAL)); } + + if (hsr->redbox && !timer_pending(&hsr->announce_proxy_timer)) + mod_timer(&hsr->announce_proxy_timer, jiffies + + msecs_to_jiffies(HSR_ANNOUNCE_INTERVAL) / 2); } else { /* Deactivate the announce timer */ timer_delete(&hsr->announce_timer); + if (hsr->redbox) + timer_delete(&hsr->announce_proxy_timer); } } @@ -279,10 +285,11 @@ out: return NULL; } -static void send_hsr_supervision_frame(struct hsr_port *master, - unsigned long *interval) +static void send_hsr_supervision_frame(struct hsr_port *port, + unsigned long *interval, + const unsigned char *addr) { - struct hsr_priv *hsr = master->hsr; + struct hsr_priv *hsr = port->hsr; __u8 type = HSR_TLV_LIFE_CHECK; struct hsr_sup_payload *hsr_sp; struct hsr_sup_tlv *hsr_stlv; @@ -296,9 +303,9 @@ static void send_hsr_supervision_frame(struct hsr_port *master, hsr->announce_count++; } - skb = hsr_init_skb(master); + skb = hsr_init_skb(port); if (!skb) { - netdev_warn_once(master->dev, "HSR: Could not send supervision frame\n"); + netdev_warn_once(port->dev, "HSR: Could not send supervision frame\n"); return; } @@ -321,11 +328,12 @@ static void send_hsr_supervision_frame(struct hsr_port *master, hsr_stag->tlv.HSR_TLV_length = hsr->prot_version ? sizeof(struct hsr_sup_payload) : 12; - /* Payload: MacAddressA */ + /* Payload: MacAddressA / SAN MAC from ProxyNodeTable */ hsr_sp = skb_put(skb, sizeof(struct hsr_sup_payload)); - ether_addr_copy(hsr_sp->macaddress_A, master->dev->dev_addr); + ether_addr_copy(hsr_sp->macaddress_A, addr); - if (hsr->redbox) { + if (hsr->redbox && + hsr_is_node_in_db(&hsr->proxy_node_db, addr)) { hsr_stlv = skb_put(skb, sizeof(struct hsr_sup_tlv)); hsr_stlv->HSR_TLV_type = PRP_TLV_REDBOX_MAC; hsr_stlv->HSR_TLV_length = sizeof(struct hsr_sup_payload); @@ -340,13 +348,14 @@ static void send_hsr_supervision_frame(struct hsr_port *master, return; } - hsr_forward_skb(skb, master); + hsr_forward_skb(skb, port); spin_unlock_bh(&hsr->seqnr_lock); return; } static void send_prp_supervision_frame(struct hsr_port *master, - unsigned long *interval) + unsigned long *interval, + const unsigned char *addr) { struct hsr_priv *hsr = master->hsr; struct hsr_sup_payload *hsr_sp; @@ -396,7 +405,7 @@ static void hsr_announce(struct timer_list *t) rcu_read_lock(); master = hsr_port_get_hsr(hsr, HSR_PT_MASTER); - hsr->proto_ops->send_sv_frame(master, &interval); + hsr->proto_ops->send_sv_frame(master, &interval, master->dev->dev_addr); if (is_admin_up(master->dev)) mod_timer(&hsr->announce_timer, jiffies + interval); @@ -404,6 +413,37 @@ static void hsr_announce(struct timer_list *t) rcu_read_unlock(); } +/* Announce (supervision frame) timer function for RedBox + */ +static void hsr_proxy_announce(struct timer_list *t) +{ + struct hsr_priv *hsr = from_timer(hsr, t, announce_proxy_timer); + struct hsr_port *interlink; + unsigned long interval = 0; + struct hsr_node *node; + + rcu_read_lock(); + /* RedBOX sends supervisory frames to HSR network with MAC addresses + * of SAN nodes stored in ProxyNodeTable. + */ + interlink = hsr_port_get_hsr(hsr, HSR_PT_INTERLINK); + list_for_each_entry_rcu(node, &hsr->proxy_node_db, mac_list) { + if (hsr_addr_is_redbox(hsr, node->macaddress_A)) + continue; + hsr->proto_ops->send_sv_frame(interlink, &interval, + node->macaddress_A); + } + + if (is_admin_up(interlink->dev)) { + if (!interval) + interval = msecs_to_jiffies(HSR_ANNOUNCE_INTERVAL); + + mod_timer(&hsr->announce_proxy_timer, jiffies + interval); + } + + rcu_read_unlock(); +} + void hsr_del_ports(struct hsr_priv *hsr) { struct hsr_port *port; @@ -590,6 +630,7 @@ int hsr_dev_finalize(struct net_device *hsr_dev, struct net_device *slave[2], timer_setup(&hsr->announce_timer, hsr_announce, 0); timer_setup(&hsr->prune_timer, hsr_prune_nodes, 0); timer_setup(&hsr->prune_proxy_timer, hsr_prune_proxy_nodes, 0); + timer_setup(&hsr->announce_proxy_timer, hsr_proxy_announce, 0); ether_addr_copy(hsr->sup_multicast_addr, def_multicast_addr); hsr->sup_multicast_addr[ETH_ALEN - 1] = multicast_spec; diff --git a/net/hsr/hsr_forward.c b/net/hsr/hsr_forward.c index 05a61b8286ec..b38060246e62 100644 --- a/net/hsr/hsr_forward.c +++ b/net/hsr/hsr_forward.c @@ -117,6 +117,35 @@ static bool is_supervision_frame(struct hsr_priv *hsr, struct sk_buff *skb) return true; } +static bool is_proxy_supervision_frame(struct hsr_priv *hsr, + struct sk_buff *skb) +{ + struct hsr_sup_payload *payload; + struct ethhdr *eth_hdr; + u16 total_length = 0; + + eth_hdr = (struct ethhdr *)skb_mac_header(skb); + + /* Get the HSR protocol revision. */ + if (eth_hdr->h_proto == htons(ETH_P_HSR)) + total_length = sizeof(struct hsrv1_ethhdr_sp); + else + total_length = sizeof(struct hsrv0_ethhdr_sp); + + if (!pskb_may_pull(skb, total_length + sizeof(struct hsr_sup_payload))) + return false; + + skb_pull(skb, total_length); + payload = (struct hsr_sup_payload *)skb->data; + skb_push(skb, total_length); + + /* For RedBox (HSR-SAN) check if we have received the supervision + * frame with MAC addresses from own ProxyNodeTable. + */ + return hsr_is_node_in_db(&hsr->proxy_node_db, + payload->macaddress_A); +} + static struct sk_buff *create_stripped_skb_hsr(struct sk_buff *skb_in, struct hsr_frame_info *frame) { @@ -392,9 +421,9 @@ static int hsr_xmit(struct sk_buff *skb, struct hsr_port *port, bool prp_drop_frame(struct hsr_frame_info *frame, struct hsr_port *port) { return ((frame->port_rcv->type == HSR_PT_SLAVE_A && - port->type == HSR_PT_SLAVE_B) || + port->type == HSR_PT_SLAVE_B) || (frame->port_rcv->type == HSR_PT_SLAVE_B && - port->type == HSR_PT_SLAVE_A)); + port->type == HSR_PT_SLAVE_A)); } bool hsr_drop_frame(struct hsr_frame_info *frame, struct hsr_port *port) @@ -499,7 +528,8 @@ static void hsr_forward_do(struct hsr_frame_info *frame) frame->sequence_nr)) continue; - if (frame->is_supervision && port->type == HSR_PT_MASTER) { + if (frame->is_supervision && port->type == HSR_PT_MASTER && + !frame->is_proxy_supervision) { hsr_handle_sup_frame(frame); continue; } @@ -637,6 +667,9 @@ static int fill_frame_info(struct hsr_frame_info *frame, memset(frame, 0, sizeof(*frame)); frame->is_supervision = is_supervision_frame(port->hsr, skb); + if (frame->is_supervision && hsr->redbox) + frame->is_proxy_supervision = + is_proxy_supervision_frame(port->hsr, skb); n_db = &hsr->node_db; if (port->type == HSR_PT_INTERLINK) @@ -688,7 +721,7 @@ void hsr_forward_skb(struct sk_buff *skb, struct hsr_port *port) /* Gets called for ingress frames as well as egress from master port. * So check and increment stats for master port only here. */ - if (port->type == HSR_PT_MASTER) { + if (port->type == HSR_PT_MASTER || port->type == HSR_PT_INTERLINK) { port->dev->stats.tx_packets++; port->dev->stats.tx_bytes += skb->len; } diff --git a/net/hsr/hsr_framereg.c b/net/hsr/hsr_framereg.c index 614df9649794..73bc6f659812 100644 --- a/net/hsr/hsr_framereg.c +++ b/net/hsr/hsr_framereg.c @@ -36,6 +36,14 @@ static bool seq_nr_after(u16 a, u16 b) #define seq_nr_before(a, b) seq_nr_after((b), (a)) #define seq_nr_before_or_eq(a, b) (!seq_nr_after((a), (b))) +bool hsr_addr_is_redbox(struct hsr_priv *hsr, unsigned char *addr) +{ + if (!hsr->redbox || !is_valid_ether_addr(hsr->macaddress_redbox)) + return false; + + return ether_addr_equal(addr, hsr->macaddress_redbox); +} + bool hsr_addr_is_self(struct hsr_priv *hsr, unsigned char *addr) { struct hsr_self_node *sn; @@ -591,6 +599,10 @@ void hsr_prune_proxy_nodes(struct timer_list *t) spin_lock_bh(&hsr->list_lock); list_for_each_entry_safe(node, tmp, &hsr->proxy_node_db, mac_list) { + /* Don't prune RedBox node. */ + if (hsr_addr_is_redbox(hsr, node->macaddress_A)) + continue; + timestamp = node->time_in[HSR_PT_INTERLINK]; /* Prune old entries */ diff --git a/net/hsr/hsr_framereg.h b/net/hsr/hsr_framereg.h index 7619e31c1d2d..993fa950d814 100644 --- a/net/hsr/hsr_framereg.h +++ b/net/hsr/hsr_framereg.h @@ -22,6 +22,7 @@ struct hsr_frame_info { struct hsr_node *node_src; u16 sequence_nr; bool is_supervision; + bool is_proxy_supervision; bool is_vlan; bool is_local_dest; bool is_local_exclusive; @@ -35,6 +36,7 @@ struct hsr_node *hsr_get_node(struct hsr_port *port, struct list_head *node_db, enum hsr_port_type rx_port); void hsr_handle_sup_frame(struct hsr_frame_info *frame); bool hsr_addr_is_self(struct hsr_priv *hsr, unsigned char *addr); +bool hsr_addr_is_redbox(struct hsr_priv *hsr, unsigned char *addr); void hsr_addr_subst_source(struct hsr_node *node, struct sk_buff *skb); void hsr_addr_subst_dest(struct hsr_node *node_src, struct sk_buff *skb, diff --git a/net/hsr/hsr_main.h b/net/hsr/hsr_main.h index 23850b16d1ea..ab1f8d35d9dc 100644 --- a/net/hsr/hsr_main.h +++ b/net/hsr/hsr_main.h @@ -170,7 +170,8 @@ struct hsr_node; struct hsr_proto_ops { /* format and send supervision frame */ - void (*send_sv_frame)(struct hsr_port *port, unsigned long *interval); + void (*send_sv_frame)(struct hsr_port *port, unsigned long *interval, + const unsigned char addr[ETH_ALEN]); void (*handle_san_frame)(bool san, enum hsr_port_type port, struct hsr_node *node); bool (*drop_frame)(struct hsr_frame_info *frame, struct hsr_port *port); @@ -197,6 +198,7 @@ struct hsr_priv { struct list_head proxy_node_db; /* RedBox HSR proxy nodes */ struct hsr_self_node __rcu *self_node; /* MACs of slaves */ struct timer_list announce_timer; /* Supervision frame dispatch */ + struct timer_list announce_proxy_timer; struct timer_list prune_timer; struct timer_list prune_proxy_timer; int announce_count; diff --git a/net/hsr/hsr_netlink.c b/net/hsr/hsr_netlink.c index 898f18c6da53..f6ff0b61e08a 100644 --- a/net/hsr/hsr_netlink.c +++ b/net/hsr/hsr_netlink.c @@ -131,6 +131,7 @@ static void hsr_dellink(struct net_device *dev, struct list_head *head) del_timer_sync(&hsr->prune_timer); del_timer_sync(&hsr->prune_proxy_timer); del_timer_sync(&hsr->announce_timer); + timer_delete_sync(&hsr->announce_proxy_timer); hsr_debugfs_term(hsr); hsr_del_ports(hsr); diff --git a/net/ieee802154/6lowpan/reassembly.c b/net/ieee802154/6lowpan/reassembly.c index 56ef873828f4..867d637d86f0 100644 --- a/net/ieee802154/6lowpan/reassembly.c +++ b/net/ieee802154/6lowpan/reassembly.c @@ -130,7 +130,7 @@ static int lowpan_frag_queue(struct lowpan_frag_queue *fq, goto err; fq->q.stamp = skb->tstamp; - fq->q.mono_delivery_time = skb->mono_delivery_time; + fq->q.tstamp_type = skb->tstamp_type; if (frag_type == LOWPAN_DISPATCH_FRAG1) fq->q.flags |= INET_FRAG_FIRST_IN; diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c index 18227757ec0c..3f88d0961e5b 100644 --- a/net/ipv4/bpf_tcp_ca.c +++ b/net/ipv4/bpf_tcp_ca.c @@ -260,17 +260,17 @@ static int bpf_tcp_ca_check_member(const struct btf_type *t, return 0; } -static int bpf_tcp_ca_reg(void *kdata) +static int bpf_tcp_ca_reg(void *kdata, struct bpf_link *link) { return tcp_register_congestion_control(kdata); } -static void bpf_tcp_ca_unreg(void *kdata) +static void bpf_tcp_ca_unreg(void *kdata, struct bpf_link *link) { tcp_unregister_congestion_control(kdata); } -static int bpf_tcp_ca_update(void *kdata, void *old_kdata) +static int bpf_tcp_ca_update(void *kdata, void *old_kdata, struct bpf_link *link) { return tcp_update_congestion_control(kdata, old_kdata); } diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c index e9cb27061c12..8cc0e2f4159d 100644 --- a/net/ipv4/cipso_ipv4.c +++ b/net/ipv4/cipso_ipv4.c @@ -1976,7 +1976,7 @@ int cipso_v4_req_setattr(struct request_sock *req, buf = NULL; req_inet = inet_rsk(req); - opt = xchg((__force struct ip_options_rcu **)&req_inet->ireq_opt, opt); + opt = unrcu_pointer(xchg(&req_inet->ireq_opt, RCU_INITIALIZER(opt))); if (opt) kfree_rcu(opt, rcu); diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index f669da98d11d..7b6b042208bd 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -1030,7 +1030,7 @@ bool fib_metrics_match(struct fib_config *cfg, struct fib_info *fi) bool ecn_ca = false; nla_strscpy(tmp, nla, sizeof(tmp)); - val = tcp_ca_get_key_by_name(fi->fib_net, tmp, &ecn_ca); + val = tcp_ca_get_key_by_name(tmp, &ecn_ca); } else { if (nla_len(nla) != sizeof(u32)) return false; @@ -1459,8 +1459,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg, fi = kzalloc(struct_size(fi, fib_nh, nhs), GFP_KERNEL); if (!fi) goto failure; - fi->fib_metrics = ip_fib_metrics_init(fi->fib_net, cfg->fc_mx, - cfg->fc_mx_len, extack); + fi->fib_metrics = ip_fib_metrics_init(cfg->fc_mx, cfg->fc_mx_len, extack); if (IS_ERR(fi->fib_metrics)) { err = PTR_ERR(fi->fib_metrics); kfree(fi); diff --git a/net/ipv4/fou_core.c b/net/ipv4/fou_core.c index a8494f796dca..0abbc413e0fe 100644 --- a/net/ipv4/fou_core.c +++ b/net/ipv4/fou_core.c @@ -433,7 +433,7 @@ next_proto: offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads; ops = rcu_dereference(offloads[proto]); - if (WARN_ON_ONCE(!ops || !ops->callbacks.gro_receive)) + if (!ops || !ops->callbacks.gro_receive) goto out; pp = call_gro_receive(ops->callbacks.gro_receive, head, skb); diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index d4f0eff8b20f..64d07b842e73 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -911,6 +911,64 @@ int inet_rtx_syn_ack(const struct sock *parent, struct request_sock *req) } EXPORT_SYMBOL(inet_rtx_syn_ack); +static struct request_sock * +reqsk_alloc_noprof(const struct request_sock_ops *ops, struct sock *sk_listener, + bool attach_listener) +{ + struct request_sock *req; + + req = kmem_cache_alloc_noprof(ops->slab, GFP_ATOMIC | __GFP_NOWARN); + if (!req) + return NULL; + req->rsk_listener = NULL; + if (attach_listener) { + if (unlikely(!refcount_inc_not_zero(&sk_listener->sk_refcnt))) { + kmem_cache_free(ops->slab, req); + return NULL; + } + req->rsk_listener = sk_listener; + } + req->rsk_ops = ops; + req_to_sk(req)->sk_prot = sk_listener->sk_prot; + sk_node_init(&req_to_sk(req)->sk_node); + sk_tx_queue_clear(req_to_sk(req)); + req->saved_syn = NULL; + req->syncookie = 0; + req->timeout = 0; + req->num_timeout = 0; + req->num_retrans = 0; + req->sk = NULL; + refcount_set(&req->rsk_refcnt, 0); + + return req; +} +#define reqsk_alloc(...) alloc_hooks(reqsk_alloc_noprof(__VA_ARGS__)) + +struct request_sock *inet_reqsk_alloc(const struct request_sock_ops *ops, + struct sock *sk_listener, + bool attach_listener) +{ + struct request_sock *req = reqsk_alloc(ops, sk_listener, + attach_listener); + + if (req) { + struct inet_request_sock *ireq = inet_rsk(req); + + ireq->ireq_opt = NULL; +#if IS_ENABLED(CONFIG_IPV6) + ireq->pktopts = NULL; +#endif + atomic64_set(&ireq->ir_cookie, 0); + ireq->ireq_state = TCP_NEW_SYN_RECV; + write_pnet(&ireq->ireq_net, sock_net(sk_listener)); + ireq->ireq_family = sk_listener->sk_family; + req->timeout = TCP_TIMEOUT_INIT; + } + + return req; +} +EXPORT_SYMBOL(inet_reqsk_alloc); + static struct request_sock *inet_reqsk_clone(struct request_sock *req, struct sock *sk) { diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index faaec92a46ac..d179a2c84222 100644 --- a/net/ipv4/inet_fragment.c +++ b/net/ipv4/inet_fragment.c @@ -619,7 +619,7 @@ void inet_frag_reasm_finish(struct inet_frag_queue *q, struct sk_buff *head, skb_mark_not_on_list(head); head->prev = NULL; head->tstamp = q->stamp; - head->mono_delivery_time = q->mono_delivery_time; + head->tstamp_type = q->tstamp_type; if (sk) refcount_add(sum_truesize - head_truesize, &sk->sk_wmem_alloc); diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index e28075f0006e..337390ba85b4 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -92,13 +92,22 @@ static void inet_twsk_add_node_rcu(struct inet_timewait_sock *tw, hlist_nulls_add_head_rcu(&tw->tw_node, list); } +static void inet_twsk_schedule(struct inet_timewait_sock *tw, int timeo) +{ + __inet_twsk_schedule(tw, timeo, false); +} + /* - * Enter the time wait state. This is called with locally disabled BH. + * Enter the time wait state. * Essentially we whip up a timewait bucket, copy the relevant info into it * from the SK, and mess with hash chains and list linkage. + * + * The caller must not access @tw anymore after this function returns. */ -void inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, - struct inet_hashinfo *hashinfo) +void inet_twsk_hashdance_schedule(struct inet_timewait_sock *tw, + struct sock *sk, + struct inet_hashinfo *hashinfo, + int timeo) { const struct inet_sock *inet = inet_sk(sk); const struct inet_connection_sock *icsk = inet_csk(sk); @@ -114,6 +123,7 @@ void inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, hashinfo->bhash_size)]; bhead2 = inet_bhashfn_portaddr(hashinfo, sk, twsk_net(tw), inet->inet_num); + local_bh_disable(); spin_lock(&bhead->lock); spin_lock(&bhead2->lock); @@ -129,26 +139,34 @@ void inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, spin_lock(lock); + /* Step 2: Hash TW into tcp ehash chain */ inet_twsk_add_node_rcu(tw, &ehead->chain); /* Step 3: Remove SK from hash chain */ if (__sk_nulls_del_node_init_rcu(sk)) sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); - spin_unlock(lock); + /* Ensure above writes are committed into memory before updating the + * refcount. + * Provides ordering vs later refcount_inc(). + */ + smp_wmb(); /* tw_refcnt is set to 3 because we have : * - one reference for bhash chain. * - one reference for ehash chain. * - one reference for timer. - * We can use atomic_set() because prior spin_lock()/spin_unlock() - * committed into memory all tw fields. * Also note that after this point, we lost our implicit reference * so we are not allowed to use tw anymore. */ refcount_set(&tw->tw_refcnt, 3); + + inet_twsk_schedule(tw, timeo); + + spin_unlock(lock); + local_bh_enable(); } -EXPORT_SYMBOL_GPL(inet_twsk_hashdance); +EXPORT_SYMBOL_GPL(inet_twsk_hashdance_schedule); static void tw_timer_handler(struct timer_list *t) { @@ -192,7 +210,7 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, tw->tw_prot = sk->sk_prot_creator; atomic64_set(&tw->tw_cookie, atomic64_read(&sk->sk_cookie)); twsk_net_set(tw, sock_net(sk)); - timer_setup(&tw->tw_timer, tw_timer_handler, TIMER_PINNED); + timer_setup(&tw->tw_timer, tw_timer_handler, 0); /* * Because we use RCU lookups, we should not set tw_refcnt * to a non null value before everything is setup for this @@ -217,7 +235,34 @@ EXPORT_SYMBOL_GPL(inet_twsk_alloc); */ void inet_twsk_deschedule_put(struct inet_timewait_sock *tw) { - if (del_timer_sync(&tw->tw_timer)) + struct inet_hashinfo *hashinfo = tw->tw_dr->hashinfo; + spinlock_t *lock = inet_ehash_lockp(hashinfo, tw->tw_hash); + + /* inet_twsk_purge() walks over all sockets, including tw ones, + * and removes them via inet_twsk_deschedule_put() after a + * refcount_inc_not_zero(). + * + * inet_twsk_hashdance_schedule() must (re)init the refcount before + * arming the timer, i.e. inet_twsk_purge can obtain a reference to + * a twsk that did not yet schedule the timer. + * + * The ehash lock synchronizes these two: + * After acquiring the lock, the timer is always scheduled (else + * timer_shutdown returns false), because hashdance_schedule releases + * the ehash lock only after completing the timer initialization. + * + * Without grabbing the ehash lock, we get: + * 1) cpu x sets twsk refcount to 3 + * 2) cpu y bumps refcount to 4 + * 3) cpu y calls inet_twsk_deschedule_put() and shuts timer down + * 4) cpu x tries to start timer, but mod_timer is a noop post-shutdown + * -> timer refcount is never decremented. + */ + spin_lock(lock); + /* Makes sure hashdance_schedule() has completed */ + spin_unlock(lock); + + if (timer_shutdown_sync(&tw->tw_timer)) inet_twsk_kill(tw); inet_twsk_put(tw); } diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 08e2c92e25ab..a92664a5ef2e 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -355,7 +355,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) qp->iif = dev->ifindex; qp->q.stamp = skb->tstamp; - qp->q.mono_delivery_time = skb->mono_delivery_time; + qp->q.tstamp_type = skb->tstamp_type; qp->q.meat += skb->len; qp->ecn |= ecn; add_frag_mem_limit(qp->q.fqdir, skb->truesize); diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 9500031a1f55..b90d0f78ac80 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -764,7 +764,7 @@ int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, { struct iphdr *iph; struct sk_buff *skb2; - bool mono_delivery_time = skb->mono_delivery_time; + u8 tstamp_type = skb->tstamp_type; struct rtable *rt = skb_rtable(skb); unsigned int mtu, hlen, ll_rs; struct ip_fraglist_iter iter; @@ -856,7 +856,7 @@ int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, } } - skb_set_delivery_time(skb, tstamp, mono_delivery_time); + skb_set_delivery_time(skb, tstamp, tstamp_type); err = output(net, sk, skb); if (!err) @@ -912,7 +912,7 @@ slow_path: /* * Put this fragment into the sending queue. */ - skb_set_delivery_time(skb2, tstamp, mono_delivery_time); + skb_set_delivery_time(skb2, tstamp, tstamp_type); err = output(net, sk, skb2); if (err) goto fail; @@ -1457,7 +1457,10 @@ struct sk_buff *__ip_make_skb(struct sock *sk, skb->priority = (cork->tos != -1) ? cork->priority: READ_ONCE(sk->sk_priority); skb->mark = cork->mark; - skb->tstamp = cork->transmit_time; + if (sk_is_tcp(sk)) + skb_set_delivery_time(skb, cork->transmit_time, SKB_CLOCK_MONOTONIC); + else + skb_set_delivery_type_by_clockid(skb, cork->transmit_time, sk->sk_clockid); /* * Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec * on dst refcount @@ -1649,7 +1652,8 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, arg->csumoffset) = csum_fold(csum_add(nskb->csum, arg->csum)); nskb->ip_summed = CHECKSUM_NONE; - nskb->mono_delivery_time = !!transmit_time; + if (transmit_time) + nskb->tstamp_type = SKB_CLOCK_MONOTONIC; if (txhash) skb_set_hash(nskb, txhash, PKT_HASH_TYPE_L4); ip_push_pending_frames(sk, &fl4); diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index bccef2fcf620..5cffad42fe8c 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -1099,7 +1099,6 @@ static void ip_tunnel_dev_free(struct net_device *dev) gro_cells_destroy(&tunnel->gro_cells); dst_cache_destroy(&tunnel->dst_cache); - free_percpu(dev->tstats); } void ip_tunnel_dellink(struct net_device *dev, struct list_head *head) @@ -1313,20 +1312,15 @@ int ip_tunnel_init(struct net_device *dev) dev->needs_free_netdev = true; dev->priv_destructor = ip_tunnel_dev_free; - dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); - if (!dev->tstats) - return -ENOMEM; + dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS; err = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL); - if (err) { - free_percpu(dev->tstats); + if (err) return err; - } err = gro_cells_init(&tunnel->gro_cells, dev); if (err) { dst_cache_destroy(&tunnel->dst_cache); - free_percpu(dev->tstats); return err; } diff --git a/net/ipv4/metrics.c b/net/ipv4/metrics.c index 0e3ee1532848..8ddac1f595ed 100644 --- a/net/ipv4/metrics.c +++ b/net/ipv4/metrics.c @@ -7,7 +7,7 @@ #include <net/net_namespace.h> #include <net/tcp.h> -static int ip_metrics_convert(struct net *net, struct nlattr *fc_mx, +static int ip_metrics_convert(struct nlattr *fc_mx, int fc_mx_len, u32 *metrics, struct netlink_ext_ack *extack) { @@ -31,7 +31,7 @@ static int ip_metrics_convert(struct net *net, struct nlattr *fc_mx, char tmp[TCP_CA_NAME_MAX]; nla_strscpy(tmp, nla, sizeof(tmp)); - val = tcp_ca_get_key_by_name(net, tmp, &ecn_ca); + val = tcp_ca_get_key_by_name(tmp, &ecn_ca); if (val == TCP_CA_UNSPEC) { NL_SET_ERR_MSG(extack, "Unknown tcp congestion algorithm"); return -EINVAL; @@ -63,7 +63,7 @@ static int ip_metrics_convert(struct net *net, struct nlattr *fc_mx, return 0; } -struct dst_metrics *ip_fib_metrics_init(struct net *net, struct nlattr *fc_mx, +struct dst_metrics *ip_fib_metrics_init(struct nlattr *fc_mx, int fc_mx_len, struct netlink_ext_ack *extack) { @@ -77,7 +77,7 @@ struct dst_metrics *ip_fib_metrics_init(struct net *net, struct nlattr *fc_mx, if (unlikely(!fib_metrics)) return ERR_PTR(-ENOMEM); - err = ip_metrics_convert(net, fc_mx, fc_mx_len, fib_metrics->metrics, + err = ip_metrics_convert(fc_mx, fc_mx_len, fib_metrics->metrics, extack); if (!err) { refcount_set(&fib_metrics->refcnt, 1); diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 823306487a82..619ddc087957 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -946,7 +946,7 @@ static enum skb_drop_reason __ping_queue_rcv_skb(struct sock *sk, pr_debug("ping_queue_rcv_skb(sk=%p,sk->num=%d,skb=%p)\n", inet_sk(sk), inet_sk(sk)->inet_num, skb); if (sock_queue_rcv_skb_reason(sk, skb, &reason) < 0) { - kfree_skb_reason(skb, reason); + sk_skb_reason_drop(sk, skb, reason); pr_debug("ping_queue_rcv_skb -> failed\n"); return reason; } diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 4cb43401e0e0..474dfd263c8b 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -301,7 +301,7 @@ static int raw_rcv_skb(struct sock *sk, struct sk_buff *skb) ipv4_pktinfo_prepare(sk, skb, true); if (sock_queue_rcv_skb_reason(sk, skb, &reason) < 0) { - kfree_skb_reason(skb, reason); + sk_skb_reason_drop(sk, skb, reason); return NET_RX_DROP; } @@ -312,7 +312,7 @@ int raw_rcv(struct sock *sk, struct sk_buff *skb) { if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) { atomic_inc(&sk->sk_drops); - kfree_skb_reason(skb, SKB_DROP_REASON_XFRM_POLICY); + sk_skb_reason_drop(sk, skb, SKB_DROP_REASON_XFRM_POLICY); return NET_RX_DROP; } nf_reset_ct(skb); @@ -360,7 +360,7 @@ static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4, skb->protocol = htons(ETH_P_IP); skb->priority = READ_ONCE(sk->sk_priority); skb->mark = sockc->mark; - skb->tstamp = sockc->transmit_time; + skb_set_delivery_type_by_clockid(skb, sockc->transmit_time, sk->sk_clockid); skb_dst_set(skb, &rt->dst); *rtp = NULL; diff --git a/net/ipv4/route.c b/net/ipv4/route.c index b3073d1c8f8f..54512acbead7 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1481,7 +1481,6 @@ static bool rt_cache_route(struct fib_nh_common *nhc, struct rtable *rt) struct uncached_list { spinlock_t lock; struct list_head head; - struct list_head quarantine; }; static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list); @@ -1532,7 +1531,7 @@ void rt_flush_dev(struct net_device *dev) rt->dst.dev = blackhole_netdev; netdev_ref_replace(dev, blackhole_netdev, &rt->dst.dev_tracker, GFP_ATOMIC); - list_move(&rt->dst.rt_uncached, &ul->quarantine); + list_del_init(&rt->dst.rt_uncached); } spin_unlock_bh(&ul->lock); } @@ -1924,7 +1923,7 @@ static u32 fib_multipath_custom_hash_outer(const struct net *net, hash_keys.ports.dst = keys.ports.dst; *p_has_inner = !!(keys.control.flags & FLOW_DIS_ENCAPSULATION); - return flow_hash_from_keys(&hash_keys); + return fib_multipath_hash_from_keys(net, &hash_keys); } static u32 fib_multipath_custom_hash_inner(const struct net *net, @@ -1973,7 +1972,7 @@ static u32 fib_multipath_custom_hash_inner(const struct net *net, if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_PORT) hash_keys.ports.dst = keys.ports.dst; - return flow_hash_from_keys(&hash_keys); + return fib_multipath_hash_from_keys(net, &hash_keys); } static u32 fib_multipath_custom_hash_skb(const struct net *net, @@ -2010,7 +2009,7 @@ static u32 fib_multipath_custom_hash_fl4(const struct net *net, if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_PORT) hash_keys.ports.dst = fl4->fl4_dport; - return flow_hash_from_keys(&hash_keys); + return fib_multipath_hash_from_keys(net, &hash_keys); } /* if skb is set it will be used and fl4 can be NULL */ @@ -2031,7 +2030,7 @@ int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4, hash_keys.addrs.v4addrs.src = fl4->saddr; hash_keys.addrs.v4addrs.dst = fl4->daddr; } - mhash = flow_hash_from_keys(&hash_keys); + mhash = fib_multipath_hash_from_keys(net, &hash_keys); break; case 1: /* skb is currently provided only when forwarding */ @@ -2065,7 +2064,7 @@ int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4, hash_keys.ports.dst = fl4->fl4_dport; hash_keys.basic.ip_proto = fl4->flowi4_proto; } - mhash = flow_hash_from_keys(&hash_keys); + mhash = fib_multipath_hash_from_keys(net, &hash_keys); break; case 2: memset(&hash_keys, 0, sizeof(hash_keys)); @@ -2096,7 +2095,7 @@ int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4, hash_keys.addrs.v4addrs.src = fl4->saddr; hash_keys.addrs.v4addrs.dst = fl4->daddr; } - mhash = flow_hash_from_keys(&hash_keys); + mhash = fib_multipath_hash_from_keys(net, &hash_keys); break; case 3: if (skb) @@ -3661,7 +3660,6 @@ int __init ip_rt_init(void) struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu); INIT_LIST_HEAD(&ul->head); - INIT_LIST_HEAD(&ul->quarantine); spin_lock_init(&ul->lock); } #ifdef CONFIG_IP_ROUTE_CLASSID diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index b61d36810fe3..1948d15f1f28 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -496,6 +496,6 @@ out: out_free: reqsk_free(req); out_drop: - kfree_skb_reason(skb, reason); + sk_skb_reason_drop(sk, skb, reason); return NULL; } diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 162a0a3b6ba5..9140d20eb2d4 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -130,7 +130,8 @@ static int ipv4_privileged_ports(struct ctl_table *table, int write, return ret; } -static void inet_get_ping_group_range_table(struct ctl_table *table, kgid_t *low, kgid_t *high) +static void inet_get_ping_group_range_table(const struct ctl_table *table, + kgid_t *low, kgid_t *high) { kgid_t *data = table->data; struct net *net = @@ -145,7 +146,8 @@ static void inet_get_ping_group_range_table(struct ctl_table *table, kgid_t *low } /* Update system visible IP port range */ -static void set_ping_group_range(struct ctl_table *table, kgid_t low, kgid_t high) +static void set_ping_group_range(const struct ctl_table *table, + kgid_t low, kgid_t high) { kgid_t *data = table->data; struct net *net = @@ -462,6 +464,61 @@ static int proc_fib_multipath_hash_fields(struct ctl_table *table, int write, return ret; } + +static u32 proc_fib_multipath_hash_rand_seed __ro_after_init; + +static void proc_fib_multipath_hash_init_rand_seed(void) +{ + get_random_bytes(&proc_fib_multipath_hash_rand_seed, + sizeof(proc_fib_multipath_hash_rand_seed)); +} + +static void proc_fib_multipath_hash_set_seed(struct net *net, u32 user_seed) +{ + struct sysctl_fib_multipath_hash_seed new = { + .user_seed = user_seed, + .mp_seed = (user_seed ? user_seed : + proc_fib_multipath_hash_rand_seed), + }; + + WRITE_ONCE(net->ipv4.sysctl_fib_multipath_hash_seed, new); +} + +static int proc_fib_multipath_hash_seed(struct ctl_table *table, int write, + void *buffer, size_t *lenp, + loff_t *ppos) +{ + struct sysctl_fib_multipath_hash_seed *mphs; + struct net *net = table->data; + struct ctl_table tmp; + u32 user_seed; + int ret; + + mphs = &net->ipv4.sysctl_fib_multipath_hash_seed; + user_seed = mphs->user_seed; + + tmp = *table; + tmp.data = &user_seed; + + ret = proc_douintvec_minmax(&tmp, write, buffer, lenp, ppos); + + if (write && ret == 0) { + proc_fib_multipath_hash_set_seed(net, user_seed); + call_netevent_notifiers(NETEVENT_IPV4_MPATH_HASH_UPDATE, net); + } + + return ret; +} +#else + +static void proc_fib_multipath_hash_init_rand_seed(void) +{ +} + +static void proc_fib_multipath_hash_set_seed(struct net *net, u32 user_seed) +{ +} + #endif static struct ctl_table ipv4_table[] = { @@ -1070,6 +1127,13 @@ static struct ctl_table ipv4_net_table[] = { .extra1 = SYSCTL_ONE, .extra2 = &fib_multipath_hash_fields_all_mask, }, + { + .procname = "fib_multipath_hash_seed", + .data = &init_net, + .maxlen = sizeof(u32), + .mode = 0644, + .proc_handler = proc_fib_multipath_hash_seed, + }, #endif { .procname = "ip_unprivileged_port_start", @@ -1501,6 +1565,14 @@ static struct ctl_table ipv4_net_table[] = { .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ONE, }, + { + .procname = "tcp_rto_min_us", + .data = &init_net.ipv4.sysctl_tcp_rto_min_us, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ONE, + }, }; static __net_init int ipv4_sysctl_init_net(struct net *net) @@ -1540,6 +1612,8 @@ static __net_init int ipv4_sysctl_init_net(struct net *net) if (!net->ipv4.sysctl_local_reserved_ports) goto err_ports; + proc_fib_multipath_hash_set_seed(net, 0); + return 0; err_ports: @@ -1574,6 +1648,8 @@ static __init int sysctl_ipv4_init(void) if (!hdr) return -ENOMEM; + proc_fib_multipath_hash_init_rand_seed(); + if (register_pernet_subsys(&ipv4_sysctl_ops)) { unregister_net_sysctl_table(hdr); return -ENOMEM; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index e6790ea74877..e03a342c9162 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -282,6 +282,7 @@ #include <asm/ioctls.h> #include <net/busy_poll.h> #include <net/hotdata.h> +#include <trace/events/tcp.h> #include <net/rps.h> /* Track pending CMSGs. */ @@ -420,6 +421,7 @@ void tcp_init_sock(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); + int rto_min_us; tp->out_of_order_queue = RB_ROOT; sk->tcp_rtx_queue = RB_ROOT; @@ -428,7 +430,8 @@ void tcp_init_sock(struct sock *sk) INIT_LIST_HEAD(&tp->tsorted_sent_queue); icsk->icsk_rto = TCP_TIMEOUT_INIT; - icsk->icsk_rto_min = TCP_RTO_MIN; + rto_min_us = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rto_min_us); + icsk->icsk_rto_min = usecs_to_jiffies(rto_min_us); icsk->icsk_delack_max = TCP_DELACK_MAX; tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT); minmax_reset(&tp->rtt_min, tcp_jiffies32, ~0U); @@ -598,7 +601,7 @@ __poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait) */ mask |= EPOLLOUT | EPOLLWRNORM; } - /* This barrier is coupled with smp_wmb() in tcp_reset() */ + /* This barrier is coupled with smp_wmb() in tcp_done_with_error() */ smp_rmb(); if (READ_ONCE(sk->sk_err) || !skb_queue_empty_lockless(&sk->sk_error_queue)) @@ -3086,7 +3089,7 @@ int tcp_disconnect(struct sock *sk, int flags) icsk->icsk_ack.rcv_mss = TCP_MIN_MSS; memset(&tp->rx_opt, 0, sizeof(tp->rx_opt)); __sk_dst_reset(sk); - dst_release(xchg((__force struct dst_entry **)&sk->sk_rx_dst, NULL)); + dst_release(unrcu_pointer(xchg(&sk->sk_rx_dst, NULL))); tcp_saved_syn_free(tp); tp->compressed_ack = 0; tp->segs_in = 0; @@ -4461,7 +4464,7 @@ int tcp_md5_hash_key(struct tcp_sigpool *hp, EXPORT_SYMBOL(tcp_md5_hash_key); /* Called with rcu_read_lock() */ -enum skb_drop_reason +static enum skb_drop_reason tcp_inbound_md5_hash(const struct sock *sk, const struct sk_buff *skb, const void *saddr, const void *daddr, int family, int l3index, const __u8 *hash_location) @@ -4481,7 +4484,7 @@ tcp_inbound_md5_hash(const struct sock *sk, const struct sk_buff *skb, if (!key && hash_location) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED); - tcp_hash_fail("Unexpected MD5 Hash found", family, skb, ""); + trace_tcp_hash_md5_unexpected(sk, skb); return SKB_DROP_REASON_TCP_MD5UNEXPECTED; } @@ -4496,29 +4499,90 @@ tcp_inbound_md5_hash(const struct sock *sk, const struct sk_buff *skb, NULL, skb); if (genhash || memcmp(hash_location, newhash, 16) != 0) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE); - if (family == AF_INET) { - tcp_hash_fail("MD5 Hash failed", AF_INET, skb, "%s L3 index %d", - genhash ? "tcp_v4_calc_md5_hash failed" - : "", l3index); - } else { - if (genhash) { - tcp_hash_fail("MD5 Hash failed", - AF_INET6, skb, "L3 index %d", - l3index); - } else { - tcp_hash_fail("MD5 Hash mismatch", - AF_INET6, skb, "L3 index %d", - l3index); - } - } + trace_tcp_hash_md5_mismatch(sk, skb); return SKB_DROP_REASON_TCP_MD5FAILURE; } return SKB_NOT_DROPPED_YET; } -EXPORT_SYMBOL(tcp_inbound_md5_hash); +#else +static inline enum skb_drop_reason +tcp_inbound_md5_hash(const struct sock *sk, const struct sk_buff *skb, + const void *saddr, const void *daddr, + int family, int l3index, const __u8 *hash_location) +{ + return SKB_NOT_DROPPED_YET; +} #endif +/* Called with rcu_read_lock() */ +enum skb_drop_reason +tcp_inbound_hash(struct sock *sk, const struct request_sock *req, + const struct sk_buff *skb, + const void *saddr, const void *daddr, + int family, int dif, int sdif) +{ + const struct tcphdr *th = tcp_hdr(skb); + const struct tcp_ao_hdr *aoh; + const __u8 *md5_location; + int l3index; + + /* Invalid option or two times meet any of auth options */ + if (tcp_parse_auth_options(th, &md5_location, &aoh)) { + trace_tcp_hash_bad_header(sk, skb); + return SKB_DROP_REASON_TCP_AUTH_HDR; + } + + if (req) { + if (tcp_rsk_used_ao(req) != !!aoh) { + u8 keyid, rnext, maclen; + + if (aoh) { + keyid = aoh->keyid; + rnext = aoh->rnext_keyid; + maclen = tcp_ao_hdr_maclen(aoh); + } else { + keyid = rnext = maclen = 0; + } + + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAOBAD); + trace_tcp_ao_handshake_failure(sk, skb, keyid, rnext, maclen); + return SKB_DROP_REASON_TCP_AOFAILURE; + } + } + + /* sdif set, means packet ingressed via a device + * in an L3 domain and dif is set to the l3mdev + */ + l3index = sdif ? dif : 0; + + /* Fast path: unsigned segments */ + if (likely(!md5_location && !aoh)) { + /* Drop if there's TCP-MD5 or TCP-AO key with any rcvid/sndid + * for the remote peer. On TCP-AO established connection + * the last key is impossible to remove, so there's + * always at least one current_key. + */ + if (tcp_ao_required(sk, saddr, family, l3index, true)) { + trace_tcp_hash_ao_required(sk, skb); + return SKB_DROP_REASON_TCP_AONOTFOUND; + } + if (unlikely(tcp_md5_do_lookup(sk, l3index, saddr, family))) { + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND); + trace_tcp_hash_md5_required(sk, skb); + return SKB_DROP_REASON_TCP_MD5NOTFOUND; + } + return SKB_NOT_DROPPED_YET; + } + + if (aoh) + return tcp_inbound_ao_hash(sk, skb, family, req, l3index, aoh); + + return tcp_inbound_md5_hash(sk, skb, saddr, daddr, family, + l3index, md5_location); +} +EXPORT_SYMBOL_GPL(tcp_inbound_hash); + void tcp_done(struct sock *sk) { struct request_sock *req; @@ -4583,14 +4647,10 @@ int tcp_abort(struct sock *sk, int err) bh_lock_sock(sk); if (!sock_flag(sk, SOCK_DEAD)) { - WRITE_ONCE(sk->sk_err, err); - /* This barrier is coupled with smp_rmb() in tcp_poll() */ - smp_wmb(); - sk_error_report(sk); if (tcp_need_reset(sk->sk_state)) tcp_send_active_reset(sk, GFP_ATOMIC, SK_RST_REASON_NOT_SPECIFIED); - tcp_done(sk); + tcp_done_with_error(sk, err); } bh_unlock_sock(sk); diff --git a/net/ipv4/tcp_ao.c b/net/ipv4/tcp_ao.c index 09c0fa6756b7..85531437890c 100644 --- a/net/ipv4/tcp_ao.c +++ b/net/ipv4/tcp_ao.c @@ -16,6 +16,7 @@ #include <net/tcp.h> #include <net/ipv6.h> #include <net/icmp.h> +#include <trace/events/tcp.h> DEFINE_STATIC_KEY_DEFERRED_FALSE(tcp_ao_needed, HZ); @@ -884,17 +885,16 @@ tcp_ao_verify_hash(const struct sock *sk, const struct sk_buff *skb, const struct tcp_ao_hdr *aoh, struct tcp_ao_key *key, u8 *traffic_key, u8 *phash, u32 sne, int l3index) { - u8 maclen = aoh->length - sizeof(struct tcp_ao_hdr); const struct tcphdr *th = tcp_hdr(skb); + u8 maclen = tcp_ao_hdr_maclen(aoh); void *hash_buf = NULL; if (maclen != tcp_ao_maclen(key)) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAOBAD); atomic64_inc(&info->counters.pkt_bad); atomic64_inc(&key->pkt_bad); - tcp_hash_fail("AO hash wrong length", family, skb, - "%u != %d L3index: %d", maclen, - tcp_ao_maclen(key), l3index); + trace_tcp_ao_wrong_maclen(sk, skb, aoh->keyid, + aoh->rnext_keyid, maclen); return SKB_DROP_REASON_TCP_AOFAILURE; } @@ -909,8 +909,8 @@ tcp_ao_verify_hash(const struct sock *sk, const struct sk_buff *skb, NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAOBAD); atomic64_inc(&info->counters.pkt_bad); atomic64_inc(&key->pkt_bad); - tcp_hash_fail("AO hash mismatch", family, skb, - "L3index: %d", l3index); + trace_tcp_ao_mismatch(sk, skb, aoh->keyid, + aoh->rnext_keyid, maclen); kfree(hash_buf); return SKB_DROP_REASON_TCP_AOFAILURE; } @@ -927,6 +927,7 @@ tcp_inbound_ao_hash(struct sock *sk, const struct sk_buff *skb, int l3index, const struct tcp_ao_hdr *aoh) { const struct tcphdr *th = tcp_hdr(skb); + u8 maclen = tcp_ao_hdr_maclen(aoh); u8 *phash = (u8 *)(aoh + 1); /* hash goes just after the header */ struct tcp_ao_info *info; enum skb_drop_reason ret; @@ -939,8 +940,8 @@ tcp_inbound_ao_hash(struct sock *sk, const struct sk_buff *skb, info = rcu_dereference(tcp_sk(sk)->ao_info); if (!info) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAOKEYNOTFOUND); - tcp_hash_fail("AO key not found", family, skb, - "keyid: %u L3index: %d", aoh->keyid, l3index); + trace_tcp_ao_key_not_found(sk, skb, aoh->keyid, + aoh->rnext_keyid, maclen); return SKB_DROP_REASON_TCP_AOUNEXPECTED; } @@ -981,6 +982,9 @@ tcp_inbound_ao_hash(struct sock *sk, const struct sk_buff *skb, current_key = READ_ONCE(info->current_key); /* Key rotation: the peer asks us to use new key (RNext) */ if (unlikely(aoh->rnext_keyid != current_key->sndid)) { + trace_tcp_ao_rnext_request(sk, skb, current_key->sndid, + aoh->rnext_keyid, + tcp_ao_hdr_maclen(aoh)); /* If the key is not found we do nothing. */ key = tcp_ao_established_key(info, aoh->rnext_keyid, -1); if (key) @@ -1046,8 +1050,8 @@ verify_hash: key_not_found: NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAOKEYNOTFOUND); atomic64_inc(&info->counters.key_not_found); - tcp_hash_fail("Requested by the peer AO key id not found", - family, skb, "L3index: %d", l3index); + trace_tcp_ao_key_not_found(sk, skb, aoh->keyid, + aoh->rnext_keyid, maclen); return SKB_DROP_REASON_TCP_AOKEYNOTFOUND; } diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index 28ffcfbeef14..48617d99abb0 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -46,8 +46,7 @@ void tcp_set_ca_state(struct sock *sk, const u8 ca_state) } /* Must be called with rcu lock held */ -static struct tcp_congestion_ops *tcp_ca_find_autoload(struct net *net, - const char *name) +static struct tcp_congestion_ops *tcp_ca_find_autoload(const char *name) { struct tcp_congestion_ops *ca = tcp_ca_find(name); @@ -178,7 +177,7 @@ int tcp_update_congestion_control(struct tcp_congestion_ops *ca, struct tcp_cong return ret; } -u32 tcp_ca_get_key_by_name(struct net *net, const char *name, bool *ecn_ca) +u32 tcp_ca_get_key_by_name(const char *name, bool *ecn_ca) { const struct tcp_congestion_ops *ca; u32 key = TCP_CA_UNSPEC; @@ -186,7 +185,7 @@ u32 tcp_ca_get_key_by_name(struct net *net, const char *name, bool *ecn_ca) might_sleep(); rcu_read_lock(); - ca = tcp_ca_find_autoload(net, name); + ca = tcp_ca_find_autoload(name); if (ca) { key = ca->key; *ecn_ca = ca->flags & TCP_CONG_NEEDS_ECN; @@ -283,7 +282,7 @@ int tcp_set_default_congestion_control(struct net *net, const char *name) int ret; rcu_read_lock(); - ca = tcp_ca_find_autoload(net, name); + ca = tcp_ca_find_autoload(name); if (!ca) { ret = -ENOENT; } else if (!bpf_try_module_get(ca, ca->owner)) { @@ -421,7 +420,7 @@ int tcp_set_congestion_control(struct sock *sk, const char *name, bool load, if (!load) ca = tcp_ca_find(name); else - ca = tcp_ca_find_autoload(sock_net(sk), name); + ca = tcp_ca_find_autoload(name); /* No change asking for existing value */ if (ca == icsk->icsk_ca_ops) { diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index 8ed54e7334a9..0f523cbfe329 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -49,7 +49,7 @@ void tcp_fastopen_ctx_destroy(struct net *net) { struct tcp_fastopen_context *ctxt; - ctxt = xchg((__force struct tcp_fastopen_context **)&net->ipv4.tcp_fastopen_ctx, NULL); + ctxt = unrcu_pointer(xchg(&net->ipv4.tcp_fastopen_ctx, NULL)); if (ctxt) call_rcu(&ctxt->rcu, tcp_fastopen_ctx_free); @@ -80,9 +80,10 @@ int tcp_fastopen_reset_cipher(struct net *net, struct sock *sk, if (sk) { q = &inet_csk(sk)->icsk_accept_queue.fastopenq; - octx = xchg((__force struct tcp_fastopen_context **)&q->ctx, ctx); + octx = unrcu_pointer(xchg(&q->ctx, RCU_INITIALIZER(ctx))); } else { - octx = xchg((__force struct tcp_fastopen_context **)&net->ipv4.tcp_fastopen_ctx, ctx); + octx = unrcu_pointer(xchg(&net->ipv4.tcp_fastopen_ctx, + RCU_INITIALIZER(ctx))); } if (octx) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 2e39cb881e20..ec2ed92dcad5 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3595,8 +3595,10 @@ static void tcp_snd_sne_update(struct tcp_sock *tp, u32 ack) ao = rcu_dereference_protected(tp->ao_info, lockdep_sock_is_held((struct sock *)tp)); - if (ao && ack < tp->snd_una) + if (ao && ack < tp->snd_una) { ao->snd_sne++; + trace_tcp_ao_snd_sne_update((struct sock *)tp, ao->snd_sne); + } #endif } @@ -3621,8 +3623,10 @@ static void tcp_rcv_sne_update(struct tcp_sock *tp, u32 seq) ao = rcu_dereference_protected(tp->ao_info, lockdep_sock_is_held((struct sock *)tp)); - if (ao && seq < tp->rcv_nxt) + if (ao && seq < tp->rcv_nxt) { ao->rcv_sne++; + trace_tcp_ao_rcv_sne_update((struct sock *)tp, ao->rcv_sne); + } #endif } @@ -4453,9 +4457,26 @@ static enum skb_drop_reason tcp_sequence(const struct tcp_sock *tp, return SKB_NOT_DROPPED_YET; } + +void tcp_done_with_error(struct sock *sk, int err) +{ + /* This barrier is coupled with smp_rmb() in tcp_poll() */ + WRITE_ONCE(sk->sk_err, err); + smp_wmb(); + + tcp_write_queue_purge(sk); + tcp_done(sk); + + if (!sock_flag(sk, SOCK_DEAD)) + sk_error_report(sk); +} +EXPORT_SYMBOL(tcp_done_with_error); + /* When we get a reset we do this. */ void tcp_reset(struct sock *sk, struct sk_buff *skb) { + int err; + trace_tcp_receive_reset(sk); /* mptcp can't tell us to ignore reset pkts, @@ -4467,24 +4488,17 @@ void tcp_reset(struct sock *sk, struct sk_buff *skb) /* We want the right error as BSD sees it (and indeed as we do). */ switch (sk->sk_state) { case TCP_SYN_SENT: - WRITE_ONCE(sk->sk_err, ECONNREFUSED); + err = ECONNREFUSED; break; case TCP_CLOSE_WAIT: - WRITE_ONCE(sk->sk_err, EPIPE); + err = EPIPE; break; case TCP_CLOSE: return; default: - WRITE_ONCE(sk->sk_err, ECONNRESET); + err = ECONNRESET; } - /* This barrier is coupled with smp_rmb() in tcp_poll() */ - smp_wmb(); - - tcp_write_queue_purge(sk); - tcp_done(sk); - - if (!sock_flag(sk, SOCK_DEAD)) - sk_error_report(sk); + tcp_done_with_error(sk, err); } /* @@ -4820,10 +4834,7 @@ static bool tcp_try_coalesce(struct sock *sk, if (TCP_SKB_CB(from)->seq != TCP_SKB_CB(to)->end_seq) return false; - if (!mptcp_skb_can_collapse(to, from)) - return false; - - if (skb_cmp_decrypted(from, to)) + if (!tcp_skb_can_collapse_rx(to, from)) return false; if (!skb_try_coalesce(to, from, fragstolen, &delta)) @@ -4866,7 +4877,7 @@ static void tcp_drop_reason(struct sock *sk, struct sk_buff *skb, enum skb_drop_reason reason) { sk_drops_add(sk, skb); - kfree_skb_reason(skb, reason); + sk_skb_reason_drop(sk, skb, reason); } /* This one checks to see if we can put data from the @@ -5379,7 +5390,7 @@ restart: break; } - if (n && n != tail && mptcp_skb_can_collapse(skb, n) && + if (n && n != tail && tcp_skb_can_collapse_rx(skb, n) && TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(n)->seq) { end_of_skbs = false; break; @@ -5430,11 +5441,9 @@ restart: skb = tcp_collapse_one(sk, skb, list, root); if (!skb || skb == tail || - !mptcp_skb_can_collapse(nskb, skb) || + !tcp_skb_can_collapse_rx(nskb, skb) || (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN))) goto end; - if (skb_cmp_decrypted(skb, nskb)) - goto end; } } } @@ -6998,31 +7007,6 @@ static void tcp_openreq_init(struct request_sock *req, #endif } -struct request_sock *inet_reqsk_alloc(const struct request_sock_ops *ops, - struct sock *sk_listener, - bool attach_listener) -{ - struct request_sock *req = reqsk_alloc(ops, sk_listener, - attach_listener); - - if (req) { - struct inet_request_sock *ireq = inet_rsk(req); - - ireq->ireq_opt = NULL; -#if IS_ENABLED(CONFIG_IPV6) - ireq->pktopts = NULL; -#endif - atomic64_set(&ireq->ir_cookie, 0); - ireq->ireq_state = TCP_NEW_SYN_RECV; - write_pnet(&ireq->ireq_net, sock_net(sk_listener)); - ireq->ireq_family = sk_listener->sk_family; - req->timeout = TCP_TIMEOUT_INIT; - } - - return req; -} -EXPORT_SYMBOL(inet_reqsk_alloc); - /* * Return true if a syncookie should be sent */ diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index b710958393e6..fd17f25ff288 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -93,7 +93,9 @@ static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, struct inet_hashinfo tcp_hashinfo; EXPORT_SYMBOL(tcp_hashinfo); -static DEFINE_PER_CPU(struct sock *, ipv4_tcp_sk); +static DEFINE_PER_CPU(struct sock_bh_locked, ipv4_tcp_sk) = { + .bh_lock = INIT_LOCAL_LOCK(bh_lock), +}; static u32 tcp_v4_init_seq(const struct sk_buff *skb) { @@ -114,6 +116,7 @@ int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) const struct inet_timewait_sock *tw = inet_twsk(sktw); const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw); struct tcp_sock *tp = tcp_sk(sk); + int ts_recent_stamp; if (reuse == 2) { /* Still does not detect *everything* that goes through @@ -152,10 +155,11 @@ int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) If TW bucket has been already destroyed we fall back to VJ's scheme and use initial timestamp retrieved from peer table. */ - if (tcptw->tw_ts_recent_stamp && + ts_recent_stamp = READ_ONCE(tcptw->tw_ts_recent_stamp); + if (ts_recent_stamp && (!twp || (reuse && time_after32(ktime_get_seconds(), - tcptw->tw_ts_recent_stamp)))) { - /* inet_twsk_hashdance() sets sk_refcnt after putting twsk + ts_recent_stamp)))) { + /* inet_twsk_hashdance_schedule() sets sk_refcnt after putting twsk * and releasing the bucket lock. */ if (unlikely(!refcount_inc_not_zero(&sktw->sk_refcnt))) @@ -178,8 +182,8 @@ int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) if (!seq) seq = 1; WRITE_ONCE(tp->write_seq, seq); - tp->rx_opt.ts_recent = tcptw->tw_ts_recent; - tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp; + tp->rx_opt.ts_recent = READ_ONCE(tcptw->tw_ts_recent); + tp->rx_opt.ts_recent_stamp = ts_recent_stamp; } return 1; @@ -611,15 +615,10 @@ int tcp_v4_err(struct sk_buff *skb, u32 info) ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th); - if (!sock_owned_by_user(sk)) { - WRITE_ONCE(sk->sk_err, err); - - sk_error_report(sk); - - tcp_done(sk); - } else { + if (!sock_owned_by_user(sk)) + tcp_done_with_error(sk, err); + else WRITE_ONCE(sk->sk_err_soft, err); - } goto out; } @@ -885,7 +884,9 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb, arg.tos = ip_hdr(skb)->tos; arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL); local_bh_disable(); - ctl_sk = this_cpu_read(ipv4_tcp_sk); + local_lock_nested_bh(&ipv4_tcp_sk.bh_lock); + ctl_sk = this_cpu_read(ipv4_tcp_sk.sock); + sock_net_set(ctl_sk, net); if (sk) { ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? @@ -910,6 +911,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb, sock_net_set(ctl_sk, &init_net); __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); __TCP_INC_STATS(net, TCP_MIB_OUTRSTS); + local_unlock_nested_bh(&ipv4_tcp_sk.bh_lock); local_bh_enable(); #ifdef CONFIG_TCP_MD5SIG @@ -1005,7 +1007,8 @@ static void tcp_v4_send_ack(const struct sock *sk, arg.tos = tos; arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL); local_bh_disable(); - ctl_sk = this_cpu_read(ipv4_tcp_sk); + local_lock_nested_bh(&ipv4_tcp_sk.bh_lock); + ctl_sk = this_cpu_read(ipv4_tcp_sk.sock); sock_net_set(ctl_sk, net); ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark); @@ -1020,6 +1023,7 @@ static void tcp_v4_send_ack(const struct sock *sk, sock_net_set(ctl_sk, &init_net); __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); + local_unlock_nested_bh(&ipv4_tcp_sk.bh_lock); local_bh_enable(); } @@ -1057,19 +1061,17 @@ static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb) #else if (0) { #endif -#ifdef CONFIG_TCP_MD5SIG - } else if (static_branch_unlikely(&tcp_md5_needed.key)) { + } else if (static_branch_tcp_md5()) { key.md5_key = tcp_twsk_md5_key(tcptw); if (key.md5_key) key.type = TCP_KEY_MD5; -#endif } tcp_v4_send_ack(sk, skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt, tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, tcp_tw_tsval(tcptw), - tcptw->tw_ts_recent, + READ_ONCE(tcptw->tw_ts_recent), tw->tw_bound_dev_if, &key, tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0, tw->tw_tos, @@ -1131,8 +1133,7 @@ static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, #else if (0) { #endif -#ifdef CONFIG_TCP_MD5SIG - } else if (static_branch_unlikely(&tcp_md5_needed.key)) { + } else if (static_branch_tcp_md5()) { const union tcp_md5_addr *addr; int l3index; @@ -1141,7 +1142,6 @@ static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, key.md5_key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); if (key.md5_key) key.type = TCP_KEY_MD5; -#endif } tcp_v4_send_ack(sk, skb, seq, @@ -1939,7 +1939,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) reset: tcp_v4_send_reset(rsk, skb, sk_rst_convert_drop_reason(reason)); discard: - kfree_skb_reason(skb, reason); + sk_skb_reason_drop(sk, skb, reason); /* Be careful here. If this function gets more complicated and * gcc suffers from register pressure on the x86, sk (in %ebx) * might be destroyed here. This current version compiles correctly, @@ -2049,8 +2049,7 @@ bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb, TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) || ((TCP_SKB_CB(tail)->tcp_flags ^ TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) || - !mptcp_skb_can_collapse(tail, skb) || - skb_cmp_decrypted(tail, skb) || + !tcp_skb_can_collapse_rx(tail, skb) || thtail->doff != th->doff || memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th))) goto no_coalesce; @@ -2176,8 +2175,8 @@ int tcp_v4_rcv(struct sk_buff *skb) int dif = inet_iif(skb); const struct iphdr *iph; const struct tcphdr *th; + struct sock *sk = NULL; bool refcounted; - struct sock *sk; int ret; u32 isn; @@ -2376,7 +2375,7 @@ bad_packet: discard_it: SKB_DR_OR(drop_reason, NOT_SPECIFIED); /* Discard frame. */ - kfree_skb_reason(skb, drop_reason); + sk_skb_reason_drop(sk, skb, drop_reason); return 0; discard_and_relse: @@ -3506,6 +3505,7 @@ static int __net_init tcp_sk_init(struct net *net) net->ipv4.sysctl_tcp_shrink_window = 0; net->ipv4.sysctl_tcp_pingpong_thresh = 1; + net->ipv4.sysctl_tcp_rto_min_us = jiffies_to_usecs(TCP_RTO_MIN); return 0; } @@ -3620,7 +3620,9 @@ void __init tcp_v4_init(void) */ inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO; - per_cpu(ipv4_tcp_sk, cpu) = sk; + sk->sk_clockid = CLOCK_MONOTONIC; + + per_cpu(ipv4_tcp_sk.sock, cpu) = sk; } if (register_pernet_subsys(&tcp_sk_ops)) panic("Failed to create the TCP control socket.\n"); diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 538c06f95918..bc67f6b9efae 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -101,16 +101,18 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, struct tcp_options_received tmp_opt; struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); bool paws_reject = false; + int ts_recent_stamp; tmp_opt.saw_tstamp = 0; - if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) { + ts_recent_stamp = READ_ONCE(tcptw->tw_ts_recent_stamp); + if (th->doff > (sizeof(*th) >> 2) && ts_recent_stamp) { tcp_parse_options(twsk_net(tw), skb, &tmp_opt, 0, NULL); if (tmp_opt.saw_tstamp) { if (tmp_opt.rcv_tsecr) tmp_opt.rcv_tsecr -= tcptw->tw_ts_offset; - tmp_opt.ts_recent = tcptw->tw_ts_recent; - tmp_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp; + tmp_opt.ts_recent = READ_ONCE(tcptw->tw_ts_recent); + tmp_opt.ts_recent_stamp = ts_recent_stamp; paws_reject = tcp_paws_reject(&tmp_opt, th->rst); } } @@ -152,8 +154,10 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, twsk_rcv_nxt_update(tcptw, TCP_SKB_CB(skb)->end_seq); if (tmp_opt.saw_tstamp) { - tcptw->tw_ts_recent_stamp = ktime_get_seconds(); - tcptw->tw_ts_recent = tmp_opt.rcv_tsval; + WRITE_ONCE(tcptw->tw_ts_recent_stamp, + ktime_get_seconds()); + WRITE_ONCE(tcptw->tw_ts_recent, + tmp_opt.rcv_tsval); } inet_twsk_reschedule(tw, TCP_TIMEWAIT_LEN); @@ -197,8 +201,10 @@ kill: } if (tmp_opt.saw_tstamp) { - tcptw->tw_ts_recent = tmp_opt.rcv_tsval; - tcptw->tw_ts_recent_stamp = ktime_get_seconds(); + WRITE_ONCE(tcptw->tw_ts_recent, + tmp_opt.rcv_tsval); + WRITE_ONCE(tcptw->tw_ts_recent_stamp, + ktime_get_seconds()); } inet_twsk_put(tw); @@ -225,7 +231,7 @@ kill: if (th->syn && !th->rst && !th->ack && !paws_reject && (after(TCP_SKB_CB(skb)->seq, tcptw->tw_rcv_nxt) || (tmp_opt.saw_tstamp && - (s32)(tcptw->tw_ts_recent - tmp_opt.rcv_tsval) < 0))) { + (s32)(READ_ONCE(tcptw->tw_ts_recent) - tmp_opt.rcv_tsval) < 0))) { u32 isn = tcptw->tw_snd_nxt + 65535 + 2; if (isn == 0) isn++; @@ -339,17 +345,10 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) if (state == TCP_TIME_WAIT) timeo = TCP_TIMEWAIT_LEN; - /* tw_timer is pinned, so we need to make sure BH are disabled - * in following section, otherwise timer handler could run before - * we complete the initialization. - */ - local_bh_disable(); - inet_twsk_schedule(tw, timeo); /* Linkage updates. * Note that access to tw after this point is illegal. */ - inet_twsk_hashdance(tw, sk, net->ipv4.tcp_death_row.hashinfo); - local_bh_enable(); + inet_twsk_hashdance_schedule(tw, sk, net->ipv4.tcp_death_row.hashinfo, timeo); } else { /* Sorry, if we're out of memory, just CLOSE this * socket up. We've got bigger problems than diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 95618d0e78e4..16c48df8df4c 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1301,7 +1301,7 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, tp = tcp_sk(sk); prior_wstamp = tp->tcp_wstamp_ns; tp->tcp_wstamp_ns = max(tp->tcp_wstamp_ns, tp->tcp_clock_cache); - skb_set_delivery_time(skb, tp->tcp_wstamp_ns, true); + skb_set_delivery_time(skb, tp->tcp_wstamp_ns, SKB_CLOCK_MONOTONIC); if (clone_it) { oskb = skb; @@ -1655,7 +1655,7 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue, skb_split(skb, buff, len); - skb_set_delivery_time(buff, skb->tstamp, true); + skb_set_delivery_time(buff, skb->tstamp, SKB_CLOCK_MONOTONIC); tcp_fragment_tstamp(skb, buff); old_factor = tcp_skb_pcount(skb); @@ -2764,7 +2764,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) { /* "skb_mstamp_ns" is used as a start point for the retransmit timer */ tp->tcp_wstamp_ns = tp->tcp_clock_cache; - skb_set_delivery_time(skb, tp->tcp_wstamp_ns, true); + skb_set_delivery_time(skb, tp->tcp_wstamp_ns, SKB_CLOCK_MONOTONIC); list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue); tcp_init_tso_segs(skb, mss_now); goto repair; /* Skip network transmission */ @@ -3752,11 +3752,11 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, #ifdef CONFIG_SYN_COOKIES if (unlikely(synack_type == TCP_SYNACK_COOKIE && ireq->tstamp_ok)) skb_set_delivery_time(skb, cookie_init_timestamp(req, now), - true); + SKB_CLOCK_MONOTONIC); else #endif { - skb_set_delivery_time(skb, now, true); + skb_set_delivery_time(skb, now, SKB_CLOCK_MONOTONIC); if (!tcp_rsk(req)->snt_synack) /* Timestamp first SYNACK */ tcp_rsk(req)->snt_synack = tcp_skb_timestamp_us(skb); } @@ -3768,6 +3768,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, #ifdef CONFIG_TCP_AO struct tcp_ao_key *ao_key = NULL; u8 keyid = tcp_rsk(req)->ao_keyid; + u8 rnext = tcp_rsk(req)->ao_rcv_next; ao_key = tcp_sk(sk)->af_specific->ao_lookup(sk, req_to_sk(req), keyid, -1); @@ -3777,6 +3778,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, * ao_keyid (RFC5925 RNextKeyID), so let's keep it simple here. */ if (unlikely(!ao_key)) { + trace_tcp_ao_synack_no_key(sk, keyid, rnext); rcu_read_unlock(); kfree_skb(skb); net_warn_ratelimited("TCP-AO: the keyid %u from SYN packet is not present - not sending SYNACK\n", @@ -3843,7 +3845,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, bpf_skops_write_hdr_opt((struct sock *)sk, skb, req, syn_skb, synack_type, &opts); - skb_set_delivery_time(skb, now, true); + skb_set_delivery_time(skb, now, SKB_CLOCK_MONOTONIC); tcp_add_tx_delay(skb, tp); return skb; @@ -4027,7 +4029,7 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn) err = tcp_transmit_skb(sk, syn_data, 1, sk->sk_allocation); - skb_set_delivery_time(syn, syn_data->skb_mstamp_ns, true); + skb_set_delivery_time(syn, syn_data->skb_mstamp_ns, SKB_CLOCK_MONOTONIC); /* Now full SYN+DATA was cloned and sent (or not), * remove the SYN from the original skb (syn_data) @@ -4163,16 +4165,9 @@ EXPORT_SYMBOL(tcp_connect); u32 tcp_delack_max(const struct sock *sk) { - const struct dst_entry *dst = __sk_dst_get(sk); - u32 delack_max = inet_csk(sk)->icsk_delack_max; - - if (dst && dst_metric_locked(dst, RTAX_RTO_MIN)) { - u32 rto_min = dst_metric_rtt(dst, RTAX_RTO_MIN); - u32 delack_from_rto_min = max_t(int, 1, rto_min - 1); + u32 delack_from_rto_min = max(tcp_rto_min(sk), 2) - 1; - delack_max = min_t(u32, delack_max, delack_from_rto_min); - } - return delack_max; + return min(inet_csk(sk)->icsk_delack_max, delack_from_rto_min); } /* Send out a delayed ack, the caller does the policy checking diff --git a/net/ipv4/tcp_sigpool.c b/net/ipv4/tcp_sigpool.c index 8512cb09ebc0..d8a4f192873a 100644 --- a/net/ipv4/tcp_sigpool.c +++ b/net/ipv4/tcp_sigpool.c @@ -10,7 +10,14 @@ #include <net/tcp.h> static size_t __scratch_size; -static DEFINE_PER_CPU(void __rcu *, sigpool_scratch); +struct sigpool_scratch { + local_lock_t bh_lock; + void __rcu *pad; +}; + +static DEFINE_PER_CPU(struct sigpool_scratch, sigpool_scratch) = { + .bh_lock = INIT_LOCAL_LOCK(bh_lock), +}; struct sigpool_entry { struct crypto_ahash *hash; @@ -72,7 +79,7 @@ static int sigpool_reserve_scratch(size_t size) break; } - old_scratch = rcu_replace_pointer(per_cpu(sigpool_scratch, cpu), + old_scratch = rcu_replace_pointer(per_cpu(sigpool_scratch.pad, cpu), scratch, lockdep_is_held(&cpool_mutex)); if (!cpu_online(cpu) || !old_scratch) { kfree(old_scratch); @@ -93,7 +100,7 @@ static void sigpool_scratch_free(void) int cpu; for_each_possible_cpu(cpu) - kfree(rcu_replace_pointer(per_cpu(sigpool_scratch, cpu), + kfree(rcu_replace_pointer(per_cpu(sigpool_scratch.pad, cpu), NULL, lockdep_is_held(&cpool_mutex))); __scratch_size = 0; } @@ -277,7 +284,8 @@ int tcp_sigpool_start(unsigned int id, struct tcp_sigpool *c) __cond_acquires(RC /* Pairs with tcp_sigpool_reserve_scratch(), scratch area is * valid (allocated) until tcp_sigpool_end(). */ - c->scratch = rcu_dereference_bh(*this_cpu_ptr(&sigpool_scratch)); + local_lock_nested_bh(&sigpool_scratch.bh_lock); + c->scratch = rcu_dereference_bh(*this_cpu_ptr(&sigpool_scratch.pad)); return 0; } EXPORT_SYMBOL_GPL(tcp_sigpool_start); @@ -286,6 +294,7 @@ void tcp_sigpool_end(struct tcp_sigpool *c) __releases(RCU_BH) { struct crypto_ahash *hash = crypto_ahash_reqtfm(c->req); + local_unlock_nested_bh(&sigpool_scratch.bh_lock); rcu_read_unlock_bh(); ahash_request_free(c->req); crypto_free_ahash(hash); diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 5bfd76a31af6..ab4b6de0e069 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -74,11 +74,7 @@ u32 tcp_clamp_probe0_to_user_timeout(const struct sock *sk, u32 when) static void tcp_write_err(struct sock *sk) { - WRITE_ONCE(sk->sk_err, READ_ONCE(sk->sk_err_soft) ? : ETIMEDOUT); - sk_error_report(sk); - - tcp_write_queue_purge(sk); - tcp_done(sk); + tcp_done_with_error(sk, READ_ONCE(sk->sk_err_soft) ? : ETIMEDOUT); __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONTIMEOUT); } diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 189c9113fe9a..d08bf16d476d 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -2074,7 +2074,7 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) } UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite); trace_udp_fail_queue_rcv_skb(rc, sk, skb); - kfree_skb_reason(skb, drop_reason); + sk_skb_reason_drop(sk, skb, drop_reason); return -1; } @@ -2196,7 +2196,7 @@ csum_error: drop: __UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite); atomic_inc(&sk->sk_drops); - kfree_skb_reason(skb, drop_reason); + sk_skb_reason_drop(sk, skb, drop_reason); return -1; } @@ -2230,7 +2230,7 @@ bool udp_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst) struct dst_entry *old; if (dst_hold_safe(dst)) { - old = xchg((__force struct dst_entry **)&sk->sk_rx_dst, dst); + old = unrcu_pointer(xchg(&sk->sk_rx_dst, RCU_INITIALIZER(dst))); dst_release(old); return old != dst; } @@ -2383,7 +2383,7 @@ static int udp_unicast_rcv_skb(struct sock *sk, struct sk_buff *skb, int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, int proto) { - struct sock *sk; + struct sock *sk = NULL; struct udphdr *uh; unsigned short ulen; struct rtable *rt = skb_rtable(skb); @@ -2460,7 +2460,7 @@ no_sk: * Hmm. We got an UDP packet to a port to which we * don't wanna listen. Ignore it. */ - kfree_skb_reason(skb, drop_reason); + sk_skb_reason_drop(sk, skb, drop_reason); return 0; short_packet: @@ -2485,7 +2485,7 @@ csum_error: __UDP_INC_STATS(net, UDP_MIB_CSUMERRORS, proto == IPPROTO_UDPLITE); drop: __UDP_INC_STATS(net, UDP_MIB_INERRORS, proto == IPPROTO_UDPLITE); - kfree_skb_reason(skb, drop_reason); + sk_skb_reason_drop(sk, skb, drop_reason); return 0; } diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 5c424a0e7232..1e69756d53d9 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -863,7 +863,7 @@ static void addrconf_forward_change(struct net *net, __s32 newf) } } -static int addrconf_fixup_forwarding(struct ctl_table *table, int *p, int newf) +static int addrconf_fixup_forwarding(const struct ctl_table *table, int *p, int newf) { struct net *net; int old; @@ -931,7 +931,7 @@ static void addrconf_linkdown_change(struct net *net, __s32 newf) } } -static int addrconf_fixup_linkdown(struct ctl_table *table, int *p, int newf) +static int addrconf_fixup_linkdown(const struct ctl_table *table, int *p, int newf) { struct net *net; int old; @@ -6378,7 +6378,7 @@ static void addrconf_disable_change(struct net *net, __s32 newf) } } -static int addrconf_disable_ipv6(struct ctl_table *table, int *p, int newf) +static int addrconf_disable_ipv6(const struct ctl_table *table, int *p, int newf) { struct net *net = (struct net *)table->extra2; int old; @@ -6669,7 +6669,7 @@ void addrconf_disable_policy_idev(struct inet6_dev *idev, int val) } static -int addrconf_disable_policy(struct ctl_table *ctl, int *valp, int val) +int addrconf_disable_policy(const struct ctl_table *ctl, int *valp, int val) { struct net *net = (struct net *)ctl->extra2; struct inet6_dev *idev; diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 8041dc181bd4..e03fb9a1dbeb 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -509,7 +509,7 @@ void inet6_cleanup_sock(struct sock *sk) /* Free tx options */ - opt = xchg((__force struct ipv6_txoptions **)&np->opt, NULL); + opt = unrcu_pointer(xchg(&np->opt, NULL)); if (opt) { atomic_sub(opt->tot_len, &sk->sk_omem_alloc); txopt_put(opt); diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 83e4f9855ae1..eb111d20615c 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -987,7 +987,7 @@ static void __fib6_drop_pcpu_from(struct fib6_nh *fib6_nh, if (pcpu_rt && rcu_access_pointer(pcpu_rt->from) == match) { struct fib6_info *from; - from = xchg((__force struct fib6_info **)&pcpu_rt->from, NULL); + from = unrcu_pointer(xchg(&pcpu_rt->from, NULL)); fib6_info_release(from); } } diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 27d8725445e3..e7a19df3125e 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -859,7 +859,7 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, struct rt6_info *rt = dst_rt6_info(skb_dst(skb)); struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ? inet6_sk(skb->sk) : NULL; - bool mono_delivery_time = skb->mono_delivery_time; + u8 tstamp_type = skb->tstamp_type; struct ip6_frag_state state; unsigned int mtu, hlen, nexthdr_offset; ktime_t tstamp = skb->tstamp; @@ -955,7 +955,7 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, if (iter.frag) ip6_fraglist_prepare(skb, &iter); - skb_set_delivery_time(skb, tstamp, mono_delivery_time); + skb_set_delivery_time(skb, tstamp, tstamp_type); err = output(net, sk, skb); if (!err) IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), @@ -1016,7 +1016,7 @@ slow_path: /* * Put this fragment into the sending queue. */ - skb_set_delivery_time(frag, tstamp, mono_delivery_time); + skb_set_delivery_time(frag, tstamp, tstamp_type); err = output(net, sk, frag); if (err) goto fail; @@ -1924,7 +1924,10 @@ struct sk_buff *__ip6_make_skb(struct sock *sk, skb->priority = READ_ONCE(sk->sk_priority); skb->mark = cork->base.mark; - skb->tstamp = cork->base.transmit_time; + if (sk_is_tcp(sk)) + skb_set_delivery_time(skb, cork->base.transmit_time, SKB_CLOCK_MONOTONIC); + else + skb_set_delivery_type_by_clockid(skb, cork->base.transmit_time, sk->sk_clockid); ip6_cork_steal_dst(skb, cork); IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTREQUESTS); diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index d4c28ec1bc51..cd342d5015c6 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -111,8 +111,7 @@ struct ipv6_txoptions *ipv6_update_options(struct sock *sk, icsk->icsk_sync_mss(sk, icsk->icsk_pmtu_cookie); } } - opt = xchg((__force struct ipv6_txoptions **)&inet6_sk(sk)->opt, - opt); + opt = unrcu_pointer(xchg(&inet6_sk(sk)->opt, RCU_INITIALIZER(opt))); sk_dst_reset(sk); return opt; diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index d914b23256ce..254b192c5705 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -1936,7 +1936,7 @@ static struct notifier_block ndisc_netdev_notifier = { }; #ifdef CONFIG_SYSCTL -static void ndisc_warn_deprecated_sysctl(struct ctl_table *ctl, +static void ndisc_warn_deprecated_sysctl(const struct ctl_table *ctl, const char *func, const char *dev_name) { static char warncomm[TASK_COMM_LEN]; diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c index 5d989d803009..581ce055bf52 100644 --- a/net/ipv6/netfilter.c +++ b/net/ipv6/netfilter.c @@ -127,7 +127,7 @@ int br_ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, struct sk_buff *)) { int frag_max_size = BR_INPUT_SKB_CB(skb)->frag_max_size; - bool mono_delivery_time = skb->mono_delivery_time; + u8 tstamp_type = skb->tstamp_type; ktime_t tstamp = skb->tstamp; struct ip6_frag_state state; u8 *prevhdr, nexthdr = 0; @@ -193,7 +193,7 @@ int br_ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, if (iter.frag) ip6_fraglist_prepare(skb, &iter); - skb_set_delivery_time(skb, tstamp, mono_delivery_time); + skb_set_delivery_time(skb, tstamp, tstamp_type); err = output(net, sk, data, skb); if (err || !iter.frag) break; @@ -226,7 +226,7 @@ slow_path: goto blackhole; } - skb_set_delivery_time(skb2, tstamp, mono_delivery_time); + skb_set_delivery_time(skb2, tstamp, tstamp_type); err = output(net, sk, data, skb2); if (err) goto blackhole; diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index 5e1b50c6a44d..6f0844c9315d 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -263,7 +263,7 @@ static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb, fq->iif = dev->ifindex; fq->q.stamp = skb->tstamp; - fq->q.mono_delivery_time = skb->mono_delivery_time; + fq->q.tstamp_type = skb->tstamp_type; fq->q.meat += skb->len; fq->ecn |= ecn; if (payload_len > fq->q.max_size) diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 2eedf255600b..608fa9d05b55 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -362,14 +362,14 @@ static inline int rawv6_rcv_skb(struct sock *sk, struct sk_buff *skb) if ((raw6_sk(sk)->checksum || rcu_access_pointer(sk->sk_filter)) && skb_checksum_complete(skb)) { atomic_inc(&sk->sk_drops); - kfree_skb_reason(skb, SKB_DROP_REASON_SKB_CSUM); + sk_skb_reason_drop(sk, skb, SKB_DROP_REASON_SKB_CSUM); return NET_RX_DROP; } /* Charge it to the socket. */ skb_dst_drop(skb); if (sock_queue_rcv_skb_reason(sk, skb, &reason) < 0) { - kfree_skb_reason(skb, reason); + sk_skb_reason_drop(sk, skb, reason); return NET_RX_DROP; } @@ -390,7 +390,7 @@ int rawv6_rcv(struct sock *sk, struct sk_buff *skb) if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb)) { atomic_inc(&sk->sk_drops); - kfree_skb_reason(skb, SKB_DROP_REASON_XFRM_POLICY); + sk_skb_reason_drop(sk, skb, SKB_DROP_REASON_XFRM_POLICY); return NET_RX_DROP; } nf_reset_ct(skb); @@ -415,7 +415,7 @@ int rawv6_rcv(struct sock *sk, struct sk_buff *skb) if (inet_test_bit(HDRINCL, sk)) { if (skb_checksum_complete(skb)) { atomic_inc(&sk->sk_drops); - kfree_skb_reason(skb, SKB_DROP_REASON_SKB_CSUM); + sk_skb_reason_drop(sk, skb, SKB_DROP_REASON_SKB_CSUM); return NET_RX_DROP; } } @@ -621,7 +621,7 @@ static int rawv6_send_hdrinc(struct sock *sk, struct msghdr *msg, int length, skb->protocol = htons(ETH_P_IPV6); skb->priority = READ_ONCE(sk->sk_priority); skb->mark = sockc->mark; - skb->tstamp = sockc->transmit_time; + skb_set_delivery_type_by_clockid(skb, sockc->transmit_time, sk->sk_clockid); skb_put(skb, length); skb_reset_network_header(skb); diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index 327caca64257..a48be617a8ab 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -198,7 +198,7 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb, fq->iif = dev->ifindex; fq->q.stamp = skb->tstamp; - fq->q.mono_delivery_time = skb->mono_delivery_time; + fq->q.tstamp_type = skb->tstamp_type; fq->q.meat += skb->len; fq->ecn |= ecn; add_frag_mem_limit(fq->q.fqdir, skb->truesize); diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 8d72ca0b086d..5b107f241cdb 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -131,7 +131,6 @@ static struct fib6_info *rt6_get_route_info(struct net *net, struct uncached_list { spinlock_t lock; struct list_head head; - struct list_head quarantine; }; static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list); @@ -189,8 +188,7 @@ static void rt6_uncached_list_flush_dev(struct net_device *dev) handled = true; } if (handled) - list_move(&rt->dst.rt_uncached, - &ul->quarantine); + list_del_init(&rt->dst.rt_uncached); } spin_unlock_bh(&ul->lock); } @@ -368,7 +366,7 @@ static void ip6_dst_destroy(struct dst_entry *dst) in6_dev_put(idev); } - from = xchg((__force struct fib6_info **)&rt->from, NULL); + from = unrcu_pointer(xchg(&rt->from, NULL)); fib6_info_release(from); } @@ -1440,7 +1438,7 @@ static struct rt6_info *rt6_make_pcpu_route(struct net *net, if (res->f6i->fib6_destroying) { struct fib6_info *from; - from = xchg((__force struct fib6_info **)&pcpu_rt->from, NULL); + from = unrcu_pointer(xchg(&pcpu_rt->from, NULL)); fib6_info_release(from); } @@ -1469,7 +1467,7 @@ static void rt6_remove_exception(struct rt6_exception_bucket *bucket, /* purge completely the exception to allow releasing the held resources: * some [sk] cache may keep the dst around for unlimited time */ - from = xchg((__force struct fib6_info **)&rt6_ex->rt6i->from, NULL); + from = unrcu_pointer(xchg(&rt6_ex->rt6i->from, NULL)); fib6_info_release(from); dst_dev_put(&rt6_ex->rt6i->dst); @@ -2376,7 +2374,7 @@ static u32 rt6_multipath_custom_hash_outer(const struct net *net, hash_keys.ports.dst = keys.ports.dst; *p_has_inner = !!(keys.control.flags & FLOW_DIS_ENCAPSULATION); - return flow_hash_from_keys(&hash_keys); + return fib_multipath_hash_from_keys(net, &hash_keys); } static u32 rt6_multipath_custom_hash_inner(const struct net *net, @@ -2425,7 +2423,7 @@ static u32 rt6_multipath_custom_hash_inner(const struct net *net, if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_PORT) hash_keys.ports.dst = keys.ports.dst; - return flow_hash_from_keys(&hash_keys); + return fib_multipath_hash_from_keys(net, &hash_keys); } static u32 rt6_multipath_custom_hash_skb(const struct net *net, @@ -2464,7 +2462,7 @@ static u32 rt6_multipath_custom_hash_fl6(const struct net *net, if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_PORT) hash_keys.ports.dst = fl6->fl6_dport; - return flow_hash_from_keys(&hash_keys); + return fib_multipath_hash_from_keys(net, &hash_keys); } /* if skb is set it will be used and fl6 can be NULL */ @@ -2486,7 +2484,7 @@ u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6, hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6); hash_keys.basic.ip_proto = fl6->flowi6_proto; } - mhash = flow_hash_from_keys(&hash_keys); + mhash = fib_multipath_hash_from_keys(net, &hash_keys); break; case 1: if (skb) { @@ -2518,7 +2516,7 @@ u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6, hash_keys.ports.dst = fl6->fl6_dport; hash_keys.basic.ip_proto = fl6->flowi6_proto; } - mhash = flow_hash_from_keys(&hash_keys); + mhash = fib_multipath_hash_from_keys(net, &hash_keys); break; case 2: memset(&hash_keys, 0, sizeof(hash_keys)); @@ -2555,7 +2553,7 @@ u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6, hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6); hash_keys.basic.ip_proto = fl6->flowi6_proto; } - mhash = flow_hash_from_keys(&hash_keys); + mhash = fib_multipath_hash_from_keys(net, &hash_keys); break; case 3: if (skb) @@ -3764,7 +3762,7 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg, if (!rt) goto out; - rt->fib6_metrics = ip_fib_metrics_init(net, cfg->fc_mx, cfg->fc_mx_len, + rt->fib6_metrics = ip_fib_metrics_init(cfg->fc_mx, cfg->fc_mx_len, extack); if (IS_ERR(rt->fib6_metrics)) { err = PTR_ERR(rt->fib6_metrics); @@ -6758,7 +6756,6 @@ int __init ip6_route_init(void) struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu); INIT_LIST_HEAD(&ul->head); - INIT_LIST_HEAD(&ul->quarantine); spin_lock_init(&ul->lock); } diff --git a/net/ipv6/seg6.c b/net/ipv6/seg6.c index a31521e270f7..180da19c148c 100644 --- a/net/ipv6/seg6.c +++ b/net/ipv6/seg6.c @@ -21,9 +21,7 @@ #include <net/genetlink.h> #include <linux/seg6.h> #include <linux/seg6_genl.h> -#ifdef CONFIG_IPV6_SEG6_HMAC #include <net/seg6_hmac.h> -#endif bool seg6_validate_srh(struct ipv6_sr_hdr *srh, int len, bool reduced) { @@ -437,13 +435,11 @@ static int __net_init seg6_net_init(struct net *net) net->ipv6.seg6_data = sdata; -#ifdef CONFIG_IPV6_SEG6_HMAC if (seg6_hmac_net_init(net)) { kfree(rcu_dereference_raw(sdata->tun_src)); kfree(sdata); return -ENOMEM; } -#endif return 0; } @@ -452,9 +448,7 @@ static void __net_exit seg6_net_exit(struct net *net) { struct seg6_pernet_data *sdata = seg6_pernet(net); -#ifdef CONFIG_IPV6_SEG6_HMAC seg6_hmac_net_exit(net); -#endif kfree(rcu_dereference_raw(sdata->tun_src)); kfree(sdata); @@ -520,41 +514,28 @@ int __init seg6_init(void) if (err) goto out_unregister_pernet; -#ifdef CONFIG_IPV6_SEG6_LWTUNNEL err = seg6_iptunnel_init(); if (err) goto out_unregister_genl; err = seg6_local_init(); - if (err) { - seg6_iptunnel_exit(); - goto out_unregister_genl; - } -#endif + if (err) + goto out_unregister_iptun; -#ifdef CONFIG_IPV6_SEG6_HMAC err = seg6_hmac_init(); if (err) - goto out_unregister_iptun; -#endif + goto out_unregister_seg6; pr_info("Segment Routing with IPv6\n"); out: return err; -#ifdef CONFIG_IPV6_SEG6_HMAC -out_unregister_iptun: -#ifdef CONFIG_IPV6_SEG6_LWTUNNEL +out_unregister_seg6: seg6_local_exit(); +out_unregister_iptun: seg6_iptunnel_exit(); -#endif -#endif -#ifdef CONFIG_IPV6_SEG6_LWTUNNEL out_unregister_genl: -#endif -#if IS_ENABLED(CONFIG_IPV6_SEG6_LWTUNNEL) || IS_ENABLED(CONFIG_IPV6_SEG6_HMAC) genl_unregister_family(&seg6_genl_family); -#endif out_unregister_pernet: unregister_pernet_subsys(&ip6_segments_ops); goto out; @@ -562,13 +543,9 @@ out_unregister_pernet: void seg6_exit(void) { -#ifdef CONFIG_IPV6_SEG6_HMAC seg6_hmac_exit(); -#endif -#ifdef CONFIG_IPV6_SEG6_LWTUNNEL seg6_local_exit(); seg6_iptunnel_exit(); -#endif genl_unregister_family(&seg6_genl_family); unregister_pernet_subsys(&ip6_segments_ops); } diff --git a/net/ipv6/seg6_local.c b/net/ipv6/seg6_local.c index c434940131b1..c74705ead984 100644 --- a/net/ipv6/seg6_local.c +++ b/net/ipv6/seg6_local.c @@ -1380,7 +1380,9 @@ drop: return err; } -DEFINE_PER_CPU(struct seg6_bpf_srh_state, seg6_bpf_srh_states); +DEFINE_PER_CPU(struct seg6_bpf_srh_state, seg6_bpf_srh_states) = { + .bh_lock = INIT_LOCAL_LOCK(bh_lock), +}; bool seg6_bpf_has_valid_srh(struct sk_buff *skb) { @@ -1388,6 +1390,7 @@ bool seg6_bpf_has_valid_srh(struct sk_buff *skb) this_cpu_ptr(&seg6_bpf_srh_states); struct ipv6_sr_hdr *srh = srh_state->srh; + lockdep_assert_held(&srh_state->bh_lock); if (unlikely(srh == NULL)) return false; @@ -1408,8 +1411,7 @@ bool seg6_bpf_has_valid_srh(struct sk_buff *skb) static int input_action_end_bpf(struct sk_buff *skb, struct seg6_local_lwt *slwt) { - struct seg6_bpf_srh_state *srh_state = - this_cpu_ptr(&seg6_bpf_srh_states); + struct seg6_bpf_srh_state *srh_state; struct ipv6_sr_hdr *srh; int ret; @@ -1420,10 +1422,14 @@ static int input_action_end_bpf(struct sk_buff *skb, } advance_nextseg(srh, &ipv6_hdr(skb)->daddr); - /* preempt_disable is needed to protect the per-CPU buffer srh_state, - * which is also accessed by the bpf_lwt_seg6_* helpers + /* The access to the per-CPU buffer srh_state is protected by running + * always in softirq context (with disabled BH). On PREEMPT_RT the + * required locking is provided by the following local_lock_nested_bh() + * statement. It is also accessed by the bpf_lwt_seg6_* helpers via + * bpf_prog_run_save_cb(). */ - preempt_disable(); + local_lock_nested_bh(&seg6_bpf_srh_states.bh_lock); + srh_state = this_cpu_ptr(&seg6_bpf_srh_states); srh_state->srh = srh; srh_state->hdrlen = srh->hdrlen << 3; srh_state->valid = true; @@ -1446,15 +1452,15 @@ static int input_action_end_bpf(struct sk_buff *skb, if (srh_state->srh && !seg6_bpf_has_valid_srh(skb)) goto drop; + local_unlock_nested_bh(&seg6_bpf_srh_states.bh_lock); - preempt_enable(); if (ret != BPF_REDIRECT) seg6_lookup_nexthop(skb, NULL, 0); return dst_input(skb); drop: - preempt_enable(); + local_unlock_nested_bh(&seg6_bpf_srh_states.bh_lock); kfree_skb(skb); return -EINVAL; } diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c index bfad1e89b6a6..9d83eadd308b 100644 --- a/net/ipv6/syncookies.c +++ b/net/ipv6/syncookies.c @@ -275,6 +275,6 @@ out: out_free: reqsk_free(req); out_drop: - kfree_skb_reason(skb, reason); + sk_skb_reason_drop(sk, skb, reason); return NULL; } diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 729faf8bd366..200fea92f12f 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -490,14 +490,10 @@ static int tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, ipv6_icmp_error(sk, skb, err, th->dest, ntohl(info), (u8 *)th); - if (!sock_owned_by_user(sk)) { - WRITE_ONCE(sk->sk_err, err); - sk_error_report(sk); /* Wake people up to see the error (see connect in sock.c) */ - - tcp_done(sk); - } else { + if (!sock_owned_by_user(sk)) + tcp_done_with_error(sk, err); + else WRITE_ONCE(sk->sk_err_soft, err); - } goto out; case TCP_LISTEN: break; @@ -975,7 +971,7 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 mark = inet_twsk(sk)->tw_mark; else mark = READ_ONCE(sk->sk_mark); - skb_set_delivery_time(buff, tcp_transmit_time(sk), true); + skb_set_delivery_time(buff, tcp_transmit_time(sk), SKB_CLOCK_MONOTONIC); } if (txhash) { /* autoflowlabel/skb_get_hash_flowi6 rely on buff->hash */ @@ -1200,9 +1196,9 @@ static void tcp_v6_timewait_ack(struct sock *sk, struct sk_buff *skb) tcp_v6_send_ack(sk, skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt, tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, tcp_tw_tsval(tcptw), - tcptw->tw_ts_recent, tw->tw_bound_dev_if, &key, - tw->tw_tclass, cpu_to_be32(tw->tw_flowlabel), tw->tw_priority, - tw->tw_txhash); + READ_ONCE(tcptw->tw_ts_recent), tw->tw_bound_dev_if, + &key, tw->tw_tclass, cpu_to_be32(tw->tw_flowlabel), + tw->tw_priority, tw->tw_txhash); #ifdef CONFIG_TCP_AO out: @@ -1678,7 +1674,7 @@ reset: discard: if (opt_skb) __kfree_skb(opt_skb); - kfree_skb_reason(skb, reason); + sk_skb_reason_drop(sk, skb, reason); return 0; csum_err: reason = SKB_DROP_REASON_TCP_CSUM; @@ -1751,8 +1747,8 @@ INDIRECT_CALLABLE_SCOPE int tcp_v6_rcv(struct sk_buff *skb) int dif = inet6_iif(skb); const struct tcphdr *th; const struct ipv6hdr *hdr; + struct sock *sk = NULL; bool refcounted; - struct sock *sk; int ret; u32 isn; struct net *net = dev_net(skb->dev); @@ -1944,7 +1940,7 @@ bad_packet: discard_it: SKB_DR_OR(drop_reason, NOT_SPECIFIED); - kfree_skb_reason(skb, drop_reason); + sk_skb_reason_drop(sk, skb, drop_reason); return 0; discard_and_relse: @@ -2383,8 +2379,14 @@ static struct inet_protosw tcpv6_protosw = { static int __net_init tcpv6_net_init(struct net *net) { - return inet_ctl_sock_create(&net->ipv6.tcp_sk, PF_INET6, - SOCK_RAW, IPPROTO_TCP, net); + int res; + + res = inet_ctl_sock_create(&net->ipv6.tcp_sk, PF_INET6, + SOCK_RAW, IPPROTO_TCP, net); + if (!res) + net->ipv6.tcp_sk->sk_clockid = CLOCK_MONOTONIC; + + return res; } static void __net_exit tcpv6_net_exit(struct net *net) diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index c81a07ac0463..b56f0b9f4307 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -673,7 +673,7 @@ static int __udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) } UDP6_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite); trace_udp_fail_queue_rcv_skb(rc, sk, skb); - kfree_skb_reason(skb, drop_reason); + sk_skb_reason_drop(sk, skb, drop_reason); return -1; } @@ -776,7 +776,7 @@ csum_error: drop: __UDP6_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite); atomic_inc(&sk->sk_drops); - kfree_skb_reason(skb, drop_reason); + sk_skb_reason_drop(sk, skb, drop_reason); return -1; } @@ -940,8 +940,8 @@ int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED; const struct in6_addr *saddr, *daddr; struct net *net = dev_net(skb->dev); + struct sock *sk = NULL; struct udphdr *uh; - struct sock *sk; bool refcounted; u32 ulen = 0; @@ -1033,7 +1033,7 @@ no_sk: __UDP6_INC_STATS(net, UDP_MIB_NOPORTS, proto == IPPROTO_UDPLITE); icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0); - kfree_skb_reason(skb, reason); + sk_skb_reason_drop(sk, skb, reason); return 0; short_packet: @@ -1054,7 +1054,7 @@ csum_error: __UDP6_INC_STATS(net, UDP_MIB_CSUMERRORS, proto == IPPROTO_UDPLITE); discard: __UDP6_INC_STATS(net, UDP_MIB_INERRORS, proto == IPPROTO_UDPLITE); - kfree_skb_reason(skb, reason); + sk_skb_reason_drop(sk, skb, reason); return 0; } diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index 88a34db265d8..64f446f0930b 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -39,7 +39,6 @@ #include <linux/ip.h> #include <linux/udp.h> #include <linux/l2tp.h> -#include <linux/hash.h> #include <linux/sort.h> #include <linux/file.h> #include <linux/nsproxy.h> @@ -107,11 +106,23 @@ struct l2tp_net { /* Lock for write access to l2tp_tunnel_idr */ spinlock_t l2tp_tunnel_idr_lock; struct idr l2tp_tunnel_idr; - struct hlist_head l2tp_session_hlist[L2TP_HASH_SIZE_2]; - /* Lock for write access to l2tp_session_hlist */ - spinlock_t l2tp_session_hlist_lock; + /* Lock for write access to l2tp_v[23]_session_idr/htable */ + spinlock_t l2tp_session_idr_lock; + struct idr l2tp_v2_session_idr; + struct idr l2tp_v3_session_idr; + struct hlist_head l2tp_v3_session_htable[16]; }; +static inline u32 l2tp_v2_session_key(u16 tunnel_id, u16 session_id) +{ + return ((u32)tunnel_id) << 16 | session_id; +} + +static inline unsigned long l2tp_v3_session_hashkey(struct sock *sk, u32 session_id) +{ + return ((unsigned long)sk) + session_id; +} + #if IS_ENABLED(CONFIG_IPV6) static bool l2tp_sk_is_v6(struct sock *sk) { @@ -125,29 +136,6 @@ static inline struct l2tp_net *l2tp_pernet(const struct net *net) return net_generic(net, l2tp_net_id); } -/* Session hash global list for L2TPv3. - * The session_id SHOULD be random according to RFC3931, but several - * L2TP implementations use incrementing session_ids. So we do a real - * hash on the session_id, rather than a simple bitmask. - */ -static inline struct hlist_head * -l2tp_session_id_hash_2(struct l2tp_net *pn, u32 session_id) -{ - return &pn->l2tp_session_hlist[hash_32(session_id, L2TP_HASH_BITS_2)]; -} - -/* Session hash list. - * The session_id SHOULD be random according to RFC2661, but several - * L2TP implementations (Cisco and Microsoft) use incrementing - * session_ids. So we do a real hash on the session_id, rather than a - * simple bitmask. - */ -static inline struct hlist_head * -l2tp_session_id_hash(struct l2tp_tunnel *tunnel, u32 session_id) -{ - return &tunnel->session_hlist[hash_32(session_id, L2TP_HASH_BITS)]; -} - static void l2tp_tunnel_free(struct l2tp_tunnel *tunnel) { trace_free_tunnel(tunnel); @@ -240,66 +228,82 @@ struct l2tp_tunnel *l2tp_tunnel_get_nth(const struct net *net, int nth) } EXPORT_SYMBOL_GPL(l2tp_tunnel_get_nth); -struct l2tp_session *l2tp_tunnel_get_session(struct l2tp_tunnel *tunnel, - u32 session_id) +struct l2tp_session *l2tp_v3_session_get(const struct net *net, struct sock *sk, u32 session_id) { - struct hlist_head *session_list; + const struct l2tp_net *pn = l2tp_pernet(net); struct l2tp_session *session; - session_list = l2tp_session_id_hash(tunnel, session_id); - rcu_read_lock_bh(); - hlist_for_each_entry_rcu(session, session_list, hlist) - if (session->session_id == session_id) { - l2tp_session_inc_refcount(session); - rcu_read_unlock_bh(); + session = idr_find(&pn->l2tp_v3_session_idr, session_id); + if (session && !hash_hashed(&session->hlist) && + refcount_inc_not_zero(&session->ref_count)) { + rcu_read_unlock_bh(); + return session; + } - return session; + /* If we get here and session is non-NULL, the session_id + * collides with one in another tunnel. If sk is non-NULL, + * find the session matching sk. + */ + if (session && sk) { + unsigned long key = l2tp_v3_session_hashkey(sk, session->session_id); + + hash_for_each_possible_rcu(pn->l2tp_v3_session_htable, session, + hlist, key) { + if (session->tunnel->sock == sk && + refcount_inc_not_zero(&session->ref_count)) { + rcu_read_unlock_bh(); + return session; + } } + } rcu_read_unlock_bh(); return NULL; } -EXPORT_SYMBOL_GPL(l2tp_tunnel_get_session); +EXPORT_SYMBOL_GPL(l2tp_v3_session_get); -struct l2tp_session *l2tp_session_get(const struct net *net, u32 session_id) +struct l2tp_session *l2tp_v2_session_get(const struct net *net, u16 tunnel_id, u16 session_id) { - struct hlist_head *session_list; + u32 session_key = l2tp_v2_session_key(tunnel_id, session_id); + const struct l2tp_net *pn = l2tp_pernet(net); struct l2tp_session *session; - session_list = l2tp_session_id_hash_2(l2tp_pernet(net), session_id); - rcu_read_lock_bh(); - hlist_for_each_entry_rcu(session, session_list, global_hlist) - if (session->session_id == session_id) { - l2tp_session_inc_refcount(session); - rcu_read_unlock_bh(); - - return session; - } + session = idr_find(&pn->l2tp_v2_session_idr, session_key); + if (session && refcount_inc_not_zero(&session->ref_count)) { + rcu_read_unlock_bh(); + return session; + } rcu_read_unlock_bh(); return NULL; } +EXPORT_SYMBOL_GPL(l2tp_v2_session_get); + +struct l2tp_session *l2tp_session_get(const struct net *net, struct sock *sk, int pver, + u32 tunnel_id, u32 session_id) +{ + if (pver == L2TP_HDR_VER_2) + return l2tp_v2_session_get(net, tunnel_id, session_id); + else + return l2tp_v3_session_get(net, sk, session_id); +} EXPORT_SYMBOL_GPL(l2tp_session_get); struct l2tp_session *l2tp_session_get_nth(struct l2tp_tunnel *tunnel, int nth) { - int hash; struct l2tp_session *session; int count = 0; rcu_read_lock_bh(); - for (hash = 0; hash < L2TP_HASH_SIZE; hash++) { - hlist_for_each_entry_rcu(session, &tunnel->session_hlist[hash], hlist) { - if (++count > nth) { - l2tp_session_inc_refcount(session); - rcu_read_unlock_bh(); - return session; - } + list_for_each_entry_rcu(session, &tunnel->session_list, list) { + if (++count > nth) { + l2tp_session_inc_refcount(session); + rcu_read_unlock_bh(); + return session; } } - rcu_read_unlock_bh(); return NULL; @@ -313,86 +317,186 @@ struct l2tp_session *l2tp_session_get_by_ifname(const struct net *net, const char *ifname) { struct l2tp_net *pn = l2tp_pernet(net); - int hash; + unsigned long tunnel_id, tmp; struct l2tp_session *session; + struct l2tp_tunnel *tunnel; rcu_read_lock_bh(); - for (hash = 0; hash < L2TP_HASH_SIZE_2; hash++) { - hlist_for_each_entry_rcu(session, &pn->l2tp_session_hlist[hash], global_hlist) { - if (!strcmp(session->ifname, ifname)) { - l2tp_session_inc_refcount(session); - rcu_read_unlock_bh(); - - return session; + idr_for_each_entry_ul(&pn->l2tp_tunnel_idr, tunnel, tmp, tunnel_id) { + if (tunnel) { + list_for_each_entry_rcu(session, &tunnel->session_list, list) { + if (!strcmp(session->ifname, ifname)) { + l2tp_session_inc_refcount(session); + rcu_read_unlock_bh(); + + return session; + } } } } - rcu_read_unlock_bh(); return NULL; } EXPORT_SYMBOL_GPL(l2tp_session_get_by_ifname); +static void l2tp_session_coll_list_add(struct l2tp_session_coll_list *clist, + struct l2tp_session *session) +{ + l2tp_session_inc_refcount(session); + WARN_ON_ONCE(session->coll_list); + session->coll_list = clist; + spin_lock(&clist->lock); + list_add(&session->clist, &clist->list); + spin_unlock(&clist->lock); +} + +static int l2tp_session_collision_add(struct l2tp_net *pn, + struct l2tp_session *session1, + struct l2tp_session *session2) +{ + struct l2tp_session_coll_list *clist; + + lockdep_assert_held(&pn->l2tp_session_idr_lock); + + if (!session2) + return -EEXIST; + + /* If existing session is in IP-encap tunnel, refuse new session */ + if (session2->tunnel->encap == L2TP_ENCAPTYPE_IP) + return -EEXIST; + + clist = session2->coll_list; + if (!clist) { + /* First collision. Allocate list to manage the collided sessions + * and add the existing session to the list. + */ + clist = kmalloc(sizeof(*clist), GFP_ATOMIC); + if (!clist) + return -ENOMEM; + + spin_lock_init(&clist->lock); + INIT_LIST_HEAD(&clist->list); + refcount_set(&clist->ref_count, 1); + l2tp_session_coll_list_add(clist, session2); + } + + /* If existing session isn't already in the session hlist, add it. */ + if (!hash_hashed(&session2->hlist)) + hash_add(pn->l2tp_v3_session_htable, &session2->hlist, + session2->hlist_key); + + /* Add new session to the hlist and collision list */ + hash_add(pn->l2tp_v3_session_htable, &session1->hlist, + session1->hlist_key); + refcount_inc(&clist->ref_count); + l2tp_session_coll_list_add(clist, session1); + + return 0; +} + +static void l2tp_session_collision_del(struct l2tp_net *pn, + struct l2tp_session *session) +{ + struct l2tp_session_coll_list *clist = session->coll_list; + unsigned long session_key = session->session_id; + struct l2tp_session *session2; + + lockdep_assert_held(&pn->l2tp_session_idr_lock); + + hash_del(&session->hlist); + + if (clist) { + /* Remove session from its collision list. If there + * are other sessions with the same ID, replace this + * session's IDR entry with that session, otherwise + * remove the IDR entry. If this is the last session, + * the collision list data is freed. + */ + spin_lock(&clist->lock); + list_del_init(&session->clist); + session2 = list_first_entry_or_null(&clist->list, struct l2tp_session, clist); + if (session2) { + void *old = idr_replace(&pn->l2tp_v3_session_idr, session2, session_key); + + WARN_ON_ONCE(IS_ERR_VALUE(old)); + } else { + void *removed = idr_remove(&pn->l2tp_v3_session_idr, session_key); + + WARN_ON_ONCE(removed != session); + } + session->coll_list = NULL; + spin_unlock(&clist->lock); + if (refcount_dec_and_test(&clist->ref_count)) + kfree(clist); + l2tp_session_dec_refcount(session); + } +} + int l2tp_session_register(struct l2tp_session *session, struct l2tp_tunnel *tunnel) { - struct l2tp_session *session_walk; - struct hlist_head *g_head; - struct hlist_head *head; - struct l2tp_net *pn; + struct l2tp_net *pn = l2tp_pernet(tunnel->l2tp_net); + u32 session_key; int err; - head = l2tp_session_id_hash(tunnel, session->session_id); - - spin_lock_bh(&tunnel->hlist_lock); + spin_lock_bh(&tunnel->list_lock); if (!tunnel->acpt_newsess) { err = -ENODEV; goto err_tlock; } - hlist_for_each_entry(session_walk, head, hlist) - if (session_walk->session_id == session->session_id) { - err = -EEXIST; - goto err_tlock; - } - if (tunnel->version == L2TP_HDR_VER_3) { - pn = l2tp_pernet(tunnel->l2tp_net); - g_head = l2tp_session_id_hash_2(pn, session->session_id); - - spin_lock_bh(&pn->l2tp_session_hlist_lock); - + session_key = session->session_id; + spin_lock_bh(&pn->l2tp_session_idr_lock); + err = idr_alloc_u32(&pn->l2tp_v3_session_idr, NULL, + &session_key, session_key, GFP_ATOMIC); /* IP encap expects session IDs to be globally unique, while - * UDP encap doesn't. + * UDP encap doesn't. This isn't per the RFC, which says that + * sessions are identified only by the session ID, but is to + * support existing userspace which depends on it. */ - hlist_for_each_entry(session_walk, g_head, global_hlist) - if (session_walk->session_id == session->session_id && - (session_walk->tunnel->encap == L2TP_ENCAPTYPE_IP || - tunnel->encap == L2TP_ENCAPTYPE_IP)) { - err = -EEXIST; - goto err_tlock_pnlock; - } + if (err == -ENOSPC && tunnel->encap == L2TP_ENCAPTYPE_UDP) { + struct l2tp_session *session2; - l2tp_tunnel_inc_refcount(tunnel); - hlist_add_head_rcu(&session->global_hlist, g_head); - - spin_unlock_bh(&pn->l2tp_session_hlist_lock); + session2 = idr_find(&pn->l2tp_v3_session_idr, + session_key); + err = l2tp_session_collision_add(pn, session, session2); + } + spin_unlock_bh(&pn->l2tp_session_idr_lock); } else { - l2tp_tunnel_inc_refcount(tunnel); + session_key = l2tp_v2_session_key(tunnel->tunnel_id, + session->session_id); + spin_lock_bh(&pn->l2tp_session_idr_lock); + err = idr_alloc_u32(&pn->l2tp_v2_session_idr, NULL, + &session_key, session_key, GFP_ATOMIC); + spin_unlock_bh(&pn->l2tp_session_idr_lock); } - hlist_add_head_rcu(&session->hlist, head); - spin_unlock_bh(&tunnel->hlist_lock); + if (err) { + if (err == -ENOSPC) + err = -EEXIST; + goto err_tlock; + } + + l2tp_tunnel_inc_refcount(tunnel); + + list_add(&session->list, &tunnel->session_list); + spin_unlock_bh(&tunnel->list_lock); + + spin_lock_bh(&pn->l2tp_session_idr_lock); + if (tunnel->version == L2TP_HDR_VER_3) + idr_replace(&pn->l2tp_v3_session_idr, session, session_key); + else + idr_replace(&pn->l2tp_v2_session_idr, session, session_key); + spin_unlock_bh(&pn->l2tp_session_idr_lock); trace_register_session(session); return 0; -err_tlock_pnlock: - spin_unlock_bh(&pn->l2tp_session_hlist_lock); err_tlock: - spin_unlock_bh(&tunnel->hlist_lock); + spin_unlock_bh(&tunnel->list_lock); return err; } @@ -785,19 +889,14 @@ static void l2tp_session_queue_purge(struct l2tp_session *session) } } -/* Internal UDP receive frame. Do the real work of receiving an L2TP data frame - * here. The skb is not on a list when we get here. - * Returns 0 if the packet was a data packet and was successfully passed on. - * Returns 1 if the packet was not a good data packet and could not be - * forwarded. All such packets are passed up to userspace to deal with. - */ -static int l2tp_udp_recv_core(struct l2tp_tunnel *tunnel, struct sk_buff *skb) +/* UDP encapsulation receive handler. See net/ipv4/udp.c for details. */ +int l2tp_udp_encap_recv(struct sock *sk, struct sk_buff *skb) { struct l2tp_session *session = NULL; - struct l2tp_tunnel *orig_tunnel = tunnel; + struct l2tp_tunnel *tunnel = NULL; + struct net *net = sock_net(sk); unsigned char *ptr, *optr; u16 hdrflags; - u32 tunnel_id, session_id; u16 version; int length; @@ -807,11 +906,8 @@ static int l2tp_udp_recv_core(struct l2tp_tunnel *tunnel, struct sk_buff *skb) __skb_pull(skb, sizeof(struct udphdr)); /* Short packet? */ - if (!pskb_may_pull(skb, L2TP_HDR_SIZE_MAX)) { - pr_debug_ratelimited("%s: recv short packet (len=%d)\n", - tunnel->name, skb->len); - goto invalid; - } + if (!pskb_may_pull(skb, L2TP_HDR_SIZE_MAX)) + goto pass; /* Point to L2TP header */ optr = skb->data; @@ -834,6 +930,8 @@ static int l2tp_udp_recv_core(struct l2tp_tunnel *tunnel, struct sk_buff *skb) ptr += 2; if (version == L2TP_HDR_VER_2) { + u16 tunnel_id, session_id; + /* If length is present, skip it */ if (hdrflags & L2TP_HDRFLAG_L) ptr += 2; @@ -841,49 +939,35 @@ static int l2tp_udp_recv_core(struct l2tp_tunnel *tunnel, struct sk_buff *skb) /* Extract tunnel and session ID */ tunnel_id = ntohs(*(__be16 *)ptr); ptr += 2; - - if (tunnel_id != tunnel->tunnel_id) { - /* We are receiving trafic for another tunnel, probably - * because we have several tunnels between the same - * IP/port quadruple, look it up. - */ - struct l2tp_tunnel *alt_tunnel; - - alt_tunnel = l2tp_tunnel_get(tunnel->l2tp_net, tunnel_id); - if (!alt_tunnel) - goto pass; - tunnel = alt_tunnel; - } - session_id = ntohs(*(__be16 *)ptr); ptr += 2; + + session = l2tp_v2_session_get(net, tunnel_id, session_id); } else { + u32 session_id; + ptr += 2; /* skip reserved bits */ - tunnel_id = tunnel->tunnel_id; session_id = ntohl(*(__be32 *)ptr); ptr += 4; - } - /* Check protocol version */ - if (version != tunnel->version) { - pr_debug_ratelimited("%s: recv protocol version mismatch: got %d expected %d\n", - tunnel->name, version, tunnel->version); - goto invalid; + session = l2tp_v3_session_get(net, sk, session_id); } - /* Find the session context */ - session = l2tp_tunnel_get_session(tunnel, session_id); if (!session || !session->recv_skb) { if (session) l2tp_session_dec_refcount(session); /* Not found? Pass to userspace to deal with */ - pr_debug_ratelimited("%s: no session found (%u/%u). Passing up.\n", - tunnel->name, tunnel_id, session_id); goto pass; } - if (tunnel->version == L2TP_HDR_VER_3 && + tunnel = session->tunnel; + + /* Check protocol version */ + if (version != tunnel->version) + goto invalid; + + if (version == L2TP_HDR_VER_3 && l2tp_v3_ensure_opt_in_linear(session, skb, &ptr, &optr)) { l2tp_session_dec_refcount(session); goto invalid; @@ -892,9 +976,6 @@ static int l2tp_udp_recv_core(struct l2tp_tunnel *tunnel, struct sk_buff *skb) l2tp_recv_common(session, skb, ptr, optr, hdrflags, length); l2tp_session_dec_refcount(session); - if (tunnel != orig_tunnel) - l2tp_tunnel_dec_refcount(tunnel); - return 0; invalid: @@ -904,51 +985,14 @@ pass: /* Put UDP header back */ __skb_push(skb, sizeof(struct udphdr)); - if (tunnel != orig_tunnel) - l2tp_tunnel_dec_refcount(tunnel); - - return 1; -} - -/* UDP encapsulation receive and error receive handlers. - * See net/ipv4/udp.c for details. - * - * Note that these functions are called from inside an - * RCU-protected region, but without the socket being locked. - * - * Hence we use rcu_dereference_sk_user_data to access the - * tunnel data structure rather the usual l2tp_sk_to_tunnel - * accessor function. - */ -int l2tp_udp_encap_recv(struct sock *sk, struct sk_buff *skb) -{ - struct l2tp_tunnel *tunnel; - - tunnel = rcu_dereference_sk_user_data(sk); - if (!tunnel) - goto pass_up; - if (WARN_ON(tunnel->magic != L2TP_TUNNEL_MAGIC)) - goto pass_up; - - if (l2tp_udp_recv_core(tunnel, skb)) - goto pass_up; - - return 0; - -pass_up: return 1; } EXPORT_SYMBOL_GPL(l2tp_udp_encap_recv); +/* UDP encapsulation receive error handler. See net/ipv4/udp.c for details. */ static void l2tp_udp_encap_err_recv(struct sock *sk, struct sk_buff *skb, int err, __be16 port, u32 info, u8 *payload) { - struct l2tp_tunnel *tunnel; - - tunnel = rcu_dereference_sk_user_data(sk); - if (!tunnel || tunnel->fd < 0) - return; - sk->sk_err = err; sk_error_report(sk); @@ -1206,26 +1250,36 @@ end: return; } -/* Remove an l2tp session from l2tp_core's hash lists. */ +/* Remove an l2tp session from l2tp_core's lists. */ static void l2tp_session_unhash(struct l2tp_session *session) { struct l2tp_tunnel *tunnel = session->tunnel; - /* Remove the session from core hashes */ if (tunnel) { - /* Remove from the per-tunnel hash */ - spin_lock_bh(&tunnel->hlist_lock); - hlist_del_init_rcu(&session->hlist); - spin_unlock_bh(&tunnel->hlist_lock); - - /* For L2TPv3 we have a per-net hash: remove from there, too */ - if (tunnel->version != L2TP_HDR_VER_2) { - struct l2tp_net *pn = l2tp_pernet(tunnel->l2tp_net); - - spin_lock_bh(&pn->l2tp_session_hlist_lock); - hlist_del_init_rcu(&session->global_hlist); - spin_unlock_bh(&pn->l2tp_session_hlist_lock); + struct l2tp_net *pn = l2tp_pernet(tunnel->l2tp_net); + struct l2tp_session *removed = session; + + /* Remove from the per-tunnel list */ + spin_lock_bh(&tunnel->list_lock); + list_del_init(&session->list); + spin_unlock_bh(&tunnel->list_lock); + + /* Remove from per-net IDR */ + spin_lock_bh(&pn->l2tp_session_idr_lock); + if (tunnel->version == L2TP_HDR_VER_3) { + if (hash_hashed(&session->hlist)) + l2tp_session_collision_del(pn, session); + else + removed = idr_remove(&pn->l2tp_v3_session_idr, + session->session_id); + } else { + u32 session_key = l2tp_v2_session_key(tunnel->tunnel_id, + session->session_id); + removed = idr_remove(&pn->l2tp_v2_session_idr, + session_key); } + WARN_ON_ONCE(removed && removed != session); + spin_unlock_bh(&pn->l2tp_session_idr_lock); synchronize_rcu(); } @@ -1236,28 +1290,19 @@ static void l2tp_session_unhash(struct l2tp_session *session) static void l2tp_tunnel_closeall(struct l2tp_tunnel *tunnel) { struct l2tp_session *session; - int hash; + struct list_head *pos; + struct list_head *tmp; - spin_lock_bh(&tunnel->hlist_lock); + spin_lock_bh(&tunnel->list_lock); tunnel->acpt_newsess = false; - for (hash = 0; hash < L2TP_HASH_SIZE; hash++) { -again: - hlist_for_each_entry_rcu(session, &tunnel->session_hlist[hash], hlist) { - hlist_del_init_rcu(&session->hlist); - - spin_unlock_bh(&tunnel->hlist_lock); - l2tp_session_delete(session); - spin_lock_bh(&tunnel->hlist_lock); - - /* Now restart from the beginning of this hash - * chain. We always remove a session from the - * list so we are guaranteed to make forward - * progress. - */ - goto again; - } + list_for_each_safe(pos, tmp, &tunnel->session_list) { + session = list_entry(pos, struct l2tp_session, list); + list_del_init(&session->list); + spin_unlock_bh(&tunnel->list_lock); + l2tp_session_delete(session); + spin_lock_bh(&tunnel->list_lock); } - spin_unlock_bh(&tunnel->hlist_lock); + spin_unlock_bh(&tunnel->list_lock); } /* Tunnel socket destroy hook for UDP encapsulation */ @@ -1451,8 +1496,9 @@ int l2tp_tunnel_create(int fd, int version, u32 tunnel_id, u32 peer_tunnel_id, tunnel->magic = L2TP_TUNNEL_MAGIC; sprintf(&tunnel->name[0], "tunl %u", tunnel_id); - spin_lock_init(&tunnel->hlist_lock); + spin_lock_init(&tunnel->list_lock); tunnel->acpt_newsess = true; + INIT_LIST_HEAD(&tunnel->session_list); tunnel->encap = encap; @@ -1462,8 +1508,6 @@ int l2tp_tunnel_create(int fd, int version, u32 tunnel_id, u32 peer_tunnel_id, /* Init delete workqueue struct */ INIT_WORK(&tunnel->del_work, l2tp_tunnel_del_work); - INIT_LIST_HEAD(&tunnel->list); - err = 0; err: if (tunnelp) @@ -1651,8 +1695,10 @@ struct l2tp_session *l2tp_session_create(int priv_size, struct l2tp_tunnel *tunn skb_queue_head_init(&session->reorder_q); + session->hlist_key = l2tp_v3_session_hashkey(tunnel->sock, session->session_id); INIT_HLIST_NODE(&session->hlist); - INIT_HLIST_NODE(&session->global_hlist); + INIT_LIST_HEAD(&session->clist); + INIT_LIST_HEAD(&session->list); if (cfg) { session->pwtype = cfg->pw_type; @@ -1685,15 +1731,13 @@ EXPORT_SYMBOL_GPL(l2tp_session_create); static __net_init int l2tp_init_net(struct net *net) { struct l2tp_net *pn = net_generic(net, l2tp_net_id); - int hash; idr_init(&pn->l2tp_tunnel_idr); spin_lock_init(&pn->l2tp_tunnel_idr_lock); - for (hash = 0; hash < L2TP_HASH_SIZE_2; hash++) - INIT_HLIST_HEAD(&pn->l2tp_session_hlist[hash]); - - spin_lock_init(&pn->l2tp_session_hlist_lock); + idr_init(&pn->l2tp_v2_session_idr); + idr_init(&pn->l2tp_v3_session_idr); + spin_lock_init(&pn->l2tp_session_idr_lock); return 0; } @@ -1703,7 +1747,6 @@ static __net_exit void l2tp_exit_net(struct net *net) struct l2tp_net *pn = l2tp_pernet(net); struct l2tp_tunnel *tunnel = NULL; unsigned long tunnel_id, tmp; - int hash; rcu_read_lock_bh(); idr_for_each_entry_ul(&pn->l2tp_tunnel_idr, tunnel, tmp, tunnel_id) { @@ -1716,8 +1759,8 @@ static __net_exit void l2tp_exit_net(struct net *net) flush_workqueue(l2tp_wq); rcu_barrier(); - for (hash = 0; hash < L2TP_HASH_SIZE_2; hash++) - WARN_ON_ONCE(!hlist_empty(&pn->l2tp_session_hlist[hash])); + idr_destroy(&pn->l2tp_v2_session_idr); + idr_destroy(&pn->l2tp_v3_session_idr); idr_destroy(&pn->l2tp_tunnel_idr); } diff --git a/net/l2tp/l2tp_core.h b/net/l2tp/l2tp_core.h index 91ebf0a3f499..8ac81bc1bc6f 100644 --- a/net/l2tp/l2tp_core.h +++ b/net/l2tp/l2tp_core.h @@ -19,14 +19,6 @@ #define L2TP_TUNNEL_MAGIC 0x42114DDA #define L2TP_SESSION_MAGIC 0x0C04EB7D -/* Per tunnel session hash table size */ -#define L2TP_HASH_BITS 4 -#define L2TP_HASH_SIZE BIT(L2TP_HASH_BITS) - -/* System-wide session hash table size */ -#define L2TP_HASH_BITS_2 8 -#define L2TP_HASH_SIZE_2 BIT(L2TP_HASH_BITS_2) - struct sk_buff; struct l2tp_stats { @@ -61,10 +53,15 @@ struct l2tp_session_cfg { char *ifname; }; +struct l2tp_session_coll_list { + spinlock_t lock; /* for access to list */ + struct list_head list; + refcount_t ref_count; +}; + /* Represents a session (pseudowire) instance. * Tracks runtime state including cookies, dataplane packet sequencing, and IO statistics. - * Is linked into a per-tunnel session hashlist; and in the case of an L2TPv3 session into - * an additional per-net ("global") hashlist. + * Is linked into a per-tunnel session list and a per-net ("global") IDR tree. */ #define L2TP_SESSION_NAME_MAX 32 struct l2tp_session { @@ -88,8 +85,12 @@ struct l2tp_session { u32 nr_oos; /* NR of last OOS packet */ int nr_oos_count; /* for OOS recovery */ int nr_oos_count_max; - struct hlist_node hlist; /* hash list node */ + struct list_head list; /* per-tunnel list node */ refcount_t ref_count; + struct hlist_node hlist; /* per-net session hlist */ + unsigned long hlist_key; /* key for session hlist */ + struct l2tp_session_coll_list *coll_list; /* session collision list */ + struct list_head clist; /* for coll_list */ char name[L2TP_SESSION_NAME_MAX]; /* for logging */ char ifname[IFNAMSIZ]; @@ -102,7 +103,6 @@ struct l2tp_session { int reorder_skip; /* set if skip to next nr */ enum l2tp_pwtype pwtype; struct l2tp_stats stats; - struct hlist_node global_hlist; /* global hash list node */ /* Session receive handler for data packets. * Each pseudowire implementation should implement this callback in order to @@ -114,7 +114,7 @@ struct l2tp_session { /* Session close handler. * Each pseudowire implementation may implement this callback in order to carry * out pseudowire-specific shutdown actions. - * The callback is called by core after unhashing the session and purging its + * The callback is called by core after unlisting the session and purging its * reorder queue. */ void (*session_close)(struct l2tp_session *session); @@ -150,7 +150,7 @@ struct l2tp_tunnel_cfg { /* Represents a tunnel instance. * Tracks runtime state including IO statistics. * Holds the tunnel socket (either passed from userspace or directly created by the kernel). - * Maintains a hashlist of sessions belonging to the tunnel instance. + * Maintains a list of sessions belonging to the tunnel instance. * Is linked into a per-net list of tunnels. */ #define L2TP_TUNNEL_NAME_MAX 20 @@ -160,12 +160,11 @@ struct l2tp_tunnel { unsigned long dead; struct rcu_head rcu; - spinlock_t hlist_lock; /* write-protection for session_hlist */ + spinlock_t list_lock; /* write-protection for session_list */ bool acpt_newsess; /* indicates whether this tunnel accepts - * new sessions. Protected by hlist_lock. + * new sessions. Protected by list_lock. */ - struct hlist_head session_hlist[L2TP_HASH_SIZE]; - /* hashed list of sessions, hashed by id */ + struct list_head session_list; /* list of sessions */ u32 tunnel_id; u32 peer_tunnel_id; int version; /* 2=>L2TPv2, 3=>L2TPv3 */ @@ -174,7 +173,6 @@ struct l2tp_tunnel { enum l2tp_encap_type encap; struct l2tp_stats stats; - struct list_head list; /* list node on per-namespace list of tunnels */ struct net *l2tp_net; /* the net we belong to */ refcount_t ref_count; @@ -224,10 +222,11 @@ void l2tp_session_dec_refcount(struct l2tp_session *session); */ struct l2tp_tunnel *l2tp_tunnel_get(const struct net *net, u32 tunnel_id); struct l2tp_tunnel *l2tp_tunnel_get_nth(const struct net *net, int nth); -struct l2tp_session *l2tp_tunnel_get_session(struct l2tp_tunnel *tunnel, - u32 session_id); -struct l2tp_session *l2tp_session_get(const struct net *net, u32 session_id); +struct l2tp_session *l2tp_v3_session_get(const struct net *net, struct sock *sk, u32 session_id); +struct l2tp_session *l2tp_v2_session_get(const struct net *net, u16 tunnel_id, u16 session_id); +struct l2tp_session *l2tp_session_get(const struct net *net, struct sock *sk, int pver, + u32 tunnel_id, u32 session_id); struct l2tp_session *l2tp_session_get_nth(struct l2tp_tunnel *tunnel, int nth); struct l2tp_session *l2tp_session_get_by_ifname(const struct net *net, const char *ifname); diff --git a/net/l2tp/l2tp_debugfs.c b/net/l2tp/l2tp_debugfs.c index 4595b56d175d..8755ae521154 100644 --- a/net/l2tp/l2tp_debugfs.c +++ b/net/l2tp/l2tp_debugfs.c @@ -123,17 +123,14 @@ static void l2tp_dfs_seq_tunnel_show(struct seq_file *m, void *v) struct l2tp_tunnel *tunnel = v; struct l2tp_session *session; int session_count = 0; - int hash; rcu_read_lock_bh(); - for (hash = 0; hash < L2TP_HASH_SIZE; hash++) { - hlist_for_each_entry_rcu(session, &tunnel->session_hlist[hash], hlist) { - /* Session ID of zero is a dummy/reserved value used by pppol2tp */ - if (session->session_id == 0) - continue; + list_for_each_entry_rcu(session, &tunnel->session_list, list) { + /* Session ID of zero is a dummy/reserved value used by pppol2tp */ + if (session->session_id == 0) + continue; - session_count++; - } + session_count++; } rcu_read_unlock_bh(); diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c index 19c8cc5289d5..e48aa177d74c 100644 --- a/net/l2tp/l2tp_ip.c +++ b/net/l2tp/l2tp_ip.c @@ -140,7 +140,7 @@ static int l2tp_ip_recv(struct sk_buff *skb) } /* Ok, this is a data packet. Lookup the session. */ - session = l2tp_session_get(net, session_id); + session = l2tp_v3_session_get(net, NULL, session_id); if (!session) goto discard; diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c index 8780ec64f376..d217ff1f229e 100644 --- a/net/l2tp/l2tp_ip6.c +++ b/net/l2tp/l2tp_ip6.c @@ -150,7 +150,7 @@ static int l2tp_ip6_recv(struct sk_buff *skb) } /* Ok, this is a data packet. Lookup the session. */ - session = l2tp_session_get(net, session_id); + session = l2tp_v3_session_get(net, NULL, session_id); if (!session) goto discard; diff --git a/net/l2tp/l2tp_netlink.c b/net/l2tp/l2tp_netlink.c index a901fd14fe3b..d105030520f9 100644 --- a/net/l2tp/l2tp_netlink.c +++ b/net/l2tp/l2tp_netlink.c @@ -61,7 +61,8 @@ static struct l2tp_session *l2tp_nl_session_get(struct genl_info *info) session_id = nla_get_u32(info->attrs[L2TP_ATTR_SESSION_ID]); tunnel = l2tp_tunnel_get(net, tunnel_id); if (tunnel) { - session = l2tp_tunnel_get_session(tunnel, session_id); + session = l2tp_session_get(net, tunnel->sock, tunnel->version, + tunnel_id, session_id); l2tp_tunnel_dec_refcount(tunnel); } } @@ -635,7 +636,8 @@ static int l2tp_nl_cmd_session_create(struct sk_buff *skb, struct genl_info *inf &cfg); if (ret >= 0) { - session = l2tp_tunnel_get_session(tunnel, session_id); + session = l2tp_session_get(net, tunnel->sock, tunnel->version, + tunnel_id, session_id); if (session) { ret = l2tp_session_notify(&l2tp_nl_family, info, session, L2TP_CMD_SESSION_CREATE); diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c index 6146e4e67bbb..3596290047b2 100644 --- a/net/l2tp/l2tp_ppp.c +++ b/net/l2tp/l2tp_ppp.c @@ -753,7 +753,8 @@ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr, if (tunnel->peer_tunnel_id == 0) tunnel->peer_tunnel_id = info.peer_tunnel_id; - session = l2tp_tunnel_get_session(tunnel, info.session_id); + session = l2tp_session_get(sock_net(sk), tunnel->sock, tunnel->version, + info.tunnel_id, info.session_id); if (session) { drop_refcnt = true; @@ -1045,7 +1046,8 @@ static int pppol2tp_tunnel_copy_stats(struct pppol2tp_ioc_stats *stats, /* If session_id is set, search the corresponding session in the * context of this tunnel and record the session's statistics. */ - session = l2tp_tunnel_get_session(tunnel, stats->session_id); + session = l2tp_session_get(tunnel->l2tp_net, tunnel->sock, tunnel->version, + tunnel->tunnel_id, stats->session_id); if (!session) return -EBADR; diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 83ad6c9709fe..afb361a043d9 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -1379,6 +1379,11 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev, (IEEE80211_EHT_PHY_CAP7_MU_BEAMFORMER_80MHZ | IEEE80211_EHT_PHY_CAP7_MU_BEAMFORMER_160MHZ | IEEE80211_EHT_PHY_CAP7_MU_BEAMFORMER_320MHZ); + link_conf->eht_80mhz_full_bw_ul_mumimo = + params->eht_cap->fixed.phy_cap_info[7] & + (IEEE80211_EHT_PHY_CAP7_NON_OFDMA_UL_MU_MIMO_80MHZ | + IEEE80211_EHT_PHY_CAP7_NON_OFDMA_UL_MU_MIMO_160MHZ | + IEEE80211_EHT_PHY_CAP7_NON_OFDMA_UL_MU_MIMO_320MHZ); } else { link_conf->eht_su_beamformer = false; link_conf->eht_su_beamformee = false; @@ -1666,7 +1671,7 @@ static int ieee80211_stop_ap(struct wiphy *wiphy, struct net_device *dev, if (sdata->wdev.cac_started) { chandef = link_conf->chanreq.oper; - wiphy_delayed_work_cancel(wiphy, &link->dfs_cac_timer_work); + wiphy_delayed_work_cancel(wiphy, &sdata->dfs_cac_timer_work); cfg80211_cac_event(sdata->dev, &chandef, NL80211_RADAR_CAC_ABORTED, GFP_KERNEL); @@ -3467,7 +3472,7 @@ static int ieee80211_start_radar_detection(struct wiphy *wiphy, if (err) goto out_unlock; - wiphy_delayed_work_queue(wiphy, &sdata->deflink.dfs_cac_timer_work, + wiphy_delayed_work_queue(wiphy, &sdata->dfs_cac_timer_work, msecs_to_jiffies(cac_time_ms)); out_unlock: @@ -3483,12 +3488,8 @@ static void ieee80211_end_cac(struct wiphy *wiphy, lockdep_assert_wiphy(local->hw.wiphy); list_for_each_entry(sdata, &local->interfaces, list) { - /* it might be waiting for the local->mtx, but then - * by the time it gets it, sdata->wdev.cac_started - * will no longer be true - */ wiphy_delayed_work_cancel(wiphy, - &sdata->deflink.dfs_cac_timer_work); + &sdata->dfs_cac_timer_work); if (sdata->wdev.cac_started) { ieee80211_link_release_channel(&sdata->deflink); @@ -3638,10 +3639,10 @@ void ieee80211_csa_finish(struct ieee80211_vif *vif, unsigned int link_id) continue; wiphy_work_queue(iter->local->hw.wiphy, - &iter->deflink.csa_finalize_work); + &iter->deflink.csa.finalize_work); } } - wiphy_work_queue(local->hw.wiphy, &link_data->csa_finalize_work); + wiphy_work_queue(local->hw.wiphy, &link_data->csa.finalize_work); rcu_read_unlock(); } @@ -3728,7 +3729,7 @@ static int __ieee80211_csa_finalize(struct ieee80211_link_data *link_data) } if (!cfg80211_chandef_identical(&link_conf->chanreq.oper, - &link_data->csa_chanreq.oper)) + &link_data->csa.chanreq.oper)) return -EINVAL; link_conf->csa_active = false; @@ -3749,7 +3750,7 @@ static int __ieee80211_csa_finalize(struct ieee80211_link_data *link_data) if (err) return err; - cfg80211_ch_switch_notify(sdata->dev, &link_data->csa_chanreq.oper, + cfg80211_ch_switch_notify(sdata->dev, &link_data->csa.chanreq.oper, link_data->link_id); return 0; @@ -3770,7 +3771,7 @@ static void ieee80211_csa_finalize(struct ieee80211_link_data *link_data) void ieee80211_csa_finalize_work(struct wiphy *wiphy, struct wiphy_work *work) { struct ieee80211_link_data *link = - container_of(work, struct ieee80211_link_data, csa_finalize_work); + container_of(work, struct ieee80211_link_data, csa.finalize_work); struct ieee80211_sub_if_data *sdata = link->sdata; struct ieee80211_local *local = sdata->local; @@ -4017,7 +4018,7 @@ __ieee80211_channel_switch(struct wiphy *wiphy, struct net_device *dev, goto out; } - link_data->csa_chanreq = chanreq; + link_data->csa.chanreq = chanreq; link_conf->csa_active = true; if (params->block_tx && @@ -4028,12 +4029,12 @@ __ieee80211_channel_switch(struct wiphy *wiphy, struct net_device *dev, } cfg80211_ch_switch_started_notify(sdata->dev, - &link_data->csa_chanreq.oper, link_id, + &link_data->csa.chanreq.oper, link_id, params->count, params->block_tx); if (changed) { ieee80211_link_info_change_notify(sdata, link_data, changed); - drv_channel_switch_beacon(sdata, &link_data->csa_chanreq.oper); + drv_channel_switch_beacon(sdata, &link_data->csa.chanreq.oper); } else { /* if the beacon didn't change, we can finalize immediately */ ieee80211_csa_finalize(link_data); @@ -4979,13 +4980,17 @@ static void ieee80211_del_intf_link(struct wiphy *wiphy, ieee80211_vif_set_links(sdata, wdev->valid_links, 0); } -static int sta_add_link_station(struct ieee80211_local *local, - struct ieee80211_sub_if_data *sdata, - struct link_station_parameters *params) +static int +ieee80211_add_link_station(struct wiphy *wiphy, struct net_device *dev, + struct link_station_parameters *params) { + struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); + struct ieee80211_local *local = wiphy_priv(wiphy); struct sta_info *sta; int ret; + lockdep_assert_wiphy(local->hw.wiphy); + sta = sta_info_get_bss(sdata, params->mld_mac); if (!sta) return -ENOENT; @@ -5011,23 +5016,15 @@ static int sta_add_link_station(struct ieee80211_local *local, } static int -ieee80211_add_link_station(struct wiphy *wiphy, struct net_device *dev, +ieee80211_mod_link_station(struct wiphy *wiphy, struct net_device *dev, struct link_station_parameters *params) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_local *local = wiphy_priv(wiphy); - - lockdep_assert_wiphy(sdata->local->hw.wiphy); - - return sta_add_link_station(local, sdata, params); -} - -static int sta_mod_link_station(struct ieee80211_local *local, - struct ieee80211_sub_if_data *sdata, - struct link_station_parameters *params) -{ struct sta_info *sta; + lockdep_assert_wiphy(local->hw.wiphy); + sta = sta_info_get_bss(sdata, params->mld_mac); if (!sta) return -ENOENT; @@ -5039,22 +5036,14 @@ static int sta_mod_link_station(struct ieee80211_local *local, } static int -ieee80211_mod_link_station(struct wiphy *wiphy, struct net_device *dev, - struct link_station_parameters *params) +ieee80211_del_link_station(struct wiphy *wiphy, struct net_device *dev, + struct link_station_del_parameters *params) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); - struct ieee80211_local *local = wiphy_priv(wiphy); + struct sta_info *sta; lockdep_assert_wiphy(sdata->local->hw.wiphy); - return sta_mod_link_station(local, sdata, params); -} - -static int sta_del_link_station(struct ieee80211_sub_if_data *sdata, - struct link_station_del_parameters *params) -{ - struct sta_info *sta; - sta = sta_info_get_bss(sdata, params->mld_mac); if (!sta) return -ENOENT; @@ -5071,17 +5060,6 @@ static int sta_del_link_station(struct ieee80211_sub_if_data *sdata, return 0; } -static int -ieee80211_del_link_station(struct wiphy *wiphy, struct net_device *dev, - struct link_station_del_parameters *params) -{ - struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); - - lockdep_assert_wiphy(sdata->local->hw.wiphy); - - return sta_del_link_station(sdata, params); -} - static int ieee80211_set_hw_timestamp(struct wiphy *wiphy, struct net_device *dev, struct cfg80211_set_hw_timestamp *hwts) diff --git a/net/mac80211/chan.c b/net/mac80211/chan.c index 380695fdc32f..ec16d7676088 100644 --- a/net/mac80211/chan.c +++ b/net/mac80211/chan.c @@ -1162,11 +1162,11 @@ ieee80211_link_chanctx_reservation_complete(struct ieee80211_link_data *link) case NL80211_IFTYPE_MESH_POINT: case NL80211_IFTYPE_OCB: wiphy_work_queue(sdata->local->hw.wiphy, - &link->csa_finalize_work); + &link->csa.finalize_work); break; case NL80211_IFTYPE_STATION: wiphy_delayed_work_queue(sdata->local->hw.wiphy, - &link->u.mgd.chswitch_work, 0); + &link->u.mgd.csa.switch_work, 0); break; case NL80211_IFTYPE_UNSPECIFIED: case NL80211_IFTYPE_AP_VLAN: diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h index 5d078c0a2323..d4e73d3630e0 100644 --- a/net/mac80211/driver-ops.h +++ b/net/mac80211/driver-ops.h @@ -1150,6 +1150,9 @@ drv_pre_channel_switch(struct ieee80211_sub_if_data *sdata, if (!check_sdata_in_driver(sdata)) return -EIO; + if (!ieee80211_vif_link_active(&sdata->vif, ch_switch->link_id)) + return 0; + trace_drv_pre_channel_switch(local, sdata, ch_switch); if (local->ops->pre_channel_switch) ret = local->ops->pre_channel_switch(&local->hw, &sdata->vif, @@ -1171,6 +1174,9 @@ drv_post_channel_switch(struct ieee80211_link_data *link) if (!check_sdata_in_driver(sdata)) return -EIO; + if (!ieee80211_vif_link_active(&sdata->vif, link->link_id)) + return 0; + trace_drv_post_channel_switch(local, sdata); if (local->ops->post_channel_switch) ret = local->ops->post_channel_switch(&local->hw, &sdata->vif, @@ -1191,6 +1197,9 @@ drv_abort_channel_switch(struct ieee80211_link_data *link) if (!check_sdata_in_driver(sdata)) return; + if (!ieee80211_vif_link_active(&sdata->vif, link->link_id)) + return; + trace_drv_abort_channel_switch(local, sdata); if (local->ops->abort_channel_switch) @@ -1210,6 +1219,9 @@ drv_channel_switch_rx_beacon(struct ieee80211_sub_if_data *sdata, if (!check_sdata_in_driver(sdata)) return; + if (!ieee80211_vif_link_active(&sdata->vif, ch_switch->link_id)) + return; + trace_drv_channel_switch_rx_beacon(local, sdata, ch_switch); if (local->ops->channel_switch_rx_beacon) local->ops->channel_switch_rx_beacon(&local->hw, &sdata->vif, diff --git a/net/mac80211/ht.c b/net/mac80211/ht.c index d7e8cf8e48b7..79caeb485fd5 100644 --- a/net/mac80211/ht.c +++ b/net/mac80211/ht.c @@ -475,7 +475,7 @@ void ieee80211_send_delba(struct ieee80211_sub_if_data *sdata, sdata->vif.type == NL80211_IFTYPE_MESH_POINT) memcpy(mgmt->bssid, sdata->vif.addr, ETH_ALEN); else if (sdata->vif.type == NL80211_IFTYPE_STATION) - memcpy(mgmt->bssid, sdata->deflink.u.mgd.bssid, ETH_ALEN); + memcpy(mgmt->bssid, sdata->vif.cfg.ap_addr, ETH_ALEN); else if (sdata->vif.type == NL80211_IFTYPE_ADHOC) memcpy(mgmt->bssid, sdata->u.ibss.bssid, ETH_ALEN); diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c index 7ace5cdc6c26..bf338f3d4dd3 100644 --- a/net/mac80211/ibss.c +++ b/net/mac80211/ibss.c @@ -9,7 +9,7 @@ * Copyright 2009, Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright(c) 2016 Intel Deutschland GmbH - * Copyright(c) 2018-2023 Intel Corporation + * Copyright(c) 2018-2024 Intel Corporation */ #include <linux/delay.h> @@ -533,12 +533,12 @@ int ieee80211_ibss_finish_csa(struct ieee80211_sub_if_data *sdata, u64 *changed) IEEE80211_PRIVACY(ifibss->privacy)); /* XXX: should not really modify cfg80211 data */ if (cbss) { - cbss->channel = sdata->deflink.csa_chanreq.oper.chan; + cbss->channel = sdata->deflink.csa.chanreq.oper.chan; cfg80211_put_bss(sdata->local->hw.wiphy, cbss); } } - ifibss->chandef = sdata->deflink.csa_chanreq.oper; + ifibss->chandef = sdata->deflink.csa.chanreq.oper; /* generate the beacon */ return ieee80211_ibss_csa_beacon(sdata, NULL, changed); diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 3cedfdc9099b..3e735c9436d3 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -26,6 +26,7 @@ #include <linux/idr.h> #include <linux/rhashtable.h> #include <linux/rbtree.h> +#include <kunit/visibility.h> #include <net/ieee80211_radiotap.h> #include <net/cfg80211.h> #include <net/mac80211.h> @@ -974,10 +975,15 @@ struct ieee80211_link_data_managed { bool disable_wmm_tracking; bool operating_11g_mode; - bool csa_waiting_bcn; - bool csa_ignored_same_chan; - bool csa_blocked_tx; - struct wiphy_delayed_work chswitch_work; + struct { + struct wiphy_delayed_work switch_work; + struct cfg80211_chan_def ap_chandef; + struct ieee80211_parsed_tpe tpe; + unsigned long time; + bool waiting_bcn; + bool ignored_same_chan; + bool blocked_tx; + } csa; struct wiphy_work request_smps_work; /* used to reconfigure hardware SM PS */ @@ -1036,11 +1042,13 @@ struct ieee80211_link_data { struct ieee80211_key __rcu *default_mgmt_key; struct ieee80211_key __rcu *default_beacon_key; - struct wiphy_work csa_finalize_work; bool operating_11g_mode; - struct ieee80211_chan_req csa_chanreq; + struct { + struct wiphy_work finalize_work; + struct ieee80211_chan_req chanreq; + } csa; struct wiphy_work color_change_finalize_work; struct delayed_work color_collision_detect_work; @@ -1059,7 +1067,6 @@ struct ieee80211_link_data { int ap_power_level; /* in dBm */ bool radar_required; - struct wiphy_delayed_work dfs_cac_timer_work; union { struct ieee80211_link_data_managed mgd; @@ -1158,6 +1165,8 @@ struct ieee80211_sub_if_data { struct ieee80211_link_data deflink; struct ieee80211_link_data __rcu *link[IEEE80211_MLD_MAX_NUM_LINKS]; + struct wiphy_delayed_work dfs_cac_timer_work; + /* for ieee80211_set_active_links_async() */ struct wiphy_work activate_links_work; u16 desired_active_links; @@ -1708,7 +1717,6 @@ struct ieee802_11_elems { const struct ieee80211_he_spr *he_spr; const struct ieee80211_mu_edca_param_set *mu_edca_param_set; const struct ieee80211_he_6ghz_capa *he_6ghz_capa; - const struct ieee80211_tx_pwr_env *tx_pwr_env[IEEE80211_TPE_MAX_IE_COUNT]; const u8 *uora_element; const u8 *mesh_id; const u8 *peering; @@ -1746,6 +1754,10 @@ struct ieee802_11_elems { const struct ieee80211_bandwidth_indication *bandwidth_indication; const struct ieee80211_ttlm_elem *ttlm[IEEE80211_TTLM_MAX_CNT]; + /* not the order in the psd values is per element, not per chandef */ + struct ieee80211_parsed_tpe tpe; + struct ieee80211_parsed_tpe csa_tpe; + /* length of them, respectively */ u8 ext_capab_len; u8 ssid_len; @@ -1764,8 +1776,6 @@ struct ieee802_11_elems { u8 perr_len; u8 country_elem_len; u8 bssid_index_len; - u8 tx_pwr_env_len[IEEE80211_TPE_MAX_IE_COUNT]; - u8 tx_pwr_env_num; u8 eht_cap_len; /* mult-link element can be de-fragmented and thus u8 is not sufficient */ @@ -1970,6 +1980,7 @@ void ieee80211_offchannel_stop_vifs(struct ieee80211_local *local); void ieee80211_offchannel_return(struct ieee80211_local *local); void ieee80211_roc_setup(struct ieee80211_local *local); void ieee80211_start_next_roc(struct ieee80211_local *local); +void ieee80211_reconfig_roc(struct ieee80211_local *local); void ieee80211_roc_purge(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata); int ieee80211_remain_on_channel(struct wiphy *wiphy, struct wireless_dev *wdev, @@ -2245,6 +2256,7 @@ int ieee80211_frame_duration(enum nl80211_band band, size_t len, void ieee80211_regulatory_limit_wmm_params(struct ieee80211_sub_if_data *sdata, struct ieee80211_tx_queue_params *qparam, int ac); +void ieee80211_clear_tpe(struct ieee80211_parsed_tpe *tpe); void ieee80211_set_wmm_default(struct ieee80211_link_data *link, bool bss_notify, bool enable_qos); void ieee80211_xmit(struct ieee80211_sub_if_data *sdata, @@ -2683,6 +2695,11 @@ void ieee80211_remove_wbrf(struct ieee80211_local *local, struct cfg80211_chan_d #define VISIBLE_IF_MAC80211_KUNIT ieee80211_rx_result ieee80211_drop_unencrypted_mgmt(struct ieee80211_rx_data *rx); +int ieee80211_calc_chandef_subchan_offset(const struct cfg80211_chan_def *ap, + u8 n_partial_subchans); +void ieee80211_rearrange_tpe_psd(struct ieee80211_parsed_tpe_psd *psd, + const struct cfg80211_chan_def *ap, + const struct cfg80211_chan_def *used); #else #define EXPORT_SYMBOL_IF_MAC80211_KUNIT(sym) #define VISIBLE_IF_MAC80211_KUNIT static diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index b935bb5d8ed1..e31ce8d7649d 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -543,18 +543,18 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, bool going_do sdata->vif.bss_conf.csa_active = false; if (sdata->vif.type == NL80211_IFTYPE_STATION) - sdata->deflink.u.mgd.csa_waiting_bcn = false; + sdata->deflink.u.mgd.csa.waiting_bcn = false; if (sdata->csa_blocked_queues) { ieee80211_wake_vif_queues(local, sdata, IEEE80211_QUEUE_STOP_REASON_CSA); sdata->csa_blocked_queues = false; } - wiphy_work_cancel(local->hw.wiphy, &sdata->deflink.csa_finalize_work); + wiphy_work_cancel(local->hw.wiphy, &sdata->deflink.csa.finalize_work); wiphy_work_cancel(local->hw.wiphy, &sdata->deflink.color_change_finalize_work); wiphy_delayed_work_cancel(local->hw.wiphy, - &sdata->deflink.dfs_cac_timer_work); + &sdata->dfs_cac_timer_work); if (sdata->wdev.cac_started) { chandef = sdata->vif.bss_conf.chanreq.oper; @@ -1744,6 +1744,8 @@ static void ieee80211_setup_sdata(struct ieee80211_sub_if_data *sdata, wiphy_work_init(&sdata->work, ieee80211_iface_work); wiphy_work_init(&sdata->activate_links_work, ieee80211_activate_links_work); + wiphy_delayed_work_init(&sdata->dfs_cac_timer_work, + ieee80211_dfs_cac_timer_work); switch (type) { case NL80211_IFTYPE_P2P_GO: diff --git a/net/mac80211/link.c b/net/mac80211/link.c index af0321408a97..2e6e92defbca 100644 --- a/net/mac80211/link.c +++ b/net/mac80211/link.c @@ -37,7 +37,7 @@ void ieee80211_link_init(struct ieee80211_sub_if_data *sdata, link_conf->link_id = link_id; link_conf->vif = &sdata->vif; - wiphy_work_init(&link->csa_finalize_work, + wiphy_work_init(&link->csa.finalize_work, ieee80211_csa_finalize_work); wiphy_work_init(&link->color_change_finalize_work, ieee80211_color_change_finalize_work); @@ -45,8 +45,6 @@ void ieee80211_link_init(struct ieee80211_sub_if_data *sdata, ieee80211_color_collision_detection_work); INIT_LIST_HEAD(&link->assigned_chanctx_list); INIT_LIST_HEAD(&link->reserved_chanctx_list); - wiphy_delayed_work_init(&link->dfs_cac_timer_work, - ieee80211_dfs_cac_timer_work); if (!deflink) { switch (sdata->vif.type) { @@ -74,7 +72,7 @@ void ieee80211_link_stop(struct ieee80211_link_data *link) cancel_delayed_work_sync(&link->color_collision_detect_work); wiphy_work_cancel(link->sdata->local->hw.wiphy, - &link->csa_finalize_work); + &link->csa.finalize_work); ieee80211_link_release_channel(link); } @@ -359,6 +357,18 @@ static int _ieee80211_set_active_links(struct ieee80211_sub_if_data *sdata, ieee80211_teardown_tdls_peers(link); __ieee80211_link_release_channel(link, true); + + /* + * If CSA is (still) active while the link is deactivated, + * just schedule the channel switch work for the time we + * had previously calculated, and we'll take the process + * from there. + */ + if (link->conf->csa_active) + wiphy_delayed_work_queue(local->hw.wiphy, + &link->u.mgd.csa.switch_work, + link->u.mgd.csa.time - + jiffies); } list_for_each_entry(sta, &local->sta_list, list) { diff --git a/net/mac80211/main.c b/net/mac80211/main.c index 0965ad11ec74..a9aefc83d30a 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -337,6 +337,8 @@ void ieee80211_bss_info_change_notify(struct ieee80211_sub_if_data *sdata, might_sleep(); + WARN_ON_ONCE(ieee80211_vif_is_mld(&sdata->vif)); + if (!changed || sdata->vif.type == NL80211_IFTYPE_AP_VLAN) return; @@ -369,7 +371,6 @@ void ieee80211_bss_info_change_notify(struct ieee80211_sub_if_data *sdata, if (changed & ~BSS_CHANGED_VIF_CFG_FLAGS) { u64 ch = changed & ~BSS_CHANGED_VIF_CFG_FLAGS; - /* FIXME: should be for each link */ trace_drv_link_info_changed(local, sdata, &sdata->vif.bss_conf, changed); if (local->ops->link_info_changed) diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index a5f2d3cfe60d..ef3280fafbe9 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -606,11 +606,218 @@ static bool ieee80211_chandef_usable(struct ieee80211_sub_if_data *sdata, return true; } +static int ieee80211_chandef_num_subchans(const struct cfg80211_chan_def *c) +{ + if (c->width == NL80211_CHAN_WIDTH_80P80) + return 4 + 4; + + return nl80211_chan_width_to_mhz(c->width) / 20; +} + +static int ieee80211_chandef_num_widths(const struct cfg80211_chan_def *c) +{ + switch (c->width) { + case NL80211_CHAN_WIDTH_20: + case NL80211_CHAN_WIDTH_20_NOHT: + return 1; + case NL80211_CHAN_WIDTH_40: + return 2; + case NL80211_CHAN_WIDTH_80P80: + case NL80211_CHAN_WIDTH_80: + return 3; + case NL80211_CHAN_WIDTH_160: + return 4; + case NL80211_CHAN_WIDTH_320: + return 5; + default: + WARN_ON(1); + return 0; + } +} + +VISIBLE_IF_MAC80211_KUNIT int +ieee80211_calc_chandef_subchan_offset(const struct cfg80211_chan_def *ap, + u8 n_partial_subchans) +{ + int n = ieee80211_chandef_num_subchans(ap); + struct cfg80211_chan_def tmp = *ap; + int offset = 0; + + /* + * Given a chandef (in this context, it's the AP's) and a number + * of subchannels that we want to look at ('n_partial_subchans'), + * calculate the offset in number of subchannels between the full + * and the subset with the desired width. + */ + + /* same number of subchannels means no offset, obviously */ + if (n == n_partial_subchans) + return 0; + + /* don't WARN - misconfigured APs could cause this if their N > width */ + if (n < n_partial_subchans) + return 0; + + while (ieee80211_chandef_num_subchans(&tmp) > n_partial_subchans) { + u32 prev = tmp.center_freq1; + + ieee80211_chandef_downgrade(&tmp, NULL); + + /* + * if center_freq moved up, half the original channels + * are gone now but were below, so increase offset + */ + if (prev < tmp.center_freq1) + offset += ieee80211_chandef_num_subchans(&tmp); + } + + /* + * 80+80 with secondary 80 below primary - four subchannels for it + * (we cannot downgrade *to* 80+80, so no need to consider 'tmp') + */ + if (ap->width == NL80211_CHAN_WIDTH_80P80 && + ap->center_freq2 < ap->center_freq1) + offset += 4; + + return offset; +} +EXPORT_SYMBOL_IF_MAC80211_KUNIT(ieee80211_calc_chandef_subchan_offset); + +VISIBLE_IF_MAC80211_KUNIT void +ieee80211_rearrange_tpe_psd(struct ieee80211_parsed_tpe_psd *psd, + const struct cfg80211_chan_def *ap, + const struct cfg80211_chan_def *used) +{ + u8 needed = ieee80211_chandef_num_subchans(used); + u8 have = ieee80211_chandef_num_subchans(ap); + u8 tmp[IEEE80211_TPE_PSD_ENTRIES_320MHZ]; + u8 offset; + + if (!psd->valid) + return; + + /* if N is zero, all defaults were used, no point in rearranging */ + if (!psd->n) + goto out; + + BUILD_BUG_ON(sizeof(tmp) != sizeof(psd->power)); + + /* + * This assumes that 'N' is consistent with the HE channel, as + * it should be (otherwise the AP is broken). + * + * In psd->power we have values in the order 0..N, 0..K, where + * N+K should cover the entire channel per 'ap', but even if it + * doesn't then we've pre-filled 'unlimited' as defaults. + * + * But this is all the wrong order, we want to have them in the + * order of the 'used' channel. + * + * So for example, we could have a 320 MHz EHT AP, which has the + * HE channel as 80 MHz (e.g. due to puncturing, which doesn't + * seem to be considered for the TPE), as follows: + * + * EHT 320: | | | | | | | | | | | | | | | | | + * HE 80: | | | | | + * used 160: | | | | | | | | | + * + * N entries: |--|--|--|--| + * K entries: |--|--|--|--|--|--|--|--| |--|--|--|--| + * power idx: 4 5 6 7 8 9 10 11 0 1 2 3 12 13 14 15 + * full chan: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 + * used chan: 0 1 2 3 4 5 6 7 + * + * The idx in the power array ('power idx') is like this since it + * comes directly from the element's N and K entries in their + * element order, and those are this way for HE compatibility. + * + * Rearrange them as desired here, first by putting them into the + * 'full chan' order, and then selecting the necessary subset for + * the 'used chan'. + */ + + /* first reorder according to AP channel */ + offset = ieee80211_calc_chandef_subchan_offset(ap, psd->n); + for (int i = 0; i < have; i++) { + if (i < offset) + tmp[i] = psd->power[i + psd->n]; + else if (i < offset + psd->n) + tmp[i] = psd->power[i - offset]; + else + tmp[i] = psd->power[i]; + } + + /* + * and then select the subset for the used channel + * (set everything to defaults first in case a driver is confused) + */ + memset(psd->power, IEEE80211_TPE_PSD_NO_LIMIT, sizeof(psd->power)); + offset = ieee80211_calc_chandef_subchan_offset(ap, needed); + for (int i = 0; i < needed; i++) + psd->power[i] = tmp[offset + i]; + +out: + /* limit, but don't lie if there are defaults in the data */ + if (needed < psd->count) + psd->count = needed; +} +EXPORT_SYMBOL_IF_MAC80211_KUNIT(ieee80211_rearrange_tpe_psd); + +static void ieee80211_rearrange_tpe(struct ieee80211_parsed_tpe *tpe, + const struct cfg80211_chan_def *ap, + const struct cfg80211_chan_def *used) +{ + /* ignore this completely for narrow/invalid channels */ + if (!ieee80211_chandef_num_subchans(ap) || + !ieee80211_chandef_num_subchans(used)) { + ieee80211_clear_tpe(tpe); + return; + } + + for (int i = 0; i < 2; i++) { + int needed_pwr_count; + + ieee80211_rearrange_tpe_psd(&tpe->psd_local[i], ap, used); + ieee80211_rearrange_tpe_psd(&tpe->psd_reg_client[i], ap, used); + + /* limit this to the widths we actually need */ + needed_pwr_count = ieee80211_chandef_num_widths(used); + if (needed_pwr_count < tpe->max_local[i].count) + tpe->max_local[i].count = needed_pwr_count; + if (needed_pwr_count < tpe->max_reg_client[i].count) + tpe->max_reg_client[i].count = needed_pwr_count; + } +} + +/* + * The AP part of the channel request is used to distinguish settings + * to the device used for wider bandwidth OFDMA. This is used in the + * channel context code to assign two channel contexts even if they're + * both for the same channel, if the AP bandwidths are incompatible. + * If not EHT (or driver override) then ap.chan == NULL indicates that + * there's no wider BW OFDMA used. + */ +static void ieee80211_set_chanreq_ap(struct ieee80211_sub_if_data *sdata, + struct ieee80211_chan_req *chanreq, + struct ieee80211_conn_settings *conn, + struct cfg80211_chan_def *ap_chandef) +{ + chanreq->ap.chan = NULL; + + if (conn->mode < IEEE80211_CONN_MODE_EHT) + return; + if (sdata->vif.driver_flags & IEEE80211_VIF_IGNORE_OFDMA_WIDER_BW) + return; + + chanreq->ap = *ap_chandef; +} + static struct ieee802_11_elems * ieee80211_determine_chan_mode(struct ieee80211_sub_if_data *sdata, struct ieee80211_conn_settings *conn, struct cfg80211_bss *cbss, int link_id, - struct ieee80211_chan_req *chanreq) + struct ieee80211_chan_req *chanreq, + struct cfg80211_chan_def *ap_chandef) { const struct cfg80211_bss_ies *ies = rcu_dereference(cbss->ies); struct ieee80211_bss *bss = (void *)cbss->priv; @@ -623,7 +830,6 @@ ieee80211_determine_chan_mode(struct ieee80211_sub_if_data *sdata, }; struct ieee802_11_elems *elems; struct ieee80211_supported_band *sband; - struct cfg80211_chan_def ap_chandef; enum ieee80211_conn_mode ap_mode; int ret; @@ -634,7 +840,7 @@ again: return ERR_PTR(-ENOMEM); ap_mode = ieee80211_determine_ap_chan(sdata, channel, bss->vht_cap_info, - elems, false, conn, &ap_chandef); + elems, false, conn, ap_chandef); /* this should be impossible since parsing depends on our mode */ if (WARN_ON(ap_mode > conn->mode)) { @@ -701,14 +907,9 @@ again: break; } - chanreq->oper = ap_chandef; + chanreq->oper = *ap_chandef; - /* wider-bandwidth OFDMA is only done in EHT */ - if (conn->mode >= IEEE80211_CONN_MODE_EHT && - !(sdata->vif.driver_flags & IEEE80211_VIF_IGNORE_OFDMA_WIDER_BW)) - chanreq->ap = ap_chandef; - else - chanreq->ap.chan = NULL; + ieee80211_set_chanreq_ap(sdata, chanreq, conn, ap_chandef); while (!ieee80211_chandef_usable(sdata, &chanreq->oper, IEEE80211_CHAN_DISABLED)) { @@ -738,7 +939,7 @@ again: IEEE80211_CONN_BW_LIMIT_160); } - if (chanreq->oper.width != ap_chandef.width || ap_mode != conn->mode) + if (chanreq->oper.width != ap_chandef->width || ap_mode != conn->mode) sdata_info(sdata, "regulatory prevented using AP config, downgraded\n"); @@ -790,6 +991,7 @@ static int ieee80211_config_bw(struct ieee80211_link_data *link, struct ieee80211_channel *channel = link->conf->chanreq.oper.chan; struct ieee80211_sub_if_data *sdata = link->sdata; struct ieee80211_chan_req chanreq = {}; + struct cfg80211_chan_def ap_chandef; enum ieee80211_conn_mode ap_mode; u32 vht_cap_info = 0; u16 ht_opmode; @@ -805,7 +1007,7 @@ static int ieee80211_config_bw(struct ieee80211_link_data *link, ap_mode = ieee80211_determine_ap_chan(sdata, channel, vht_cap_info, elems, true, &link->u.mgd.conn, - &chanreq.ap); + &ap_chandef); if (ap_mode != link->u.mgd.conn.mode) { link_info(link, @@ -815,10 +1017,9 @@ static int ieee80211_config_bw(struct ieee80211_link_data *link, return -EINVAL; } - chanreq.oper = chanreq.ap; - if (link->u.mgd.conn.mode < IEEE80211_CONN_MODE_EHT || - sdata->vif.driver_flags & IEEE80211_VIF_IGNORE_OFDMA_WIDER_BW) - chanreq.ap.chan = NULL; + chanreq.oper = ap_chandef; + ieee80211_set_chanreq_ap(sdata, &chanreq, &link->u.mgd.conn, + &ap_chandef); /* * if HT operation mode changed store the new one - @@ -843,6 +1044,16 @@ static int ieee80211_config_bw(struct ieee80211_link_data *link, ieee80211_min_bw_limit_from_chandef(&chanreq.oper)) ieee80211_chandef_downgrade(&chanreq.oper, NULL); + if (ap_chandef.chan->band == NL80211_BAND_6GHZ && + link->u.mgd.conn.mode >= IEEE80211_CONN_MODE_HE) { + ieee80211_rearrange_tpe(&elems->tpe, &ap_chandef, + &chanreq.oper); + if (memcmp(&link->conf->tpe, &elems->tpe, sizeof(elems->tpe))) { + link->conf->tpe = elems->tpe; + *changed |= BSS_CHANGED_TPE; + } + } + if (ieee80211_chanreq_identical(&chanreq, &link->conf->chanreq)) return 0; @@ -1862,12 +2073,12 @@ void ieee80211_send_4addr_nullfunc(struct ieee80211_local *local, } /* spectrum management related things */ -static void ieee80211_chswitch_work(struct wiphy *wiphy, - struct wiphy_work *work) +static void ieee80211_csa_switch_work(struct wiphy *wiphy, + struct wiphy_work *work) { struct ieee80211_link_data *link = container_of(work, struct ieee80211_link_data, - u.mgd.chswitch_work.work); + u.mgd.csa.switch_work.work); struct ieee80211_sub_if_data *sdata = link->sdata; struct ieee80211_local *local = sdata->local; struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; @@ -1885,6 +2096,18 @@ static void ieee80211_chswitch_work(struct wiphy *wiphy, return; /* + * If the link isn't active (now), we cannot wait for beacons, won't + * have a reserved chanctx, etc. Just switch over the chandef and + * update cfg80211 directly. + */ + if (!ieee80211_vif_link_active(&sdata->vif, link->link_id)) { + link->conf->chanreq = link->csa.chanreq; + cfg80211_ch_switch_notify(sdata->dev, &link->csa.chanreq.oper, + link->link_id); + return; + } + + /* * using reservation isn't immediate as it may be deferred until later * with multi-vif. once reservation is complete it will re-schedule the * work with no reserved_chanctx so verify chandef to check if it @@ -1902,9 +2125,9 @@ static void ieee80211_chswitch_work(struct wiphy *wiphy, ret = ieee80211_link_use_reserved_context(link); if (ret) { - sdata_info(sdata, - "failed to use reserved channel context, disconnecting (err=%d)\n", - ret); + link_info(link, + "failed to use reserved channel context, disconnecting (err=%d)\n", + ret); wiphy_work_queue(sdata->local->hw.wiphy, &ifmgd->csa_connection_drop_work); } @@ -1912,15 +2135,29 @@ static void ieee80211_chswitch_work(struct wiphy *wiphy, } if (!ieee80211_chanreq_identical(&link->conf->chanreq, - &link->csa_chanreq)) { - sdata_info(sdata, - "failed to finalize channel switch, disconnecting\n"); + &link->csa.chanreq)) { + link_info(link, + "failed to finalize channel switch, disconnecting\n"); wiphy_work_queue(sdata->local->hw.wiphy, &ifmgd->csa_connection_drop_work); return; } - link->u.mgd.csa_waiting_bcn = true; + link->u.mgd.csa.waiting_bcn = true; + + /* apply new TPE restrictions immediately on the new channel */ + if (link->u.mgd.csa.ap_chandef.chan->band == NL80211_BAND_6GHZ && + link->u.mgd.conn.mode >= IEEE80211_CONN_MODE_HE) { + ieee80211_rearrange_tpe(&link->u.mgd.csa.tpe, + &link->u.mgd.csa.ap_chandef, + &link->conf->chanreq.oper); + if (memcmp(&link->conf->tpe, &link->u.mgd.csa.tpe, + sizeof(link->u.mgd.csa.tpe))) { + link->conf->tpe = link->u.mgd.csa.tpe; + ieee80211_link_info_change_notify(sdata, link, + BSS_CHANGED_TPE); + } + } ieee80211_sta_reset_beacon_monitor(sdata); ieee80211_sta_reset_conn_monitor(sdata); @@ -1944,19 +2181,19 @@ static void ieee80211_chswitch_post_beacon(struct ieee80211_link_data *link) } link->conf->csa_active = false; - link->u.mgd.csa_blocked_tx = false; - link->u.mgd.csa_waiting_bcn = false; + link->u.mgd.csa.blocked_tx = false; + link->u.mgd.csa.waiting_bcn = false; ret = drv_post_channel_switch(link); if (ret) { - sdata_info(sdata, - "driver post channel switch failed, disconnecting\n"); + link_info(link, + "driver post channel switch failed, disconnecting\n"); wiphy_work_queue(sdata->local->hw.wiphy, &ifmgd->csa_connection_drop_work); return; } - cfg80211_ch_switch_notify(sdata->dev, &link->reserved.oper, + cfg80211_ch_switch_notify(sdata->dev, &link->conf->chanreq.oper, link->link_id); } @@ -1971,7 +2208,8 @@ void ieee80211_chswitch_done(struct ieee80211_vif *vif, bool success, if (!success) { sdata_info(sdata, - "driver channel switch failed, disconnecting\n"); + "driver channel switch failed (link %d), disconnecting\n", + link_id); wiphy_work_queue(sdata->local->hw.wiphy, &sdata->u.mgd.csa_connection_drop_work); } else { @@ -1984,7 +2222,7 @@ void ieee80211_chswitch_done(struct ieee80211_vif *vif, bool success, } wiphy_delayed_work_queue(sdata->local->hw.wiphy, - &link->u.mgd.chswitch_work, 0); + &link->u.mgd.csa.switch_work, 0); } rcu_read_unlock(); @@ -2011,74 +2249,228 @@ ieee80211_sta_abort_chanswitch(struct ieee80211_link_data *link) } link->conf->csa_active = false; - link->u.mgd.csa_blocked_tx = false; + link->u.mgd.csa.blocked_tx = false; drv_abort_channel_switch(link); } +struct sta_csa_rnr_iter_data { + struct ieee80211_link_data *link; + struct ieee80211_channel *chan; + u8 mld_id; +}; + +static enum cfg80211_rnr_iter_ret +ieee80211_sta_csa_rnr_iter(void *_data, u8 type, + const struct ieee80211_neighbor_ap_info *info, + const u8 *tbtt_info, u8 tbtt_info_len) +{ + struct sta_csa_rnr_iter_data *data = _data; + struct ieee80211_link_data *link = data->link; + struct ieee80211_sub_if_data *sdata = link->sdata; + struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; + const struct ieee80211_tbtt_info_ge_11 *ti; + enum nl80211_band band; + unsigned int center_freq; + int link_id; + + if (type != IEEE80211_TBTT_INFO_TYPE_TBTT) + return RNR_ITER_CONTINUE; + + if (tbtt_info_len < sizeof(*ti)) + return RNR_ITER_CONTINUE; + + ti = (const void *)tbtt_info; + + if (ti->mld_params.mld_id != data->mld_id) + return RNR_ITER_CONTINUE; + + link_id = le16_get_bits(ti->mld_params.params, + IEEE80211_RNR_MLD_PARAMS_LINK_ID); + if (link_id != data->link->link_id) + return RNR_ITER_CONTINUE; + + /* we found the entry for our link! */ + + /* this AP is confused, it had this right before ... just disconnect */ + if (!ieee80211_operating_class_to_band(info->op_class, &band)) { + link_info(link, + "AP now has invalid operating class in RNR, disconnect\n"); + wiphy_work_queue(sdata->local->hw.wiphy, + &ifmgd->csa_connection_drop_work); + return RNR_ITER_BREAK; + } + + center_freq = ieee80211_channel_to_frequency(info->channel, band); + data->chan = ieee80211_get_channel(sdata->local->hw.wiphy, center_freq); + + return RNR_ITER_BREAK; +} + +static void +ieee80211_sta_other_link_csa_disappeared(struct ieee80211_link_data *link, + struct ieee802_11_elems *elems) +{ + struct ieee80211_sub_if_data *sdata = link->sdata; + struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; + struct sta_csa_rnr_iter_data data = { + .link = link, + }; + + /* + * If we get here, we see a beacon from another link without + * CSA still being reported for it, so now we have to check + * if the CSA was aborted or completed. This may not even be + * perfectly possible if the CSA was only done for changing + * the puncturing, but in that case if the link in inactive + * we don't really care, and if it's an active link (or when + * it's activated later) we'll get a beacon and adjust. + */ + + if (WARN_ON(!elems->ml_basic)) + return; + + data.mld_id = ieee80211_mle_get_mld_id((const void *)elems->ml_basic); + + /* + * So in order to do this, iterate the RNR element(s) and see + * what channel is reported now. + */ + cfg80211_iter_rnr(elems->ie_start, elems->total_len, + ieee80211_sta_csa_rnr_iter, &data); + + if (!data.chan) { + link_info(link, + "couldn't find (valid) channel in RNR for CSA, disconnect\n"); + wiphy_work_queue(sdata->local->hw.wiphy, + &ifmgd->csa_connection_drop_work); + return; + } + + /* + * If it doesn't match the CSA, then assume it aborted. This + * may erroneously detect that it was _not_ aborted when it + * was in fact aborted, but only changed the bandwidth or the + * puncturing configuration, but we don't have enough data to + * detect that. + */ + if (data.chan != link->csa.chanreq.oper.chan) + ieee80211_sta_abort_chanswitch(link); +} + +enum ieee80211_csa_source { + IEEE80211_CSA_SOURCE_BEACON, + IEEE80211_CSA_SOURCE_OTHER_LINK, + IEEE80211_CSA_SOURCE_ACTION, +}; + static void ieee80211_sta_process_chanswitch(struct ieee80211_link_data *link, u64 timestamp, u32 device_timestamp, - struct ieee802_11_elems *elems, - bool beacon) + struct ieee802_11_elems *full_elems, + struct ieee802_11_elems *csa_elems, + enum ieee80211_csa_source source) { struct ieee80211_sub_if_data *sdata = link->sdata; struct ieee80211_local *local = sdata->local; struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; - struct cfg80211_bss *cbss = link->conf->bss; + struct ieee80211_chanctx *chanctx = NULL; struct ieee80211_chanctx_conf *conf; - struct ieee80211_chanctx *chanctx; - enum nl80211_band current_band; - struct ieee80211_csa_ie csa_ie; + struct ieee80211_csa_ie csa_ie = {}; struct ieee80211_channel_switch ch_switch = { .link_id = link->link_id, + .timestamp = timestamp, + .device_timestamp = device_timestamp, }; - struct ieee80211_bss *bss; - unsigned long timeout; + unsigned long now; int res; lockdep_assert_wiphy(local->hw.wiphy); - if (!cbss) - return; + if (csa_elems) { + struct cfg80211_bss *cbss = link->conf->bss; + enum nl80211_band current_band; + struct ieee80211_bss *bss; - current_band = cbss->channel->band; - bss = (void *)cbss->priv; - res = ieee80211_parse_ch_switch_ie(sdata, elems, current_band, - bss->vht_cap_info, - &link->u.mgd.conn, - link->u.mgd.bssid, &csa_ie); + if (WARN_ON(!cbss)) + return; + + current_band = cbss->channel->band; + bss = (void *)cbss->priv; + + res = ieee80211_parse_ch_switch_ie(sdata, csa_elems, + current_band, + bss->vht_cap_info, + &link->u.mgd.conn, + link->u.mgd.bssid, &csa_ie); + if (res == 0) { + ch_switch.block_tx = csa_ie.mode; + ch_switch.chandef = csa_ie.chanreq.oper; + ch_switch.count = csa_ie.count; + ch_switch.delay = csa_ie.max_switch_time; + } - if (!res) { - ch_switch.timestamp = timestamp; - ch_switch.device_timestamp = device_timestamp; - ch_switch.block_tx = csa_ie.mode; - ch_switch.chandef = csa_ie.chanreq.oper; - ch_switch.count = csa_ie.count; - ch_switch.delay = csa_ie.max_switch_time; + link->u.mgd.csa.tpe = csa_elems->csa_tpe; + } else { + /* + * If there was no per-STA profile for this link, we + * get called with csa_elems == NULL. This of course means + * there are no CSA elements, so set res=1 indicating + * no more CSA. + */ + res = 1; } if (res < 0) goto drop_connection; if (link->conf->csa_active) { - /* already processing - disregard action frames */ - if (!beacon) + switch (source) { + case IEEE80211_CSA_SOURCE_ACTION: + /* already processing - disregard action frames */ return; + case IEEE80211_CSA_SOURCE_BEACON: + if (link->u.mgd.csa.waiting_bcn) { + ieee80211_chswitch_post_beacon(link); + /* + * If the CSA is still present after the switch + * we need to consider it as a new CSA (possibly + * to self). This happens by not returning here + * so we'll get to the check below. + */ + } else if (res) { + ieee80211_sta_abort_chanswitch(link); + return; + } else { + drv_channel_switch_rx_beacon(sdata, &ch_switch); + return; + } + break; + case IEEE80211_CSA_SOURCE_OTHER_LINK: + /* active link: we want to see the beacon to continue */ + if (ieee80211_vif_link_active(&sdata->vif, + link->link_id)) + return; - if (link->u.mgd.csa_waiting_bcn) { - ieee80211_chswitch_post_beacon(link); - /* - * If the CSA IE is still present in the beacon after - * the switch, we need to consider it as a new CSA - * (possibly to self) - this happens by not returning - * here so we'll get to the check below. - */ - } else if (res) { - ieee80211_sta_abort_chanswitch(link); - return; - } else { - drv_channel_switch_rx_beacon(sdata, &ch_switch); + /* switch work ran, so just complete the process */ + if (link->u.mgd.csa.waiting_bcn) { + ieee80211_chswitch_post_beacon(link); + /* + * If the CSA is still present after the switch + * we need to consider it as a new CSA (possibly + * to self). This happens by not returning here + * so we'll get to the check below. + */ + break; + } + + /* link still has CSA but we already know, do nothing */ + if (!res) + return; + + /* check in the RNR if the CSA aborted */ + ieee80211_sta_other_link_csa_disappeared(link, + full_elems); return; } } @@ -2089,41 +2481,39 @@ ieee80211_sta_process_chanswitch(struct ieee80211_link_data *link, if (link->conf->chanreq.oper.chan->band != csa_ie.chanreq.oper.chan->band) { - sdata_info(sdata, - "AP %pM switches to different band (%d MHz, width:%d, CF1/2: %d/%d MHz), disconnecting\n", - link->u.mgd.bssid, - csa_ie.chanreq.oper.chan->center_freq, - csa_ie.chanreq.oper.width, - csa_ie.chanreq.oper.center_freq1, - csa_ie.chanreq.oper.center_freq2); + link_info(link, + "AP %pM switches to different band (%d MHz, width:%d, CF1/2: %d/%d MHz), disconnecting\n", + link->u.mgd.bssid, + csa_ie.chanreq.oper.chan->center_freq, + csa_ie.chanreq.oper.width, + csa_ie.chanreq.oper.center_freq1, + csa_ie.chanreq.oper.center_freq2); goto drop_connection; } if (!cfg80211_chandef_usable(local->hw.wiphy, &csa_ie.chanreq.oper, IEEE80211_CHAN_DISABLED)) { - sdata_info(sdata, - "AP %pM switches to unsupported channel " - "(%d.%03d MHz, width:%d, CF1/2: %d.%03d/%d MHz), " - "disconnecting\n", - link->u.mgd.bssid, - csa_ie.chanreq.oper.chan->center_freq, - csa_ie.chanreq.oper.chan->freq_offset, - csa_ie.chanreq.oper.width, - csa_ie.chanreq.oper.center_freq1, - csa_ie.chanreq.oper.freq1_offset, - csa_ie.chanreq.oper.center_freq2); + link_info(link, + "AP %pM switches to unsupported channel (%d.%03d MHz, width:%d, CF1/2: %d.%03d/%d MHz), disconnecting\n", + link->u.mgd.bssid, + csa_ie.chanreq.oper.chan->center_freq, + csa_ie.chanreq.oper.chan->freq_offset, + csa_ie.chanreq.oper.width, + csa_ie.chanreq.oper.center_freq1, + csa_ie.chanreq.oper.freq1_offset, + csa_ie.chanreq.oper.center_freq2); goto drop_connection; } if (cfg80211_chandef_identical(&csa_ie.chanreq.oper, &link->conf->chanreq.oper) && - (!csa_ie.mode || !beacon)) { - if (link->u.mgd.csa_ignored_same_chan) + (!csa_ie.mode || source != IEEE80211_CSA_SOURCE_BEACON)) { + if (link->u.mgd.csa.ignored_same_chan) return; - sdata_info(sdata, - "AP %pM tries to chanswitch to same channel, ignore\n", - link->u.mgd.bssid); - link->u.mgd.csa_ignored_same_chan = true; + link_info(link, + "AP %pM tries to chanswitch to same channel, ignore\n", + link->u.mgd.bssid); + link->u.mgd.csa.ignored_same_chan = true; return; } @@ -2138,40 +2528,48 @@ ieee80211_sta_process_chanswitch(struct ieee80211_link_data *link, conf = rcu_dereference_protected(link->conf->chanctx_conf, lockdep_is_held(&local->hw.wiphy->mtx)); - if (!conf) { - sdata_info(sdata, - "no channel context assigned to vif?, disconnecting\n"); + if (ieee80211_vif_link_active(&sdata->vif, link->link_id) && !conf) { + link_info(link, + "no channel context assigned to vif?, disconnecting\n"); goto drop_connection; } - chanctx = container_of(conf, struct ieee80211_chanctx, conf); + if (conf) + chanctx = container_of(conf, struct ieee80211_chanctx, conf); if (!ieee80211_hw_check(&local->hw, CHANCTX_STA_CSA)) { - sdata_info(sdata, - "driver doesn't support chan-switch with channel contexts\n"); + link_info(link, + "driver doesn't support chan-switch with channel contexts\n"); goto drop_connection; } if (drv_pre_channel_switch(sdata, &ch_switch)) { - sdata_info(sdata, - "preparing for channel switch failed, disconnecting\n"); + link_info(link, + "preparing for channel switch failed, disconnecting\n"); goto drop_connection; } - res = ieee80211_link_reserve_chanctx(link, &csa_ie.chanreq, - chanctx->mode, false); - if (res) { - sdata_info(sdata, - "failed to reserve channel context for channel switch, disconnecting (err=%d)\n", - res); - goto drop_connection; + link->u.mgd.csa.ap_chandef = csa_ie.chanreq.ap; + + link->csa.chanreq.oper = csa_ie.chanreq.oper; + ieee80211_set_chanreq_ap(sdata, &link->csa.chanreq, &link->u.mgd.conn, + &csa_ie.chanreq.ap); + + if (chanctx) { + res = ieee80211_link_reserve_chanctx(link, &link->csa.chanreq, + chanctx->mode, false); + if (res) { + link_info(link, + "failed to reserve channel context for channel switch, disconnecting (err=%d)\n", + res); + goto drop_connection; + } } link->conf->csa_active = true; - link->csa_chanreq = csa_ie.chanreq; - link->u.mgd.csa_ignored_same_chan = false; + link->u.mgd.csa.ignored_same_chan = false; link->u.mgd.beacon_crc_valid = false; - link->u.mgd.csa_blocked_tx = csa_ie.mode; + link->u.mgd.csa.blocked_tx = csa_ie.mode; if (csa_ie.mode && !ieee80211_hw_check(&local->hw, HANDLES_QUIET_CSA)) { @@ -2184,18 +2582,28 @@ ieee80211_sta_process_chanswitch(struct ieee80211_link_data *link, link->link_id, csa_ie.count, csa_ie.mode); - if (local->ops->channel_switch) { - /* use driver's channel switch callback */ + /* we may have to handle timeout for deactivated link in software */ + now = jiffies; + link->u.mgd.csa.time = now + + TU_TO_JIFFIES((max_t(int, csa_ie.count, 1) - 1) * + link->conf->beacon_int); + + if (ieee80211_vif_link_active(&sdata->vif, link->link_id) && + local->ops->channel_switch) { + /* + * Use driver's channel switch callback, the driver will + * later call ieee80211_chswitch_done(). It may deactivate + * the link as well, we handle that elsewhere and queue + * the csa.switch_work for the calculated time then. + */ drv_channel_switch(local, sdata, &ch_switch); return; } /* channel switch handled in software */ - timeout = TU_TO_JIFFIES((max_t(int, csa_ie.count, 1) - 1) * - cbss->beacon_interval); wiphy_delayed_work_queue(local->hw.wiphy, - &link->u.mgd.chswitch_work, - timeout); + &link->u.mgd.csa.switch_work, + link->u.mgd.csa.time - now); return; drop_connection: /* @@ -2206,7 +2614,7 @@ ieee80211_sta_process_chanswitch(struct ieee80211_link_data *link, * reset when the disconnection worker runs. */ link->conf->csa_active = true; - link->u.mgd.csa_blocked_tx = csa_ie.mode; + link->u.mgd.csa.blocked_tx = csa_ie.mode; sdata->csa_blocked_queues = csa_ie.mode && !ieee80211_hw_check(&local->hw, HANDLES_QUIET_CSA); @@ -2602,16 +3010,15 @@ void ieee80211_dynamic_ps_timer(struct timer_list *t) void ieee80211_dfs_cac_timer_work(struct wiphy *wiphy, struct wiphy_work *work) { - struct ieee80211_link_data *link = - container_of(work, struct ieee80211_link_data, + struct ieee80211_sub_if_data *sdata = + container_of(work, struct ieee80211_sub_if_data, dfs_cac_timer_work.work); - struct cfg80211_chan_def chandef = link->conf->chanreq.oper; - struct ieee80211_sub_if_data *sdata = link->sdata; + struct cfg80211_chan_def chandef = sdata->vif.bss_conf.chanreq.oper; lockdep_assert_wiphy(sdata->local->hw.wiphy); if (sdata->wdev.cac_started) { - ieee80211_link_release_channel(link); + ieee80211_link_release_channel(&sdata->deflink); cfg80211_cac_event(sdata->dev, &chandef, NL80211_RADAR_CAC_FINISHED, GFP_KERNEL); @@ -3260,9 +3667,9 @@ static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata, } sdata->vif.bss_conf.csa_active = false; - sdata->deflink.u.mgd.csa_blocked_tx = false; - sdata->deflink.u.mgd.csa_waiting_bcn = false; - sdata->deflink.u.mgd.csa_ignored_same_chan = false; + sdata->deflink.u.mgd.csa.blocked_tx = false; + sdata->deflink.u.mgd.csa.waiting_bcn = false; + sdata->deflink.u.mgd.csa.ignored_same_chan = false; if (sdata->csa_blocked_queues) { ieee80211_wake_vif_queues(local, sdata, IEEE80211_QUEUE_STOP_REASON_CSA); @@ -3275,9 +3682,7 @@ static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata, sdata->vif.bss_conf.power_type = IEEE80211_REG_UNSET_AP; sdata->vif.bss_conf.pwr_reduction = 0; - sdata->vif.bss_conf.tx_pwr_env_num = 0; - memset(sdata->vif.bss_conf.tx_pwr_env, 0, - sizeof(sdata->vif.bss_conf.tx_pwr_env)); + ieee80211_clear_tpe(&sdata->vif.bss_conf.tpe); sdata->vif.cfg.eml_cap = 0; sdata->vif.cfg.eml_med_sync_delay = 0; @@ -3287,8 +3692,17 @@ static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata, sizeof(sdata->u.mgd.ttlm_info)); wiphy_delayed_work_cancel(sdata->local->hw.wiphy, &ifmgd->ttlm_work); + memset(&sdata->vif.neg_ttlm, 0, sizeof(sdata->vif.neg_ttlm)); wiphy_delayed_work_cancel(sdata->local->hw.wiphy, &ifmgd->neg_ttlm_timeout_work); + + sdata->u.mgd.removed_links = 0; + wiphy_delayed_work_cancel(sdata->local->hw.wiphy, + &sdata->u.mgd.ml_reconf_work); + + wiphy_work_cancel(sdata->local->hw.wiphy, + &ifmgd->teardown_ttlm_work); + ieee80211_vif_set_links(sdata, 0, 0); ifmgd->mcast_seq_last = IEEE80211_SN_MODULO; @@ -3592,7 +4006,7 @@ static void __ieee80211_disconnect(struct ieee80211_sub_if_data *sdata) if (WARN_ON_ONCE(!link)) continue; - if (link->u.mgd.csa_blocked_tx) + if (link->u.mgd.csa.blocked_tx) continue; tx = true; @@ -3629,8 +4043,8 @@ static void __ieee80211_disconnect(struct ieee80211_sub_if_data *sdata) tx, frame_buf); /* the other links will be destroyed */ sdata->vif.bss_conf.csa_active = false; - sdata->deflink.u.mgd.csa_waiting_bcn = false; - sdata->deflink.u.mgd.csa_blocked_tx = false; + sdata->deflink.u.mgd.csa.waiting_bcn = false; + sdata->deflink.u.mgd.csa.blocked_tx = false; if (sdata->csa_blocked_queues) { ieee80211_wake_vif_queues(local, sdata, IEEE80211_QUEUE_STOP_REASON_CSA); @@ -4445,40 +4859,12 @@ static bool ieee80211_assoc_config_link(struct ieee80211_link_data *link, if (elems->he_operation && link->u.mgd.conn.mode >= IEEE80211_CONN_MODE_HE && elems->he_cap) { - const struct ieee80211_he_6ghz_oper *he_6ghz_oper; - ieee80211_he_cap_ie_to_sta_he_cap(sdata, sband, elems->he_cap, elems->he_cap_len, elems->he_6ghz_capa, link_sta); - he_6ghz_oper = ieee80211_he_6ghz_oper(elems->he_operation); - - if (is_6ghz && he_6ghz_oper) { - switch (u8_get_bits(he_6ghz_oper->control, - IEEE80211_HE_6GHZ_OPER_CTRL_REG_INFO)) { - case IEEE80211_6GHZ_CTRL_REG_LPI_AP: - case IEEE80211_6GHZ_CTRL_REG_INDOOR_LPI_AP: - bss_conf->power_type = IEEE80211_REG_LPI_AP; - break; - case IEEE80211_6GHZ_CTRL_REG_SP_AP: - case IEEE80211_6GHZ_CTRL_REG_INDOOR_SP_AP: - bss_conf->power_type = IEEE80211_REG_SP_AP; - break; - case IEEE80211_6GHZ_CTRL_REG_VLP_AP: - bss_conf->power_type = IEEE80211_REG_VLP_AP; - break; - default: - bss_conf->power_type = IEEE80211_REG_UNSET_AP; - break; - } - } else if (is_6ghz) { - link_info(link, - "HE 6 GHz operation missing (on %d MHz), expect issues\n", - bss_conf->chanreq.oper.chan->center_freq); - } - bss_conf->he_support = link_sta->pub->he_cap.has_he; if (elems->rsnx && elems->rsnx_len && (elems->rsnx[0] & WLAN_RSNX_CAPA_PROTECTED_TWT) && @@ -5020,6 +5406,23 @@ ieee80211_determine_our_sta_mode_assoc(struct ieee80211_sub_if_data *sdata, conn->bw_limit, tmp.bw_limit); } +static enum ieee80211_ap_reg_power +ieee80211_ap_power_type(u8 control) +{ + switch (u8_get_bits(control, IEEE80211_HE_6GHZ_OPER_CTRL_REG_INFO)) { + case IEEE80211_6GHZ_CTRL_REG_LPI_AP: + case IEEE80211_6GHZ_CTRL_REG_INDOOR_LPI_AP: + return IEEE80211_REG_LPI_AP; + case IEEE80211_6GHZ_CTRL_REG_SP_AP: + case IEEE80211_6GHZ_CTRL_REG_INDOOR_SP_AP: + return IEEE80211_REG_SP_AP; + case IEEE80211_6GHZ_CTRL_REG_VLP_AP: + return IEEE80211_REG_VLP_AP; + default: + return IEEE80211_REG_UNSET_AP; + } +} + static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata, struct ieee80211_link_data *link, int link_id, @@ -5029,15 +5432,15 @@ static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata, struct ieee80211_local *local = sdata->local; bool is_6ghz = cbss->channel->band == NL80211_BAND_6GHZ; struct ieee80211_chan_req chanreq = {}; + struct cfg80211_chan_def ap_chandef; struct ieee802_11_elems *elems; int ret; - u32 i; lockdep_assert_wiphy(local->hw.wiphy); rcu_read_lock(); elems = ieee80211_determine_chan_mode(sdata, conn, cbss, link_id, - &chanreq); + &chanreq, &ap_chandef); if (IS_ERR(elems)) { rcu_read_unlock(); @@ -5052,26 +5455,23 @@ static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata, } if (link && is_6ghz && conn->mode >= IEEE80211_CONN_MODE_HE) { - struct ieee80211_bss_conf *bss_conf; - u8 j = 0; - - bss_conf = link->conf; + const struct ieee80211_he_6ghz_oper *he_6ghz_oper; if (elems->pwr_constr_elem) - bss_conf->pwr_reduction = *elems->pwr_constr_elem; + link->conf->pwr_reduction = *elems->pwr_constr_elem; - BUILD_BUG_ON(ARRAY_SIZE(bss_conf->tx_pwr_env) != - ARRAY_SIZE(elems->tx_pwr_env)); - - for (i = 0; i < elems->tx_pwr_env_num; i++) { - if (elems->tx_pwr_env_len[i] > sizeof(bss_conf->tx_pwr_env[j])) - continue; + he_6ghz_oper = ieee80211_he_6ghz_oper(elems->he_operation); + if (he_6ghz_oper) + link->conf->power_type = + ieee80211_ap_power_type(he_6ghz_oper->control); + else + link_info(link, + "HE 6 GHz operation missing (on %d MHz), expect issues\n", + cbss->channel->center_freq); - bss_conf->tx_pwr_env_num++; - memcpy(&bss_conf->tx_pwr_env[j], elems->tx_pwr_env[i], - elems->tx_pwr_env_len[i]); - j++; - } + link->conf->tpe = elems->tpe; + ieee80211_rearrange_tpe(&link->conf->tpe, &ap_chandef, + &chanreq.oper); } rcu_read_unlock(); /* the element data was RCU protected so no longer valid anyway */ @@ -6150,6 +6550,110 @@ static void ieee80211_process_adv_ttlm(struct ieee80211_sub_if_data *sdata, } } +static void +ieee80211_mgd_check_cross_link_csa(struct ieee80211_sub_if_data *sdata, + int reporting_link_id, + struct ieee802_11_elems *elems) +{ + const struct element *sta_profiles[IEEE80211_MLD_MAX_NUM_LINKS] = {}; + ssize_t sta_profiles_len[IEEE80211_MLD_MAX_NUM_LINKS] = {}; + const struct element *sub; + const u8 *subelems; + size_t subelems_len; + u8 common_size; + int link_id; + + if (!ieee80211_mle_size_ok((u8 *)elems->ml_basic, elems->ml_basic_len)) + return; + + common_size = ieee80211_mle_common_size((u8 *)elems->ml_basic); + subelems = (u8 *)elems->ml_basic + common_size; + subelems_len = elems->ml_basic_len - common_size; + + for_each_element_id(sub, IEEE80211_MLE_SUBELEM_PER_STA_PROFILE, + subelems, subelems_len) { + struct ieee80211_mle_per_sta_profile *prof = (void *)sub->data; + struct ieee80211_link_data *link; + ssize_t len; + + if (!ieee80211_mle_basic_sta_prof_size_ok(sub->data, + sub->datalen)) + continue; + + link_id = le16_get_bits(prof->control, + IEEE80211_MLE_STA_CONTROL_LINK_ID); + /* need a valid link ID, but also not our own, both AP bugs */ + if (link_id == reporting_link_id || + link_id >= IEEE80211_MLD_MAX_NUM_LINKS) + continue; + + link = sdata_dereference(sdata->link[link_id], sdata); + if (!link) + continue; + + len = cfg80211_defragment_element(sub, subelems, subelems_len, + NULL, 0, + IEEE80211_MLE_SUBELEM_FRAGMENT); + if (WARN_ON(len < 0)) + continue; + + sta_profiles[link_id] = sub; + sta_profiles_len[link_id] = len; + } + + for (link_id = 0; link_id < IEEE80211_MLD_MAX_NUM_LINKS; link_id++) { + struct ieee80211_mle_per_sta_profile *prof; + struct ieee802_11_elems *prof_elems; + struct ieee80211_link_data *link; + ssize_t len; + + if (link_id == reporting_link_id) + continue; + + link = sdata_dereference(sdata->link[link_id], sdata); + if (!link) + continue; + + if (!sta_profiles[link_id]) { + prof_elems = NULL; + goto handle; + } + + /* we can defragment in-place, won't use the buffer again */ + len = cfg80211_defragment_element(sta_profiles[link_id], + subelems, subelems_len, + (void *)sta_profiles[link_id], + sta_profiles_len[link_id], + IEEE80211_MLE_SUBELEM_FRAGMENT); + if (WARN_ON(len != sta_profiles_len[link_id])) + continue; + + prof = (void *)sta_profiles[link_id]; + prof_elems = ieee802_11_parse_elems(prof->variable + + (prof->sta_info_len - 1), + len - + (prof->sta_info_len - 1), + false, NULL); + + /* memory allocation failed - let's hope that's transient */ + if (!prof_elems) + continue; + +handle: + /* + * FIXME: the timings here are obviously incorrect, + * but only older Intel drivers seem to care, and + * those don't have MLO. If you really need this, + * the problem is having to calculate it with the + * TSF offset etc. The device_timestamp is still + * correct, of course. + */ + ieee80211_sta_process_chanswitch(link, 0, 0, elems, prof_elems, + IEEE80211_CSA_SOURCE_OTHER_LINK); + kfree(prof_elems); + } +} + static void ieee80211_rx_mgmt_beacon(struct ieee80211_link_data *link, struct ieee80211_hdr *hdr, size_t len, struct ieee80211_rx_status *rx_status) @@ -6374,7 +6878,11 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_link_data *link, ieee80211_sta_process_chanswitch(link, rx_status->mactime, rx_status->device_timestamp, - elems, true); + elems, elems, + IEEE80211_CSA_SOURCE_BEACON); + + /* note that after this elems->ml_basic can no longer be used fully */ + ieee80211_mgd_check_cross_link_csa(sdata, rx_status->link_id, elems); if (!link->u.mgd.disable_wmm_tracking && ieee80211_sta_wmm_params(local, link, elems->wmm_param, @@ -6834,7 +7342,7 @@ static void ieee80211_teardown_ttlm_work(struct wiphy *wiphy, u16 new_dormant_links; struct ieee80211_sub_if_data *sdata = container_of(work, struct ieee80211_sub_if_data, - u.mgd.neg_ttlm_timeout_work.work); + u.mgd.teardown_ttlm_work); if (!sdata->vif.neg_ttlm.valid) return; @@ -6970,7 +7478,8 @@ void ieee80211_sta_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata, ieee80211_sta_process_chanswitch(link, rx_status->mactime, rx_status->device_timestamp, - elems, false); + elems, elems, + IEEE80211_CSA_SOURCE_ACTION); kfree(elems); } else if (mgmt->u.action.category == WLAN_CATEGORY_PUBLIC) { struct ieee802_11_elems *elems; @@ -6998,7 +7507,8 @@ void ieee80211_sta_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata, ieee80211_sta_process_chanswitch(link, rx_status->mactime, rx_status->device_timestamp, - elems, false); + elems, elems, + IEEE80211_CSA_SOURCE_ACTION); } kfree(elems); @@ -7321,7 +7831,7 @@ static void ieee80211_sta_bcn_mon_timer(struct timer_list *t) return; if (sdata->vif.bss_conf.csa_active && - !sdata->deflink.u.mgd.csa_waiting_bcn) + !sdata->deflink.u.mgd.csa.waiting_bcn) return; if (sdata->vif.driver_flags & IEEE80211_VIF_BEACON_FILTER) @@ -7345,7 +7855,7 @@ static void ieee80211_sta_conn_mon_timer(struct timer_list *t) return; if (sdata->vif.bss_conf.csa_active && - !sdata->deflink.u.mgd.csa_waiting_bcn) + !sdata->deflink.u.mgd.csa.waiting_bcn) return; sta = sta_info_get(sdata, sdata->vif.cfg.ap_addr); @@ -7556,8 +8066,10 @@ void ieee80211_mgd_setup_link(struct ieee80211_link_data *link) else link->u.mgd.req_smps = IEEE80211_SMPS_OFF; - wiphy_delayed_work_init(&link->u.mgd.chswitch_work, - ieee80211_chswitch_work); + wiphy_delayed_work_init(&link->u.mgd.csa.switch_work, + ieee80211_csa_switch_work); + + ieee80211_clear_tpe(&link->conf->tpe); if (sdata->u.mgd.assoc_data) ether_addr_copy(link->conf->addr, @@ -8686,7 +9198,7 @@ void ieee80211_mgd_stop_link(struct ieee80211_link_data *link) wiphy_work_cancel(link->sdata->local->hw.wiphy, &link->u.mgd.recalc_smps); wiphy_delayed_work_cancel(link->sdata->local->hw.wiphy, - &link->u.mgd.chswitch_work); + &link->u.mgd.csa.switch_work); } void ieee80211_mgd_stop(struct ieee80211_sub_if_data *sdata) @@ -8704,15 +9216,8 @@ void ieee80211_mgd_stop(struct ieee80211_sub_if_data *sdata) &ifmgd->beacon_connection_loss_work); wiphy_work_cancel(sdata->local->hw.wiphy, &ifmgd->csa_connection_drop_work); - wiphy_work_cancel(sdata->local->hw.wiphy, - &ifmgd->teardown_ttlm_work); wiphy_delayed_work_cancel(sdata->local->hw.wiphy, &ifmgd->tdls_peer_del_work); - wiphy_delayed_work_cancel(sdata->local->hw.wiphy, - &ifmgd->ml_reconf_work); - wiphy_delayed_work_cancel(sdata->local->hw.wiphy, &ifmgd->ttlm_work); - wiphy_delayed_work_cancel(sdata->local->hw.wiphy, - &ifmgd->neg_ttlm_timeout_work); if (ifmgd->assoc_data) ieee80211_destroy_assoc_data(sdata, ASSOC_TIMEOUT); diff --git a/net/mac80211/offchannel.c b/net/mac80211/offchannel.c index 65e1e9e971fd..28d03196ef75 100644 --- a/net/mac80211/offchannel.c +++ b/net/mac80211/offchannel.c @@ -8,7 +8,7 @@ * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz> * Copyright 2007, Michael Wu <flamingice@sourmilk.net> * Copyright 2009 Johannes Berg <johannes@sipsolutions.net> - * Copyright (C) 2019, 2022-2023 Intel Corporation + * Copyright (C) 2019, 2022-2024 Intel Corporation */ #include <linux/export.h> #include <net/mac80211.h> @@ -413,6 +413,39 @@ void ieee80211_start_next_roc(struct ieee80211_local *local) } } +void ieee80211_reconfig_roc(struct ieee80211_local *local) +{ + struct ieee80211_roc_work *roc, *tmp; + + /* + * In the software implementation can just continue with the + * interruption due to reconfig, roc_work is still queued if + * needed. + */ + if (!local->ops->remain_on_channel) + return; + + /* flush work so nothing from the driver is still pending */ + wiphy_work_flush(local->hw.wiphy, &local->hw_roc_start); + wiphy_work_flush(local->hw.wiphy, &local->hw_roc_done); + + list_for_each_entry_safe(roc, tmp, &local->roc_list, list) { + if (!roc->started) + break; + + if (!roc->hw_begun) { + /* it didn't start in HW yet, so we can restart it */ + roc->started = false; + continue; + } + + /* otherwise destroy it and tell userspace */ + ieee80211_roc_notify_destroy(roc); + } + + ieee80211_start_next_roc(local); +} + static void __ieee80211_roc_work(struct ieee80211_local *local) { struct ieee80211_roc_work *roc; diff --git a/net/mac80211/parse.c b/net/mac80211/parse.c index 055a60e90979..28aae14db8a9 100644 --- a/net/mac80211/parse.c +++ b/net/mac80211/parse.c @@ -187,6 +187,84 @@ ieee80211_parse_extension_element(u32 *crc, *crc = crc32_be(*crc, (void *)elem, elem->datalen + 2); } +static void ieee80211_parse_tpe(struct ieee80211_parsed_tpe *tpe, + const u8 *data, u8 len) +{ + const struct ieee80211_tx_pwr_env *env = (const void *)data; + u8 count, interpret, category; + u8 *out, N, *cnt_out = NULL, *N_out = NULL; + + if (!ieee80211_valid_tpe_element(data, len)) + return; + + count = u8_get_bits(env->info, IEEE80211_TX_PWR_ENV_INFO_COUNT); + interpret = u8_get_bits(env->info, IEEE80211_TX_PWR_ENV_INFO_INTERPRET); + category = u8_get_bits(env->info, IEEE80211_TX_PWR_ENV_INFO_CATEGORY); + + switch (interpret) { + case IEEE80211_TPE_LOCAL_EIRP: + out = tpe->max_local[category].power; + cnt_out = &tpe->max_local[category].count; + tpe->max_local[category].valid = true; + break; + case IEEE80211_TPE_REG_CLIENT_EIRP: + out = tpe->max_reg_client[category].power; + cnt_out = &tpe->max_reg_client[category].count; + tpe->max_reg_client[category].valid = true; + break; + case IEEE80211_TPE_LOCAL_EIRP_PSD: + out = tpe->psd_local[category].power; + cnt_out = &tpe->psd_local[category].count; + N_out = &tpe->psd_local[category].n; + tpe->psd_local[category].valid = true; + break; + case IEEE80211_TPE_REG_CLIENT_EIRP_PSD: + out = tpe->psd_reg_client[category].power; + cnt_out = &tpe->psd_reg_client[category].count; + N_out = &tpe->psd_reg_client[category].n; + tpe->psd_reg_client[category].valid = true; + break; + } + + switch (interpret) { + case IEEE80211_TPE_LOCAL_EIRP: + case IEEE80211_TPE_REG_CLIENT_EIRP: + /* count was validated <= 3, plus 320 MHz */ + BUILD_BUG_ON(IEEE80211_TPE_EIRP_ENTRIES_320MHZ < 5); + memcpy(out, env->variable, count + 1); + *cnt_out = count + 1; + /* separately take 320 MHz if present */ + if (count == 3 && len > sizeof(*env) + count + 1) { + out[4] = env->variable[count + 2]; + *cnt_out = 5; + } + break; + case IEEE80211_TPE_LOCAL_EIRP_PSD: + case IEEE80211_TPE_REG_CLIENT_EIRP_PSD: + if (!count) { + memset(out, env->variable[0], + IEEE80211_TPE_PSD_ENTRIES_320MHZ); + *cnt_out = IEEE80211_TPE_PSD_ENTRIES_320MHZ; + break; + } + + N = 1 << (count - 1); + memcpy(out, env->variable, N); + *cnt_out = N; + *N_out = N; + + if (len > sizeof(*env) + N) { + int K = u8_get_bits(env->variable[N], + IEEE80211_TX_PWR_ENV_EXT_COUNT); + + K = min(K, IEEE80211_TPE_PSD_ENTRIES_320MHZ - N); + memcpy(out + N, env->variable + N + 1, K); + (*cnt_out) += K; + } + break; + } +} + static u32 _ieee802_11_parse_elems_full(struct ieee80211_elems_parse_params *params, struct ieee80211_elems_parse *elems_parse, @@ -529,6 +607,13 @@ _ieee802_11_parse_elems_full(struct ieee80211_elems_parse_params *params, elem_parse_failed = IEEE80211_PARSE_ERR_BAD_ELEM_SIZE; } + + subelem = cfg80211_find_ext_elem(WLAN_EID_TX_POWER_ENVELOPE, + pos, elen); + if (subelem) + ieee80211_parse_tpe(&elems->csa_tpe, + subelem->data + 1, + subelem->datalen - 1); break; case WLAN_EID_COUNTRY: elems->country_elem = pos; @@ -593,16 +678,9 @@ _ieee802_11_parse_elems_full(struct ieee80211_elems_parse_params *params, elems->rsnx_len = elen; break; case WLAN_EID_TX_POWER_ENVELOPE: - if (elen < 1 || - elen > sizeof(struct ieee80211_tx_pwr_env)) + if (params->mode < IEEE80211_CONN_MODE_HE) break; - - if (elems->tx_pwr_env_num >= ARRAY_SIZE(elems->tx_pwr_env)) - break; - - elems->tx_pwr_env[elems->tx_pwr_env_num] = (void *)pos; - elems->tx_pwr_env_len[elems->tx_pwr_env_num] = elen; - elems->tx_pwr_env_num++; + ieee80211_parse_tpe(&elems->tpe, pos, elen); break; case WLAN_EID_EXTENSION: ieee80211_parse_extension_element(calc_crc ? @@ -889,6 +967,10 @@ ieee802_11_parse_elems_full(struct ieee80211_elems_parse_params *params) elems->ie_start = params->start; elems->total_len = params->len; + /* set all TPE entries to unlimited (but invalid) */ + ieee80211_clear_tpe(&elems->tpe); + ieee80211_clear_tpe(&elems->csa_tpe); + nontransmitted_profile = elems_parse->scratch_pos; nontransmitted_profile_len = ieee802_11_find_bssid_profile(params->start, params->len, diff --git a/net/mac80211/spectmgmt.c b/net/mac80211/spectmgmt.c index b2de4c6fb808..df96d3db1c0e 100644 --- a/net/mac80211/spectmgmt.c +++ b/net/mac80211/spectmgmt.c @@ -9,7 +9,7 @@ * Copyright 2007, Michael Wu <flamingice@sourmilk.net> * Copyright 2007-2008, Intel Corporation * Copyright 2008, Johannes Berg <johannes@sipsolutions.net> - * Copyright (C) 2018, 2020, 2022-2023 Intel Corporation + * Copyright (C) 2018, 2020, 2022-2024 Intel Corporation */ #include <linux/ieee80211.h> @@ -366,6 +366,9 @@ int ieee80211_parse_ch_switch_ie(struct ieee80211_sub_if_data *sdata, validate_chandef_by_ht_vht_oper(sdata, conn, vht_cap_info, &new_chandef); + /* capture the AP chandef before (potential) downgrading */ + csa_ie->chanreq.ap = new_chandef; + /* if data is there validate the bandwidth & use it */ if (new_chandef.chan) { if (conn->bw_limit < IEEE80211_CONN_BW_LIMIT_320 && diff --git a/net/mac80211/tests/Makefile b/net/mac80211/tests/Makefile index 4fdaf3feaca3..511dfa226699 100644 --- a/net/mac80211/tests/Makefile +++ b/net/mac80211/tests/Makefile @@ -1,3 +1,3 @@ -mac80211-tests-y += module.o elems.o mfp.o +mac80211-tests-y += module.o elems.o mfp.o tpe.o obj-$(CONFIG_MAC80211_KUNIT_TEST) += mac80211-tests.o diff --git a/net/mac80211/tests/tpe.c b/net/mac80211/tests/tpe.c new file mode 100644 index 000000000000..dd63303a2985 --- /dev/null +++ b/net/mac80211/tests/tpe.c @@ -0,0 +1,284 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * KUnit tests for TPE element handling + * + * Copyright (C) 2024 Intel Corporation + */ +#include <kunit/test.h> +#include "../ieee80211_i.h" + +MODULE_IMPORT_NS(EXPORTED_FOR_KUNIT_TESTING); + +static struct ieee80211_channel chan6g_1 = { + .band = NL80211_BAND_6GHZ, + .center_freq = 5955, +}; + +static struct ieee80211_channel chan6g_33 = { + .band = NL80211_BAND_6GHZ, + .center_freq = 6115, +}; + +static struct ieee80211_channel chan6g_61 = { + .band = NL80211_BAND_6GHZ, + .center_freq = 6255, +}; + +static const struct subchan_test_case { + const char *desc; + struct cfg80211_chan_def c; + u8 n; + int expect; +} subchan_offset_cases[] = { + { + .desc = "identical 20 MHz", + .c.width = NL80211_CHAN_WIDTH_20, + .c.chan = &chan6g_1, + .c.center_freq1 = 5955, + .n = 1, + .expect = 0, + }, + { + .desc = "identical 40 MHz", + .c.width = NL80211_CHAN_WIDTH_40, + .c.chan = &chan6g_1, + .c.center_freq1 = 5965, + .n = 2, + .expect = 0, + }, + { + .desc = "identical 80+80 MHz", + /* not really is valid? doesn't matter for the test */ + .c.width = NL80211_CHAN_WIDTH_80P80, + .c.chan = &chan6g_1, + .c.center_freq1 = 5985, + .c.center_freq2 = 6225, + .n = 16, + .expect = 0, + }, + { + .desc = "identical 320 MHz", + .c.width = NL80211_CHAN_WIDTH_320, + .c.chan = &chan6g_1, + .c.center_freq1 = 6105, + .n = 16, + .expect = 0, + }, + { + .desc = "lower 160 MHz of 320 MHz", + .c.width = NL80211_CHAN_WIDTH_320, + .c.chan = &chan6g_1, + .c.center_freq1 = 6105, + .n = 8, + .expect = 0, + }, + { + .desc = "upper 160 MHz of 320 MHz", + .c.width = NL80211_CHAN_WIDTH_320, + .c.chan = &chan6g_61, + .c.center_freq1 = 6105, + .n = 8, + .expect = 8, + }, + { + .desc = "upper 160 MHz of 320 MHz, go to 40", + .c.width = NL80211_CHAN_WIDTH_320, + .c.chan = &chan6g_61, + .c.center_freq1 = 6105, + .n = 2, + .expect = 8 + 4 + 2, + }, + { + .desc = "secondary 80 above primary in 80+80 MHz", + /* not really is valid? doesn't matter for the test */ + .c.width = NL80211_CHAN_WIDTH_80P80, + .c.chan = &chan6g_1, + .c.center_freq1 = 5985, + .c.center_freq2 = 6225, + .n = 4, + .expect = 0, + }, + { + .desc = "secondary 80 below primary in 80+80 MHz", + /* not really is valid? doesn't matter for the test */ + .c.width = NL80211_CHAN_WIDTH_80P80, + .c.chan = &chan6g_61, + .c.center_freq1 = 6225, + .c.center_freq2 = 5985, + .n = 4, + .expect = 4, + }, + { + .desc = "secondary 80 below primary in 80+80 MHz, go to 20", + /* not really is valid? doesn't matter for the test */ + .c.width = NL80211_CHAN_WIDTH_80P80, + .c.chan = &chan6g_61, + .c.center_freq1 = 6225, + .c.center_freq2 = 5985, + .n = 1, + .expect = 7, + }, +}; + +KUNIT_ARRAY_PARAM_DESC(subchan_offset, subchan_offset_cases, desc); + +static void subchan_offset(struct kunit *test) +{ + const struct subchan_test_case *params = test->param_value; + int offset; + + KUNIT_ASSERT_EQ(test, cfg80211_chandef_valid(¶ms->c), true); + + offset = ieee80211_calc_chandef_subchan_offset(¶ms->c, params->n); + + KUNIT_EXPECT_EQ(test, params->expect, offset); +} + +static const struct psd_reorder_test_case { + const char *desc; + struct cfg80211_chan_def ap, used; + struct ieee80211_parsed_tpe_psd psd, out; +} psd_reorder_cases[] = { + { + .desc = "no changes, 320 MHz", + + .ap.width = NL80211_CHAN_WIDTH_320, + .ap.chan = &chan6g_1, + .ap.center_freq1 = 6105, + + .used.width = NL80211_CHAN_WIDTH_320, + .used.chan = &chan6g_1, + .used.center_freq1 = 6105, + + .psd.valid = true, + .psd.count = 16, + .psd.n = 8, + .psd.power = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + + .out.valid = true, + .out.count = 16, + .out.n = 8, + .out.power = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + }, + { + .desc = "no changes, 320 MHz, 160 MHz used, n=0", + + .ap.width = NL80211_CHAN_WIDTH_320, + .ap.chan = &chan6g_1, + .ap.center_freq1 = 6105, + + .used.width = NL80211_CHAN_WIDTH_160, + .used.chan = &chan6g_1, + .used.center_freq1 = 6025, + + .psd.valid = true, + .psd.count = 16, + .psd.n = 0, + .psd.power = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, }, + + .out.valid = true, + .out.count = 8, + .out.n = 0, + .out.power = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, }, + }, + { + .desc = "320 MHz, HE is 80, used 160, all lower", + + .ap.width = NL80211_CHAN_WIDTH_320, + .ap.chan = &chan6g_1, + .ap.center_freq1 = 6105, + + .used.width = NL80211_CHAN_WIDTH_160, + .used.chan = &chan6g_1, + .used.center_freq1 = 6025, + + .psd.valid = true, + .psd.count = 16, + .psd.n = 4, + .psd.power = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + + .out.valid = true, + .out.count = 8, + .out.n = 4, + .out.power = { 0, 1, 2, 3, 4, 5, 6, 7, 127, 127, 127, 127, 127, 127, 127, 127}, + }, + { + .desc = "320 MHz, HE is 80, used 160, all upper", + /* + * EHT: | | | | | | | | | | | | | | | | | + * HE: | | | | | + * used: | | | | | | | | | + */ + + .ap.width = NL80211_CHAN_WIDTH_320, + .ap.chan = &chan6g_61, + .ap.center_freq1 = 6105, + + .used.width = NL80211_CHAN_WIDTH_160, + .used.chan = &chan6g_61, + .used.center_freq1 = 6185, + + .psd.valid = true, + .psd.count = 16, + .psd.n = 4, + .psd.power = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + + .out.valid = true, + .out.count = 8, + .out.n = 4, + .out.power = { 12, 13, 14, 15, 0, 1, 2, 3, 127, 127, 127, 127, 127, 127, 127, 127}, + }, + { + .desc = "320 MHz, HE is 80, used 160, split", + /* + * EHT: | | | | | | | | | | | | | | | | | + * HE: | | | | | + * used: | | | | | | | | | + */ + + .ap.width = NL80211_CHAN_WIDTH_320, + .ap.chan = &chan6g_33, + .ap.center_freq1 = 6105, + + .used.width = NL80211_CHAN_WIDTH_160, + .used.chan = &chan6g_33, + .used.center_freq1 = 6185, + + .psd.valid = true, + .psd.count = 16, + .psd.n = 4, + .psd.power = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + + .out.valid = true, + .out.count = 8, + .out.n = 4, + .out.power = { 0, 1, 2, 3, 12, 13, 14, 15, 127, 127, 127, 127, 127, 127, 127, 127}, + }, +}; + +KUNIT_ARRAY_PARAM_DESC(psd_reorder, psd_reorder_cases, desc); + +static void psd_reorder(struct kunit *test) +{ + const struct psd_reorder_test_case *params = test->param_value; + struct ieee80211_parsed_tpe_psd tmp = params->psd; + + KUNIT_ASSERT_EQ(test, cfg80211_chandef_valid(¶ms->ap), true); + KUNIT_ASSERT_EQ(test, cfg80211_chandef_valid(¶ms->used), true); + + ieee80211_rearrange_tpe_psd(&tmp, ¶ms->ap, ¶ms->used); + KUNIT_EXPECT_MEMEQ(test, &tmp, ¶ms->out, sizeof(tmp)); +} + +static struct kunit_case tpe_test_cases[] = { + KUNIT_CASE_PARAM(subchan_offset, subchan_offset_gen_params), + KUNIT_CASE_PARAM(psd_reorder, psd_reorder_gen_params), + {} +}; + +static struct kunit_suite tpe = { + .name = "mac80211-tpe", + .test_cases = tpe_test_cases, +}; + +kunit_test_suite(tpe); diff --git a/net/mac80211/util.c b/net/mac80211/util.c index 771c05640aa3..d7c45b098be9 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -2179,8 +2179,7 @@ int ieee80211_reconfig(struct ieee80211_local *local) local->in_reconfig = false; barrier(); - /* Restart deferred ROCs */ - ieee80211_start_next_roc(local); + ieee80211_reconfig_roc(local); /* Requeue all works */ list_for_each_entry(sdata, &local->interfaces, list) @@ -3460,12 +3459,8 @@ void ieee80211_dfs_cac_cancel(struct ieee80211_local *local) lockdep_assert_wiphy(local->hw.wiphy); list_for_each_entry(sdata, &local->interfaces, list) { - /* it might be waiting for the local->mtx, but then - * by the time it gets it, sdata->wdev.cac_started - * will no longer be true - */ wiphy_delayed_work_cancel(local->hw.wiphy, - &sdata->deflink.dfs_cac_timer_work); + &sdata->dfs_cac_timer_work); if (sdata->wdev.cac_started) { chandef = sdata->vif.bss_conf.chanreq.oper; @@ -4342,3 +4337,28 @@ ieee80211_min_bw_limit_from_chandef(struct cfg80211_chan_def *chandef) return IEEE80211_CONN_BW_LIMIT_20; } } + +void ieee80211_clear_tpe(struct ieee80211_parsed_tpe *tpe) +{ + for (int i = 0; i < 2; i++) { + tpe->max_local[i].valid = false; + memset(tpe->max_local[i].power, + IEEE80211_TPE_MAX_TX_PWR_NO_CONSTRAINT, + sizeof(tpe->max_local[i].power)); + + tpe->max_reg_client[i].valid = false; + memset(tpe->max_reg_client[i].power, + IEEE80211_TPE_MAX_TX_PWR_NO_CONSTRAINT, + sizeof(tpe->max_reg_client[i].power)); + + tpe->psd_local[i].valid = false; + memset(tpe->psd_local[i].power, + IEEE80211_TPE_PSD_NO_LIMIT, + sizeof(tpe->psd_local[i].power)); + + tpe->psd_reg_client[i].valid = false; + memset(tpe->psd_reg_client[i].power, + IEEE80211_TPE_PSD_NO_LIMIT, + sizeof(tpe->psd_reg_client[i].power)); + } +} diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index bb7dca8aa2d9..a26c2c840fd9 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -2040,13 +2040,13 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied) do_div(grow, msk->rcvq_space.space); rcvwin += (grow << 1); - rcvbuf = min_t(u64, __tcp_space_from_win(scaling_ratio, rcvwin), + rcvbuf = min_t(u64, mptcp_space_from_win(sk, rcvwin), READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2])); if (rcvbuf > sk->sk_rcvbuf) { u32 window_clamp; - window_clamp = __tcp_win_from_space(scaling_ratio, rcvbuf); + window_clamp = mptcp_win_from_space(sk, rcvbuf); WRITE_ONCE(sk->sk_rcvbuf, rcvbuf); /* Make subflows follow along. If we do not do this, we @@ -2202,7 +2202,7 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, if (skb_queue_empty(&msk->receive_queue) && __mptcp_move_skbs(msk)) continue; - /* only the master socket status is relevant here. The exit + /* only the MPTCP socket status is relevant here. The exit * conditions mirror closely tcp_recvmsg() */ if (copied >= target) @@ -3526,7 +3526,7 @@ void mptcp_subflow_process_delegated(struct sock *ssk, long status) static int mptcp_hash(struct sock *sk) { /* should never be called, - * we hash the TCP subflows not the master socket + * we hash the TCP subflows not the MPTCP socket */ WARN_ON_ONCE(1); return 0; diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 7aa47e2dd52b..b11a4e50d52b 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -386,6 +386,11 @@ static inline int mptcp_win_from_space(const struct sock *sk, int space) return __tcp_win_from_space(mptcp_sk(sk)->scaling_ratio, space); } +static inline int mptcp_space_from_win(const struct sock *sk, int win) +{ + return __tcp_space_from_win(mptcp_sk(sk)->scaling_ratio, win); +} + static inline int __mptcp_space(const struct sock *sk) { return mptcp_win_from_space(sk, READ_ONCE(sk->sk_rcvbuf) - __mptcp_rmem(sk)); diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c index f9a4fb17b5b7..2026a9a36f80 100644 --- a/net/mptcp/sockopt.c +++ b/net/mptcp/sockopt.c @@ -1579,7 +1579,7 @@ int mptcp_set_rcvlowat(struct sock *sk, int val) if (sk->sk_userlocks & SOCK_RCVBUF_LOCK) return 0; - space = __tcp_space_from_win(mptcp_sk(sk)->scaling_ratio, val); + space = mptcp_space_from_win(sk, val); if (space <= sk->sk_rcvbuf) return 0; diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 612c38570a64..39e2cbdf3801 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -1719,7 +1719,7 @@ int mptcp_subflow_create_socket(struct sock *sk, unsigned short family, mptcp_sockopt_sync_locked(mptcp_sk(sk), sf->sk); release_sock(sf->sk); - /* the newly created socket really belongs to the owning MPTCP master + /* the newly created socket really belongs to the owning MPTCP * socket, even if for additional subflows the allocation is performed * by a kernel workqueue. Adjust inode references, so that the * procfs/diag interfaces really show this one belonging to the correct diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index b6d0dcf3a5c3..78a1cc72dc38 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -1924,7 +1924,8 @@ proc_do_sync_ports(struct ctl_table *table, int write, return rc; } -static int ipvs_proc_est_cpumask_set(struct ctl_table *table, void *buffer) +static int ipvs_proc_est_cpumask_set(const struct ctl_table *table, + void *buffer) { struct netns_ipvs *ipvs = table->extra2; cpumask_var_t *valp = table->data; @@ -1962,8 +1963,8 @@ out: return ret; } -static int ipvs_proc_est_cpumask_get(struct ctl_table *table, void *buffer, - size_t size) +static int ipvs_proc_est_cpumask_get(const struct ctl_table *table, + void *buffer, size_t size) { struct netns_ipvs *ipvs = table->extra2; cpumask_var_t *valp = table->data; diff --git a/net/netfilter/nf_conntrack_bpf.c b/net/netfilter/nf_conntrack_bpf.c index d2492d050fe6..4a136fc3a9c0 100644 --- a/net/netfilter/nf_conntrack_bpf.c +++ b/net/netfilter/nf_conntrack_bpf.c @@ -32,7 +32,9 @@ * -EINVAL - Passed NULL for bpf_tuple pointer * -EINVAL - opts->reserved is not 0 * -EINVAL - netns_id is less than -1 - * -EINVAL - opts__sz isn't NF_BPF_CT_OPTS_SZ (12) + * -EINVAL - opts__sz isn't NF_BPF_CT_OPTS_SZ (16) or 12 + * -EINVAL - opts->ct_zone_id set when + opts__sz isn't NF_BPF_CT_OPTS_SZ (16) * -EPROTO - l4proto isn't one of IPPROTO_TCP or IPPROTO_UDP * -ENONET - No network namespace found for netns_id * -ENOENT - Conntrack lookup could not find entry for tuple @@ -42,6 +44,8 @@ * Values: * IPPROTO_TCP, IPPROTO_UDP * @dir: - connection tracking tuple direction. + * @ct_zone_id - connection tracking zone id. + * @ct_zone_dir - connection tracking zone direction. * @reserved - Reserved member, will be reused for more options in future * Values: * 0 @@ -51,11 +55,13 @@ struct bpf_ct_opts { s32 error; u8 l4proto; u8 dir; - u8 reserved[2]; + u16 ct_zone_id; + u8 ct_zone_dir; + u8 reserved[3]; }; enum { - NF_BPF_CT_OPTS_SZ = 12, + NF_BPF_CT_OPTS_SZ = 16, }; static int bpf_nf_ct_tuple_parse(struct bpf_sock_tuple *bpf_tuple, @@ -104,12 +110,21 @@ __bpf_nf_ct_alloc_entry(struct net *net, struct bpf_sock_tuple *bpf_tuple, u32 timeout) { struct nf_conntrack_tuple otuple, rtuple; + struct nf_conntrack_zone ct_zone; struct nf_conn *ct; int err; - if (!opts || !bpf_tuple || opts->reserved[0] || opts->reserved[1] || - opts_len != NF_BPF_CT_OPTS_SZ) + if (!opts || !bpf_tuple) return ERR_PTR(-EINVAL); + if (!(opts_len == NF_BPF_CT_OPTS_SZ || opts_len == 12)) + return ERR_PTR(-EINVAL); + if (opts_len == NF_BPF_CT_OPTS_SZ) { + if (opts->reserved[0] || opts->reserved[1] || opts->reserved[2]) + return ERR_PTR(-EINVAL); + } else { + if (opts->ct_zone_id) + return ERR_PTR(-EINVAL); + } if (unlikely(opts->netns_id < BPF_F_CURRENT_NETNS)) return ERR_PTR(-EINVAL); @@ -130,7 +145,16 @@ __bpf_nf_ct_alloc_entry(struct net *net, struct bpf_sock_tuple *bpf_tuple, return ERR_PTR(-ENONET); } - ct = nf_conntrack_alloc(net, &nf_ct_zone_dflt, &otuple, &rtuple, + if (opts_len == NF_BPF_CT_OPTS_SZ) { + if (opts->ct_zone_dir == 0) + opts->ct_zone_dir = NF_CT_DEFAULT_ZONE_DIR; + nf_ct_zone_init(&ct_zone, + opts->ct_zone_id, opts->ct_zone_dir, 0); + } else { + ct_zone = nf_ct_zone_dflt; + } + + ct = nf_conntrack_alloc(net, &ct_zone, &otuple, &rtuple, GFP_ATOMIC); if (IS_ERR(ct)) goto out; @@ -152,12 +176,21 @@ static struct nf_conn *__bpf_nf_ct_lookup(struct net *net, { struct nf_conntrack_tuple_hash *hash; struct nf_conntrack_tuple tuple; + struct nf_conntrack_zone ct_zone; struct nf_conn *ct; int err; - if (!opts || !bpf_tuple || opts->reserved[0] || opts->reserved[1] || - opts_len != NF_BPF_CT_OPTS_SZ) + if (!opts || !bpf_tuple) return ERR_PTR(-EINVAL); + if (!(opts_len == NF_BPF_CT_OPTS_SZ || opts_len == 12)) + return ERR_PTR(-EINVAL); + if (opts_len == NF_BPF_CT_OPTS_SZ) { + if (opts->reserved[0] || opts->reserved[1] || opts->reserved[2]) + return ERR_PTR(-EINVAL); + } else { + if (opts->ct_zone_id) + return ERR_PTR(-EINVAL); + } if (unlikely(opts->l4proto != IPPROTO_TCP && opts->l4proto != IPPROTO_UDP)) return ERR_PTR(-EPROTO); if (unlikely(opts->netns_id < BPF_F_CURRENT_NETNS)) @@ -174,7 +207,16 @@ static struct nf_conn *__bpf_nf_ct_lookup(struct net *net, return ERR_PTR(-ENONET); } - hash = nf_conntrack_find_get(net, &nf_ct_zone_dflt, &tuple); + if (opts_len == NF_BPF_CT_OPTS_SZ) { + if (opts->ct_zone_dir == 0) + opts->ct_zone_dir = NF_CT_DEFAULT_ZONE_DIR; + nf_ct_zone_init(&ct_zone, + opts->ct_zone_id, opts->ct_zone_dir, 0); + } else { + ct_zone = nf_ct_zone_dflt; + } + + hash = nf_conntrack_find_get(net, &ct_zone, &tuple); if (opts->netns_id >= 0) put_net(net); if (!hash) @@ -245,7 +287,7 @@ __bpf_kfunc_start_defs(); * @opts - Additional options for allocation (documented above) * Cannot be NULL * @opts__sz - Length of the bpf_ct_opts structure - * Must be NF_BPF_CT_OPTS_SZ (12) + * Must be NF_BPF_CT_OPTS_SZ (16) or 12 */ __bpf_kfunc struct nf_conn___init * bpf_xdp_ct_alloc(struct xdp_md *xdp_ctx, struct bpf_sock_tuple *bpf_tuple, @@ -279,7 +321,7 @@ bpf_xdp_ct_alloc(struct xdp_md *xdp_ctx, struct bpf_sock_tuple *bpf_tuple, * @opts - Additional options for lookup (documented above) * Cannot be NULL * @opts__sz - Length of the bpf_ct_opts structure - * Must be NF_BPF_CT_OPTS_SZ (12) + * Must be NF_BPF_CT_OPTS_SZ (16) or 12 */ __bpf_kfunc struct nf_conn * bpf_xdp_ct_lookup(struct xdp_md *xdp_ctx, struct bpf_sock_tuple *bpf_tuple, @@ -312,7 +354,7 @@ bpf_xdp_ct_lookup(struct xdp_md *xdp_ctx, struct bpf_sock_tuple *bpf_tuple, * @opts - Additional options for allocation (documented above) * Cannot be NULL * @opts__sz - Length of the bpf_ct_opts structure - * Must be NF_BPF_CT_OPTS_SZ (12) + * Must be NF_BPF_CT_OPTS_SZ (16) or 12 */ __bpf_kfunc struct nf_conn___init * bpf_skb_ct_alloc(struct __sk_buff *skb_ctx, struct bpf_sock_tuple *bpf_tuple, @@ -347,7 +389,7 @@ bpf_skb_ct_alloc(struct __sk_buff *skb_ctx, struct bpf_sock_tuple *bpf_tuple, * @opts - Additional options for lookup (documented above) * Cannot be NULL * @opts__sz - Length of the bpf_ct_opts structure - * Must be NF_BPF_CT_OPTS_SZ (12) + * Must be NF_BPF_CT_OPTS_SZ (16) or 12 */ __bpf_kfunc struct nf_conn * bpf_skb_ct_lookup(struct __sk_buff *skb_ctx, struct bpf_sock_tuple *bpf_tuple, diff --git a/net/netfilter/nf_tables_trace.c b/net/netfilter/nf_tables_trace.c index a83637e3f455..580c55268f65 100644 --- a/net/netfilter/nf_tables_trace.c +++ b/net/netfilter/nf_tables_trace.c @@ -317,7 +317,7 @@ void nft_trace_init(struct nft_traceinfo *info, const struct nft_pktinfo *pkt, net_get_random_once(&trace_key, sizeof(trace_key)); info->skbid = (u32)siphash_3u32(hash32_ptr(skb), - skb_get_hash(skb), + skb_get_hash_net(nft_net(pkt), skb), skb->skb_iif, &trace_key); } diff --git a/net/netfilter/nft_hash.c b/net/netfilter/nft_hash.c index 92d47e469204..868d68302d22 100644 --- a/net/netfilter/nft_hash.c +++ b/net/netfilter/nft_hash.c @@ -51,7 +51,8 @@ static void nft_symhash_eval(const struct nft_expr *expr, struct sk_buff *skb = pkt->skb; u32 h; - h = reciprocal_scale(__skb_get_hash_symmetric(skb), priv->modulus); + h = reciprocal_scale(__skb_get_hash_symmetric_net(nft_net(pkt), skb), + priv->modulus); regs->data[priv->dreg] = h + priv->offset; } diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index fa9c090cf629..0b7a89db3ab7 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -636,8 +636,7 @@ static struct proto netlink_proto = { }; static int __netlink_create(struct net *net, struct socket *sock, - struct mutex *dump_cb_mutex, int protocol, - int kern) + int protocol, int kern) { struct sock *sk; struct netlink_sock *nlk; @@ -655,7 +654,6 @@ static int __netlink_create(struct net *net, struct socket *sock, lockdep_set_class_and_name(&nlk->nl_cb_mutex, nlk_cb_mutex_keys + protocol, nlk_cb_mutex_key_strings[protocol]); - nlk->dump_cb_mutex = dump_cb_mutex; init_waitqueue_head(&nlk->wait); sk->sk_destruct = netlink_sock_destruct; @@ -667,7 +665,6 @@ static int netlink_create(struct net *net, struct socket *sock, int protocol, int kern) { struct module *module = NULL; - struct mutex *cb_mutex; struct netlink_sock *nlk; int (*bind)(struct net *net, int group); void (*unbind)(struct net *net, int group); @@ -696,7 +693,6 @@ static int netlink_create(struct net *net, struct socket *sock, int protocol, module = nl_table[protocol].module; else err = -EPROTONOSUPPORT; - cb_mutex = nl_table[protocol].cb_mutex; bind = nl_table[protocol].bind; unbind = nl_table[protocol].unbind; release = nl_table[protocol].release; @@ -705,7 +701,7 @@ static int netlink_create(struct net *net, struct socket *sock, int protocol, if (err < 0) goto out; - err = __netlink_create(net, sock, cb_mutex, protocol, kern); + err = __netlink_create(net, sock, protocol, kern); if (err < 0) goto out_module; @@ -2016,7 +2012,6 @@ __netlink_kernel_create(struct net *net, int unit, struct module *module, struct sock *sk; struct netlink_sock *nlk; struct listeners *listeners = NULL; - struct mutex *cb_mutex = cfg ? cfg->cb_mutex : NULL; unsigned int groups; BUG_ON(!nl_table); @@ -2027,7 +2022,7 @@ __netlink_kernel_create(struct net *net, int unit, struct module *module, if (sock_create_lite(PF_NETLINK, SOCK_DGRAM, unit, &sock)) return NULL; - if (__netlink_create(net, sock, cb_mutex, unit, 1) < 0) + if (__netlink_create(net, sock, unit, 1) < 0) goto out_sock_release_nosk; sk = sock->sk; @@ -2055,7 +2050,6 @@ __netlink_kernel_create(struct net *net, int unit, struct module *module, if (!nl_table[unit].registered) { nl_table[unit].groups = groups; rcu_assign_pointer(nl_table[unit].listeners, listeners); - nl_table[unit].cb_mutex = cb_mutex; nl_table[unit].module = module; if (cfg) { nl_table[unit].bind = cfg->bind; @@ -2326,17 +2320,9 @@ static int netlink_dump(struct sock *sk, bool lock_taken) netlink_skb_set_owner_r(skb, sk); if (nlk->dump_done_errno > 0) { - struct mutex *extra_mutex = nlk->dump_cb_mutex; - cb->extack = &extack; - if (cb->flags & RTNL_FLAG_DUMP_UNLOCKED) - extra_mutex = NULL; - if (extra_mutex) - mutex_lock(extra_mutex); nlk->dump_done_errno = cb->dump(skb, cb); - if (extra_mutex) - mutex_unlock(extra_mutex); /* EMSGSIZE plus something already in the skb means * that there's more to dump but current skb has filled up. diff --git a/net/openvswitch/vport-internal_dev.c b/net/openvswitch/vport-internal_dev.c index 74c88a6baa43..4b33133cbdff 100644 --- a/net/openvswitch/vport-internal_dev.c +++ b/net/openvswitch/vport-internal_dev.c @@ -85,7 +85,6 @@ static const struct net_device_ops internal_dev_netdev_ops = { .ndo_stop = internal_dev_stop, .ndo_start_xmit = internal_dev_xmit, .ndo_set_mac_address = eth_mac_addr, - .ndo_get_stats64 = dev_get_tstats64, }; static struct rtnl_link_ops internal_dev_link_ops __read_mostly = { @@ -140,11 +139,7 @@ static struct vport *internal_dev_create(const struct vport_parms *parms) err = -ENOMEM; goto error_free_vport; } - vport->dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); - if (!vport->dev->tstats) { - err = -ENOMEM; - goto error_free_netdev; - } + dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS; dev_net_set(vport->dev, ovs_dp_get_net(vport->dp)); dev->ifindex = parms->desired_ifindex; @@ -169,8 +164,6 @@ static struct vport *internal_dev_create(const struct vport_parms *parms) error_unlock: rtnl_unlock(); - free_percpu(dev->tstats); -error_free_netdev: free_netdev(dev); error_free_vport: ovs_vport_free(vport); @@ -186,7 +179,6 @@ static void internal_dev_destroy(struct vport *vport) /* unregister_netdevice() waits for an RCU grace period. */ unregister_netdevice(vport->dev); - free_percpu(vport->dev->tstats); rtnl_unlock(); } diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index ea3ebc160e25..42d29b8a84fc 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -2056,8 +2056,7 @@ retry: skb->dev = dev; skb->priority = READ_ONCE(sk->sk_priority); skb->mark = READ_ONCE(sk->sk_mark); - skb->tstamp = sockc.transmit_time; - + skb_set_delivery_type_by_clockid(skb, sockc.transmit_time, sk->sk_clockid); skb_setup_tx_timestamp(skb, sockc.tsflags); if (unlikely(extra_len == 4)) @@ -2122,7 +2121,7 @@ static int packet_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { enum skb_drop_reason drop_reason = SKB_CONSUMED; - struct sock *sk; + struct sock *sk = NULL; struct sockaddr_ll *sll; struct packet_sock *po; u8 *skb_head = skb->data; @@ -2227,7 +2226,7 @@ drop_n_restore: skb->len = skb_len; } drop: - kfree_skb_reason(skb, drop_reason); + sk_skb_reason_drop(sk, skb, drop_reason); return 0; } @@ -2235,7 +2234,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { enum skb_drop_reason drop_reason = SKB_CONSUMED; - struct sock *sk; + struct sock *sk = NULL; struct packet_sock *po; struct sockaddr_ll *sll; union tpacket_uhdr h; @@ -2495,7 +2494,7 @@ drop_n_restore: skb->len = skb_len; } drop: - kfree_skb_reason(skb, drop_reason); + sk_skb_reason_drop(sk, skb, drop_reason); return 0; drop_n_account: @@ -2504,7 +2503,7 @@ drop_n_account: drop_reason = SKB_DROP_REASON_PACKET_SOCK_ERROR; sk->sk_data_ready(sk); - kfree_skb_reason(copy_skb, drop_reason); + sk_skb_reason_drop(sk, copy_skb, drop_reason); goto drop_n_restore; } @@ -2584,7 +2583,7 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb, skb->dev = dev; skb->priority = READ_ONCE(po->sk.sk_priority); skb->mark = READ_ONCE(po->sk.sk_mark); - skb->tstamp = sockc->transmit_time; + skb_set_delivery_type_by_clockid(skb, sockc->transmit_time, po->sk.sk_clockid); skb_setup_tx_timestamp(skb, sockc->tsflags); skb_zcopy_set_nouarg(skb, ph.raw); @@ -3062,7 +3061,7 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) skb->dev = dev; skb->priority = READ_ONCE(sk->sk_priority); skb->mark = sockc.mark; - skb->tstamp = sockc.transmit_time; + skb_set_delivery_type_by_clockid(skb, sockc.transmit_time, sk->sk_clockid); if (unlikely(extra_len == 4)) skb->no_fcs = 1; diff --git a/net/qrtr/ns.c b/net/qrtr/ns.c index 654a3cc0d347..3de9350cbf30 100644 --- a/net/qrtr/ns.c +++ b/net/qrtr/ns.c @@ -132,8 +132,8 @@ static int service_announce_new(struct sockaddr_qrtr *dest, return kernel_sendmsg(qrtr_ns.sock, &msg, &iv, 1, sizeof(pkt)); } -static int service_announce_del(struct sockaddr_qrtr *dest, - struct qrtr_server *srv) +static void service_announce_del(struct sockaddr_qrtr *dest, + struct qrtr_server *srv) { struct qrtr_ctrl_pkt pkt; struct msghdr msg = { }; @@ -157,10 +157,10 @@ static int service_announce_del(struct sockaddr_qrtr *dest, msg.msg_namelen = sizeof(*dest); ret = kernel_sendmsg(qrtr_ns.sock, &msg, &iv, 1, sizeof(pkt)); - if (ret < 0) + if (ret < 0 && ret != -ENODEV) pr_err("failed to announce del service\n"); - return ret; + return; } static void lookup_notify(struct sockaddr_qrtr *to, struct qrtr_server *srv, @@ -188,7 +188,7 @@ static void lookup_notify(struct sockaddr_qrtr *to, struct qrtr_server *srv, msg.msg_namelen = sizeof(*to); ret = kernel_sendmsg(qrtr_ns.sock, &msg, &iv, 1, sizeof(pkt)); - if (ret < 0) + if (ret < 0 && ret != -ENODEV) pr_err("failed to send lookup notification\n"); } @@ -207,6 +207,9 @@ static int announce_servers(struct sockaddr_qrtr *sq) xa_for_each(&node->servers, index, srv) { ret = service_announce_new(sq, srv); if (ret < 0) { + if (ret == -ENODEV) + continue; + pr_err("failed to announce new service\n"); return ret; } @@ -369,7 +372,7 @@ static int ctrl_cmd_bye(struct sockaddr_qrtr *from) msg.msg_namelen = sizeof(sq); ret = kernel_sendmsg(qrtr_ns.sock, &msg, &iv, 1, sizeof(pkt)); - if (ret < 0) { + if (ret < 0 && ret != -ENODEV) { pr_err("failed to send bye cmd\n"); return ret; } @@ -443,7 +446,7 @@ static int ctrl_cmd_del_client(struct sockaddr_qrtr *from, msg.msg_namelen = sizeof(sq); ret = kernel_sendmsg(qrtr_ns.sock, &msg, &iv, 1, sizeof(pkt)); - if (ret < 0) { + if (ret < 0 && ret != -ENODEV) { pr_err("failed to send del client cmd\n"); return ret; } diff --git a/net/rds/tcp.c b/net/rds/tcp.c index d8111ac83bb6..3dc6956f66f8 100644 --- a/net/rds/tcp.c +++ b/net/rds/tcp.c @@ -719,9 +719,7 @@ static int __init rds_tcp_init(void) { int ret; - rds_tcp_conn_slab = kmem_cache_create("rds_tcp_connection", - sizeof(struct rds_tcp_connection), - 0, 0, NULL); + rds_tcp_conn_slab = KMEM_CACHE(rds_tcp_connection, 0); if (!rds_tcp_conn_slab) { ret = -ENOMEM; goto out; diff --git a/net/rds/tcp_recv.c b/net/rds/tcp_recv.c index c00f04a1a534..7997a19d1da3 100644 --- a/net/rds/tcp_recv.c +++ b/net/rds/tcp_recv.c @@ -337,9 +337,7 @@ out: int rds_tcp_recv_init(void) { - rds_tcp_incoming_slab = kmem_cache_create("rds_tcp_incoming", - sizeof(struct rds_tcp_incoming), - 0, 0, NULL); + rds_tcp_incoming_slab = KMEM_CACHE(rds_tcp_incoming, 0); if (!rds_tcp_incoming_slab) return -ENOMEM; return 0; diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 2520708b06a1..2714c4ed928e 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -62,7 +62,7 @@ static void tcf_set_action_cookie(struct tc_cookie __rcu **old_cookie, { struct tc_cookie *old; - old = xchg((__force struct tc_cookie **)old_cookie, new_cookie); + old = unrcu_pointer(xchg(old_cookie, RCU_INITIALIZER(new_cookie))); if (old) call_rcu(&old->rcu, tcf_free_cookie_rcu); } diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c index 0e3cf11ae5fc..396b576390d0 100644 --- a/net/sched/act_bpf.c +++ b/net/sched/act_bpf.c @@ -54,8 +54,8 @@ TC_INDIRECT_SCOPE int tcf_bpf_act(struct sk_buff *skb, bpf_compute_data_pointers(skb); filter_res = bpf_prog_run(filter, skb); } - if (unlikely(!skb->tstamp && skb->mono_delivery_time)) - skb->mono_delivery_time = 0; + if (unlikely(!skb->tstamp && skb->tstamp_type)) + skb->tstamp_type = SKB_CLOCK_REALTIME; if (skb_sk_is_prefetched(skb) && filter_res != TC_ACT_OK) skb_orphan(skb); diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c index 5e83e890f6a4..1941ebec23ff 100644 --- a/net/sched/cls_bpf.c +++ b/net/sched/cls_bpf.c @@ -104,8 +104,8 @@ TC_INDIRECT_SCOPE int cls_bpf_classify(struct sk_buff *skb, bpf_compute_data_pointers(skb); filter_res = bpf_prog_run(prog->filter, skb); } - if (unlikely(!skb->tstamp && skb->mono_delivery_time)) - skb->mono_delivery_time = 0; + if (unlikely(!skb->tstamp && skb->tstamp_type)) + skb->tstamp_type = SKB_CLOCK_REALTIME; if (prog->exts_integrated) { res->class = 0; diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index fd9a6f20b60b..eef570c577ac 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -41,6 +41,12 @@ #define TCA_FLOWER_KEY_CT_FLAGS_MASK \ (TCA_FLOWER_KEY_CT_FLAGS_MAX - 1) +#define TUNNEL_FLAGS_PRESENT (\ + _BITUL(IP_TUNNEL_CSUM_BIT) | \ + _BITUL(IP_TUNNEL_DONT_FRAGMENT_BIT) | \ + _BITUL(IP_TUNNEL_OAM_BIT) | \ + _BITUL(IP_TUNNEL_CRIT_OPT_BIT)) + struct fl_flow_key { struct flow_dissector_key_meta meta; struct flow_dissector_key_control control; @@ -75,6 +81,7 @@ struct fl_flow_key { struct flow_dissector_key_l2tpv3 l2tpv3; struct flow_dissector_key_ipsec ipsec; struct flow_dissector_key_cfm cfm; + struct flow_dissector_key_enc_flags enc_flags; } __aligned(BITS_PER_LONG / 8); /* Ensure that we can do comparisons as longs. */ struct fl_flow_mask_range { @@ -732,6 +739,10 @@ static const struct nla_policy fl_policy[TCA_FLOWER_MAX + 1] = { [TCA_FLOWER_KEY_SPI_MASK] = { .type = NLA_U32 }, [TCA_FLOWER_L2_MISS] = NLA_POLICY_MAX(NLA_U8, 1), [TCA_FLOWER_KEY_CFM] = { .type = NLA_NESTED }, + [TCA_FLOWER_KEY_ENC_FLAGS] = NLA_POLICY_MASK(NLA_U32, + TUNNEL_FLAGS_PRESENT), + [TCA_FLOWER_KEY_ENC_FLAGS_MASK] = NLA_POLICY_MASK(NLA_U32, + TUNNEL_FLAGS_PRESENT), }; static const struct nla_policy @@ -1825,6 +1836,21 @@ static int fl_set_key_cfm(struct nlattr **tb, return 0; } +static int fl_set_key_enc_flags(struct nlattr **tb, u32 *flags_key, + u32 *flags_mask, struct netlink_ext_ack *extack) +{ + /* mask is mandatory for flags */ + if (NL_REQ_ATTR_CHECK(extack, NULL, tb, TCA_FLOWER_KEY_ENC_FLAGS_MASK)) { + NL_SET_ERR_MSG(extack, "missing enc_flags mask"); + return -EINVAL; + } + + *flags_key = nla_get_u32(tb[TCA_FLOWER_KEY_ENC_FLAGS]); + *flags_mask = nla_get_u32(tb[TCA_FLOWER_KEY_ENC_FLAGS_MASK]); + + return 0; +} + static int fl_set_key(struct net *net, struct nlattr **tb, struct fl_flow_key *key, struct fl_flow_key *mask, struct netlink_ext_ack *extack) @@ -2059,9 +2085,16 @@ static int fl_set_key(struct net *net, struct nlattr **tb, if (ret) return ret; - if (tb[TCA_FLOWER_KEY_FLAGS]) + if (tb[TCA_FLOWER_KEY_FLAGS]) { ret = fl_set_key_flags(tb, &key->control.flags, &mask->control.flags, extack); + if (ret) + return ret; + } + + if (tb[TCA_FLOWER_KEY_ENC_FLAGS]) + ret = fl_set_key_enc_flags(tb, &key->enc_flags.flags, + &mask->enc_flags.flags, extack); return ret; } @@ -2175,6 +2208,8 @@ static void fl_init_dissector(struct flow_dissector *dissector, FLOW_DISSECTOR_KEY_IPSEC, ipsec); FL_KEY_SET_IF_MASKED(mask, keys, cnt, FLOW_DISSECTOR_KEY_CFM, cfm); + FL_KEY_SET_IF_MASKED(mask, keys, cnt, + FLOW_DISSECTOR_KEY_ENC_FLAGS, enc_flags); skb_flow_dissector_init(dissector, keys, cnt); } @@ -3291,6 +3326,22 @@ err_cfm_opts: return err; } +static int fl_dump_key_enc_flags(struct sk_buff *skb, + struct flow_dissector_key_enc_flags *key, + struct flow_dissector_key_enc_flags *mask) +{ + if (!memchr_inv(mask, 0, sizeof(*mask))) + return 0; + + if (nla_put_u32(skb, TCA_FLOWER_KEY_ENC_FLAGS, key->flags)) + return -EMSGSIZE; + + if (nla_put_u32(skb, TCA_FLOWER_KEY_ENC_FLAGS_MASK, mask->flags)) + return -EMSGSIZE; + + return 0; +} + static int fl_dump_key_options(struct sk_buff *skb, int enc_opt_type, struct flow_dissector_key_enc_opts *enc_opts) { @@ -3592,6 +3643,9 @@ static int fl_dump_key(struct sk_buff *skb, struct net *net, if (fl_dump_key_cfm(skb, &key->cfm, &mask->cfm)) goto nla_put_failure; + if (fl_dump_key_enc_flags(skb, &key->enc_flags, &mask->enc_flags)) + goto nla_put_failure; + return 0; nla_put_failure: diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index e22ff003d52e..2af24547a82c 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -633,6 +633,7 @@ EXPORT_SYMBOL_GPL(netif_carrier_event); static int noop_enqueue(struct sk_buff *skb, struct Qdisc *qdisc, struct sk_buff **to_free) { + dev_core_stats_tx_dropped_inc(skb->dev); __qdisc_drop(skb, to_free); return NET_XMIT_CN; } diff --git a/net/smc/Makefile b/net/smc/Makefile index 2c510d543058..60f1c87d5212 100644 --- a/net/smc/Makefile +++ b/net/smc/Makefile @@ -4,6 +4,6 @@ obj-$(CONFIG_SMC) += smc.o obj-$(CONFIG_SMC_DIAG) += smc_diag.o smc-y := af_smc.o smc_pnet.o smc_ib.o smc_clc.o smc_core.o smc_wr.o smc_llc.o smc-y += smc_cdc.o smc_tx.o smc_rx.o smc_close.o smc_ism.o smc_netlink.o smc_stats.o -smc-y += smc_tracepoint.o +smc-y += smc_tracepoint.o smc_inet.o smc-$(CONFIG_SYSCTL) += smc_sysctl.o smc-$(CONFIG_SMC_LO) += smc_loopback.o diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index c5f98c6b2561..73a875573e7a 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -54,6 +54,7 @@ #include "smc_tracepoint.h" #include "smc_sysctl.h" #include "smc_loopback.h" +#include "smc_inet.h" static DEFINE_MUTEX(smc_server_lgr_pending); /* serialize link group * creation on server @@ -170,15 +171,15 @@ static bool smc_hs_congested(const struct sock *sk) return false; } -static struct smc_hashinfo smc_v4_hashinfo = { +struct smc_hashinfo smc_v4_hashinfo = { .lock = __RW_LOCK_UNLOCKED(smc_v4_hashinfo.lock), }; -static struct smc_hashinfo smc_v6_hashinfo = { +struct smc_hashinfo smc_v6_hashinfo = { .lock = __RW_LOCK_UNLOCKED(smc_v6_hashinfo.lock), }; -static int smc_hash_sk(struct sock *sk) +int smc_hash_sk(struct sock *sk) { struct smc_hashinfo *h = sk->sk_prot->h.smc_hash; struct hlist_head *head; @@ -193,7 +194,7 @@ static int smc_hash_sk(struct sock *sk) return 0; } -static void smc_unhash_sk(struct sock *sk) +void smc_unhash_sk(struct sock *sk) { struct smc_hashinfo *h = sk->sk_prot->h.smc_hash; @@ -207,7 +208,7 @@ static void smc_unhash_sk(struct sock *sk) * work which we didn't do because of user hold the sock_lock in the * BH context */ -static void smc_release_cb(struct sock *sk) +void smc_release_cb(struct sock *sk) { struct smc_sock *smc = smc_sk(sk); @@ -307,7 +308,7 @@ static int __smc_release(struct smc_sock *smc) return rc; } -static int smc_release(struct socket *sock) +int smc_release(struct socket *sock) { struct sock *sk = sock->sk; struct smc_sock *smc; @@ -361,25 +362,15 @@ static void smc_destruct(struct sock *sk) return; } -static struct sock *smc_sock_alloc(struct net *net, struct socket *sock, - int protocol) +void smc_sk_init(struct net *net, struct sock *sk, int protocol) { - struct smc_sock *smc; - struct proto *prot; - struct sock *sk; - - prot = (protocol == SMCPROTO_SMC6) ? &smc_proto6 : &smc_proto; - sk = sk_alloc(net, PF_SMC, GFP_KERNEL, prot, 0); - if (!sk) - return NULL; + struct smc_sock *smc = smc_sk(sk); - sock_init_data(sock, sk); /* sets sk_refcnt to 1 */ sk->sk_state = SMC_INIT; sk->sk_destruct = smc_destruct; sk->sk_protocol = protocol; WRITE_ONCE(sk->sk_sndbuf, 2 * READ_ONCE(net->smc.sysctl_wmem)); WRITE_ONCE(sk->sk_rcvbuf, 2 * READ_ONCE(net->smc.sysctl_rmem)); - smc = smc_sk(sk); INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work); INIT_WORK(&smc->connect_work, smc_connect_work); INIT_DELAYED_WORK(&smc->conn.tx_work, smc_tx_work); @@ -389,12 +380,30 @@ static struct sock *smc_sock_alloc(struct net *net, struct socket *sock, sk->sk_prot->hash(sk); mutex_init(&smc->clcsock_release_lock); smc_init_saved_callbacks(smc); + smc->limit_smc_hs = net->smc.limit_smc_hs; + smc->use_fallback = false; /* assume rdma capability first */ + smc->fallback_rsn = 0; +} + +static struct sock *smc_sock_alloc(struct net *net, struct socket *sock, + int protocol) +{ + struct proto *prot; + struct sock *sk; + + prot = (protocol == SMCPROTO_SMC6) ? &smc_proto6 : &smc_proto; + sk = sk_alloc(net, PF_SMC, GFP_KERNEL, prot, 0); + if (!sk) + return NULL; + + sock_init_data(sock, sk); /* sets sk_refcnt to 1 */ + smc_sk_init(net, sk, protocol); return sk; } -static int smc_bind(struct socket *sock, struct sockaddr *uaddr, - int addr_len) +int smc_bind(struct socket *sock, struct sockaddr *uaddr, + int addr_len) { struct sockaddr_in *addr = (struct sockaddr_in *)uaddr; struct sock *sk = sock->sk; @@ -1623,8 +1632,8 @@ out: release_sock(&smc->sk); } -static int smc_connect(struct socket *sock, struct sockaddr *addr, - int alen, int flags) +int smc_connect(struct socket *sock, struct sockaddr *addr, + int alen, int flags) { struct sock *sk = sock->sk; struct smc_sock *smc; @@ -2605,7 +2614,7 @@ out: read_unlock_bh(&listen_clcsock->sk_callback_lock); } -static int smc_listen(struct socket *sock, int backlog) +int smc_listen(struct socket *sock, int backlog) { struct sock *sk = sock->sk; struct smc_sock *smc; @@ -2670,8 +2679,8 @@ out: return rc; } -static int smc_accept(struct socket *sock, struct socket *new_sock, - struct proto_accept_arg *arg) +int smc_accept(struct socket *sock, struct socket *new_sock, + struct proto_accept_arg *arg) { struct sock *sk = sock->sk, *nsk; DECLARE_WAITQUEUE(wait, current); @@ -2740,8 +2749,8 @@ out: return rc; } -static int smc_getname(struct socket *sock, struct sockaddr *addr, - int peer) +int smc_getname(struct socket *sock, struct sockaddr *addr, + int peer) { struct smc_sock *smc; @@ -2754,7 +2763,7 @@ static int smc_getname(struct socket *sock, struct sockaddr *addr, return smc->clcsock->ops->getname(smc->clcsock, addr, peer); } -static int smc_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) +int smc_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { struct sock *sk = sock->sk; struct smc_sock *smc; @@ -2792,8 +2801,8 @@ out: return rc; } -static int smc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, - int flags) +int smc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, + int flags) { struct sock *sk = sock->sk; struct smc_sock *smc; @@ -2842,8 +2851,8 @@ static __poll_t smc_accept_poll(struct sock *parent) return mask; } -static __poll_t smc_poll(struct file *file, struct socket *sock, - poll_table *wait) +__poll_t smc_poll(struct file *file, struct socket *sock, + poll_table *wait) { struct sock *sk = sock->sk; struct smc_sock *smc; @@ -2895,7 +2904,7 @@ static __poll_t smc_poll(struct file *file, struct socket *sock, return mask; } -static int smc_shutdown(struct socket *sock, int how) +int smc_shutdown(struct socket *sock, int how) { struct sock *sk = sock->sk; bool do_shutdown = true; @@ -3035,8 +3044,8 @@ static int __smc_setsockopt(struct socket *sock, int level, int optname, return rc; } -static int smc_setsockopt(struct socket *sock, int level, int optname, - sockptr_t optval, unsigned int optlen) +int smc_setsockopt(struct socket *sock, int level, int optname, + sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; struct smc_sock *smc; @@ -3122,8 +3131,8 @@ out: return rc; } -static int smc_getsockopt(struct socket *sock, int level, int optname, - char __user *optval, int __user *optlen) +int smc_getsockopt(struct socket *sock, int level, int optname, + char __user *optval, int __user *optlen) { struct smc_sock *smc; int rc; @@ -3148,8 +3157,8 @@ static int smc_getsockopt(struct socket *sock, int level, int optname, return rc; } -static int smc_ioctl(struct socket *sock, unsigned int cmd, - unsigned long arg) +int smc_ioctl(struct socket *sock, unsigned int cmd, + unsigned long arg) { union smc_host_cursor cons, urg; struct smc_connection *conn; @@ -3235,9 +3244,9 @@ static int smc_ioctl(struct socket *sock, unsigned int cmd, * Note that subsequent recv() calls have to wait till all splice() processing * completed. */ -static ssize_t smc_splice_read(struct socket *sock, loff_t *ppos, - struct pipe_inode_info *pipe, size_t len, - unsigned int flags) +ssize_t smc_splice_read(struct socket *sock, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags) { struct sock *sk = sock->sk; struct smc_sock *smc; @@ -3303,6 +3312,31 @@ static const struct proto_ops smc_sock_ops = { .splice_read = smc_splice_read, }; +int smc_create_clcsk(struct net *net, struct sock *sk, int family) +{ + struct smc_sock *smc = smc_sk(sk); + int rc; + + rc = sock_create_kern(net, family, SOCK_STREAM, IPPROTO_TCP, + &smc->clcsock); + if (rc) { + sk_common_release(sk); + return rc; + } + + /* smc_clcsock_release() does not wait smc->clcsock->sk's + * destruction; its sk_state might not be TCP_CLOSE after + * smc->sk is close()d, and TCP timers can be fired later, + * which need net ref. + */ + sk = smc->clcsock->sk; + __netns_tracker_free(net, &sk->ns_tracker, false); + sk->sk_net_refcnt = 1; + get_net_track(net, &sk->ns_tracker, GFP_KERNEL); + sock_inuse_add(net, 1); + return 0; +} + static int __smc_create(struct net *net, struct socket *sock, int protocol, int kern, struct socket *clcsock) { @@ -3328,35 +3362,12 @@ static int __smc_create(struct net *net, struct socket *sock, int protocol, /* create internal TCP socket for CLC handshake and fallback */ smc = smc_sk(sk); - smc->use_fallback = false; /* assume rdma capability first */ - smc->fallback_rsn = 0; - - /* default behavior from limit_smc_hs in every net namespace */ - smc->limit_smc_hs = net->smc.limit_smc_hs; rc = 0; - if (!clcsock) { - rc = sock_create_kern(net, family, SOCK_STREAM, IPPROTO_TCP, - &smc->clcsock); - if (rc) { - sk_common_release(sk); - goto out; - } - - /* smc_clcsock_release() does not wait smc->clcsock->sk's - * destruction; its sk_state might not be TCP_CLOSE after - * smc->sk is close()d, and TCP timers can be fired later, - * which need net ref. - */ - sk = smc->clcsock->sk; - __netns_tracker_free(net, &sk->ns_tracker, false); - sk->sk_net_refcnt = 1; - get_net_track(net, &sk->ns_tracker, GFP_KERNEL); - sock_inuse_add(net, 1); - } else { + if (clcsock) smc->clcsock = clcsock; - } - + else + rc = smc_create_clcsk(net, sk, family); out: return rc; } @@ -3565,10 +3576,15 @@ static int __init smc_init(void) pr_err("%s: tcp_ulp_register fails with %d\n", __func__, rc); goto out_lo; } - + rc = smc_inet_init(); + if (rc) { + pr_err("%s: smc_inet_init fails with %d\n", __func__, rc); + goto out_ulp; + } static_branch_enable(&tcp_have_smc); return 0; - +out_ulp: + tcp_unregister_ulp(&smc_ulp_ops); out_lo: smc_loopback_exit(); out_ib: @@ -3605,6 +3621,7 @@ out_pernet_subsys: static void __exit smc_exit(void) { static_branch_disable(&tcp_have_smc); + smc_inet_exit(); tcp_unregister_ulp(&smc_ulp_ops); sock_unregister(PF_SMC); smc_core_exit(); @@ -3632,4 +3649,9 @@ MODULE_DESCRIPTION("smc socket address family"); MODULE_LICENSE("GPL"); MODULE_ALIAS_NETPROTO(PF_SMC); MODULE_ALIAS_TCP_ULP("smc"); +/* 256 for IPPROTO_SMC and 1 for SOCK_STREAM */ +MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_INET, 256, 1); +#if IS_ENABLED(CONFIG_IPV6) +MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_INET6, 256, 1); +#endif /* CONFIG_IPV6 */ MODULE_ALIAS_GENL_FAMILY(SMC_GENL_FAMILY_NAME); diff --git a/net/smc/smc.h b/net/smc/smc.h index 18c8b7870198..34b781e463c4 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -34,6 +34,44 @@ extern struct proto smc_proto; extern struct proto smc_proto6; +extern struct smc_hashinfo smc_v4_hashinfo; +extern struct smc_hashinfo smc_v6_hashinfo; + +int smc_hash_sk(struct sock *sk); +void smc_unhash_sk(struct sock *sk); +void smc_release_cb(struct sock *sk); + +int smc_release(struct socket *sock); +int smc_bind(struct socket *sock, struct sockaddr *uaddr, + int addr_len); +int smc_connect(struct socket *sock, struct sockaddr *addr, + int alen, int flags); +int smc_accept(struct socket *sock, struct socket *new_sock, + struct proto_accept_arg *arg); +int smc_getname(struct socket *sock, struct sockaddr *addr, + int peer); +__poll_t smc_poll(struct file *file, struct socket *sock, + poll_table *wait); +int smc_ioctl(struct socket *sock, unsigned int cmd, + unsigned long arg); +int smc_listen(struct socket *sock, int backlog); +int smc_shutdown(struct socket *sock, int how); +int smc_setsockopt(struct socket *sock, int level, int optname, + sockptr_t optval, unsigned int optlen); +int smc_getsockopt(struct socket *sock, int level, int optname, + char __user *optval, int __user *optlen); +int smc_sendmsg(struct socket *sock, struct msghdr *msg, size_t len); +int smc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, + int flags); +ssize_t smc_splice_read(struct socket *sock, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags); + +/* smc sock initialization */ +void smc_sk_init(struct net *net, struct sock *sk, int protocol); +/* clcsock initialization */ +int smc_create_clcsk(struct net *net, struct sock *sk, int family); + #ifdef ATOMIC64_INIT #define KERNEL_HAS_ATOMIC64 #endif diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index fafdb97adfad..3b95828d9976 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -2006,7 +2006,7 @@ out: } #define SMCD_DMBE_SIZES 6 /* 0 -> 16KB, 1 -> 32KB, .. 6 -> 1MB */ -#define SMCR_RMBE_SIZES 5 /* 0 -> 16KB, 1 -> 32KB, .. 5 -> 512KB */ +#define SMCR_RMBE_SIZES 15 /* 0 -> 16KB, 1 -> 32KB, .. 15 -> 512MB */ /* convert the RMB size into the compressed notation (minimum 16K, see * SMCD/R_DMBE_SIZES. @@ -2015,7 +2015,6 @@ out: */ static u8 smc_compress_bufsize(int size, bool is_smcd, bool is_rmb) { - const unsigned int max_scat = SG_MAX_SINGLE_ALLOC * PAGE_SIZE; u8 compressed; if (size <= SMC_BUF_MIN_SIZE) @@ -2025,9 +2024,11 @@ static u8 smc_compress_bufsize(int size, bool is_smcd, bool is_rmb) compressed = min_t(u8, ilog2(size) + 1, is_smcd ? SMCD_DMBE_SIZES : SMCR_RMBE_SIZES); +#ifdef CONFIG_ARCH_NO_SG_CHAIN if (!is_smcd && is_rmb) /* RMBs are backed by & limited to max size of scatterlists */ - compressed = min_t(u8, compressed, ilog2(max_scat >> 14)); + compressed = min_t(u8, compressed, ilog2((SG_MAX_SINGLE_ALLOC * PAGE_SIZE) >> 14)); +#endif return compressed; } diff --git a/net/smc/smc_inet.c b/net/smc/smc_inet.c new file mode 100644 index 000000000000..bece346dd8e9 --- /dev/null +++ b/net/smc/smc_inet.c @@ -0,0 +1,159 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Shared Memory Communications over RDMA (SMC-R) and RoCE + * + * Definitions for the IPPROTO_SMC (socket related) + * + * Copyright IBM Corp. 2016, 2018 + * Copyright (c) 2024, Alibaba Inc. + * + * Author: D. Wythe <alibuda@linux.alibaba.com> + */ + +#include <net/protocol.h> +#include <net/sock.h> + +#include "smc_inet.h" +#include "smc.h" + +static int smc_inet_init_sock(struct sock *sk); + +static struct proto smc_inet_prot = { + .name = "INET_SMC", + .owner = THIS_MODULE, + .init = smc_inet_init_sock, + .hash = smc_hash_sk, + .unhash = smc_unhash_sk, + .release_cb = smc_release_cb, + .obj_size = sizeof(struct smc_sock), + .h.smc_hash = &smc_v4_hashinfo, + .slab_flags = SLAB_TYPESAFE_BY_RCU, +}; + +static const struct proto_ops smc_inet_stream_ops = { + .family = PF_INET, + .owner = THIS_MODULE, + .release = smc_release, + .bind = smc_bind, + .connect = smc_connect, + .socketpair = sock_no_socketpair, + .accept = smc_accept, + .getname = smc_getname, + .poll = smc_poll, + .ioctl = smc_ioctl, + .listen = smc_listen, + .shutdown = smc_shutdown, + .setsockopt = smc_setsockopt, + .getsockopt = smc_getsockopt, + .sendmsg = smc_sendmsg, + .recvmsg = smc_recvmsg, + .mmap = sock_no_mmap, + .splice_read = smc_splice_read, +}; + +static struct inet_protosw smc_inet_protosw = { + .type = SOCK_STREAM, + .protocol = IPPROTO_SMC, + .prot = &smc_inet_prot, + .ops = &smc_inet_stream_ops, + .flags = INET_PROTOSW_ICSK, +}; + +#if IS_ENABLED(CONFIG_IPV6) +static struct proto smc_inet6_prot = { + .name = "INET6_SMC", + .owner = THIS_MODULE, + .init = smc_inet_init_sock, + .hash = smc_hash_sk, + .unhash = smc_unhash_sk, + .release_cb = smc_release_cb, + .obj_size = sizeof(struct smc_sock), + .h.smc_hash = &smc_v6_hashinfo, + .slab_flags = SLAB_TYPESAFE_BY_RCU, +}; + +static const struct proto_ops smc_inet6_stream_ops = { + .family = PF_INET6, + .owner = THIS_MODULE, + .release = smc_release, + .bind = smc_bind, + .connect = smc_connect, + .socketpair = sock_no_socketpair, + .accept = smc_accept, + .getname = smc_getname, + .poll = smc_poll, + .ioctl = smc_ioctl, + .listen = smc_listen, + .shutdown = smc_shutdown, + .setsockopt = smc_setsockopt, + .getsockopt = smc_getsockopt, + .sendmsg = smc_sendmsg, + .recvmsg = smc_recvmsg, + .mmap = sock_no_mmap, + .splice_read = smc_splice_read, +}; + +static struct inet_protosw smc_inet6_protosw = { + .type = SOCK_STREAM, + .protocol = IPPROTO_SMC, + .prot = &smc_inet6_prot, + .ops = &smc_inet6_stream_ops, + .flags = INET_PROTOSW_ICSK, +}; +#endif /* CONFIG_IPV6 */ + +static int smc_inet_init_sock(struct sock *sk) +{ + struct net *net = sock_net(sk); + + /* init common smc sock */ + smc_sk_init(net, sk, IPPROTO_SMC); + /* create clcsock */ + return smc_create_clcsk(net, sk, sk->sk_family); +} + +int __init smc_inet_init(void) +{ + int rc; + + rc = proto_register(&smc_inet_prot, 1); + if (rc) { + pr_err("%s: proto_register smc_inet_prot fails with %d\n", + __func__, rc); + return rc; + } + /* no return value */ + inet_register_protosw(&smc_inet_protosw); + +#if IS_ENABLED(CONFIG_IPV6) + rc = proto_register(&smc_inet6_prot, 1); + if (rc) { + pr_err("%s: proto_register smc_inet6_prot fails with %d\n", + __func__, rc); + goto out_inet6_prot; + } + rc = inet6_register_protosw(&smc_inet6_protosw); + if (rc) { + pr_err("%s: inet6_register_protosw smc_inet6_protosw fails with %d\n", + __func__, rc); + goto out_inet6_protosw; + } + return rc; +out_inet6_protosw: + proto_unregister(&smc_inet6_prot); +out_inet6_prot: + inet_unregister_protosw(&smc_inet_protosw); + proto_unregister(&smc_inet_prot); +#endif /* CONFIG_IPV6 */ + return rc; +} + +void smc_inet_exit(void) +{ +#if IS_ENABLED(CONFIG_IPV6) + inet6_unregister_protosw(&smc_inet6_protosw); + proto_unregister(&smc_inet6_prot); +#endif /* CONFIG_IPV6 */ + inet_unregister_protosw(&smc_inet_protosw); + proto_unregister(&smc_inet_prot); +} diff --git a/net/smc/smc_inet.h b/net/smc/smc_inet.h new file mode 100644 index 000000000000..a489c8a2b8ef --- /dev/null +++ b/net/smc/smc_inet.h @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Shared Memory Communications over RDMA (SMC-R) and RoCE + * + * Definitions for the IPPROTO_SMC (socket related) + + * Copyright IBM Corp. 2016 + * Copyright (c) 2024, Alibaba Inc. + * + * Author: D. Wythe <alibuda@linux.alibaba.com> + */ +#ifndef __INET_SMC +#define __INET_SMC + +/* Initialize protocol registration on IPPROTO_SMC, + * @return 0 on success + */ +int smc_inet_init(void); + +void smc_inet_exit(void); + +#endif /* __INET_SMC */ diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c index ab6e694f7bc2..dc063c2c7950 100644 --- a/net/tls/tls_device.c +++ b/net/tls/tls_device.c @@ -231,14 +231,10 @@ static void tls_device_resync_tx(struct sock *sk, struct tls_context *tls_ctx, u32 seq) { struct net_device *netdev; - struct sk_buff *skb; int err = 0; u8 *rcd_sn; - skb = tcp_write_queue_tail(sk); - if (skb) - TCP_SKB_CB(skb)->eor = 1; - + tcp_write_collapse_fence(sk); rcd_sn = tls_ctx->tx.rec_seq; trace_tls_device_tx_resync_send(sk, seq, rcd_sn); @@ -1067,7 +1063,6 @@ int tls_set_device_offload(struct sock *sk) struct tls_prot_info *prot; struct net_device *netdev; struct tls_context *ctx; - struct sk_buff *skb; char *iv, *rec_seq; int rc; @@ -1138,9 +1133,7 @@ int tls_set_device_offload(struct sock *sk) * SKBs where only part of the payload needs to be encrypted. * So mark the last skb in the write queue as end of record. */ - skb = tcp_write_queue_tail(sk); - if (skb) - TCP_SKB_CB(skb)->eor = 1; + tcp_write_collapse_fence(sk); /* Avoid offloading if the device is down * We don't want to offload new flows after diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 142f56770b77..b0a4c6d08e0a 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -126,6 +126,81 @@ static spinlock_t bsd_socket_locks[UNIX_HASH_SIZE / 2]; * hash table is protected with spinlock. * each socket state is protected by separate spinlock. */ +#ifdef CONFIG_PROVE_LOCKING +#define cmp_ptr(l, r) (((l) > (r)) - ((l) < (r))) + +static int unix_table_lock_cmp_fn(const struct lockdep_map *a, + const struct lockdep_map *b) +{ + return cmp_ptr(a, b); +} + +static int unix_state_lock_cmp_fn(const struct lockdep_map *_a, + const struct lockdep_map *_b) +{ + const struct unix_sock *a, *b; + + a = container_of(_a, struct unix_sock, lock.dep_map); + b = container_of(_b, struct unix_sock, lock.dep_map); + + if (a->sk.sk_state == TCP_LISTEN) { + /* unix_stream_connect(): Before the 2nd unix_state_lock(), + * + * 1. a is TCP_LISTEN. + * 2. b is not a. + * 3. concurrent connect(b -> a) must fail. + * + * Except for 2. & 3., the b's state can be any possible + * value due to concurrent connect() or listen(). + * + * 2. is detected in debug_spin_lock_before(), and 3. cannot + * be expressed as lock_cmp_fn. + */ + switch (b->sk.sk_state) { + case TCP_CLOSE: + case TCP_ESTABLISHED: + case TCP_LISTEN: + return -1; + default: + /* Invalid case. */ + return 0; + } + } + + /* Should never happen. Just to be symmetric. */ + if (b->sk.sk_state == TCP_LISTEN) { + switch (b->sk.sk_state) { + case TCP_CLOSE: + case TCP_ESTABLISHED: + return 1; + default: + return 0; + } + } + + /* unix_state_double_lock(): ascending address order. */ + return cmp_ptr(a, b); +} + +static int unix_recvq_lock_cmp_fn(const struct lockdep_map *_a, + const struct lockdep_map *_b) +{ + const struct sock *a, *b; + + a = container_of(_a, struct sock, sk_receive_queue.lock.dep_map); + b = container_of(_b, struct sock, sk_receive_queue.lock.dep_map); + + /* unix_collect_skb(): listener -> embryo order. */ + if (a->sk_state == TCP_LISTEN && unix_sk(b)->listener == a) + return -1; + + /* Should never happen. Just to be symmetric. */ + if (b->sk_state == TCP_LISTEN && unix_sk(a)->listener == b) + return 1; + + return 0; +} +#endif static unsigned int unix_unbound_hash(struct sock *sk) { @@ -168,7 +243,7 @@ static void unix_table_double_lock(struct net *net, swap(hash1, hash2); spin_lock(&net->unx.table.locks[hash1]); - spin_lock_nested(&net->unx.table.locks[hash2], SINGLE_DEPTH_NESTING); + spin_lock(&net->unx.table.locks[hash2]); } static void unix_table_double_unlock(struct net *net, @@ -647,8 +722,8 @@ static void unix_release_sock(struct sock *sk, int embrion) while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) { if (state == TCP_LISTEN) unix_release_sock(skb->sk, 1); + /* passed fds are erased in the kfree_skb hook */ - UNIXCB(skb).consumed = skb->len; kfree_skb(skb); } @@ -676,14 +751,19 @@ static void unix_release_sock(struct sock *sk, int embrion) static void init_peercred(struct sock *sk) { + sk->sk_peer_pid = get_pid(task_tgid(current)); + sk->sk_peer_cred = get_current_cred(); +} + +static void update_peercred(struct sock *sk) +{ const struct cred *old_cred; struct pid *old_pid; spin_lock(&sk->sk_peer_lock); old_pid = sk->sk_peer_pid; old_cred = sk->sk_peer_cred; - sk->sk_peer_pid = get_pid(task_tgid(current)); - sk->sk_peer_cred = get_current_cred(); + init_peercred(sk); spin_unlock(&sk->sk_peer_lock); put_pid(old_pid); @@ -692,26 +772,12 @@ static void init_peercred(struct sock *sk) static void copy_peercred(struct sock *sk, struct sock *peersk) { - const struct cred *old_cred; - struct pid *old_pid; + lockdep_assert_held(&unix_sk(peersk)->lock); - if (sk < peersk) { - spin_lock(&sk->sk_peer_lock); - spin_lock_nested(&peersk->sk_peer_lock, SINGLE_DEPTH_NESTING); - } else { - spin_lock(&peersk->sk_peer_lock); - spin_lock_nested(&sk->sk_peer_lock, SINGLE_DEPTH_NESTING); - } - old_pid = sk->sk_peer_pid; - old_cred = sk->sk_peer_cred; - sk->sk_peer_pid = get_pid(peersk->sk_peer_pid); + spin_lock(&sk->sk_peer_lock); + sk->sk_peer_pid = get_pid(peersk->sk_peer_pid); sk->sk_peer_cred = get_cred(peersk->sk_peer_cred); - spin_unlock(&sk->sk_peer_lock); - spin_unlock(&peersk->sk_peer_lock); - - put_pid(old_pid); - put_cred(old_cred); } static int unix_listen(struct socket *sock, int backlog) @@ -735,7 +801,7 @@ static int unix_listen(struct socket *sock, int backlog) WRITE_ONCE(sk->sk_state, TCP_LISTEN); /* set credentials so connect can copy them */ - init_peercred(sk); + update_peercred(sk); err = 0; out_unlock: @@ -972,12 +1038,15 @@ static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, sk->sk_write_space = unix_write_space; sk->sk_max_ack_backlog = READ_ONCE(net->unx.sysctl_max_dgram_qlen); sk->sk_destruct = unix_sock_destructor; + lock_set_cmp_fn(&sk->sk_receive_queue.lock, unix_recvq_lock_cmp_fn, NULL); + u = unix_sk(sk); u->listener = NULL; u->vertex = NULL; u->path.dentry = NULL; u->path.mnt = NULL; spin_lock_init(&u->lock); + lock_set_cmp_fn(&u->lock, unix_state_lock_cmp_fn, NULL); mutex_init(&u->iolock); /* single task reading lock */ mutex_init(&u->bindlock); /* single task binding lock */ init_waitqueue_head(&u->peer_wait); @@ -1326,11 +1395,12 @@ static void unix_state_double_lock(struct sock *sk1, struct sock *sk2) unix_state_lock(sk1); return; } + if (sk1 > sk2) swap(sk1, sk2); unix_state_lock(sk1); - unix_state_lock_nested(sk2, U_LOCK_SECOND); + unix_state_lock(sk2); } static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2) @@ -1473,6 +1543,7 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr, struct unix_sock *u = unix_sk(sk), *newu, *otheru; struct net *net = sock_net(sk); struct sk_buff *skb = NULL; + unsigned char state; long timeo; int err; @@ -1523,7 +1594,6 @@ restart: goto out; } - /* Latch state of peer */ unix_state_lock(other); /* Apparently VFS overslept socket death. Retry. */ @@ -1553,37 +1623,21 @@ restart: goto restart; } - /* Latch our state. - - It is tricky place. We need to grab our state lock and cannot - drop lock on peer. It is dangerous because deadlock is - possible. Connect to self case and simultaneous - attempt to connect are eliminated by checking socket - state. other is TCP_LISTEN, if sk is TCP_LISTEN we - check this before attempt to grab lock. - - Well, and we have to recheck the state after socket locked. + /* self connect and simultaneous connect are eliminated + * by rejecting TCP_LISTEN socket to avoid deadlock. */ - switch (READ_ONCE(sk->sk_state)) { - case TCP_CLOSE: - /* This is ok... continue with connect */ - break; - case TCP_ESTABLISHED: - /* Socket is already connected */ - err = -EISCONN; - goto out_unlock; - default: - err = -EINVAL; + state = READ_ONCE(sk->sk_state); + if (unlikely(state != TCP_CLOSE)) { + err = state == TCP_ESTABLISHED ? -EISCONN : -EINVAL; goto out_unlock; } - unix_state_lock_nested(sk, U_LOCK_SECOND); + unix_state_lock(sk); - if (sk->sk_state != TCP_CLOSE) { + if (unlikely(sk->sk_state != TCP_CLOSE)) { + err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EINVAL; unix_state_unlock(sk); - unix_state_unlock(other); - sock_put(other); - goto restart; + goto out_unlock; } err = security_unix_stream_connect(sk, other, newsk); @@ -2717,9 +2771,8 @@ static int unix_stream_read_generic(struct unix_stream_read_state *state, skip = max(sk_peek_offset(sk, flags), 0); do { - int chunk; - bool drop_skb; struct sk_buff *skb, *last; + int chunk; redo: unix_state_lock(sk); @@ -2815,11 +2868,7 @@ unlock: } chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size); - skb_get(skb); chunk = state->recv_actor(skb, skip, chunk, state); - drop_skb = !unix_skb_len(skb); - /* skb is only safe to use if !drop_skb */ - consume_skb(skb); if (chunk < 0) { if (copied == 0) copied = -EFAULT; @@ -2828,18 +2877,6 @@ unlock: copied += chunk; size -= chunk; - if (drop_skb) { - /* the skb was touched by a concurrent reader; - * we should not expect anything from this skb - * anymore and assume it invalid - we can be - * sure it was dropped from the socket queue - * - * let's report a short read - */ - err = 0; - break; - } - /* Mark read part of skb as used */ if (!(flags & MSG_PEEK)) { UNIXCB(skb).consumed += chunk; @@ -3620,6 +3657,7 @@ static int __net_init unix_net_init(struct net *net) for (i = 0; i < UNIX_HASH_SIZE; i++) { spin_lock_init(&net->unx.table.locks[i]); + lock_set_cmp_fn(&net->unx.table.locks[i], unix_table_lock_cmp_fn, NULL); INIT_HLIST_HEAD(&net->unx.table.buckets[i]); } diff --git a/net/unix/diag.c b/net/unix/diag.c index 937edf4afed4..9138af8b465e 100644 --- a/net/unix/diag.c +++ b/net/unix/diag.c @@ -47,9 +47,7 @@ static int sk_diag_dump_peer(struct sock *sk, struct sk_buff *nlskb) peer = unix_peer_get(sk); if (peer) { - unix_state_lock(peer); ino = sock_i_ino(peer); - unix_state_unlock(peer); sock_put(peer); return nla_put_u32(nlskb, UNIX_DIAG_PEER, ino); @@ -75,20 +73,9 @@ static int sk_diag_dump_icons(struct sock *sk, struct sk_buff *nlskb) buf = nla_data(attr); i = 0; - skb_queue_walk(&sk->sk_receive_queue, skb) { - struct sock *req, *peer; - - req = skb->sk; - /* - * The state lock is outer for the same sk's - * queue lock. With the other's queue locked it's - * OK to lock the state. - */ - unix_state_lock_nested(req, U_LOCK_DIAG); - peer = unix_sk(req)->peer; - buf[i++] = (peer ? sock_i_ino(peer) : 0); - unix_state_unlock(req); - } + skb_queue_walk(&sk->sk_receive_queue, skb) + buf[i++] = sock_i_ino(unix_peer(skb->sk)); + spin_unlock(&sk->sk_receive_queue.lock); } @@ -180,22 +167,6 @@ out_nlmsg_trim: return -EMSGSIZE; } -static int sk_diag_dump(struct sock *sk, struct sk_buff *skb, struct unix_diag_req *req, - struct user_namespace *user_ns, - u32 portid, u32 seq, u32 flags) -{ - int sk_ino; - - unix_state_lock(sk); - sk_ino = sock_i_ino(sk); - unix_state_unlock(sk); - - if (!sk_ino) - return 0; - - return sk_diag_fill(sk, skb, req, user_ns, portid, seq, flags, sk_ino); -} - static int unix_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); @@ -213,14 +184,22 @@ static int unix_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) num = 0; spin_lock(&net->unx.table.locks[slot]); sk_for_each(sk, &net->unx.table.buckets[slot]) { + int sk_ino; + if (num < s_num) goto next; + if (!(req->udiag_states & (1 << READ_ONCE(sk->sk_state)))) goto next; - if (sk_diag_dump(sk, skb, req, sk_user_ns(skb->sk), + + sk_ino = sock_i_ino(sk); + if (!sk_ino) + goto next; + + if (sk_diag_fill(sk, skb, req, sk_user_ns(skb->sk), NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, - NLM_F_MULTI) < 0) { + NLM_F_MULTI, sk_ino) < 0) { spin_unlock(&net->unx.table.locks[slot]); goto done; } diff --git a/net/unix/garbage.c b/net/unix/garbage.c index dfe94a90ece4..eb8aa5171a68 100644 --- a/net/unix/garbage.c +++ b/net/unix/garbage.c @@ -337,11 +337,6 @@ static bool unix_vertex_dead(struct unix_vertex *vertex) return true; } -enum unix_recv_queue_lock_class { - U_RECVQ_LOCK_NORMAL, - U_RECVQ_LOCK_EMBRYO, -}; - static void unix_collect_queue(struct unix_sock *u, struct sk_buff_head *hitlist) { skb_queue_splice_init(&u->sk.sk_receive_queue, hitlist); @@ -375,8 +370,7 @@ static void unix_collect_skb(struct list_head *scc, struct sk_buff_head *hitlist skb_queue_walk(queue, skb) { struct sk_buff_head *embryo_queue = &skb->sk->sk_receive_queue; - /* listener -> embryo order, the inversion never happens. */ - spin_lock_nested(&embryo_queue->lock, U_RECVQ_LOCK_EMBRYO); + spin_lock(&embryo_queue->lock); unix_collect_queue(unix_sk(skb->sk), hitlist); spin_unlock(&embryo_queue->lock); } diff --git a/net/wireless/core.c b/net/wireless/core.c index 4b1f45e3070e..4d5d351bd0b5 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -421,6 +421,8 @@ static void cfg80211_wiphy_work(struct work_struct *work) rdev = container_of(work, struct cfg80211_registered_device, wiphy_work); + trace_wiphy_work_worker_start(&rdev->wiphy); + wiphy_lock(&rdev->wiphy); if (rdev->suspended) goto out; @@ -434,6 +436,7 @@ static void cfg80211_wiphy_work(struct work_struct *work) queue_work(system_unbound_wq, work); spin_unlock_irq(&rdev->wiphy_work_lock); + trace_wiphy_work_run(&rdev->wiphy, wk); wk->func(&rdev->wiphy, wk); } else { spin_unlock_irq(&rdev->wiphy_work_lock); @@ -1066,6 +1069,7 @@ void cfg80211_process_wiphy_works(struct cfg80211_registered_device *rdev, list_del_init(&wk->entry); spin_unlock_irqrestore(&rdev->wiphy_work_lock, flags); + trace_wiphy_work_run(&rdev->wiphy, wk); wk->func(&rdev->wiphy, wk); spin_lock_irqsave(&rdev->wiphy_work_lock, flags); @@ -1141,7 +1145,8 @@ void wiphy_unregister(struct wiphy *wiphy) flush_work(&rdev->background_cac_abort_wk); cfg80211_rdev_free_wowlan(rdev); - cfg80211_rdev_free_coalesce(rdev); + cfg80211_free_coalesce(rdev->coalesce); + rdev->coalesce = NULL; } EXPORT_SYMBOL(wiphy_unregister); @@ -1610,6 +1615,8 @@ void wiphy_work_queue(struct wiphy *wiphy, struct wiphy_work *work) struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); unsigned long flags; + trace_wiphy_work_queue(wiphy, work); + spin_lock_irqsave(&rdev->wiphy_work_lock, flags); if (list_empty(&work->entry)) list_add_tail(&work->entry, &rdev->wiphy_work_list); @@ -1626,6 +1633,8 @@ void wiphy_work_cancel(struct wiphy *wiphy, struct wiphy_work *work) lockdep_assert_held(&wiphy->mtx); + trace_wiphy_work_cancel(wiphy, work); + spin_lock_irqsave(&rdev->wiphy_work_lock, flags); if (!list_empty(&work->entry)) list_del_init(&work->entry); @@ -1639,6 +1648,8 @@ void wiphy_work_flush(struct wiphy *wiphy, struct wiphy_work *work) unsigned long flags; bool run; + trace_wiphy_work_flush(wiphy, work); + spin_lock_irqsave(&rdev->wiphy_work_lock, flags); run = !work || !list_empty(&work->entry); spin_unlock_irqrestore(&rdev->wiphy_work_lock, flags); @@ -1660,6 +1671,8 @@ void wiphy_delayed_work_queue(struct wiphy *wiphy, struct wiphy_delayed_work *dwork, unsigned long delay) { + trace_wiphy_delayed_work_queue(wiphy, &dwork->work, delay); + if (!delay) { del_timer(&dwork->timer); wiphy_work_queue(wiphy, &dwork->work); diff --git a/net/wireless/ibss.c b/net/wireless/ibss.c index 9f02ee5f08be..34e5acff3935 100644 --- a/net/wireless/ibss.c +++ b/net/wireless/ibss.c @@ -3,7 +3,7 @@ * Some IBSS support code for cfg80211. * * Copyright 2009 Johannes Berg <johannes@sipsolutions.net> - * Copyright (C) 2020-2023 Intel Corporation + * Copyright (C) 2020-2024 Intel Corporation */ #include <linux/etherdevice.h> @@ -94,6 +94,9 @@ int __cfg80211_join_ibss(struct cfg80211_registered_device *rdev, lockdep_assert_held(&rdev->wiphy.mtx); + if (wdev->cac_started) + return -EBUSY; + if (wdev->u.ibss.ssid_len) return -EALREADY; diff --git a/net/wireless/mesh.c b/net/wireless/mesh.c index 83306979fbe2..aaca65b66af4 100644 --- a/net/wireless/mesh.c +++ b/net/wireless/mesh.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 /* * Portions - * Copyright (C) 2022-2023 Intel Corporation + * Copyright (C) 2022-2024 Intel Corporation */ #include <linux/ieee80211.h> #include <linux/export.h> @@ -127,6 +127,9 @@ int __cfg80211_join_mesh(struct cfg80211_registered_device *rdev, if (!rdev->ops->join_mesh) return -EOPNOTSUPP; + if (wdev->cac_started) + return -EBUSY; + if (!setup->chandef.chan) { /* if no channel explicitly given, use preset channel */ setup->chandef = wdev->u.mesh.preset_chandef; diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 72c7bf558581..b457c2fcee30 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -3419,6 +3419,33 @@ static int __nl80211_set_channel(struct cfg80211_registered_device *rdev, if (chandef.chan != cur_chan) return -EBUSY; + /* only allow this for regular channel widths */ + switch (wdev->links[link_id].ap.chandef.width) { + case NL80211_CHAN_WIDTH_20_NOHT: + case NL80211_CHAN_WIDTH_20: + case NL80211_CHAN_WIDTH_40: + case NL80211_CHAN_WIDTH_80: + case NL80211_CHAN_WIDTH_80P80: + case NL80211_CHAN_WIDTH_160: + case NL80211_CHAN_WIDTH_320: + break; + default: + return -EINVAL; + } + + switch (chandef.width) { + case NL80211_CHAN_WIDTH_20_NOHT: + case NL80211_CHAN_WIDTH_20: + case NL80211_CHAN_WIDTH_40: + case NL80211_CHAN_WIDTH_80: + case NL80211_CHAN_WIDTH_80P80: + case NL80211_CHAN_WIDTH_160: + case NL80211_CHAN_WIDTH_320: + break; + default: + return -EINVAL; + } + result = rdev_set_ap_chanwidth(rdev, dev, link_id, &chandef); if (result) @@ -5941,6 +5968,9 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info) if (!rdev->ops->start_ap) return -EOPNOTSUPP; + if (wdev->cac_started) + return -EBUSY; + if (wdev->links[link_id].ap.beacon_interval) return -EALREADY; @@ -9933,6 +9963,17 @@ static int nl80211_start_radar_detection(struct sk_buff *skb, flush_delayed_work(&rdev->dfs_update_channels_wk); + switch (wdev->iftype) { + case NL80211_IFTYPE_AP: + case NL80211_IFTYPE_P2P_GO: + case NL80211_IFTYPE_MESH_POINT: + case NL80211_IFTYPE_ADHOC: + break; + default: + /* caution - see cfg80211_beaconing_iface_active() below */ + return -EINVAL; + } + wiphy_lock(wiphy); dfs_region = reg_get_dfs_region(wiphy); @@ -9963,12 +10004,7 @@ static int nl80211_start_radar_detection(struct sk_buff *skb, goto unlock; } - if (netif_carrier_ok(dev)) { - err = -EBUSY; - goto unlock; - } - - if (wdev->cac_started) { + if (cfg80211_beaconing_iface_active(wdev) || wdev->cac_started) { err = -EBUSY; goto unlock; } @@ -13865,9 +13901,8 @@ nla_put_failure: return -ENOBUFS; } -void cfg80211_rdev_free_coalesce(struct cfg80211_registered_device *rdev) +void cfg80211_free_coalesce(struct cfg80211_coalesce *coalesce) { - struct cfg80211_coalesce *coalesce = rdev->coalesce; int i, j; struct cfg80211_coalesce_rules *rule; @@ -13876,13 +13911,13 @@ void cfg80211_rdev_free_coalesce(struct cfg80211_registered_device *rdev) for (i = 0; i < coalesce->n_rules; i++) { rule = &coalesce->rules[i]; + if (!rule) + continue; for (j = 0; j < rule->n_patterns; j++) kfree(rule->patterns[j].mask); kfree(rule->patterns); } - kfree(coalesce->rules); kfree(coalesce); - rdev->coalesce = NULL; } static int nl80211_parse_coalesce_rule(struct cfg80211_registered_device *rdev, @@ -13980,17 +14015,16 @@ static int nl80211_set_coalesce(struct sk_buff *skb, struct genl_info *info) { struct cfg80211_registered_device *rdev = info->user_ptr[0]; const struct wiphy_coalesce_support *coalesce = rdev->wiphy.coalesce; - struct cfg80211_coalesce new_coalesce = {}; - struct cfg80211_coalesce *n_coalesce; - int err, rem_rule, n_rules = 0, i, j; + struct cfg80211_coalesce *new_coalesce; + int err, rem_rule, n_rules = 0, i; struct nlattr *rule; - struct cfg80211_coalesce_rules *tmp_rule; if (!rdev->wiphy.coalesce || !rdev->ops->set_coalesce) return -EOPNOTSUPP; if (!info->attrs[NL80211_ATTR_COALESCE_RULE]) { - cfg80211_rdev_free_coalesce(rdev); + cfg80211_free_coalesce(rdev->coalesce); + rdev->coalesce = NULL; rdev_set_coalesce(rdev, NULL); return 0; } @@ -14001,47 +14035,34 @@ static int nl80211_set_coalesce(struct sk_buff *skb, struct genl_info *info) if (n_rules > coalesce->n_rules) return -EINVAL; - new_coalesce.rules = kcalloc(n_rules, sizeof(new_coalesce.rules[0]), - GFP_KERNEL); - if (!new_coalesce.rules) + new_coalesce = kzalloc(struct_size(new_coalesce, rules, n_rules), + GFP_KERNEL); + if (!new_coalesce) return -ENOMEM; - new_coalesce.n_rules = n_rules; + new_coalesce->n_rules = n_rules; i = 0; nla_for_each_nested(rule, info->attrs[NL80211_ATTR_COALESCE_RULE], rem_rule) { err = nl80211_parse_coalesce_rule(rdev, rule, - &new_coalesce.rules[i]); + &new_coalesce->rules[i]); if (err) goto error; i++; } - err = rdev_set_coalesce(rdev, &new_coalesce); + err = rdev_set_coalesce(rdev, new_coalesce); if (err) goto error; - n_coalesce = kmemdup(&new_coalesce, sizeof(new_coalesce), GFP_KERNEL); - if (!n_coalesce) { - err = -ENOMEM; - goto error; - } - cfg80211_rdev_free_coalesce(rdev); - rdev->coalesce = n_coalesce; + cfg80211_free_coalesce(rdev->coalesce); + rdev->coalesce = new_coalesce; return 0; error: - for (i = 0; i < new_coalesce.n_rules; i++) { - tmp_rule = &new_coalesce.rules[i]; - if (!tmp_rule) - continue; - for (j = 0; j < tmp_rule->n_patterns; j++) - kfree(tmp_rule->patterns[j].mask); - kfree(tmp_rule->patterns); - } - kfree(new_coalesce.rules); + cfg80211_free_coalesce(new_coalesce); return err; } diff --git a/net/wireless/nl80211.h b/net/wireless/nl80211.h index 6376f3a87f8a..ffaab9a92e5b 100644 --- a/net/wireless/nl80211.h +++ b/net/wireless/nl80211.h @@ -1,7 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* * Portions of this file - * Copyright (C) 2018, 2020-2022 Intel Corporation + * Copyright (C) 2018, 2020-2024 Intel Corporation */ #ifndef __NET_WIRELESS_NL80211_H #define __NET_WIRELESS_NL80211_H @@ -119,7 +119,7 @@ nl80211_radar_notify(struct cfg80211_registered_device *rdev, void nl80211_send_ap_stopped(struct wireless_dev *wdev, unsigned int link_id); -void cfg80211_rdev_free_coalesce(struct cfg80211_registered_device *rdev); +void cfg80211_free_coalesce(struct cfg80211_coalesce *coalesce); /* peer measurement */ int nl80211_pmsr_start(struct sk_buff *skb, struct genl_info *info); diff --git a/net/wireless/trace.h b/net/wireless/trace.h index 87986170d1b1..6ef9294747e3 100644 --- a/net/wireless/trace.h +++ b/net/wireless/trace.h @@ -243,6 +243,80 @@ } while (0) /************************************************************* + * wiphy work traces * + *************************************************************/ + +DECLARE_EVENT_CLASS(wiphy_work_event, + TP_PROTO(struct wiphy *wiphy, struct wiphy_work *work), + TP_ARGS(wiphy, work), + TP_STRUCT__entry( + WIPHY_ENTRY + __field(void *, instance) + __field(void *, func) + ), + TP_fast_assign( + WIPHY_ASSIGN; + __entry->instance = work; + __entry->func = work ? work->func : NULL; + ), + TP_printk(WIPHY_PR_FMT " instance=%p func=%pS", + WIPHY_PR_ARG, __entry->instance, __entry->func) +); + +DEFINE_EVENT(wiphy_work_event, wiphy_work_queue, + TP_PROTO(struct wiphy *wiphy, struct wiphy_work *work), + TP_ARGS(wiphy, work) +); + +DEFINE_EVENT(wiphy_work_event, wiphy_work_run, + TP_PROTO(struct wiphy *wiphy, struct wiphy_work *work), + TP_ARGS(wiphy, work) +); + +DEFINE_EVENT(wiphy_work_event, wiphy_work_cancel, + TP_PROTO(struct wiphy *wiphy, struct wiphy_work *work), + TP_ARGS(wiphy, work) +); + +DEFINE_EVENT(wiphy_work_event, wiphy_work_flush, + TP_PROTO(struct wiphy *wiphy, struct wiphy_work *work), + TP_ARGS(wiphy, work) +); + +TRACE_EVENT(wiphy_delayed_work_queue, + TP_PROTO(struct wiphy *wiphy, struct wiphy_work *work, + unsigned long delay), + TP_ARGS(wiphy, work, delay), + TP_STRUCT__entry( + WIPHY_ENTRY + __field(void *, instance) + __field(void *, func) + __field(unsigned long, delay) + ), + TP_fast_assign( + WIPHY_ASSIGN; + __entry->instance = work; + __entry->func = work->func; + __entry->delay = delay; + ), + TP_printk(WIPHY_PR_FMT " instance=%p func=%pS delay=%ld", + WIPHY_PR_ARG, __entry->instance, __entry->func, + __entry->delay) +); + +TRACE_EVENT(wiphy_work_worker_start, + TP_PROTO(struct wiphy *wiphy), + TP_ARGS(wiphy), + TP_STRUCT__entry( + WIPHY_ENTRY + ), + TP_fast_assign( + WIPHY_ASSIGN; + ), + TP_printk(WIPHY_PR_FMT, WIPHY_PR_ARG) +); + +/************************************************************* * rdev->ops traces * *************************************************************/ @@ -2889,6 +2963,75 @@ DEFINE_EVENT(wiphy_wdev_link_evt, rdev_del_intf_link, TP_ARGS(wiphy, wdev, link_id) ); +TRACE_EVENT(rdev_del_link_station, + TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, + struct link_station_del_parameters *params), + TP_ARGS(wiphy, netdev, params), + TP_STRUCT__entry( + WIPHY_ENTRY + NETDEV_ENTRY + __array(u8, mld_mac, 6) + __field(u32, link_id) + ), + TP_fast_assign( + WIPHY_ASSIGN; + NETDEV_ASSIGN; + memset(__entry->mld_mac, 0, 6); + if (params->mld_mac) + memcpy(__entry->mld_mac, params->mld_mac, 6); + __entry->link_id = params->link_id; + ), + TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", station mac: %pM" + ", link id: %u", + WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->mld_mac, + __entry->link_id) +); + +TRACE_EVENT(rdev_set_hw_timestamp, + TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, + struct cfg80211_set_hw_timestamp *hwts), + + TP_ARGS(wiphy, netdev, hwts), + + TP_STRUCT__entry( + WIPHY_ENTRY + NETDEV_ENTRY + MAC_ENTRY(macaddr) + __field(bool, enable) + ), + + TP_fast_assign( + WIPHY_ASSIGN; + NETDEV_ASSIGN; + MAC_ASSIGN(macaddr, hwts->macaddr); + __entry->enable = hwts->enable; + ), + + TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", mac %pM, enable: %u", + WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->macaddr, + __entry->enable) +); + +TRACE_EVENT(rdev_set_ttlm, + TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, + struct cfg80211_ttlm_params *params), + TP_ARGS(wiphy, netdev, params), + TP_STRUCT__entry( + WIPHY_ENTRY + NETDEV_ENTRY + __array(u8, dlink, sizeof(u16) * 8) + __array(u8, ulink, sizeof(u16) * 8) + ), + TP_fast_assign( + WIPHY_ASSIGN; + NETDEV_ASSIGN; + memcpy(__entry->dlink, params->dlink, sizeof(params->dlink)); + memcpy(__entry->ulink, params->ulink, sizeof(params->ulink)); + ), + TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT, + WIPHY_PR_ARG, NETDEV_PR_ARG) +); + /************************************************************* * cfg80211 exported functions traces * *************************************************************/ @@ -3923,55 +4066,6 @@ DEFINE_EVENT(link_station_add_mod, rdev_mod_link_station, TP_ARGS(wiphy, netdev, params) ); -TRACE_EVENT(rdev_del_link_station, - TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, - struct link_station_del_parameters *params), - TP_ARGS(wiphy, netdev, params), - TP_STRUCT__entry( - WIPHY_ENTRY - NETDEV_ENTRY - __array(u8, mld_mac, 6) - __field(u32, link_id) - ), - TP_fast_assign( - WIPHY_ASSIGN; - NETDEV_ASSIGN; - memset(__entry->mld_mac, 0, 6); - if (params->mld_mac) - memcpy(__entry->mld_mac, params->mld_mac, 6); - __entry->link_id = params->link_id; - ), - TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", station mac: %pM" - ", link id: %u", - WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->mld_mac, - __entry->link_id) -); - -TRACE_EVENT(rdev_set_hw_timestamp, - TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, - struct cfg80211_set_hw_timestamp *hwts), - - TP_ARGS(wiphy, netdev, hwts), - - TP_STRUCT__entry( - WIPHY_ENTRY - NETDEV_ENTRY - MAC_ENTRY(macaddr) - __field(bool, enable) - ), - - TP_fast_assign( - WIPHY_ASSIGN; - NETDEV_ASSIGN; - MAC_ASSIGN(macaddr, hwts->macaddr); - __entry->enable = hwts->enable; - ), - - TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", mac %pM, enable: %u", - WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->macaddr, - __entry->enable) -); - TRACE_EVENT(cfg80211_links_removed, TP_PROTO(struct net_device *netdev, u16 link_mask), TP_ARGS(netdev, link_mask), @@ -3987,26 +4081,6 @@ TRACE_EVENT(cfg80211_links_removed, __entry->link_mask) ); -TRACE_EVENT(rdev_set_ttlm, - TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, - struct cfg80211_ttlm_params *params), - TP_ARGS(wiphy, netdev, params), - TP_STRUCT__entry( - WIPHY_ENTRY - NETDEV_ENTRY - __array(u8, dlink, sizeof(u16) * 8) - __array(u8, ulink, sizeof(u16) * 8) - ), - TP_fast_assign( - WIPHY_ASSIGN; - NETDEV_ASSIGN; - memcpy(__entry->dlink, params->dlink, sizeof(params->dlink)); - memcpy(__entry->ulink, params->ulink, sizeof(params->ulink)); - ), - TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT, - WIPHY_PR_ARG, NETDEV_PR_ARG) -); - #endif /* !__RDEV_OPS_TRACE || TRACE_HEADER_MULTI_READ */ #undef TRACE_INCLUDE_PATH diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 7d1c0986f9bb..ed062e038389 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -35,8 +35,6 @@ #define TX_BATCH_SIZE 32 #define MAX_PER_SOCKET_BUDGET (TX_BATCH_SIZE) -static DEFINE_PER_CPU(struct list_head, xskmap_flush_list); - void xsk_set_rx_need_wakeup(struct xsk_buff_pool *pool) { if (pool->cached_need_wakeup & XDP_WAKEUP_RX) @@ -372,7 +370,7 @@ static int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp) { - struct list_head *flush_list = this_cpu_ptr(&xskmap_flush_list); + struct list_head *flush_list = bpf_net_ctx_get_xskmap_flush_list(); int err; err = xsk_rcv(xs, xdp); @@ -387,7 +385,7 @@ int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp) void __xsk_map_flush(void) { - struct list_head *flush_list = this_cpu_ptr(&xskmap_flush_list); + struct list_head *flush_list = bpf_net_ctx_get_xskmap_flush_list(); struct xdp_sock *xs, *tmp; list_for_each_entry_safe(xs, tmp, flush_list, flush_node) { @@ -399,7 +397,7 @@ void __xsk_map_flush(void) #ifdef CONFIG_DEBUG_NET bool xsk_map_check_flush(void) { - if (list_empty(this_cpu_ptr(&xskmap_flush_list))) + if (list_empty(bpf_net_ctx_get_xskmap_flush_list())) return false; __xsk_map_flush(); return true; @@ -1772,7 +1770,7 @@ static struct pernet_operations xsk_net_ops = { static int __init xsk_init(void) { - int err, cpu; + int err; err = proto_register(&xsk_proto, 0 /* no slab */); if (err) @@ -1790,8 +1788,6 @@ static int __init xsk_init(void) if (err) goto out_pernet; - for_each_possible_cpu(cpu) - INIT_LIST_HEAD(&per_cpu(xskmap_flush_list, cpu)); return 0; out_pernet: |