From 46ef5b89ec0ecf290d74c4aee844f063933c4da4 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Mon, 13 Jul 2020 23:59:50 +0800 Subject: ip6_gre: fix null-ptr-deref in ip6gre_init_net() KASAN report null-ptr-deref error when register_netdev() failed: KASAN: null-ptr-deref in range [0x00000000000003c0-0x00000000000003c7] CPU: 2 PID: 422 Comm: ip Not tainted 5.8.0-rc4+ #12 Call Trace: ip6gre_init_net+0x4ab/0x580 ? ip6gre_tunnel_uninit+0x3f0/0x3f0 ops_init+0xa8/0x3c0 setup_net+0x2de/0x7e0 ? rcu_read_lock_bh_held+0xb0/0xb0 ? ops_init+0x3c0/0x3c0 ? kasan_unpoison_shadow+0x33/0x40 ? __kasan_kmalloc.constprop.0+0xc2/0xd0 copy_net_ns+0x27d/0x530 create_new_namespaces+0x382/0xa30 unshare_nsproxy_namespaces+0xa1/0x1d0 ksys_unshare+0x39c/0x780 ? walk_process_tree+0x2a0/0x2a0 ? trace_hardirqs_on+0x4a/0x1b0 ? _raw_spin_unlock_irq+0x1f/0x30 ? syscall_trace_enter+0x1a7/0x330 ? do_syscall_64+0x1c/0xa0 __x64_sys_unshare+0x2d/0x40 do_syscall_64+0x56/0xa0 entry_SYSCALL_64_after_hwframe+0x44/0xa9 ip6gre_tunnel_uninit() has set 'ign->fb_tunnel_dev' to NULL, later access to ign->fb_tunnel_dev cause null-ptr-deref. Fix it by saving 'ign->fb_tunnel_dev' to local variable ndev. Fixes: dafabb6590cb ("ip6_gre: fix use-after-free in ip6gre_tunnel_lookup()") Reported-by: Hulk Robot Signed-off-by: Wei Yongjun Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/ip6_gre.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 6532bde82b40..3a57fb9ce049 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -1562,17 +1562,18 @@ static void ip6gre_destroy_tunnels(struct net *net, struct list_head *head) static int __net_init ip6gre_init_net(struct net *net) { struct ip6gre_net *ign = net_generic(net, ip6gre_net_id); + struct net_device *ndev; int err; if (!net_has_fallback_tunnels(net)) return 0; - ign->fb_tunnel_dev = alloc_netdev(sizeof(struct ip6_tnl), "ip6gre0", - NET_NAME_UNKNOWN, - ip6gre_tunnel_setup); - if (!ign->fb_tunnel_dev) { + ndev = alloc_netdev(sizeof(struct ip6_tnl), "ip6gre0", + NET_NAME_UNKNOWN, ip6gre_tunnel_setup); + if (!ndev) { err = -ENOMEM; goto err_alloc_dev; } + ign->fb_tunnel_dev = ndev; dev_net_set(ign->fb_tunnel_dev, net); /* FB netdevice is special: we have one, and only one per netns. * Allowing to move it to another netns is clearly unsafe. @@ -1592,7 +1593,7 @@ static int __net_init ip6gre_init_net(struct net *net) return 0; err_reg_dev: - free_netdev(ign->fb_tunnel_dev); + free_netdev(ndev); err_alloc_dev: return err; } -- cgit From 1e9451cbda456a170518b2bfd643e2cb980880bf Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 14 Jul 2020 18:51:39 +0200 Subject: netfilter: nf_tables: fix nat hook table deletion sybot came up with following transaction: add table ip syz0 add chain ip syz0 syz2 { type nat hook prerouting priority 0; policy accept; } add table ip syz0 { flags dormant; } delete chain ip syz0 syz2 delete table ip syz0 which yields: hook not found, pf 2 num 0 WARNING: CPU: 0 PID: 6775 at net/netfilter/core.c:413 __nf_unregister_net_hook+0x3e6/0x4a0 net/netfilter/core.c:413 [..] nft_unregister_basechain_hooks net/netfilter/nf_tables_api.c:206 [inline] nft_table_disable net/netfilter/nf_tables_api.c:835 [inline] nf_tables_table_disable net/netfilter/nf_tables_api.c:868 [inline] nf_tables_commit+0x32d3/0x4d70 net/netfilter/nf_tables_api.c:7550 nfnetlink_rcv_batch net/netfilter/nfnetlink.c:486 [inline] nfnetlink_rcv_skb_batch net/netfilter/nfnetlink.c:544 [inline] nfnetlink_rcv+0x14a5/0x1e50 net/netfilter/nfnetlink.c:562 netlink_unicast_kernel net/netlink/af_netlink.c:1303 [inline] Problem is that when I added ability to override base hook registration to make nat basechains register with the nat core instead of netfilter core, I forgot to update nft_table_disable() to use that instead of the 'raw' hook register interface. In syzbot transaction, the basechain is of 'nat' type. Its registered with the nat core. The switch to 'dormant mode' attempts to delete from netfilter core instead. After updating nft_table_disable/enable to use the correct helper, nft_(un)register_basechain_hooks can be folded into the only remaining caller. Because nft_trans_table_enable() won't do anything when the DORMANT flag is set, remove the flag first, then re-add it in case re-enablement fails, else this patch breaks sequence: add table ip x { flags dormant; } /* add base chains */ add table ip x The last 'add' will remove the dormant flags, but won't have any other effect -- base chains are not registered. Then, next 'set dormant flag' will create another 'hook not found' splat. Reported-by: syzbot+2570f2c036e3da5db176@syzkaller.appspotmail.com Fixes: 4e25ceb80b58 ("netfilter: nf_tables: allow chain type to override hook register") Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 41 ++++++++++++++--------------------------- 1 file changed, 14 insertions(+), 27 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 7647ecfa0d40..88325b264737 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -188,24 +188,6 @@ static void nft_netdev_unregister_hooks(struct net *net, nf_unregister_net_hook(net, &hook->ops); } -static int nft_register_basechain_hooks(struct net *net, int family, - struct nft_base_chain *basechain) -{ - if (family == NFPROTO_NETDEV) - return nft_netdev_register_hooks(net, &basechain->hook_list); - - return nf_register_net_hook(net, &basechain->ops); -} - -static void nft_unregister_basechain_hooks(struct net *net, int family, - struct nft_base_chain *basechain) -{ - if (family == NFPROTO_NETDEV) - nft_netdev_unregister_hooks(net, &basechain->hook_list); - else - nf_unregister_net_hook(net, &basechain->ops); -} - static int nf_tables_register_hook(struct net *net, const struct nft_table *table, struct nft_chain *chain) @@ -223,7 +205,10 @@ static int nf_tables_register_hook(struct net *net, if (basechain->type->ops_register) return basechain->type->ops_register(net, ops); - return nft_register_basechain_hooks(net, table->family, basechain); + if (table->family == NFPROTO_NETDEV) + return nft_netdev_register_hooks(net, &basechain->hook_list); + + return nf_register_net_hook(net, &basechain->ops); } static void nf_tables_unregister_hook(struct net *net, @@ -242,7 +227,10 @@ static void nf_tables_unregister_hook(struct net *net, if (basechain->type->ops_unregister) return basechain->type->ops_unregister(net, ops); - nft_unregister_basechain_hooks(net, table->family, basechain); + if (table->family == NFPROTO_NETDEV) + nft_netdev_unregister_hooks(net, &basechain->hook_list); + else + nf_unregister_net_hook(net, &basechain->ops); } static int nft_trans_table_add(struct nft_ctx *ctx, int msg_type) @@ -832,8 +820,7 @@ static void nft_table_disable(struct net *net, struct nft_table *table, u32 cnt) if (cnt && i++ == cnt) break; - nft_unregister_basechain_hooks(net, table->family, - nft_base_chain(chain)); + nf_tables_unregister_hook(net, table, chain); } } @@ -848,8 +835,7 @@ static int nf_tables_table_enable(struct net *net, struct nft_table *table) if (!nft_is_base_chain(chain)) continue; - err = nft_register_basechain_hooks(net, table->family, - nft_base_chain(chain)); + err = nf_tables_register_hook(net, table, chain); if (err < 0) goto err_register_hooks; @@ -894,11 +880,12 @@ static int nf_tables_updtable(struct nft_ctx *ctx) nft_trans_table_enable(trans) = false; } else if (!(flags & NFT_TABLE_F_DORMANT) && ctx->table->flags & NFT_TABLE_F_DORMANT) { + ctx->table->flags &= ~NFT_TABLE_F_DORMANT; ret = nf_tables_table_enable(ctx->net, ctx->table); - if (ret >= 0) { - ctx->table->flags &= ~NFT_TABLE_F_DORMANT; + if (ret >= 0) nft_trans_table_enable(trans) = true; - } + else + ctx->table->flags |= NFT_TABLE_F_DORMANT; } if (ret < 0) goto err; -- cgit From f961134a612c793d5901a93d85a29337c74af978 Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Fri, 10 Jul 2020 14:12:43 +0200 Subject: vsock/virtio: annotate 'the_virtio_vsock' RCU pointer Commit 0deab087b16a ("vsock/virtio: use RCU to avoid use-after-free on the_virtio_vsock") starts to use RCU to protect 'the_virtio_vsock' pointer, but we forgot to annotate it. This patch adds the annotation to fix the following sparse errors: net/vmw_vsock/virtio_transport.c:73:17: error: incompatible types in comparison expression (different address spaces): net/vmw_vsock/virtio_transport.c:73:17: struct virtio_vsock [noderef] __rcu * net/vmw_vsock/virtio_transport.c:73:17: struct virtio_vsock * net/vmw_vsock/virtio_transport.c:171:17: error: incompatible types in comparison expression (different address spaces): net/vmw_vsock/virtio_transport.c:171:17: struct virtio_vsock [noderef] __rcu * net/vmw_vsock/virtio_transport.c:171:17: struct virtio_vsock * net/vmw_vsock/virtio_transport.c:207:17: error: incompatible types in comparison expression (different address spaces): net/vmw_vsock/virtio_transport.c:207:17: struct virtio_vsock [noderef] __rcu * net/vmw_vsock/virtio_transport.c:207:17: struct virtio_vsock * net/vmw_vsock/virtio_transport.c:561:13: error: incompatible types in comparison expression (different address spaces): net/vmw_vsock/virtio_transport.c:561:13: struct virtio_vsock [noderef] __rcu * net/vmw_vsock/virtio_transport.c:561:13: struct virtio_vsock * net/vmw_vsock/virtio_transport.c:612:9: error: incompatible types in comparison expression (different address spaces): net/vmw_vsock/virtio_transport.c:612:9: struct virtio_vsock [noderef] __rcu * net/vmw_vsock/virtio_transport.c:612:9: struct virtio_vsock * net/vmw_vsock/virtio_transport.c:631:9: error: incompatible types in comparison expression (different address spaces): net/vmw_vsock/virtio_transport.c:631:9: struct virtio_vsock [noderef] __rcu * net/vmw_vsock/virtio_transport.c:631:9: struct virtio_vsock * Fixes: 0deab087b16a ("vsock/virtio: use RCU to avoid use-after-free on the_virtio_vsock") Reported-by: Michael S. Tsirkin Signed-off-by: Stefano Garzarella Reviewed-by: Stefan Hajnoczi Acked-by: Michael S. Tsirkin Signed-off-by: Jakub Kicinski --- net/vmw_vsock/virtio_transport.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c index dfbaf6bd8b1c..2700a63ab095 100644 --- a/net/vmw_vsock/virtio_transport.c +++ b/net/vmw_vsock/virtio_transport.c @@ -22,7 +22,7 @@ #include static struct workqueue_struct *virtio_vsock_workqueue; -static struct virtio_vsock *the_virtio_vsock; +static struct virtio_vsock __rcu *the_virtio_vsock; static DEFINE_MUTEX(the_virtio_vsock_mutex); /* protects the_virtio_vsock */ struct virtio_vsock { -- cgit From cebb69754f37d68e1355a5e726fdac317bcda302 Mon Sep 17 00:00:00 2001 From: Weilong Chen Date: Wed, 15 Jul 2020 20:58:10 +0800 Subject: rtnetlink: Fix memory(net_device) leak when ->newlink fails When vlan_newlink call register_vlan_dev fails, it might return error with dev->reg_state = NETREG_UNREGISTERED. The rtnl_newlink should free the memory. But currently rtnl_newlink only free the memory which state is NETREG_UNINITIALIZED. BUG: memory leak unreferenced object 0xffff8881051de000 (size 4096): comm "syz-executor139", pid 560, jiffies 4294745346 (age 32.445s) hex dump (first 32 bytes): 76 6c 61 6e 32 00 00 00 00 00 00 00 00 00 00 00 vlan2........... 00 45 28 03 81 88 ff ff 00 00 00 00 00 00 00 00 .E(............. backtrace: [<0000000047527e31>] kmalloc_node include/linux/slab.h:578 [inline] [<0000000047527e31>] kvmalloc_node+0x33/0xd0 mm/util.c:574 [<000000002b59e3bc>] kvmalloc include/linux/mm.h:753 [inline] [<000000002b59e3bc>] kvzalloc include/linux/mm.h:761 [inline] [<000000002b59e3bc>] alloc_netdev_mqs+0x83/0xd90 net/core/dev.c:9929 [<000000006076752a>] rtnl_create_link+0x2c0/0xa20 net/core/rtnetlink.c:3067 [<00000000572b3be5>] __rtnl_newlink+0xc9c/0x1330 net/core/rtnetlink.c:3329 [<00000000e84ea553>] rtnl_newlink+0x66/0x90 net/core/rtnetlink.c:3397 [<0000000052c7c0a9>] rtnetlink_rcv_msg+0x540/0x990 net/core/rtnetlink.c:5460 [<000000004b5cb379>] netlink_rcv_skb+0x12b/0x3a0 net/netlink/af_netlink.c:2469 [<00000000c71c20d3>] netlink_unicast_kernel net/netlink/af_netlink.c:1303 [inline] [<00000000c71c20d3>] netlink_unicast+0x4c6/0x690 net/netlink/af_netlink.c:1329 [<00000000cca72fa9>] netlink_sendmsg+0x735/0xcc0 net/netlink/af_netlink.c:1918 [<000000009221ebf7>] sock_sendmsg_nosec net/socket.c:652 [inline] [<000000009221ebf7>] sock_sendmsg+0x109/0x140 net/socket.c:672 [<000000001c30ffe4>] ____sys_sendmsg+0x5f5/0x780 net/socket.c:2352 [<00000000b71ca6f3>] ___sys_sendmsg+0x11d/0x1a0 net/socket.c:2406 [<0000000007297384>] __sys_sendmsg+0xeb/0x1b0 net/socket.c:2439 [<000000000eb29b11>] do_syscall_64+0x56/0xa0 arch/x86/entry/common.c:359 [<000000006839b4d0>] entry_SYSCALL_64_after_hwframe+0x44/0xa9 Fixes: cb626bf566eb ("net-sysfs: Fix reference count leak") Reported-by: Hulk Robot Signed-off-by: Weilong Chen Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 9aedc15736ad..85a4b0101f76 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -3343,7 +3343,8 @@ replay: */ if (err < 0) { /* If device is not registered at all, free it now */ - if (dev->reg_state == NETREG_UNINITIALIZED) + if (dev->reg_state == NETREG_UNINITIALIZED || + dev->reg_state == NETREG_UNREGISTERED) free_netdev(dev); goto out; } -- cgit From 0b4a66a389d1ff5dab29f688fcfe36482bc889a2 Mon Sep 17 00:00:00 2001 From: Wang Hai Date: Fri, 17 Jul 2020 15:10:16 +0800 Subject: nfc: nci: add missed destroy_workqueue in nci_register_device When nfc_register_device fails in nci_register_device, destroy_workqueue() shouled be called to destroy ndev->tx_wq. Fixes: 3c1c0f5dc80b ("NFC: NCI: Fix nci_register_device init sequence") Reported-by: Hulk Robot Signed-off-by: Wang Hai Signed-off-by: David S. Miller --- net/nfc/nci/core.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/nfc/nci/core.c b/net/nfc/nci/core.c index 7cd524884304..78ea8c94dcba 100644 --- a/net/nfc/nci/core.c +++ b/net/nfc/nci/core.c @@ -1228,10 +1228,13 @@ int nci_register_device(struct nci_dev *ndev) rc = nfc_register_device(ndev->nfc_dev); if (rc) - goto destroy_rx_wq_exit; + goto destroy_tx_wq_exit; goto exit; +destroy_tx_wq_exit: + destroy_workqueue(ndev->tx_wq); + destroy_rx_wq_exit: destroy_workqueue(ndev->rx_wq); -- cgit From 6d6148bc78d2f002ecc67b801c5f55a0cc5184c8 Mon Sep 17 00:00:00 2001 From: Murali Karicheri Date: Fri, 17 Jul 2020 10:55:09 -0400 Subject: net: hsr: fix incorrect lsdu size in the tag of HSR frames for small frames For small Ethernet frames with size less than minimum size 66 for HSR vs 60 for regular Ethernet frames, hsr driver currently doesn't pad the frame to make it minimum size. This results in incorrect LSDU size being populated in the HSR tag for these frames. Fix this by padding the frame to the minimum size applicable for HSR. Signed-off-by: Murali Karicheri Signed-off-by: David S. Miller --- net/hsr/hsr_forward.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/hsr/hsr_forward.c b/net/hsr/hsr_forward.c index ed13760463de..e42fd356f073 100644 --- a/net/hsr/hsr_forward.c +++ b/net/hsr/hsr_forward.c @@ -127,6 +127,9 @@ static void hsr_fill_tag(struct sk_buff *skb, struct hsr_frame_info *frame, int lane_id; int lsdu_size; + /* pad to minimum packet size which is 60 + 6 (HSR tag) */ + skb_put_padto(skb, ETH_ZLEN + HSR_HLEN); + if (port->type == HSR_PT_SLAVE_A) lane_id = 0; else -- cgit From eea9f73e1ff9b99fbb937d41bb07c5bb2ae9ea2f Mon Sep 17 00:00:00 2001 From: Murali Karicheri Date: Fri, 17 Jul 2020 10:55:10 -0400 Subject: net: hsr: validate address B before copying to skb Validate MAC address before copying the same to outgoing frame skb destination address. Since a node can have zero mac address for Link B until a valid frame is received over that link, this fix address the issue of a zero MAC address being in the packet. Signed-off-by: Murali Karicheri Signed-off-by: David S. Miller --- net/hsr/hsr_framereg.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/hsr/hsr_framereg.c b/net/hsr/hsr_framereg.c index 03b891904314..530de24b1fb5 100644 --- a/net/hsr/hsr_framereg.c +++ b/net/hsr/hsr_framereg.c @@ -325,7 +325,8 @@ void hsr_addr_subst_dest(struct hsr_node *node_src, struct sk_buff *skb, if (port->type != node_dst->addr_B_port) return; - ether_addr_copy(eth_hdr(skb)->h_dest, node_dst->macaddress_B); + if (is_valid_ether_addr(node_dst->macaddress_B)) + ether_addr_copy(eth_hdr(skb)->h_dest, node_dst->macaddress_B); } void hsr_register_frame_in(struct hsr_node *node, struct hsr_port *port, -- cgit From a35fffbf98189ba8359f19073286b2ea816255c5 Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Sat, 18 Jul 2020 15:06:09 +0200 Subject: net/smc: handle unexpected response types for confirm link A delete link could arrive during confirm link processing. Handle this situation directly in smc_llc_srv_conf_link() rather than using the logic in smc_llc_wait() to avoid the unexpected message handling there. Reviewed-by: Ursula Braun Fixes: 1551c95b6124 ("net/smc: final part of add link processing as SMC server") Signed-off-by: Karsten Graul Signed-off-by: David S. Miller --- net/smc/smc_llc.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index c1a038689c63..58f4da2e0cc7 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -1051,12 +1051,14 @@ static int smc_llc_srv_conf_link(struct smc_link *link, if (rc) return -ENOLINK; /* receive CONFIRM LINK response over the RoCE fabric */ - qentry = smc_llc_wait(lgr, link, SMC_LLC_WAIT_FIRST_TIME, - SMC_LLC_CONFIRM_LINK); - if (!qentry) { + qentry = smc_llc_wait(lgr, link, SMC_LLC_WAIT_FIRST_TIME, 0); + if (!qentry || + qentry->msg.raw.hdr.common.type != SMC_LLC_CONFIRM_LINK) { /* send DELETE LINK */ smc_llc_send_delete_link(link, link_new->link_id, SMC_LLC_REQ, false, SMC_LLC_DEL_LOST_PATH); + if (qentry) + smc_llc_flow_qentry_del(&lgr->llc_flow_lcl); return -ENOLINK; } smc_llc_save_peer_uid(qentry); -- cgit From 68fd8942038f30dbb64a594dc15d9948289de42a Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Sat, 18 Jul 2020 15:06:10 +0200 Subject: net/smc: clear link during SMC client link down processing In a link-down condition we notify the SMC server and expect that the server will finally trigger the link clear processing on the client side. This could fail when anything along this notification path goes wrong. Clear the link as part of SMC client link-down processing to prevent dangling links. Reviewed-by: Ursula Braun Fixes: 541afa10c126 ("net/smc: add smcr_port_err() and smcr_link_down() processing") Signed-off-by: Karsten Graul Signed-off-by: David S. Miller --- net/smc/smc_core.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index f69d205b3e11..e286b3c8c962 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -1204,10 +1204,12 @@ static void smcr_link_down(struct smc_link *lnk) SMC_LLC_WAIT_TIME); mutex_lock(&lgr->llc_conf_mutex); } - if (!list_empty(&lgr->list)) + if (!list_empty(&lgr->list)) { smc_llc_send_delete_link(to_lnk, del_link_id, SMC_LLC_REQ, true, SMC_LLC_DEL_LOST_PATH); + smcr_link_clear(lnk, true); + } wake_up(&lgr->llc_flow_waiter); /* wake up next waiter */ } } -- cgit From 7df8bcb56053173e5e5c0e566391fa601e3e4778 Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Sat, 18 Jul 2020 15:06:11 +0200 Subject: net/smc: fix link lookup for new rdma connections For new rdma connections the SMC server assigns the link and sends the link data in the clc accept message. To match the correct link use not only the qp_num but also the gid and the mac of the links. If there are equal qp_nums for different links the wrong link would be chosen. Reviewed-by: Ursula Braun Fixes: 0fb0b02bd6fd ("net/smc: adapt SMC client code to use the LLC flow") Signed-off-by: Karsten Graul Signed-off-by: David S. Miller --- net/smc/af_smc.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 903321543838..f80591567a3d 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -632,7 +632,9 @@ static int smc_connect_rdma(struct smc_sock *smc, for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { struct smc_link *l = &smc->conn.lgr->lnk[i]; - if (l->peer_qpn == ntoh24(aclc->qpn)) { + if (l->peer_qpn == ntoh24(aclc->qpn) && + !memcmp(l->peer_gid, &aclc->lcl.gid, SMC_GID_SIZE) && + !memcmp(l->peer_mac, &aclc->lcl.mac, sizeof(l->peer_mac))) { link = l; break; } -- cgit From 63673597cca93ef6fa12414933da01d5806547af Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Sat, 18 Jul 2020 15:06:12 +0200 Subject: net/smc: protect smc ib device initialization Before an smc ib device is used the first time for an smc link it is lazily initialized. When there are 2 active link groups and a new ib device is brought online then it might happen that 2 link creations run in parallel and enter smc_ib_setup_per_ibdev(). Both allocate new send and receive completion queues on the device, but only one set of them keeps assigned and the other leaks. Fix that by protecting the setup and cleanup code using a mutex. Reviewed-by: Ursula Braun Fixes: f3c1deddb21c ("net/smc: separate function for link initialization") Signed-off-by: Karsten Graul Signed-off-by: David S. Miller --- net/smc/smc_ib.c | 16 +++++++++++++--- net/smc/smc_ib.h | 1 + 2 files changed, 14 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index 7637fdebbb78..1c314dbdc7fa 100644 --- a/net/smc/smc_ib.c +++ b/net/smc/smc_ib.c @@ -506,6 +506,10 @@ long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev) int cqe_size_order, smc_order; long rc; + mutex_lock(&smcibdev->mutex); + rc = 0; + if (smcibdev->initialized) + goto out; /* the calculated number of cq entries fits to mlx5 cq allocation */ cqe_size_order = cache_line_size() == 128 ? 7 : 6; smc_order = MAX_ORDER - cqe_size_order - 1; @@ -517,7 +521,7 @@ long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev) rc = PTR_ERR_OR_ZERO(smcibdev->roce_cq_send); if (IS_ERR(smcibdev->roce_cq_send)) { smcibdev->roce_cq_send = NULL; - return rc; + goto out; } smcibdev->roce_cq_recv = ib_create_cq(smcibdev->ibdev, smc_wr_rx_cq_handler, NULL, @@ -529,21 +533,26 @@ long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev) } smc_wr_add_dev(smcibdev); smcibdev->initialized = 1; - return rc; + goto out; err: ib_destroy_cq(smcibdev->roce_cq_send); +out: + mutex_unlock(&smcibdev->mutex); return rc; } static void smc_ib_cleanup_per_ibdev(struct smc_ib_device *smcibdev) { + mutex_lock(&smcibdev->mutex); if (!smcibdev->initialized) - return; + goto out; smcibdev->initialized = 0; ib_destroy_cq(smcibdev->roce_cq_recv); ib_destroy_cq(smcibdev->roce_cq_send); smc_wr_remove_dev(smcibdev); +out: + mutex_unlock(&smcibdev->mutex); } static struct ib_client smc_ib_client; @@ -566,6 +575,7 @@ static int smc_ib_add_dev(struct ib_device *ibdev) INIT_WORK(&smcibdev->port_event_work, smc_ib_port_event_work); atomic_set(&smcibdev->lnk_cnt, 0); init_waitqueue_head(&smcibdev->lnks_deleted); + mutex_init(&smcibdev->mutex); mutex_lock(&smc_ib_devices.mutex); list_add_tail(&smcibdev->list, &smc_ib_devices.list); mutex_unlock(&smc_ib_devices.mutex); diff --git a/net/smc/smc_ib.h b/net/smc/smc_ib.h index ae6776e1e726..2ce481187dd0 100644 --- a/net/smc/smc_ib.h +++ b/net/smc/smc_ib.h @@ -52,6 +52,7 @@ struct smc_ib_device { /* ib-device infos for smc */ DECLARE_BITMAP(ports_going_away, SMC_MAX_PORTS); atomic_t lnk_cnt; /* number of links on ibdev */ wait_queue_head_t lnks_deleted; /* wait 4 removal of all links*/ + struct mutex mutex; /* protect dev setup+cleanup */ }; struct smc_buf_desc; -- cgit From 2ff0867851a21ea1ccb0c275ae1df2fe7787e1b9 Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Sat, 18 Jul 2020 15:06:13 +0200 Subject: net/smc: drop out-of-flow llc response messages To be save from unexpected or late llc response messages check if the arrived message fits to the current flow type and drop out-of-flow messages. And drop it when there is already a response assigned to the flow. Reviewed-by: Ursula Braun Fixes: ef79d439cd12 ("net/smc: process llc responses in tasklet context") Signed-off-by: Karsten Graul Signed-off-by: David S. Miller --- net/smc/smc_llc.c | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index 58f4da2e0cc7..78704f03e72a 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -1587,6 +1587,8 @@ again: static void smc_llc_rx_response(struct smc_link *link, struct smc_llc_qentry *qentry) { + enum smc_llc_flowtype flowtype = link->lgr->llc_flow_lcl.type; + struct smc_llc_flow *flow = &link->lgr->llc_flow_lcl; u8 llc_type = qentry->msg.raw.hdr.common.type; switch (llc_type) { @@ -1595,15 +1597,20 @@ static void smc_llc_rx_response(struct smc_link *link, complete(&link->llc_testlink_resp); break; case SMC_LLC_ADD_LINK: - case SMC_LLC_DELETE_LINK: - case SMC_LLC_CONFIRM_LINK: case SMC_LLC_ADD_LINK_CONT: + case SMC_LLC_CONFIRM_LINK: + if (flowtype != SMC_LLC_FLOW_ADD_LINK || flow->qentry) + break; /* drop out-of-flow response */ + goto assign; + case SMC_LLC_DELETE_LINK: + if (flowtype != SMC_LLC_FLOW_DEL_LINK || flow->qentry) + break; /* drop out-of-flow response */ + goto assign; case SMC_LLC_CONFIRM_RKEY: case SMC_LLC_DELETE_RKEY: - /* assign responses to the local flow, we requested them */ - smc_llc_flow_qentry_set(&link->lgr->llc_flow_lcl, qentry); - wake_up(&link->lgr->llc_msg_waiter); - return; + if (flowtype != SMC_LLC_FLOW_RKEY || flow->qentry) + break; /* drop out-of-flow response */ + goto assign; case SMC_LLC_CONFIRM_RKEY_CONT: /* not used because max links is 3 */ break; @@ -1612,6 +1619,11 @@ static void smc_llc_rx_response(struct smc_link *link, break; } kfree(qentry); + return; +assign: + /* assign responses to the local flow, we requested them */ + smc_llc_flow_qentry_set(&link->lgr->llc_flow_lcl, qentry); + wake_up(&link->lgr->llc_msg_waiter); } static void smc_llc_enqueue(struct smc_link *link, union smc_llc_msg *llc) -- cgit From c48254fa48e5bad589fbbf1578dae960cedcafcf Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Sat, 18 Jul 2020 15:06:14 +0200 Subject: net/smc: move add link processing for new device into llc layer When a new ib device is up smc will send an add link invitation to the peer if needed. This is currently done with rudimentary flow control. Under high workload these add link invitations can disturb other llc flows because they arrive unexpected. Fix this by integrating the invitations into the normal llc event flow and handle them as a flow. While at it, check for already assigned requests in the flow before the new add link request is assigned. Reviewed-by: Ursula Braun Fixes: 1f90a05d9ff9 ("net/smc: add smcr_port_add() and smcr_link_up() processing") Signed-off-by: Karsten Graul Signed-off-by: David S. Miller --- net/smc/smc_core.c | 80 +++++------------------------------------------------- net/smc/smc_llc.c | 56 ++++++++++++++++++++++++++++++++++---- net/smc/smc_llc.h | 2 +- 3 files changed, 58 insertions(+), 80 deletions(-) (limited to 'net') diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index e286b3c8c962..2e965de7412d 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -45,18 +45,10 @@ static struct smc_lgr_list smc_lgr_list = { /* established link groups */ static atomic_t lgr_cnt = ATOMIC_INIT(0); /* number of existing link groups */ static DECLARE_WAIT_QUEUE_HEAD(lgrs_deleted); -struct smc_ib_up_work { - struct work_struct work; - struct smc_link_group *lgr; - struct smc_ib_device *smcibdev; - u8 ibport; -}; - static void smc_buf_free(struct smc_link_group *lgr, bool is_rmb, struct smc_buf_desc *buf_desc); static void __smc_lgr_terminate(struct smc_link_group *lgr, bool soft); -static void smc_link_up_work(struct work_struct *work); static void smc_link_down_work(struct work_struct *work); /* return head of link group list and its lock for a given link group */ @@ -1106,67 +1098,23 @@ static void smc_conn_abort_work(struct work_struct *work) sock_put(&smc->sk); /* sock_hold done by schedulers of abort_work */ } -/* link is up - establish alternate link if applicable */ -static void smcr_link_up(struct smc_link_group *lgr, - struct smc_ib_device *smcibdev, u8 ibport) -{ - struct smc_link *link = NULL; - - if (list_empty(&lgr->list) || - lgr->type == SMC_LGR_SYMMETRIC || - lgr->type == SMC_LGR_ASYMMETRIC_PEER) - return; - - if (lgr->role == SMC_SERV) { - /* trigger local add link processing */ - link = smc_llc_usable_link(lgr); - if (!link) - return; - smc_llc_srv_add_link_local(link); - } else { - /* invite server to start add link processing */ - u8 gid[SMC_GID_SIZE]; - - if (smc_ib_determine_gid(smcibdev, ibport, lgr->vlan_id, gid, - NULL)) - return; - if (lgr->llc_flow_lcl.type != SMC_LLC_FLOW_NONE) { - /* some other llc task is ongoing */ - wait_event_timeout(lgr->llc_flow_waiter, - (list_empty(&lgr->list) || - lgr->llc_flow_lcl.type == SMC_LLC_FLOW_NONE), - SMC_LLC_WAIT_TIME); - } - /* lgr or device no longer active? */ - if (!list_empty(&lgr->list) && - smc_ib_port_active(smcibdev, ibport)) - link = smc_llc_usable_link(lgr); - if (link) - smc_llc_send_add_link(link, smcibdev->mac[ibport - 1], - gid, NULL, SMC_LLC_REQ); - wake_up(&lgr->llc_flow_waiter); /* wake up next waiter */ - } -} - void smcr_port_add(struct smc_ib_device *smcibdev, u8 ibport) { - struct smc_ib_up_work *ib_work; struct smc_link_group *lgr, *n; list_for_each_entry_safe(lgr, n, &smc_lgr_list.list, list) { + struct smc_link *link; + if (strncmp(smcibdev->pnetid[ibport - 1], lgr->pnet_id, SMC_MAX_PNETID_LEN) || lgr->type == SMC_LGR_SYMMETRIC || lgr->type == SMC_LGR_ASYMMETRIC_PEER) continue; - ib_work = kmalloc(sizeof(*ib_work), GFP_KERNEL); - if (!ib_work) - continue; - INIT_WORK(&ib_work->work, smc_link_up_work); - ib_work->lgr = lgr; - ib_work->smcibdev = smcibdev; - ib_work->ibport = ibport; - schedule_work(&ib_work->work); + + /* trigger local add link processing */ + link = smc_llc_usable_link(lgr); + if (link) + smc_llc_add_link_local(link); } } @@ -1249,20 +1197,6 @@ void smcr_port_err(struct smc_ib_device *smcibdev, u8 ibport) } } -static void smc_link_up_work(struct work_struct *work) -{ - struct smc_ib_up_work *ib_work = container_of(work, - struct smc_ib_up_work, - work); - struct smc_link_group *lgr = ib_work->lgr; - - if (list_empty(&lgr->list)) - goto out; - smcr_link_up(lgr, ib_work->smcibdev, ib_work->ibport); -out: - kfree(ib_work); -} - static void smc_link_down_work(struct work_struct *work) { struct smc_link *link = container_of(work, struct smc_link, diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index 78704f03e72a..30da040ab5b6 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -895,6 +895,36 @@ out: return rc; } +/* as an SMC client, invite server to start the add_link processing */ +static void smc_llc_cli_add_link_invite(struct smc_link *link, + struct smc_llc_qentry *qentry) +{ + struct smc_link_group *lgr = smc_get_lgr(link); + struct smc_init_info ini; + + if (lgr->type == SMC_LGR_SYMMETRIC || + lgr->type == SMC_LGR_ASYMMETRIC_PEER) + goto out; + + ini.vlan_id = lgr->vlan_id; + smc_pnet_find_alt_roce(lgr, &ini, link->smcibdev); + if (!ini.ib_dev) + goto out; + + smc_llc_send_add_link(link, ini.ib_dev->mac[ini.ib_port - 1], + ini.ib_gid, NULL, SMC_LLC_REQ); +out: + kfree(qentry); +} + +static bool smc_llc_is_local_add_link(union smc_llc_msg *llc) +{ + if (llc->raw.hdr.common.type == SMC_LLC_ADD_LINK && + !llc->add_link.qp_mtu && !llc->add_link.link_num) + return true; + return false; +} + static void smc_llc_process_cli_add_link(struct smc_link_group *lgr) { struct smc_llc_qentry *qentry; @@ -902,7 +932,10 @@ static void smc_llc_process_cli_add_link(struct smc_link_group *lgr) qentry = smc_llc_flow_qentry_clr(&lgr->llc_flow_lcl); mutex_lock(&lgr->llc_conf_mutex); - smc_llc_cli_add_link(qentry->link, qentry); + if (smc_llc_is_local_add_link(&qentry->msg)) + smc_llc_cli_add_link_invite(qentry->link, qentry); + else + smc_llc_cli_add_link(qentry->link, qentry); mutex_unlock(&lgr->llc_conf_mutex); } @@ -1160,14 +1193,14 @@ static void smc_llc_process_srv_add_link(struct smc_link_group *lgr) mutex_unlock(&lgr->llc_conf_mutex); } -/* enqueue a local add_link req to trigger a new add_link flow, only as SERV */ -void smc_llc_srv_add_link_local(struct smc_link *link) +/* enqueue a local add_link req to trigger a new add_link flow */ +void smc_llc_add_link_local(struct smc_link *link) { struct smc_llc_msg_add_link add_llc = {0}; add_llc.hd.length = sizeof(add_llc); add_llc.hd.common.type = SMC_LLC_ADD_LINK; - /* no dev and port needed, we as server ignore client data anyway */ + /* no dev and port needed */ smc_llc_enqueue(link, (union smc_llc_msg *)&add_llc); } @@ -1347,7 +1380,7 @@ static void smc_llc_process_srv_delete_link(struct smc_link_group *lgr) if (lgr->type == SMC_LGR_SINGLE && !list_empty(&lgr->list)) { /* trigger setup of asymm alt link */ - smc_llc_srv_add_link_local(lnk); + smc_llc_add_link_local(lnk); } out: mutex_unlock(&lgr->llc_conf_mutex); @@ -1476,7 +1509,18 @@ static void smc_llc_event_handler(struct smc_llc_qentry *qentry) if (list_empty(&lgr->list)) goto out; /* lgr is terminating */ if (lgr->role == SMC_CLNT) { - if (lgr->llc_flow_lcl.type == SMC_LLC_FLOW_ADD_LINK) { + if (smc_llc_is_local_add_link(llc)) { + if (lgr->llc_flow_lcl.type == + SMC_LLC_FLOW_ADD_LINK) + break; /* add_link in progress */ + if (smc_llc_flow_start(&lgr->llc_flow_lcl, + qentry)) { + schedule_work(&lgr->llc_add_link_work); + } + return; + } + if (lgr->llc_flow_lcl.type == SMC_LLC_FLOW_ADD_LINK && + !lgr->llc_flow_lcl.qentry) { /* a flow is waiting for this message */ smc_llc_flow_qentry_set(&lgr->llc_flow_lcl, qentry); diff --git a/net/smc/smc_llc.h b/net/smc/smc_llc.h index a5d2fe3eea61..cc00a2ec4e92 100644 --- a/net/smc/smc_llc.h +++ b/net/smc/smc_llc.h @@ -103,7 +103,7 @@ void smc_llc_send_link_delete_all(struct smc_link_group *lgr, bool ord, u32 rsn); int smc_llc_cli_add_link(struct smc_link *link, struct smc_llc_qentry *qentry); int smc_llc_srv_add_link(struct smc_link *link); -void smc_llc_srv_add_link_local(struct smc_link *link); +void smc_llc_add_link_local(struct smc_link *link); int smc_llc_init(void) __init; #endif /* SMC_LLC_H */ -- cgit From b9979c2e837926c87358024a95c67988477909b1 Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Sat, 18 Jul 2020 15:06:15 +0200 Subject: net/smc: fix handling of delete link requests As smc client the delete link requests are assigned to the flow when _any_ flow is active. This may break other flows that do not expect delete link requests during their handling. Fix that by assigning the request only when an add link flow is active. With that fix the code for smc client and smc server is the same, so remove the separate handling. Reviewed-by: Ursula Braun Fixes: 9ec6bf19ec8b ("net/smc: llc_del_link_work and use the LLC flow for delete link") Signed-off-by: Karsten Graul Signed-off-by: David S. Miller --- net/smc/smc_llc.c | 29 +++++++---------------------- 1 file changed, 7 insertions(+), 22 deletions(-) (limited to 'net') diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index 30da040ab5b6..fa8cd57a9b32 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -1544,28 +1544,13 @@ static void smc_llc_event_handler(struct smc_llc_qentry *qentry) } break; case SMC_LLC_DELETE_LINK: - if (lgr->role == SMC_CLNT) { - /* server requests to delete this link, send response */ - if (lgr->llc_flow_lcl.type != SMC_LLC_FLOW_NONE) { - /* DEL LINK REQ during ADD LINK SEQ */ - smc_llc_flow_qentry_set(&lgr->llc_flow_lcl, - qentry); - wake_up(&lgr->llc_msg_waiter); - } else if (smc_llc_flow_start(&lgr->llc_flow_lcl, - qentry)) { - schedule_work(&lgr->llc_del_link_work); - } - } else { - if (lgr->llc_flow_lcl.type == SMC_LLC_FLOW_ADD_LINK && - !lgr->llc_flow_lcl.qentry) { - /* DEL LINK REQ during ADD LINK SEQ */ - smc_llc_flow_qentry_set(&lgr->llc_flow_lcl, - qentry); - wake_up(&lgr->llc_msg_waiter); - } else if (smc_llc_flow_start(&lgr->llc_flow_lcl, - qentry)) { - schedule_work(&lgr->llc_del_link_work); - } + if (lgr->llc_flow_lcl.type == SMC_LLC_FLOW_ADD_LINK && + !lgr->llc_flow_lcl.qentry) { + /* DEL LINK REQ during ADD LINK SEQ */ + smc_llc_flow_qentry_set(&lgr->llc_flow_lcl, qentry); + wake_up(&lgr->llc_msg_waiter); + } else if (smc_llc_flow_start(&lgr->llc_flow_lcl, qentry)) { + schedule_work(&lgr->llc_del_link_work); } return; case SMC_LLC_CONFIRM_RKEY: -- cgit From 741a49a4dc5fd7e61b37b259dde915083c2c5327 Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Sat, 18 Jul 2020 15:06:16 +0200 Subject: net/smc: do not call dma sync for unmapped memory The dma related ...sync_sg... functions check the link state before the dma function is actually called. But the check in smc_link_usable() allows links in ACTIVATING state which are not yet mapped to dma memory. Under high load it may happen that the sync_sg functions are called for such a link which results in an debug output like DMA-API: mlx5_core 0002:00:00.0: device driver tries to sync DMA memory it has not allocated [device address=0x0000000103370000] [size=65536 bytes] To fix that introduce a helper to check for the link state ACTIVE and use it where appropriate. And move the link state update to ACTIVATING to the end of smcr_link_init() when most initial setup is done. Reviewed-by: Ursula Braun Fixes: d854fcbfaeda ("net/smc: add new link state and related helpers") Signed-off-by: Karsten Graul Signed-off-by: David S. Miller --- net/smc/af_smc.c | 2 +- net/smc/smc_core.c | 15 +++++++-------- net/smc/smc_core.h | 5 +++++ net/smc/smc_llc.c | 10 +++++----- 4 files changed, 18 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index f80591567a3d..d091509b5982 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -352,7 +352,7 @@ static int smcr_lgr_reg_rmbs(struct smc_link *link, */ mutex_lock(&lgr->llc_conf_mutex); for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { - if (lgr->lnk[i].state != SMC_LNK_ACTIVE) + if (!smc_link_active(&lgr->lnk[i])) continue; rc = smcr_link_reg_rmb(&lgr->lnk[i], rmb_desc); if (rc) diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 2e965de7412d..42ba227f3e97 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -318,7 +318,6 @@ int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk, get_device(&ini->ib_dev->ibdev->dev); atomic_inc(&ini->ib_dev->lnk_cnt); - lnk->state = SMC_LNK_ACTIVATING; lnk->link_id = smcr_next_link_id(lgr); lnk->lgr = lgr; lnk->link_idx = link_idx; @@ -354,6 +353,7 @@ int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk, rc = smc_wr_create_link(lnk); if (rc) goto destroy_qp; + lnk->state = SMC_LNK_ACTIVATING; return 0; destroy_qp: @@ -542,8 +542,7 @@ struct smc_link *smc_switch_conns(struct smc_link_group *lgr, smc_wr_wakeup_tx_wait(from_lnk); for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { - if (lgr->lnk[i].state != SMC_LNK_ACTIVE || - i == from_lnk->link_idx) + if (!smc_link_active(&lgr->lnk[i]) || i == from_lnk->link_idx) continue; if (is_dev_err && from_lnk->smcibdev == lgr->lnk[i].smcibdev && from_lnk->ibport == lgr->lnk[i].ibport) { @@ -1269,7 +1268,7 @@ static bool smcr_lgr_match(struct smc_link_group *lgr, return false; for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { - if (lgr->lnk[i].state != SMC_LNK_ACTIVE) + if (!smc_link_active(&lgr->lnk[i])) continue; if ((lgr->role == SMC_SERV || lgr->lnk[i].peer_qpn == clcqpn) && !memcmp(lgr->lnk[i].peer_gid, &lcl->gid, SMC_GID_SIZE) && @@ -1717,14 +1716,14 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb) void smc_sndbuf_sync_sg_for_cpu(struct smc_connection *conn) { - if (!conn->lgr || conn->lgr->is_smcd || !smc_link_usable(conn->lnk)) + if (!conn->lgr || conn->lgr->is_smcd || !smc_link_active(conn->lnk)) return; smc_ib_sync_sg_for_cpu(conn->lnk, conn->sndbuf_desc, DMA_TO_DEVICE); } void smc_sndbuf_sync_sg_for_device(struct smc_connection *conn) { - if (!conn->lgr || conn->lgr->is_smcd || !smc_link_usable(conn->lnk)) + if (!conn->lgr || conn->lgr->is_smcd || !smc_link_active(conn->lnk)) return; smc_ib_sync_sg_for_device(conn->lnk, conn->sndbuf_desc, DMA_TO_DEVICE); } @@ -1736,7 +1735,7 @@ void smc_rmb_sync_sg_for_cpu(struct smc_connection *conn) if (!conn->lgr || conn->lgr->is_smcd) return; for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { - if (!smc_link_usable(&conn->lgr->lnk[i])) + if (!smc_link_active(&conn->lgr->lnk[i])) continue; smc_ib_sync_sg_for_cpu(&conn->lgr->lnk[i], conn->rmb_desc, DMA_FROM_DEVICE); @@ -1750,7 +1749,7 @@ void smc_rmb_sync_sg_for_device(struct smc_connection *conn) if (!conn->lgr || conn->lgr->is_smcd) return; for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { - if (!smc_link_usable(&conn->lgr->lnk[i])) + if (!smc_link_active(&conn->lgr->lnk[i])) continue; smc_ib_sync_sg_for_device(&conn->lgr->lnk[i], conn->rmb_desc, DMA_FROM_DEVICE); diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index c3ff512fd891..1c4d5439d0ff 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -349,6 +349,11 @@ static inline bool smc_link_usable(struct smc_link *lnk) return true; } +static inline bool smc_link_active(struct smc_link *lnk) +{ + return lnk->state == SMC_LNK_ACTIVE; +} + struct smc_sock; struct smc_clc_msg_accept_confirm; struct smc_clc_msg_local; diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index fa8cd57a9b32..df5b0a6ea848 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -428,7 +428,7 @@ static int smc_llc_send_confirm_rkey(struct smc_link *send_link, rtok_ix = 1; for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { link = &send_link->lgr->lnk[i]; - if (link->state == SMC_LNK_ACTIVE && link != send_link) { + if (smc_link_active(link) && link != send_link) { rkeyllc->rtoken[rtok_ix].link_id = link->link_id; rkeyllc->rtoken[rtok_ix].rmb_key = htonl(rmb_desc->mr_rx[link->link_idx]->rkey); @@ -944,7 +944,7 @@ static int smc_llc_active_link_count(struct smc_link_group *lgr) int i, link_count = 0; for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { - if (!smc_link_usable(&lgr->lnk[i])) + if (!smc_link_active(&lgr->lnk[i])) continue; link_count++; } @@ -1622,7 +1622,7 @@ static void smc_llc_rx_response(struct smc_link *link, switch (llc_type) { case SMC_LLC_TEST_LINK: - if (link->state == SMC_LNK_ACTIVE) + if (smc_link_active(link)) complete(&link->llc_testlink_resp); break; case SMC_LLC_ADD_LINK: @@ -1706,7 +1706,7 @@ static void smc_llc_testlink_work(struct work_struct *work) u8 user_data[16] = { 0 }; int rc; - if (link->state != SMC_LNK_ACTIVE) + if (!smc_link_active(link)) return; /* don't reschedule worker */ expire_time = link->wr_rx_tstamp + link->llc_testlink_time; if (time_is_after_jiffies(expire_time)) { @@ -1718,7 +1718,7 @@ static void smc_llc_testlink_work(struct work_struct *work) /* receive TEST LINK response over RoCE fabric */ rc = wait_for_completion_interruptible_timeout(&link->llc_testlink_resp, SMC_LLC_WAIT_TIME); - if (link->state != SMC_LNK_ACTIVE) + if (!smc_link_active(link)) return; /* link state changed */ if (rc <= 0) { smcr_link_down_cond_sched(link); -- cgit From fd7f3a746582e8b17c48d4d8087d38c91f59ba67 Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Sat, 18 Jul 2020 15:06:17 +0200 Subject: net/smc: remove freed buffer from list Two buffers are allocated for each SMC connection. Each buffer is added to a buffer list after creation. When the second buffer allocation fails, the first buffer is freed but not deleted from the list. This might result in crashes when another connection picks up the freed buffer later and starts to work with it. Reviewed-by: Ursula Braun Fixes: 6511aad3f039 ("net/smc: change smc_buf_free function parameters") Signed-off-by: Karsten Graul Signed-off-by: David S. Miller --- net/smc/smc_core.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 42ba227f3e97..ca3dc6af73af 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -1772,8 +1772,12 @@ int smc_buf_create(struct smc_sock *smc, bool is_smcd) return rc; /* create rmb */ rc = __smc_buf_create(smc, is_smcd, true); - if (rc) + if (rc) { + mutex_lock(&smc->conn.lgr->sndbufs_lock); + list_del(&smc->conn.sndbuf_desc->list); + mutex_unlock(&smc->conn.lgr->sndbufs_lock); smc_buf_free(smc->conn.lgr, false, smc->conn.sndbuf_desc); + } return rc; } -- cgit From 1ad24058335427d046b2e5666bcd15a62ad9e242 Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Sat, 18 Jul 2020 15:06:18 +0200 Subject: net/smc: fix restoring of fallback changes When a listen socket is closed then all non-accepted sockets in its accept queue are to be released. Inside __smc_release() the helper smc_restore_fallback_changes() restores the changes done to the socket without to check if the clcsocket has a file set. This can result in a crash. Fix this by checking the file pointer first. Reviewed-by: Ursula Braun Fixes: f536dffc0b79 ("net/smc: fix closing of fallback SMC sockets") Signed-off-by: Karsten Graul Signed-off-by: David S. Miller --- net/smc/af_smc.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index d091509b5982..1163d51196da 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -126,8 +126,10 @@ EXPORT_SYMBOL_GPL(smc_proto6); static void smc_restore_fallback_changes(struct smc_sock *smc) { - smc->clcsock->file->private_data = smc->sk.sk_socket; - smc->clcsock->file = NULL; + if (smc->clcsock->file) { /* non-accepted sockets have no file yet */ + smc->clcsock->file->private_data = smc->sk.sk_socket; + smc->clcsock->file = NULL; + } } static int __smc_release(struct smc_sock *smc) -- cgit From 639f181f0ee20d3249dbc55f740f0167267180f0 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 20 Jul 2020 12:41:46 +0100 Subject: rxrpc: Fix sendmsg() returning EPIPE due to recvmsg() returning ENODATA rxrpc_sendmsg() returns EPIPE if there's an outstanding error, such as if rxrpc_recvmsg() indicating ENODATA if there's nothing for it to read. Change rxrpc_recvmsg() to return EAGAIN instead if there's nothing to read as this particular error doesn't get stored in ->sk_err by the networking core. Also change rxrpc_sendmsg() so that it doesn't fail with delayed receive errors (there's no way for it to report which call, if any, the error was caused by). Fixes: 17926a79320a ("[AF_RXRPC]: Provide secure RxRPC sockets for use by userspace and kernel both") Signed-off-by: David Howells Signed-off-by: David S. Miller --- net/rxrpc/recvmsg.c | 2 +- net/rxrpc/sendmsg.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c index 2989742a4aa1..490b1927215c 100644 --- a/net/rxrpc/recvmsg.c +++ b/net/rxrpc/recvmsg.c @@ -543,7 +543,7 @@ try_again: list_empty(&rx->recvmsg_q) && rx->sk.sk_state != RXRPC_SERVER_LISTENING) { release_sock(&rx->sk); - return -ENODATA; + return -EAGAIN; } if (list_empty(&rx->recvmsg_q)) { diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c index 1304b8608f56..03a30d014bb6 100644 --- a/net/rxrpc/sendmsg.c +++ b/net/rxrpc/sendmsg.c @@ -304,7 +304,7 @@ static int rxrpc_send_data(struct rxrpc_sock *rx, /* this should be in poll */ sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); - if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) + if (sk->sk_shutdown & SEND_SHUTDOWN) return -EPIPE; more = msg->msg_flags & MSG_MORE; -- cgit From 2bced6aefa3d0347e11efc610759e1317bfca7a7 Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Mon, 20 Jul 2020 16:24:28 +0200 Subject: net/smc: put slot when connection is killed To get a send slot smc_wr_tx_get_free_slot() is called, which might wait for a free slot. When smc_wr_tx_get_free_slot() returns there is a check if the connection was killed in the meantime. In that case don't only return an error, but also put back the free slot. Fixes: b290098092e4 ("net/smc: cancel send and receive for terminated socket") Reviewed-by: Ursula Braun Signed-off-by: Karsten Graul Signed-off-by: David S. Miller --- net/smc/smc_cdc.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c index a47e8855e045..ce468ff62a19 100644 --- a/net/smc/smc_cdc.c +++ b/net/smc/smc_cdc.c @@ -66,9 +66,13 @@ int smc_cdc_get_free_slot(struct smc_connection *conn, rc = smc_wr_tx_get_free_slot(link, smc_cdc_tx_handler, wr_buf, wr_rdma_buf, (struct smc_wr_tx_pend_priv **)pend); - if (conn->killed) + if (conn->killed) { /* abnormal termination */ + if (!rc) + smc_wr_tx_put_slot(link, + (struct smc_wr_tx_pend_priv *)pend); rc = -EPIPE; + } return rc; } -- cgit From a9e445029570ab691a44389d68b9c544338586b5 Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Mon, 20 Jul 2020 16:24:29 +0200 Subject: net/smc: fix dmb buffer shortage There is a current limit of 1920 registered dmb buffers per ISM device for smc-d. One link group can contain 255 connections, each connection is using one dmb buffer. When the connection is closed then the registered buffer is held in a queue and is reused by the next connection. When a link group is 'full' then another link group is created and uses an own buffer pool. The link groups are added to a list using list_add() which puts a new link group to the first position in the list. In the situation that many connections are opened (>1920) and a few of them stay open while others are closed quickly we end up with at least 8 link groups. For a new connection a matching link group is looked up, iterating over the list of link groups. The trailing 7 link groups all have registered dmb buffers which could be reused, while the first link group has only a few dmb buffers and then hit the 1920 limit. Because the first link group is not full (255 connection limit not reached) it is chosen and finally the connection falls back to TCP because there is no dmb buffer available in this link group. There are multiple ways to fix that: using list_add_tail() allows to scan older link groups first for free buffers which ensures that buffers are reused first. This fixes the problem for smc-r link groups as well. For smc-d there is an even better way to address this problem because smc-d does not have the 255 connections per link group limit. So fix the problem for smc-d by allowing large link groups. Fixes: c6ba7c9ba43d ("net/smc: add base infrastructure for SMC-D and ISM") Reviewed-by: Ursula Braun Signed-off-by: Karsten Graul Signed-off-by: David S. Miller --- net/smc/smc_core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index ca3dc6af73af..f82a2e599917 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -444,7 +444,7 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) } smc->conn.lgr = lgr; spin_lock_bh(lgr_lock); - list_add(&lgr->list, lgr_list); + list_add_tail(&lgr->list, lgr_list); spin_unlock_bh(lgr_lock); return 0; @@ -1311,7 +1311,7 @@ int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini) smcr_lgr_match(lgr, ini->ib_lcl, role, ini->ib_clcqpn)) && !lgr->sync_err && lgr->vlan_id == ini->vlan_id && - (role == SMC_CLNT || + (role == SMC_CLNT || ini->is_smcd || lgr->conns_num < SMC_RMBS_PER_LGR_MAX)) { /* link group found */ ini->cln_first_contact = SMC_REUSE_CONTACT; -- cgit From 5d93518e06b2102253d465e523f16bd4cb5ce859 Mon Sep 17 00:00:00 2001 From: Murali Karicheri Date: Mon, 20 Jul 2020 12:43:27 -0400 Subject: net: hsr: check for return value of skb_put_padto() skb_put_padto() can fail. So check for return type and return NULL for skb. Caller checks for skb and acts correctly if it is NULL. Fixes: 6d6148bc78d2 ("net: hsr: fix incorrect lsdu size in the tag of HSR frames for small frames") Signed-off-by: Murali Karicheri Signed-off-by: David S. Miller --- net/hsr/hsr_forward.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/hsr/hsr_forward.c b/net/hsr/hsr_forward.c index e42fd356f073..1ea17752fffc 100644 --- a/net/hsr/hsr_forward.c +++ b/net/hsr/hsr_forward.c @@ -120,15 +120,17 @@ static struct sk_buff *frame_get_stripped_skb(struct hsr_frame_info *frame, return skb_clone(frame->skb_std, GFP_ATOMIC); } -static void hsr_fill_tag(struct sk_buff *skb, struct hsr_frame_info *frame, - struct hsr_port *port, u8 proto_version) +static struct sk_buff *hsr_fill_tag(struct sk_buff *skb, + struct hsr_frame_info *frame, + struct hsr_port *port, u8 proto_version) { struct hsr_ethhdr *hsr_ethhdr; int lane_id; int lsdu_size; /* pad to minimum packet size which is 60 + 6 (HSR tag) */ - skb_put_padto(skb, ETH_ZLEN + HSR_HLEN); + if (skb_put_padto(skb, ETH_ZLEN + HSR_HLEN)) + return NULL; if (port->type == HSR_PT_SLAVE_A) lane_id = 0; @@ -147,6 +149,8 @@ static void hsr_fill_tag(struct sk_buff *skb, struct hsr_frame_info *frame, hsr_ethhdr->hsr_tag.encap_proto = hsr_ethhdr->ethhdr.h_proto; hsr_ethhdr->ethhdr.h_proto = htons(proto_version ? ETH_P_HSR : ETH_P_PRP); + + return skb; } static struct sk_buff *create_tagged_skb(struct sk_buff *skb_o, @@ -175,9 +179,10 @@ static struct sk_buff *create_tagged_skb(struct sk_buff *skb_o, memmove(dst, src, movelen); skb_reset_mac_header(skb); - hsr_fill_tag(skb, frame, port, port->hsr->prot_version); - - return skb; + /* skb_put_padto free skb on error and hsr_fill_tag returns NULL in + * that case + */ + return hsr_fill_tag(skb, frame, port, port->hsr->prot_version); } /* If the original frame was an HSR tagged frame, just clone it to be sent -- cgit From ae372cb1750f6c95370f92fe5f5620e0954663ba Mon Sep 17 00:00:00 2001 From: wenxu Date: Sun, 19 Jul 2020 20:30:37 +0800 Subject: net/sched: act_ct: fix restore the qdisc_skb_cb after defrag The fragment packets do defrag in tcf_ct_handle_fragments will clear the skb->cb which make the qdisc_skb_cb clear too. So the qdsic_skb_cb should be store before defrag and restore after that. It also update the pkt_len after all the fragments finish the defrag to one packet and make the following actions counter correct. Fixes: b57dc7c13ea9 ("net/sched: Introduce action ct") Signed-off-by: wenxu Signed-off-by: David S. Miller --- net/sched/act_ct.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c index 67504aece9ae..5928efb6449c 100644 --- a/net/sched/act_ct.c +++ b/net/sched/act_ct.c @@ -673,9 +673,10 @@ static int tcf_ct_ipv6_is_fragment(struct sk_buff *skb, bool *frag) } static int tcf_ct_handle_fragments(struct net *net, struct sk_buff *skb, - u8 family, u16 zone) + u8 family, u16 zone, bool *defrag) { enum ip_conntrack_info ctinfo; + struct qdisc_skb_cb cb; struct nf_conn *ct; int err = 0; bool frag; @@ -693,6 +694,7 @@ static int tcf_ct_handle_fragments(struct net *net, struct sk_buff *skb, return err; skb_get(skb); + cb = *qdisc_skb_cb(skb); if (family == NFPROTO_IPV4) { enum ip_defrag_users user = IP_DEFRAG_CONNTRACK_IN + zone; @@ -703,6 +705,9 @@ static int tcf_ct_handle_fragments(struct net *net, struct sk_buff *skb, local_bh_enable(); if (err && err != -EINPROGRESS) goto out_free; + + if (!err) + *defrag = true; } else { /* NFPROTO_IPV6 */ #if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6) enum ip6_defrag_users user = IP6_DEFRAG_CONNTRACK_IN + zone; @@ -711,12 +716,16 @@ static int tcf_ct_handle_fragments(struct net *net, struct sk_buff *skb, err = nf_ct_frag6_gather(net, skb, user); if (err && err != -EINPROGRESS) goto out_free; + + if (!err) + *defrag = true; #else err = -EOPNOTSUPP; goto out_free; #endif } + *qdisc_skb_cb(skb) = cb; skb_clear_hash(skb); skb->ignore_df = 1; return err; @@ -914,6 +923,7 @@ static int tcf_ct_act(struct sk_buff *skb, const struct tc_action *a, int nh_ofs, err, retval; struct tcf_ct_params *p; bool skip_add = false; + bool defrag = false; struct nf_conn *ct; u8 family; @@ -946,7 +956,7 @@ static int tcf_ct_act(struct sk_buff *skb, const struct tc_action *a, */ nh_ofs = skb_network_offset(skb); skb_pull_rcsum(skb, nh_ofs); - err = tcf_ct_handle_fragments(net, skb, family, p->zone); + err = tcf_ct_handle_fragments(net, skb, family, p->zone, &defrag); if (err == -EINPROGRESS) { retval = TC_ACT_STOLEN; goto out; @@ -1014,6 +1024,8 @@ out_push: out: tcf_action_update_bstats(&c->common, skb); + if (defrag) + qdisc_skb_cb(skb)->pkt_len = skb->len; return retval; drop: -- cgit From 6ef9dcb78046b346b5508ca1659848b136a343c2 Mon Sep 17 00:00:00 2001 From: Tung Nguyen Date: Tue, 21 Jul 2020 08:57:05 +0700 Subject: tipc: allow to build NACK message in link timeout function Commit 02288248b051 ("tipc: eliminate gap indicator from ACK messages") eliminated sending of the 'gap' indicator in regular ACK messages and only allowed to build NACK message with enabled probe/probe_reply. However, necessary correction for building NACK message was missed in tipc_link_timeout() function. This leads to significant delay and link reset (due to retransmission failure) in lossy environment. This commit fixes it by setting the 'probe' flag to 'true' when the receive deferred queue is not empty. As a result, NACK message will be built to send back to another peer. Fixes: 02288248b051 ("tipc: eliminate gap indicator from ACK messages") Acked-by: Jon Maloy Signed-off-by: Tung Nguyen Signed-off-by: David S. Miller --- net/tipc/link.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/tipc/link.c b/net/tipc/link.c index 263d950e70e9..d40f8e5b7683 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -827,11 +827,11 @@ int tipc_link_timeout(struct tipc_link *l, struct sk_buff_head *xmitq) state |= l->bc_rcvlink->rcv_unacked; state |= l->rcv_unacked; state |= !skb_queue_empty(&l->transmq); - state |= !skb_queue_empty(&l->deferdq); probe = mstate->probing; probe |= l->silent_intv_cnt; if (probe || mstate->monitoring) l->silent_intv_cnt++; + probe |= !skb_queue_empty(&l->deferdq); if (l->snd_nxt == l->checkpoint) { tipc_link_update_cwin(l, 0, 0); probe = true; -- cgit From f2b2c55e512879a05456eaf5de4d1ed2f7757509 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 21 Jul 2020 15:15:30 +0900 Subject: udp: Copy has_conns in reuseport_grow(). If an unconnected socket in a UDP reuseport group connect()s, has_conns is set to 1. Then, when a packet is received, udp[46]_lib_lookup2() scans all sockets in udp_hslot looking for the connected socket with the highest score. However, when the number of sockets bound to the port exceeds max_socks, reuseport_grow() resets has_conns to 0. It can cause udp[46]_lib_lookup2() to return without scanning all sockets, resulting in that packets sent to connected sockets may be distributed to unconnected sockets. Therefore, reuseport_grow() should copy has_conns. Fixes: acdcecc61285 ("udp: correct reuseport selection with connected sockets") CC: Willem de Bruijn Reviewed-by: Benjamin Herrenschmidt Signed-off-by: Kuniyuki Iwashima Acked-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/core/sock_reuseport.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c index adcb3aea576d..bbdd3c7b6cb5 100644 --- a/net/core/sock_reuseport.c +++ b/net/core/sock_reuseport.c @@ -101,6 +101,7 @@ static struct sock_reuseport *reuseport_grow(struct sock_reuseport *reuse) more_reuse->prog = reuse->prog; more_reuse->reuseport_id = reuse->reuseport_id; more_reuse->bind_inany = reuse->bind_inany; + more_reuse->has_conns = reuse->has_conns; memcpy(more_reuse->socks, reuse->socks, reuse->num_socks * sizeof(struct sock *)); -- cgit From efc6b6f6c3113e8b203b9debfb72d81e0f3dcace Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 21 Jul 2020 15:15:31 +0900 Subject: udp: Improve load balancing for SO_REUSEPORT. Currently, SO_REUSEPORT does not work well if connected sockets are in a UDP reuseport group. Then reuseport_has_conns() returns true and the result of reuseport_select_sock() is discarded. Also, unconnected sockets have the same score, hence only does the first unconnected socket in udp_hslot always receive all packets sent to unconnected sockets. So, the result of reuseport_select_sock() should be used for load balancing. The noteworthy point is that the unconnected sockets placed after connected sockets in sock_reuseport.socks will receive more packets than others because of the algorithm in reuseport_select_sock(). index | connected | reciprocal_scale | result --------------------------------------------- 0 | no | 20% | 40% 1 | no | 20% | 20% 2 | yes | 20% | 0% 3 | no | 20% | 40% 4 | yes | 20% | 0% If most of the sockets are connected, this can be a problem, but it still works better than now. Fixes: acdcecc61285 ("udp: correct reuseport selection with connected sockets") CC: Willem de Bruijn Reviewed-by: Benjamin Herrenschmidt Signed-off-by: Kuniyuki Iwashima Acked-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/ipv4/udp.c | 15 +++++++++------ net/ipv6/udp.c | 15 +++++++++------ 2 files changed, 18 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 1b7ebbcae497..99251d3c70d0 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -416,7 +416,7 @@ static struct sock *udp4_lib_lookup2(struct net *net, struct udp_hslot *hslot2, struct sk_buff *skb) { - struct sock *sk, *result; + struct sock *sk, *result, *reuseport_result; int score, badness; u32 hash = 0; @@ -426,17 +426,20 @@ static struct sock *udp4_lib_lookup2(struct net *net, score = compute_score(sk, net, saddr, sport, daddr, hnum, dif, sdif); if (score > badness) { + reuseport_result = NULL; + if (sk->sk_reuseport && sk->sk_state != TCP_ESTABLISHED) { hash = udp_ehashfn(net, daddr, hnum, saddr, sport); - result = reuseport_select_sock(sk, hash, skb, - sizeof(struct udphdr)); - if (result && !reuseport_has_conns(sk, false)) - return result; + reuseport_result = reuseport_select_sock(sk, hash, skb, + sizeof(struct udphdr)); + if (reuseport_result && !reuseport_has_conns(sk, false)) + return reuseport_result; } + + result = reuseport_result ? : sk; badness = score; - result = sk; } } return result; diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 7d4151747340..9503c87ac0b3 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -148,7 +148,7 @@ static struct sock *udp6_lib_lookup2(struct net *net, int dif, int sdif, struct udp_hslot *hslot2, struct sk_buff *skb) { - struct sock *sk, *result; + struct sock *sk, *result, *reuseport_result; int score, badness; u32 hash = 0; @@ -158,17 +158,20 @@ static struct sock *udp6_lib_lookup2(struct net *net, score = compute_score(sk, net, saddr, sport, daddr, hnum, dif, sdif); if (score > badness) { + reuseport_result = NULL; + if (sk->sk_reuseport && sk->sk_state != TCP_ESTABLISHED) { hash = udp6_ehashfn(net, daddr, hnum, saddr, sport); - result = reuseport_select_sock(sk, hash, skb, - sizeof(struct udphdr)); - if (result && !reuseport_has_conns(sk, false)) - return result; + reuseport_result = reuseport_select_sock(sk, hash, skb, + sizeof(struct udphdr)); + if (reuseport_result && !reuseport_has_conns(sk, false)) + return reuseport_result; } - result = sk; + + result = reuseport_result ? : sk; badness = score; } } -- cgit From 9bb5fbea59f36a589ef886292549ca4052fe676c Mon Sep 17 00:00:00 2001 From: Xiongfeng Wang Date: Tue, 21 Jul 2020 15:02:57 +0800 Subject: net-sysfs: add a newline when printing 'tx_timeout' by sysfs When I cat 'tx_timeout' by sysfs, it displays as follows. It's better to add a newline for easy reading. root@syzkaller:~# cat /sys/devices/virtual/net/lo/queues/tx-0/tx_timeout 0root@syzkaller:~# Signed-off-by: Xiongfeng Wang Signed-off-by: David S. Miller --- net/core/net-sysfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index e353b822bb15..7bd6440c63bf 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -1108,7 +1108,7 @@ static ssize_t tx_timeout_show(struct netdev_queue *queue, char *buf) trans_timeout = queue->trans_timeout; spin_unlock_irq(&queue->_xmit_lock); - return sprintf(buf, "%lu", trans_timeout); + return sprintf(buf, fmt_ulong, trans_timeout); } static unsigned int get_netdev_queue_index(struct netdev_queue *queue) -- cgit From b0a422772fec29811e293c7c0e6f991c0fd9241d Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 21 Jul 2020 17:11:44 +0800 Subject: net: udp: Fix wrong clean up for IS_UDPLITE macro We can't use IS_UDPLITE to replace udp_sk->pcflag when UDPLITE_RECV_CC is checked. Fixes: b2bf1e2659b1 ("[UDP]: Clean up for IS_UDPLITE macro") Signed-off-by: Miaohe Lin Signed-off-by: David S. Miller --- net/ipv4/udp.c | 2 +- net/ipv6/udp.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 99251d3c70d0..4077d589b72e 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -2054,7 +2054,7 @@ static int udp_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb) /* * UDP-Lite specific tests, ignored on UDP sockets */ - if ((is_udplite & UDPLITE_RECV_CC) && UDP_SKB_CB(skb)->partial_cov) { + if ((up->pcflag & UDPLITE_RECV_CC) && UDP_SKB_CB(skb)->partial_cov) { /* * MIB statistics other than incrementing the error count are diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 9503c87ac0b3..a8d74f44056a 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -646,7 +646,7 @@ static int udpv6_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb) /* * UDP-Lite specific tests, ignored on UDP sockets (see net/ipv4/udp.c). */ - if ((is_udplite & UDPLITE_RECV_CC) && UDP_SKB_CB(skb)->partial_cov) { + if ((up->pcflag & UDPLITE_RECV_CC) && UDP_SKB_CB(skb)->partial_cov) { if (up->pcrlen == 0) { /* full coverage was set */ net_dbg_ratelimited("UDPLITE6: partial coverage %d while full coverage %d requested\n", -- cgit From 8210e344ccb798c672ab237b1a4f241bda08909b Mon Sep 17 00:00:00 2001 From: guodeqing Date: Thu, 16 Jul 2020 16:12:08 +0800 Subject: ipvs: fix the connection sync failed in some cases The sync_thread_backup only checks sk_receive_queue is empty or not, there is a situation which cannot sync the connection entries when sk_receive_queue is empty and sk_rmem_alloc is larger than sk_rcvbuf, the sync packets are dropped in __udp_enqueue_schedule_skb, this is because the packets in reader_queue is not read, so the rmem is not reclaimed. Here I add the check of whether the reader_queue of the udp sock is empty or not to solve this problem. Fixes: 2276f58ac589 ("udp: use a separate rx queue for packet reception") Reported-by: zhouxudong Signed-off-by: guodeqing Acked-by: Julian Anastasov Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipvs/ip_vs_sync.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c index 605e0f68f8bd..2b8abbfe018c 100644 --- a/net/netfilter/ipvs/ip_vs_sync.c +++ b/net/netfilter/ipvs/ip_vs_sync.c @@ -1717,6 +1717,8 @@ static int sync_thread_backup(void *data) { struct ip_vs_sync_thread_data *tinfo = data; struct netns_ipvs *ipvs = tinfo->ipvs; + struct sock *sk = tinfo->sock->sk; + struct udp_sock *up = udp_sk(sk); int len; pr_info("sync thread started: state = BACKUP, mcast_ifn = %s, " @@ -1724,12 +1726,14 @@ static int sync_thread_backup(void *data) ipvs->bcfg.mcast_ifn, ipvs->bcfg.syncid, tinfo->id); while (!kthread_should_stop()) { - wait_event_interruptible(*sk_sleep(tinfo->sock->sk), - !skb_queue_empty(&tinfo->sock->sk->sk_receive_queue) - || kthread_should_stop()); + wait_event_interruptible(*sk_sleep(sk), + !skb_queue_empty_lockless(&sk->sk_receive_queue) || + !skb_queue_empty_lockless(&up->reader_queue) || + kthread_should_stop()); /* do we have data now? */ - while (!skb_queue_empty(&(tinfo->sock->sk->sk_receive_queue))) { + while (!skb_queue_empty_lockless(&sk->sk_receive_queue) || + !skb_queue_empty_lockless(&up->reader_queue)) { len = ip_vs_receive(tinfo->sock, tinfo->buf, ipvs->bcfg.sync_maxlen); if (len <= 0) { -- cgit From 2f2a7ffad5c6cbf3d438e813cfdc88230e185ba6 Mon Sep 17 00:00:00 2001 From: Peilin Ye Date: Wed, 22 Jul 2020 11:19:01 -0400 Subject: AX.25: Fix out-of-bounds read in ax25_connect() Checks on `addr_len` and `fsa->fsa_ax25.sax25_ndigis` are insufficient. ax25_connect() can go out of bounds when `fsa->fsa_ax25.sax25_ndigis` equals to 7 or 8. Fix it. This issue has been reported as a KMSAN uninit-value bug, because in such a case, ax25_connect() reaches into the uninitialized portion of the `struct sockaddr_storage` statically allocated in __sys_connect(). It is safe to remove `fsa->fsa_ax25.sax25_ndigis > AX25_MAX_DIGIS` because `addr_len` is guaranteed to be less than or equal to `sizeof(struct full_sockaddr_ax25)`. Reported-by: syzbot+c82752228ed975b0a623@syzkaller.appspotmail.com Link: https://syzkaller.appspot.com/bug?id=55ef9d629f3b3d7d70b69558015b63b48d01af66 Signed-off-by: Peilin Ye Signed-off-by: David S. Miller --- net/ax25/af_ax25.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index fd91cd34f25e..ef5bf116157a 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -1187,7 +1187,9 @@ static int __must_check ax25_connect(struct socket *sock, if (addr_len > sizeof(struct sockaddr_ax25) && fsa->fsa_ax25.sax25_ndigis != 0) { /* Valid number of digipeaters ? */ - if (fsa->fsa_ax25.sax25_ndigis < 1 || fsa->fsa_ax25.sax25_ndigis > AX25_MAX_DIGIS) { + if (fsa->fsa_ax25.sax25_ndigis < 1 || + addr_len < sizeof(struct sockaddr_ax25) + + sizeof(ax25_address) * fsa->fsa_ax25.sax25_ndigis) { err = -EINVAL; goto out_release; } -- cgit From 8f13399db22f909a35735bf8ae2f932e0c8f0e30 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Wed, 22 Jul 2020 23:52:11 +0800 Subject: sctp: shrink stream outq only when new outcnt < old outcnt It's not necessary to go list_for_each for outq->out_chunk_list when new outcnt >= old outcnt, as no chunk with higher sid than new (outcnt - 1) exists in the outqueue. While at it, also move the list_for_each code in a new function sctp_stream_shrink_out(), which will be used in the next patch. Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/sctp/stream.c | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/sctp/stream.c b/net/sctp/stream.c index 67f7e71f9129..4f87693cc036 100644 --- a/net/sctp/stream.c +++ b/net/sctp/stream.c @@ -22,17 +22,11 @@ #include #include -/* Migrates chunks from stream queues to new stream queues if needed, - * but not across associations. Also, removes those chunks to streams - * higher than the new max. - */ -static void sctp_stream_outq_migrate(struct sctp_stream *stream, - struct sctp_stream *new, __u16 outcnt) +static void sctp_stream_shrink_out(struct sctp_stream *stream, __u16 outcnt) { struct sctp_association *asoc; struct sctp_chunk *ch, *temp; struct sctp_outq *outq; - int i; asoc = container_of(stream, struct sctp_association, stream); outq = &asoc->outqueue; @@ -56,6 +50,19 @@ static void sctp_stream_outq_migrate(struct sctp_stream *stream, sctp_chunk_free(ch); } +} + +/* Migrates chunks from stream queues to new stream queues if needed, + * but not across associations. Also, removes those chunks to streams + * higher than the new max. + */ +static void sctp_stream_outq_migrate(struct sctp_stream *stream, + struct sctp_stream *new, __u16 outcnt) +{ + int i; + + if (stream->outcnt > outcnt) + sctp_stream_shrink_out(stream, outcnt); if (new) { /* Here we actually move the old ext stuff into the new -- cgit From 3ecdda3e9ad837cf9cb41b6faa11b1af3a5abc0c Mon Sep 17 00:00:00 2001 From: Xin Long Date: Wed, 22 Jul 2020 23:52:12 +0800 Subject: sctp: shrink stream outq when fails to do addstream reconf When adding a stream with stream reconf, the new stream firstly is in CLOSED state but new out chunks can still be enqueued. Then once gets the confirmation from the peer, the state will change to OPEN. However, if the peer denies, it needs to roll back the stream. But when doing that, it only sets the stream outcnt back, and the chunks already in the new stream don't get purged. It caused these chunks can still be dequeued in sctp_outq_dequeue_data(). As its stream is still in CLOSE, the chunk will be enqueued to the head again by sctp_outq_head_data(). This chunk will never be sent out, and the chunks after it can never be dequeued. The assoc will be 'hung' in a dead loop of sending this chunk. To fix it, this patch is to purge these chunks already in the new stream by calling sctp_stream_shrink_out() when failing to do the addstream reconf. Fixes: 11ae76e67a17 ("sctp: implement receiver-side procedures for the Reconf Response Parameter") Reported-by: Ying Xu Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/sctp/stream.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/sctp/stream.c b/net/sctp/stream.c index 4f87693cc036..bda2536dd740 100644 --- a/net/sctp/stream.c +++ b/net/sctp/stream.c @@ -1044,11 +1044,13 @@ struct sctp_chunk *sctp_process_strreset_resp( nums = ntohs(addstrm->number_of_streams); number = stream->outcnt - nums; - if (result == SCTP_STRRESET_PERFORMED) + if (result == SCTP_STRRESET_PERFORMED) { for (i = number; i < stream->outcnt; i++) SCTP_SO(stream, i)->state = SCTP_STREAM_OPEN; - else + } else { + sctp_stream_shrink_out(stream, number); stream->outcnt = number; + } *evp = sctp_ulpevent_make_stream_change_event(asoc, flags, 0, nums, GFP_ATOMIC); -- cgit From 8885bb0621f01a6c82be60a91e5fc0f6e2f71186 Mon Sep 17 00:00:00 2001 From: Peilin Ye Date: Wed, 22 Jul 2020 12:05:12 -0400 Subject: AX.25: Prevent out-of-bounds read in ax25_sendmsg() Checks on `addr_len` and `usax->sax25_ndigis` are insufficient. ax25_sendmsg() can go out of bounds when `usax->sax25_ndigis` equals to 7 or 8. Fix it. It is safe to remove `usax->sax25_ndigis > AX25_MAX_DIGIS`, since `addr_len` is guaranteed to be less than or equal to `sizeof(struct full_sockaddr_ax25)` Signed-off-by: Peilin Ye Signed-off-by: David S. Miller --- net/ax25/af_ax25.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index ef5bf116157a..0862fe49d434 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -1509,7 +1509,8 @@ static int ax25_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) struct full_sockaddr_ax25 *fsa = (struct full_sockaddr_ax25 *)usax; /* Valid number of digipeaters ? */ - if (usax->sax25_ndigis < 1 || usax->sax25_ndigis > AX25_MAX_DIGIS) { + if (usax->sax25_ndigis < 1 || addr_len < sizeof(struct sockaddr_ax25) + + sizeof(ax25_address) * usax->sax25_ndigis) { err = -EINVAL; goto out; } -- cgit From 17ad73e941b71f3bec7523ea4e9cbc3752461c2d Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 23 Jul 2020 17:49:57 +0300 Subject: AX.25: Prevent integer overflows in connect and sendmsg We recently added some bounds checking in ax25_connect() and ax25_sendmsg() and we so we removed the AX25_MAX_DIGIS checks because they were no longer required. Unfortunately, I believe they are required to prevent integer overflows so I have added them back. Fixes: 8885bb0621f0 ("AX.25: Prevent out-of-bounds read in ax25_sendmsg()") Fixes: 2f2a7ffad5c6 ("AX.25: Fix out-of-bounds read in ax25_connect()") Signed-off-by: Dan Carpenter Signed-off-by: David S. Miller --- net/ax25/af_ax25.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index 0862fe49d434..dec3f35467c9 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -1188,6 +1188,7 @@ static int __must_check ax25_connect(struct socket *sock, fsa->fsa_ax25.sax25_ndigis != 0) { /* Valid number of digipeaters ? */ if (fsa->fsa_ax25.sax25_ndigis < 1 || + fsa->fsa_ax25.sax25_ndigis > AX25_MAX_DIGIS || addr_len < sizeof(struct sockaddr_ax25) + sizeof(ax25_address) * fsa->fsa_ax25.sax25_ndigis) { err = -EINVAL; @@ -1509,7 +1510,9 @@ static int ax25_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) struct full_sockaddr_ax25 *fsa = (struct full_sockaddr_ax25 *)usax; /* Valid number of digipeaters ? */ - if (usax->sax25_ndigis < 1 || addr_len < sizeof(struct sockaddr_ax25) + + if (usax->sax25_ndigis < 1 || + usax->sax25_ndigis > AX25_MAX_DIGIS || + addr_len < sizeof(struct sockaddr_ax25) + sizeof(ax25_address) * usax->sax25_ndigis) { err = -EINVAL; goto out; -- cgit From 76be93fc0702322179bb0ea87295d820ee46ad14 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Thu, 23 Jul 2020 12:00:06 -0700 Subject: tcp: allow at most one TLP probe per flight Previously TLP may send multiple probes of new data in one flight. This happens when the sender is cwnd limited. After the initial TLP containing new data is sent, the sender receives another ACK that acks partial inflight. It may re-arm another TLP timer to send more, if no further ACK returns before the next TLP timeout (PTO) expires. The sender may send in theory a large amount of TLP until send queue is depleted. This only happens if the sender sees such irregular uncommon ACK pattern. But it is generally undesirable behavior during congestion especially. The original TLP design restrict only one TLP probe per inflight as published in "Reducing Web Latency: the Virtue of Gentle Aggression", SIGCOMM 2013. This patch changes TLP to send at most one probe per inflight. Note that if the sender is app-limited, TLP retransmits old data and did not have this issue. Signed-off-by: Yuchung Cheng Signed-off-by: Neal Cardwell Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 11 ++++++----- net/ipv4/tcp_output.c | 13 ++++++++----- 2 files changed, 14 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 9615e72656d1..518f04355fbf 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3488,10 +3488,8 @@ static void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq) } } -/* This routine deals with acks during a TLP episode. - * We mark the end of a TLP episode on receiving TLP dupack or when - * ack is after tlp_high_seq. - * Ref: loss detection algorithm in draft-dukkipati-tcpm-tcp-loss-probe. +/* This routine deals with acks during a TLP episode and ends an episode by + * resetting tlp_high_seq. Ref: TLP algorithm in draft-ietf-tcpm-rack */ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag) { @@ -3500,7 +3498,10 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag) if (before(ack, tp->tlp_high_seq)) return; - if (flag & FLAG_DSACKING_ACK) { + if (!tp->tlp_retrans) { + /* TLP of new data has been acknowledged */ + tp->tlp_high_seq = 0; + } else if (flag & FLAG_DSACKING_ACK) { /* This DSACK means original and TLP probe arrived; no loss */ tp->tlp_high_seq = 0; } else if (after(ack, tp->tlp_high_seq)) { diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 5f5b2f0b0e60..0bc05d68cd74 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2624,6 +2624,11 @@ void tcp_send_loss_probe(struct sock *sk) int pcount; int mss = tcp_current_mss(sk); + /* At most one outstanding TLP */ + if (tp->tlp_high_seq) + goto rearm_timer; + + tp->tlp_retrans = 0; skb = tcp_send_head(sk); if (skb && tcp_snd_wnd_test(tp, skb, mss)) { pcount = tp->packets_out; @@ -2641,10 +2646,6 @@ void tcp_send_loss_probe(struct sock *sk) return; } - /* At most one outstanding TLP retransmission. */ - if (tp->tlp_high_seq) - goto rearm_timer; - if (skb_still_in_host_queue(sk, skb)) goto rearm_timer; @@ -2666,10 +2667,12 @@ void tcp_send_loss_probe(struct sock *sk) if (__tcp_retransmit_skb(sk, skb, 1)) goto rearm_timer; + tp->tlp_retrans = 1; + +probe_sent: /* Record snd_nxt for loss detection. */ tp->tlp_high_seq = tp->snd_nxt; -probe_sent: NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPLOSSPROBES); /* Reset s.t. tcp_rearm_rto will restart timer from now */ inet_csk(sk)->icsk_pending = 0; -- cgit From c2b69f24ebd166a13cdc9909b50f33228895998b Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Fri, 24 Jul 2020 10:50:22 +1000 Subject: flow_offload: Move rhashtable inclusion to the source file I noticed that touching linux/rhashtable.h causes lib/vsprintf.c to be rebuilt. This dependency came through a bogus inclusion in the file net/flow_offload.h. This patch moves it to the right place. This patch also removes a lingering rhashtable inclusion in cls_api created by the same commit. Fixes: 4e481908c51b ("flow_offload: move tc indirect block to...") Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/core/flow_offload.c | 1 + net/sched/cls_api.c | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/flow_offload.c b/net/core/flow_offload.c index b739cfab796e..2076219b8ba5 100644 --- a/net/core/flow_offload.c +++ b/net/core/flow_offload.c @@ -4,6 +4,7 @@ #include #include #include +#include struct flow_rule *flow_rule_alloc(unsigned int num_actions) { diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index e62beec0d844..4619cb3cb0a8 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -20,7 +20,6 @@ #include #include #include -#include #include #include #include -- cgit From af9f691f0f5bdd1ade65a7b84927639882d7c3e5 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Fri, 24 Jul 2020 09:45:51 -0700 Subject: qrtr: orphan socket in qrtr_release() We have to detach sock from socket in qrtr_release(), otherwise skb->sk may still reference to this socket when the skb is released in tun->queue, particularly sk->sk_wq still points to &sock->wq, which leads to a UAF. Reported-and-tested-by: syzbot+6720d64f31c081c2f708@syzkaller.appspotmail.com Fixes: 28fb4e59a47d ("net: qrtr: Expose tunneling endpoint to user space") Cc: Bjorn Andersson Cc: Eric Dumazet Signed-off-by: Cong Wang Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/qrtr/qrtr.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/qrtr/qrtr.c b/net/qrtr/qrtr.c index 24a8c3c6da0d..300a104b9a0f 100644 --- a/net/qrtr/qrtr.c +++ b/net/qrtr/qrtr.c @@ -1180,6 +1180,7 @@ static int qrtr_release(struct socket *sock) sk->sk_state_change(sk); sock_set_flag(sk, SOCK_DEAD); + sock_orphan(sk); sock->sk = NULL; if (!sock_flag(sk, SOCK_ZAPPED)) -- cgit From 7df5cb75cfb8acf96c7f2342530eb41e0c11f4c3 Mon Sep 17 00:00:00 2001 From: Subash Abhinov Kasiviswanathan Date: Thu, 23 Jul 2020 11:31:48 -0600 Subject: dev: Defer free of skbs in flush_backlog IRQs are disabled when freeing skbs in input queue. Use the IRQ safe variant to free skbs here. Fixes: 145dd5f9c88f ("net: flush the softnet backlog in process context") Signed-off-by: Subash Abhinov Kasiviswanathan Signed-off-by: David S. Miller --- net/core/dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 90b59fc50dc9..7a774ebf64e2 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5601,7 +5601,7 @@ static void flush_backlog(struct work_struct *work) skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) { if (skb->dev->reg_state == NETREG_UNREGISTERING) { __skb_unlink(skb, &sd->input_pkt_queue); - kfree_skb(skb); + dev_kfree_skb_irq(skb); input_queue_head_incr(sd); } } -- cgit