From 3a5913183aa1b14148c723bda030e6102ad73008 Mon Sep 17 00:00:00 2001 From: Eyal Birger Date: Sun, 9 Oct 2022 22:16:43 +0300 Subject: xfrm: fix "disable_policy" on ipv4 early demux The commit in the "Fixes" tag tried to avoid a case where policy check is ignored due to dst caching in next hops. However, when the traffic is locally consumed, the dst may be cached in a local TCP or UDP socket as part of early demux. In this case the "disable_policy" flag is not checked as ip_route_input_noref() was only called before caching, and thus, packets after the initial packet in a flow will be dropped if not matching policies. Fix by checking the "disable_policy" flag also when a valid dst is already available. Link: https://bugzilla.kernel.org/show_bug.cgi?id=216557 Reported-by: Monil Patel Fixes: e6175a2ed1f1 ("xfrm: fix "disable_policy" flag use when arriving from different devices") Signed-off-by: Eyal Birger ---- v2: use dev instead of skb->dev Signed-off-by: Steffen Klassert --- net/ipv4/ip_input.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'net') diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 1b512390b3cf..e880ce77322a 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -366,6 +366,11 @@ static int ip_rcv_finish_core(struct net *net, struct sock *sk, iph->tos, dev); if (unlikely(err)) goto drop_error; + } else { + struct in_device *in_dev = __in_dev_get_rcu(dev); + + if (in_dev && IN_DEV_ORCONF(in_dev, NOPOLICY)) + IPCB(skb)->flags |= IPSKB_NOPOLICY; } #ifdef CONFIG_IP_ROUTE_CLASSID -- cgit From d83f7040e18489265b4b121f33f99b02e52dabda Mon Sep 17 00:00:00 2001 From: Eyal Birger Date: Tue, 11 Oct 2022 11:01:37 +0300 Subject: xfrm: lwtunnel: squelch kernel warning in case XFRM encap type is not available Ido reported that a kernel warning [1] can be triggered from user space when the kernel is compiled with CONFIG_MODULES=y and CONFIG_XFRM=n when adding an xfrm encap type route, e.g: $ ip route add 198.51.100.0/24 dev dummy1 encap xfrm if_id 1 Error: lwt encapsulation type not supported. The reason for the warning is that the LWT infrastructure has an autoloading feature which is meant only for encap types that don't use a net device, which is not the case in xfrm encap. Mute this warning for xfrm encap as there's no encap module to autoload in this case. [1] WARNING: CPU: 3 PID: 2746262 at net/core/lwtunnel.c:57 lwtunnel_valid_encap_type+0x4f/0x120 [...] Call Trace: rtm_to_fib_config+0x211/0x350 inet_rtm_newroute+0x3a/0xa0 rtnetlink_rcv_msg+0x154/0x3c0 netlink_rcv_skb+0x49/0xf0 netlink_unicast+0x22f/0x350 netlink_sendmsg+0x208/0x440 ____sys_sendmsg+0x21f/0x250 ___sys_sendmsg+0x83/0xd0 __sys_sendmsg+0x54/0xa0 do_syscall_64+0x35/0x80 entry_SYSCALL_64_after_hwframe+0x63/0xcd Reported-by: Ido Schimmel Fixes: 2c2493b9da91 ("xfrm: lwtunnel: add lwtunnel support for xfrm interfaces in collect_md mode") Signed-off-by: Eyal Birger Tested-by: Ido Schimmel Reviewed-by: Nikolay Aleksandrov Signed-off-by: Steffen Klassert --- net/core/lwtunnel.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c index 6fac2f0ef074..711cd3b4347a 100644 --- a/net/core/lwtunnel.c +++ b/net/core/lwtunnel.c @@ -48,9 +48,11 @@ static const char *lwtunnel_encap_str(enum lwtunnel_encap_types encap_type) return "RPL"; case LWTUNNEL_ENCAP_IOAM6: return "IOAM6"; + case LWTUNNEL_ENCAP_XFRM: + /* module autoload not supported for encap type */ + return NULL; case LWTUNNEL_ENCAP_IP6: case LWTUNNEL_ENCAP_IP: - case LWTUNNEL_ENCAP_XFRM: case LWTUNNEL_ENCAP_NONE: case __LWTUNNEL_ENCAP_MAX: /* should not have got here */ -- cgit From 4b549ccce941798703f159b227aa28c716aa78fa Mon Sep 17 00:00:00 2001 From: Christian Langrock Date: Mon, 17 Oct 2022 08:34:47 +0200 Subject: xfrm: replay: Fix ESN wrap around for GSO When using GSO it can happen that the wrong seq_hi is used for the last packets before the wrap around. This can lead to double usage of a sequence number. To avoid this, we should serialize this last GSO packet. Fixes: d7dbefc45cf5 ("xfrm: Add xfrm_replay_overflow functions for offloading") Co-developed-by: Steffen Klassert Signed-off-by: Christian Langrock Signed-off-by: Steffen Klassert --- net/ipv4/esp4_offload.c | 3 +++ net/ipv6/esp6_offload.c | 3 +++ net/xfrm/xfrm_device.c | 15 ++++++++++++++- net/xfrm/xfrm_replay.c | 2 +- 4 files changed, 21 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/esp4_offload.c b/net/ipv4/esp4_offload.c index 170152772d33..3969fa805679 100644 --- a/net/ipv4/esp4_offload.c +++ b/net/ipv4/esp4_offload.c @@ -314,6 +314,9 @@ static int esp_xmit(struct xfrm_state *x, struct sk_buff *skb, netdev_features_ xo->seq.low += skb_shinfo(skb)->gso_segs; } + if (xo->seq.low < seq) + xo->seq.hi++; + esp.seqno = cpu_to_be64(seq + ((u64)xo->seq.hi << 32)); ip_hdr(skb)->tot_len = htons(skb->len); diff --git a/net/ipv6/esp6_offload.c b/net/ipv6/esp6_offload.c index 79d43548279c..242f4295940e 100644 --- a/net/ipv6/esp6_offload.c +++ b/net/ipv6/esp6_offload.c @@ -346,6 +346,9 @@ static int esp6_xmit(struct xfrm_state *x, struct sk_buff *skb, netdev_features xo->seq.low += skb_shinfo(skb)->gso_segs; } + if (xo->seq.low < seq) + xo->seq.hi++; + esp.seqno = cpu_to_be64(xo->seq.low + ((u64)xo->seq.hi << 32)); len = skb->len - sizeof(struct ipv6hdr); diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c index 5f5aafd418af..21269e8f2db4 100644 --- a/net/xfrm/xfrm_device.c +++ b/net/xfrm/xfrm_device.c @@ -97,6 +97,18 @@ static void xfrm_outer_mode_prep(struct xfrm_state *x, struct sk_buff *skb) } } +static inline bool xmit_xfrm_check_overflow(struct sk_buff *skb) +{ + struct xfrm_offload *xo = xfrm_offload(skb); + __u32 seq = xo->seq.low; + + seq += skb_shinfo(skb)->gso_segs; + if (unlikely(seq < xo->seq.low)) + return true; + + return false; +} + struct sk_buff *validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t features, bool *again) { int err; @@ -134,7 +146,8 @@ struct sk_buff *validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t featur return skb; } - if (skb_is_gso(skb) && unlikely(x->xso.dev != dev)) { + if (skb_is_gso(skb) && (unlikely(x->xso.dev != dev) || + unlikely(xmit_xfrm_check_overflow(skb)))) { struct sk_buff *segs; /* Packet got rerouted, fixup features and segment it. */ diff --git a/net/xfrm/xfrm_replay.c b/net/xfrm/xfrm_replay.c index 9f4d42eb090f..ce56d659c55a 100644 --- a/net/xfrm/xfrm_replay.c +++ b/net/xfrm/xfrm_replay.c @@ -714,7 +714,7 @@ static int xfrm_replay_overflow_offload_esn(struct xfrm_state *x, struct sk_buff oseq += skb_shinfo(skb)->gso_segs; } - if (unlikely(oseq < replay_esn->oseq)) { + if (unlikely(xo->seq.low < replay_esn->oseq)) { XFRM_SKB_CB(skb)->seq.output.hi = ++oseq_hi; xo->seq.hi = oseq_hi; replay_esn->oseq_hi = oseq_hi; -- cgit From 7f57f8165cb6d2c206e2b9ada53b9e2d6d8af42f Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Tue, 25 Oct 2022 14:06:48 +0800 Subject: af_key: Fix send_acquire race with pfkey_register The function pfkey_send_acquire may race with pfkey_register (which could even be in a different name space). This may result in a buffer overrun. Allocating the maximum amount of memory that could be used prevents this. Reported-by: syzbot+1e9af9185d8850e2c2fa@syzkaller.appspotmail.com Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Herbert Xu Reviewed-by: Sabrina Dubroca Reviewed-by: Eric Dumazet Signed-off-by: Steffen Klassert --- net/key/af_key.c | 32 ++++++++++++++++++++++---------- 1 file changed, 22 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/key/af_key.c b/net/key/af_key.c index c85df5b958d2..213287814328 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -2905,7 +2905,7 @@ static int count_ah_combs(const struct xfrm_tmpl *t) break; if (!aalg->pfkey_supported) continue; - if (aalg_tmpl_set(t, aalg) && aalg->available) + if (aalg_tmpl_set(t, aalg)) sz += sizeof(struct sadb_comb); } return sz + sizeof(struct sadb_prop); @@ -2923,7 +2923,7 @@ static int count_esp_combs(const struct xfrm_tmpl *t) if (!ealg->pfkey_supported) continue; - if (!(ealg_tmpl_set(t, ealg) && ealg->available)) + if (!(ealg_tmpl_set(t, ealg))) continue; for (k = 1; ; k++) { @@ -2934,16 +2934,17 @@ static int count_esp_combs(const struct xfrm_tmpl *t) if (!aalg->pfkey_supported) continue; - if (aalg_tmpl_set(t, aalg) && aalg->available) + if (aalg_tmpl_set(t, aalg)) sz += sizeof(struct sadb_comb); } } return sz + sizeof(struct sadb_prop); } -static void dump_ah_combs(struct sk_buff *skb, const struct xfrm_tmpl *t) +static int dump_ah_combs(struct sk_buff *skb, const struct xfrm_tmpl *t) { struct sadb_prop *p; + int sz = 0; int i; p = skb_put(skb, sizeof(struct sadb_prop)); @@ -2971,13 +2972,17 @@ static void dump_ah_combs(struct sk_buff *skb, const struct xfrm_tmpl *t) c->sadb_comb_soft_addtime = 20*60*60; c->sadb_comb_hard_usetime = 8*60*60; c->sadb_comb_soft_usetime = 7*60*60; + sz += sizeof(*c); } } + + return sz + sizeof(*p); } -static void dump_esp_combs(struct sk_buff *skb, const struct xfrm_tmpl *t) +static int dump_esp_combs(struct sk_buff *skb, const struct xfrm_tmpl *t) { struct sadb_prop *p; + int sz = 0; int i, k; p = skb_put(skb, sizeof(struct sadb_prop)); @@ -3019,8 +3024,11 @@ static void dump_esp_combs(struct sk_buff *skb, const struct xfrm_tmpl *t) c->sadb_comb_soft_addtime = 20*60*60; c->sadb_comb_hard_usetime = 8*60*60; c->sadb_comb_soft_usetime = 7*60*60; + sz += sizeof(*c); } } + + return sz + sizeof(*p); } static int key_notify_policy_expire(struct xfrm_policy *xp, const struct km_event *c) @@ -3150,6 +3158,7 @@ static int pfkey_send_acquire(struct xfrm_state *x, struct xfrm_tmpl *t, struct struct sadb_x_sec_ctx *sec_ctx; struct xfrm_sec_ctx *xfrm_ctx; int ctx_size = 0; + int alg_size = 0; sockaddr_size = pfkey_sockaddr_size(x->props.family); if (!sockaddr_size) @@ -3161,16 +3170,16 @@ static int pfkey_send_acquire(struct xfrm_state *x, struct xfrm_tmpl *t, struct sizeof(struct sadb_x_policy); if (x->id.proto == IPPROTO_AH) - size += count_ah_combs(t); + alg_size = count_ah_combs(t); else if (x->id.proto == IPPROTO_ESP) - size += count_esp_combs(t); + alg_size = count_esp_combs(t); if ((xfrm_ctx = x->security)) { ctx_size = PFKEY_ALIGN8(xfrm_ctx->ctx_len); size += sizeof(struct sadb_x_sec_ctx) + ctx_size; } - skb = alloc_skb(size + 16, GFP_ATOMIC); + skb = alloc_skb(size + alg_size + 16, GFP_ATOMIC); if (skb == NULL) return -ENOMEM; @@ -3224,10 +3233,13 @@ static int pfkey_send_acquire(struct xfrm_state *x, struct xfrm_tmpl *t, struct pol->sadb_x_policy_priority = xp->priority; /* Set sadb_comb's. */ + alg_size = 0; if (x->id.proto == IPPROTO_AH) - dump_ah_combs(skb, t); + alg_size = dump_ah_combs(skb, t); else if (x->id.proto == IPPROTO_ESP) - dump_esp_combs(skb, t); + alg_size = dump_esp_combs(skb, t); + + hdr->sadb_msg_len += alg_size / 8; /* security context */ if (xfrm_ctx) { -- cgit From 8207f253a097fe15c93d85ac15ebb73c5e39e1e1 Mon Sep 17 00:00:00 2001 From: Thomas Zeitlhofer Date: Tue, 15 Nov 2022 23:09:41 +0100 Subject: net: neigh: decrement the family specific qlen Commit 0ff4eb3d5ebb ("neighbour: make proxy_queue.qlen limit per-device") introduced the length counter qlen in struct neigh_parms. There are separate neigh_parms instances for IPv4/ARP and IPv6/ND, and while the family specific qlen is incremented in pneigh_enqueue(), the mentioned commit decrements always the IPv4/ARP specific qlen, regardless of the currently processed family, in pneigh_queue_purge() and neigh_proxy_process(). As a result, with IPv6/ND, the family specific qlen is only incremented (and never decremented) until it exceeds PROXY_QLEN, and then, according to the check in pneigh_enqueue(), neighbor solicitations are not answered anymore. As an example, this is noted when using the subnet-router anycast address to access a Linux router. After a certain amount of time (in the observed case, qlen exceeded PROXY_QLEN after two days), the Linux router stops answering neighbor solicitations for its subnet-router anycast address and effectively becomes unreachable. Another result with IPv6/ND is that the IPv4/ARP specific qlen is decremented more often than incremented. This leads to negative qlen values, as a signed integer has been used for the length counter qlen, and potentially to an integer overflow. Fix this by introducing the helper function neigh_parms_qlen_dec(), which decrements the family specific qlen. Thereby, make use of the existing helper function neigh_get_dev_parms_rcu(), whose definition therefore needs to be placed earlier in neighbour.c. Take the family member from struct neigh_table to determine the currently processed family and appropriately call neigh_parms_qlen_dec() from pneigh_queue_purge() and neigh_proxy_process(). Additionally, use an unsigned integer for the length counter qlen. Fixes: 0ff4eb3d5ebb ("neighbour: make proxy_queue.qlen limit per-device") Signed-off-by: Thomas Zeitlhofer Signed-off-by: David S. Miller --- net/core/neighbour.c | 58 +++++++++++++++++++++++++++------------------------- 1 file changed, 30 insertions(+), 28 deletions(-) (limited to 'net') diff --git a/net/core/neighbour.c b/net/core/neighbour.c index a77a85e357e0..952a54763358 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -307,7 +307,31 @@ static int neigh_del_timer(struct neighbour *n) return 0; } -static void pneigh_queue_purge(struct sk_buff_head *list, struct net *net) +static struct neigh_parms *neigh_get_dev_parms_rcu(struct net_device *dev, + int family) +{ + switch (family) { + case AF_INET: + return __in_dev_arp_parms_get_rcu(dev); + case AF_INET6: + return __in6_dev_nd_parms_get_rcu(dev); + } + return NULL; +} + +static void neigh_parms_qlen_dec(struct net_device *dev, int family) +{ + struct neigh_parms *p; + + rcu_read_lock(); + p = neigh_get_dev_parms_rcu(dev, family); + if (p) + p->qlen--; + rcu_read_unlock(); +} + +static void pneigh_queue_purge(struct sk_buff_head *list, struct net *net, + int family) { struct sk_buff_head tmp; unsigned long flags; @@ -321,13 +345,7 @@ static void pneigh_queue_purge(struct sk_buff_head *list, struct net *net) struct net_device *dev = skb->dev; if (net == NULL || net_eq(dev_net(dev), net)) { - struct in_device *in_dev; - - rcu_read_lock(); - in_dev = __in_dev_get_rcu(dev); - if (in_dev) - in_dev->arp_parms->qlen--; - rcu_read_unlock(); + neigh_parms_qlen_dec(dev, family); __skb_unlink(skb, list); __skb_queue_tail(&tmp, skb); } @@ -409,7 +427,8 @@ static int __neigh_ifdown(struct neigh_table *tbl, struct net_device *dev, write_lock_bh(&tbl->lock); neigh_flush_dev(tbl, dev, skip_perm); pneigh_ifdown_and_unlock(tbl, dev); - pneigh_queue_purge(&tbl->proxy_queue, dev ? dev_net(dev) : NULL); + pneigh_queue_purge(&tbl->proxy_queue, dev ? dev_net(dev) : NULL, + tbl->family); if (skb_queue_empty_lockless(&tbl->proxy_queue)) del_timer_sync(&tbl->proxy_timer); return 0; @@ -1621,13 +1640,8 @@ static void neigh_proxy_process(struct timer_list *t) if (tdif <= 0) { struct net_device *dev = skb->dev; - struct in_device *in_dev; - rcu_read_lock(); - in_dev = __in_dev_get_rcu(dev); - if (in_dev) - in_dev->arp_parms->qlen--; - rcu_read_unlock(); + neigh_parms_qlen_dec(dev, tbl->family); __skb_unlink(skb, &tbl->proxy_queue); if (tbl->proxy_redo && netif_running(dev)) { @@ -1821,7 +1835,7 @@ int neigh_table_clear(int index, struct neigh_table *tbl) cancel_delayed_work_sync(&tbl->managed_work); cancel_delayed_work_sync(&tbl->gc_work); del_timer_sync(&tbl->proxy_timer); - pneigh_queue_purge(&tbl->proxy_queue, NULL); + pneigh_queue_purge(&tbl->proxy_queue, NULL, tbl->family); neigh_ifdown(tbl, NULL); if (atomic_read(&tbl->entries)) pr_crit("neighbour leakage\n"); @@ -3539,18 +3553,6 @@ static int proc_unres_qlen(struct ctl_table *ctl, int write, return ret; } -static struct neigh_parms *neigh_get_dev_parms_rcu(struct net_device *dev, - int family) -{ - switch (family) { - case AF_INET: - return __in_dev_arp_parms_get_rcu(dev); - case AF_INET6: - return __in6_dev_nd_parms_get_rcu(dev); - } - return NULL; -} - static void neigh_copy_dflt_parms(struct net *net, struct neigh_parms *p, int index) { -- cgit From 3bcd6c7eaa53b56c3f584da46a1f7652e759d0e5 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 16 Nov 2022 14:02:28 +0000 Subject: rxrpc: Fix race between conn bundle lookup and bundle removal [ZDI-CAN-15975] After rxrpc_unbundle_conn() has removed a connection from a bundle, it checks to see if there are any conns with available channels and, if not, removes and attempts to destroy the bundle. Whilst it does check after grabbing client_bundles_lock that there are no connections attached, this races with rxrpc_look_up_bundle() retrieving the bundle, but not attaching a connection for the connection to be attached later. There is therefore a window in which the bundle can get destroyed before we manage to attach a new connection to it. Fix this by adding an "active" counter to struct rxrpc_bundle: (1) rxrpc_connect_call() obtains an active count by prepping/looking up a bundle and ditches it before returning. (2) If, during rxrpc_connect_call(), a connection is added to the bundle, this obtains an active count, which is held until the connection is discarded. (3) rxrpc_deactivate_bundle() is created to drop an active count on a bundle and destroy it when the active count reaches 0. The active count is checked inside client_bundles_lock() to prevent a race with rxrpc_look_up_bundle(). (4) rxrpc_unbundle_conn() then calls rxrpc_deactivate_bundle(). Fixes: 245500d853e9 ("rxrpc: Rewrite the client connection manager") Reported-by: zdi-disclosures@trendmicro.com # ZDI-CAN-15975 Signed-off-by: David Howells Tested-by: zdi-disclosures@trendmicro.com cc: Marc Dionne cc: linux-afs@lists.infradead.org Signed-off-by: David S. Miller --- net/rxrpc/ar-internal.h | 1 + net/rxrpc/conn_client.c | 38 +++++++++++++++++++++++--------------- 2 files changed, 24 insertions(+), 15 deletions(-) (limited to 'net') diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 1ad0ec5afb50..8499ceb7719c 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -399,6 +399,7 @@ enum rxrpc_conn_proto_state { struct rxrpc_bundle { struct rxrpc_conn_parameters params; refcount_t ref; + atomic_t active; /* Number of active users */ unsigned int debug_id; bool try_upgrade; /* True if the bundle is attempting upgrade */ bool alloc_conn; /* True if someone's getting a conn */ diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c index 3c9eeb5b750c..bdb335cb2d05 100644 --- a/net/rxrpc/conn_client.c +++ b/net/rxrpc/conn_client.c @@ -40,6 +40,8 @@ __read_mostly unsigned long rxrpc_conn_idle_client_fast_expiry = 2 * HZ; DEFINE_IDR(rxrpc_client_conn_ids); static DEFINE_SPINLOCK(rxrpc_conn_id_lock); +static void rxrpc_deactivate_bundle(struct rxrpc_bundle *bundle); + /* * Get a connection ID and epoch for a client connection from the global pool. * The connection struct pointer is then recorded in the idr radix tree. The @@ -123,6 +125,7 @@ static struct rxrpc_bundle *rxrpc_alloc_bundle(struct rxrpc_conn_parameters *cp, bundle->params = *cp; rxrpc_get_peer(bundle->params.peer); refcount_set(&bundle->ref, 1); + atomic_set(&bundle->active, 1); spin_lock_init(&bundle->channel_lock); INIT_LIST_HEAD(&bundle->waiting_calls); } @@ -149,7 +152,7 @@ void rxrpc_put_bundle(struct rxrpc_bundle *bundle) dead = __refcount_dec_and_test(&bundle->ref, &r); - _debug("PUT B=%x %d", d, r); + _debug("PUT B=%x %d", d, r - 1); if (dead) rxrpc_free_bundle(bundle); } @@ -338,6 +341,7 @@ found_bundle_free: rxrpc_free_bundle(candidate); found_bundle: rxrpc_get_bundle(bundle); + atomic_inc(&bundle->active); spin_unlock(&local->client_bundles_lock); _leave(" = %u [found]", bundle->debug_id); return bundle; @@ -435,6 +439,7 @@ static void rxrpc_add_conn_to_bundle(struct rxrpc_bundle *bundle, gfp_t gfp) if (old) trace_rxrpc_client(old, -1, rxrpc_client_replace); candidate->bundle_shift = shift; + atomic_inc(&bundle->active); bundle->conns[i] = candidate; for (j = 0; j < RXRPC_MAXCALLS; j++) set_bit(shift + j, &bundle->avail_chans); @@ -725,6 +730,7 @@ granted_channel: smp_rmb(); out_put_bundle: + rxrpc_deactivate_bundle(bundle); rxrpc_put_bundle(bundle); out: _leave(" = %d", ret); @@ -900,9 +906,8 @@ out: static void rxrpc_unbundle_conn(struct rxrpc_connection *conn) { struct rxrpc_bundle *bundle = conn->bundle; - struct rxrpc_local *local = bundle->params.local; unsigned int bindex; - bool need_drop = false, need_put = false; + bool need_drop = false; int i; _enter("C=%x", conn->debug_id); @@ -921,15 +926,22 @@ static void rxrpc_unbundle_conn(struct rxrpc_connection *conn) } spin_unlock(&bundle->channel_lock); - /* If there are no more connections, remove the bundle */ - if (!bundle->avail_chans) { - _debug("maybe unbundle"); - spin_lock(&local->client_bundles_lock); + if (need_drop) { + rxrpc_deactivate_bundle(bundle); + rxrpc_put_connection(conn); + } +} - for (i = 0; i < ARRAY_SIZE(bundle->conns); i++) - if (bundle->conns[i]) - break; - if (i == ARRAY_SIZE(bundle->conns) && !bundle->params.exclusive) { +/* + * Drop the active count on a bundle. + */ +static void rxrpc_deactivate_bundle(struct rxrpc_bundle *bundle) +{ + struct rxrpc_local *local = bundle->params.local; + bool need_put = false; + + if (atomic_dec_and_lock(&bundle->active, &local->client_bundles_lock)) { + if (!bundle->params.exclusive) { _debug("erase bundle"); rb_erase(&bundle->local_node, &local->client_bundles); need_put = true; @@ -939,10 +951,6 @@ static void rxrpc_unbundle_conn(struct rxrpc_connection *conn) if (need_put) rxrpc_put_bundle(bundle); } - - if (need_drop) - rxrpc_put_connection(conn); - _leave(""); } /* -- cgit From 0ad6bded175e829c2ca261529c9dce39a32a042d Mon Sep 17 00:00:00 2001 From: Lin Ma Date: Wed, 16 Nov 2022 21:02:49 +0800 Subject: nfc/nci: fix race with opening and closing Previously we leverage NCI_UNREG and the lock inside nci_close_device to prevent the race condition between opening a device and closing a device. However, it still has problem because a failed opening command will erase the NCI_UNREG flag and allow another opening command to bypass the status checking. This fix corrects that by making sure the NCI_UNREG is held. Reported-by: syzbot+43475bf3cfbd6e41f5b7@syzkaller.appspotmail.com Fixes: 48b71a9e66c2 ("NFC: add NCI_UNREG flag to eliminate the race") Signed-off-by: Lin Ma Signed-off-by: David S. Miller --- net/nfc/nci/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/nfc/nci/core.c b/net/nfc/nci/core.c index 6a193cce2a75..4ffdf2f45c44 100644 --- a/net/nfc/nci/core.c +++ b/net/nfc/nci/core.c @@ -542,7 +542,7 @@ static int nci_open_device(struct nci_dev *ndev) skb_queue_purge(&ndev->tx_q); ndev->ops->close(ndev); - ndev->flags = 0; + ndev->flags &= BIT(NCI_UNREG); } done: -- cgit From 11c10956515b8ec44cf4f2a7b9d8bf8b9dc05ec4 Mon Sep 17 00:00:00 2001 From: Zhengchao Shao Date: Thu, 10 Nov 2022 20:26:06 +0800 Subject: 9p/fd: fix issue of list_del corruption in p9_fd_cancel() Syz reported the following issue: kernel BUG at lib/list_debug.c:53! invalid opcode: 0000 [#1] PREEMPT SMP KASAN RIP: 0010:__list_del_entry_valid.cold+0x5c/0x72 Call Trace: p9_fd_cancel+0xb1/0x270 p9_client_rpc+0x8ea/0xba0 p9_client_create+0x9c0/0xed0 v9fs_session_init+0x1e0/0x1620 v9fs_mount+0xba/0xb80 legacy_get_tree+0x103/0x200 vfs_get_tree+0x89/0x2d0 path_mount+0x4c0/0x1ac0 __x64_sys_mount+0x33b/0x430 do_syscall_64+0x35/0x80 entry_SYSCALL_64_after_hwframe+0x46/0xb0 The process is as follows: Thread A: Thread B: p9_poll_workfn() p9_client_create() ... ... p9_conn_cancel() p9_fd_cancel() list_del() ... ... list_del() //list_del corruption There is no lock protection when deleting list in p9_conn_cancel(). After deleting list in Thread A, thread B will delete the same list again. It will cause issue of list_del corruption. Setting req->status to REQ_STATUS_ERROR under lock prevents other cleanup paths from trying to manipulate req_list. The other thread can safely check req->status because it still holds a reference to req at this point. Link: https://lkml.kernel.org/r/20221110122606.383352-1-shaozhengchao@huawei.com Fixes: 52f1c45dde91 ("9p: trans_fd/p9_conn_cancel: drop client lock earlier") Reported-by: syzbot+9b69b8d10ab4a7d88056@syzkaller.appspotmail.com Signed-off-by: Zhengchao Shao [Dominique: add description of the fix in commit message] Signed-off-by: Dominique Martinet --- net/9p/trans_fd.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c index 56a186768750..bd28e63d7666 100644 --- a/net/9p/trans_fd.c +++ b/net/9p/trans_fd.c @@ -202,9 +202,11 @@ static void p9_conn_cancel(struct p9_conn *m, int err) list_for_each_entry_safe(req, rtmp, &m->req_list, req_list) { list_move(&req->req_list, &cancel_list); + req->status = REQ_STATUS_ERROR; } list_for_each_entry_safe(req, rtmp, &m->unsent_req_list, req_list) { list_move(&req->req_list, &cancel_list); + req->status = REQ_STATUS_ERROR; } spin_unlock(&m->req_lock); -- cgit From 578b565b240afdfe0596d183f473f333eb9d3008 Mon Sep 17 00:00:00 2001 From: GUO Zihua Date: Thu, 17 Nov 2022 17:11:57 +0800 Subject: 9p/fd: Fix write overflow in p9_read_work This error was reported while fuzzing: BUG: KASAN: slab-out-of-bounds in _copy_to_iter+0xd35/0x1190 Write of size 4043 at addr ffff888008724eb1 by task kworker/1:1/24 CPU: 1 PID: 24 Comm: kworker/1:1 Not tainted 6.1.0-rc5-00002-g1adf73218daa-dirty #223 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.15.0-0-g2dd4b9b3f840-prebuilt.qemu.org 04/01/2014 Workqueue: events p9_read_work Call Trace: dump_stack_lvl+0x4c/0x64 print_report+0x178/0x4b0 kasan_report+0xae/0x130 kasan_check_range+0x179/0x1e0 memcpy+0x38/0x60 _copy_to_iter+0xd35/0x1190 copy_page_to_iter+0x1d5/0xb00 pipe_read+0x3a1/0xd90 __kernel_read+0x2a5/0x760 kernel_read+0x47/0x60 p9_read_work+0x463/0x780 process_one_work+0x91d/0x1300 worker_thread+0x8c/0x1210 kthread+0x280/0x330 ret_from_fork+0x22/0x30 Allocated by task 457: kasan_save_stack+0x1c/0x40 kasan_set_track+0x21/0x30 __kasan_kmalloc+0x7e/0x90 __kmalloc+0x59/0x140 p9_fcall_init.isra.11+0x5d/0x1c0 p9_tag_alloc+0x251/0x550 p9_client_prepare_req+0x162/0x350 p9_client_rpc+0x18d/0xa90 p9_client_create+0x670/0x14e0 v9fs_session_init+0x1fd/0x14f0 v9fs_mount+0xd7/0xaf0 legacy_get_tree+0xf3/0x1f0 vfs_get_tree+0x86/0x2c0 path_mount+0x885/0x1940 do_mount+0xec/0x100 __x64_sys_mount+0x1a0/0x1e0 do_syscall_64+0x3a/0x90 entry_SYSCALL_64_after_hwframe+0x63/0xcd This BUG pops up when trying to reproduce https://syzkaller.appspot.com/bug?id=6c7cd46c7bdd0e86f95d26ec3153208ad186f9fa The callstack is different but the issue is valid and re-producable with the same re-producer in the link. The root cause of this issue is that we check the size of the message received against the msize of the client in p9_read_work. However, it turns out that capacity is no longer consistent with msize. Thus, the message size should be checked against sdata capacity. As the msize is non-consistant with the capacity of the tag and as we are now checking message size against capacity directly, there is no point checking message size against msize. So remove it. Link: https://lkml.kernel.org/r/20221117091159.31533-2-guozihua@huawei.com Link: https://lkml.kernel.org/r/20221117091159.31533-3-guozihua@huawei.com Reported-by: syzbot+0f89bd13eaceccc0e126@syzkaller.appspotmail.com Fixes: 60ece0833b6c ("net/9p: allocate appropriate reduced message buffers") Signed-off-by: GUO Zihua Reviewed-by: Christian Schoenebeck [Dominique: squash patches 1 & 2 and fix size including header part] Signed-off-by: Dominique Martinet --- net/9p/trans_fd.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c index bd28e63d7666..27c5c938ccd1 100644 --- a/net/9p/trans_fd.c +++ b/net/9p/trans_fd.c @@ -324,14 +324,6 @@ static void p9_read_work(struct work_struct *work) goto error; } - if (m->rc.size >= m->client->msize) { - p9_debug(P9_DEBUG_ERROR, - "requested packet size too big: %d\n", - m->rc.size); - err = -EIO; - goto error; - } - p9_debug(P9_DEBUG_TRANS, "mux %p pkt: size: %d bytes tag: %d\n", m, m->rc.size, m->rc.tag); @@ -344,6 +336,14 @@ static void p9_read_work(struct work_struct *work) goto error; } + if (m->rc.size > m->rreq->rc.capacity) { + p9_debug(P9_DEBUG_ERROR, + "requested packet size too big: %d for tag %d with capacity %zd\n", + m->rc.size, m->rc.tag, m->rreq->rc.capacity); + err = -EIO; + goto error; + } + if (!m->rreq->rc.sdata) { p9_debug(P9_DEBUG_ERROR, "No recv fcall for tag %d (req %p), disconnecting!\n", -- cgit From 6854fadbeee10891ed74246bdc05031906b6c8cf Mon Sep 17 00:00:00 2001 From: GUO Zihua Date: Thu, 17 Nov 2022 17:11:59 +0800 Subject: 9p/fd: Use P9_HDRSZ for header size Cleanup hardcoded header sizes to use P9_HDRSZ instead of '7' Link: https://lkml.kernel.org/r/20221117091159.31533-4-guozihua@huawei.com Signed-off-by: GUO Zihua Reviewed-by: Christian Schoenebeck [Dominique: commit message adjusted to make sense after offset size adjustment got removed] Signed-off-by: Dominique Martinet --- net/9p/trans_fd.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c index 27c5c938ccd1..eeea0a6a75b6 100644 --- a/net/9p/trans_fd.c +++ b/net/9p/trans_fd.c @@ -120,7 +120,7 @@ struct p9_conn { struct list_head unsent_req_list; struct p9_req_t *rreq; struct p9_req_t *wreq; - char tmp_buf[7]; + char tmp_buf[P9_HDRSZ]; struct p9_fcall rc; int wpos; int wsize; @@ -293,7 +293,7 @@ static void p9_read_work(struct work_struct *work) if (!m->rc.sdata) { m->rc.sdata = m->tmp_buf; m->rc.offset = 0; - m->rc.capacity = 7; /* start by reading header */ + m->rc.capacity = P9_HDRSZ; /* start by reading header */ } clear_bit(Rpending, &m->wsched); @@ -316,7 +316,7 @@ static void p9_read_work(struct work_struct *work) p9_debug(P9_DEBUG_TRANS, "got new header\n"); /* Header size */ - m->rc.size = 7; + m->rc.size = P9_HDRSZ; err = p9_parse_header(&m->rc, &m->rc.size, NULL, NULL, 0); if (err) { p9_debug(P9_DEBUG_ERROR, -- cgit From 52d1aa8b8249ff477aaa38b6f74a8ced780d079c Mon Sep 17 00:00:00 2001 From: Daniel Xu Date: Wed, 9 Nov 2022 12:39:07 -0700 Subject: netfilter: conntrack: Fix data-races around ct mark nf_conn:mark can be read from and written to in parallel. Use READ_ONCE()/WRITE_ONCE() for reads and writes to prevent unwanted compiler optimizations. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Daniel Xu Signed-off-by: Pablo Neira Ayuso --- net/core/flow_dissector.c | 2 +- net/ipv4/netfilter/ipt_CLUSTERIP.c | 4 ++-- net/netfilter/nf_conntrack_core.c | 2 +- net/netfilter/nf_conntrack_netlink.c | 24 ++++++++++++++---------- net/netfilter/nf_conntrack_standalone.c | 2 +- net/netfilter/nft_ct.c | 6 +++--- net/netfilter/xt_connmark.c | 18 ++++++++++-------- net/openvswitch/conntrack.c | 8 ++++---- net/sched/act_connmark.c | 4 ++-- net/sched/act_ct.c | 8 ++++---- net/sched/act_ctinfo.c | 6 +++--- 11 files changed, 45 insertions(+), 39 deletions(-) (limited to 'net') diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 25cd35f5922e..007730412947 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -296,7 +296,7 @@ skb_flow_dissect_ct(const struct sk_buff *skb, key->ct_zone = ct->zone.id; #endif #if IS_ENABLED(CONFIG_NF_CONNTRACK_MARK) - key->ct_mark = ct->mark; + key->ct_mark = READ_ONCE(ct->mark); #endif cl = nf_ct_labels_find(ct); diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c index f8e176c77d1c..b3cc416ed292 100644 --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c @@ -435,7 +435,7 @@ clusterip_tg(struct sk_buff *skb, const struct xt_action_param *par) switch (ctinfo) { case IP_CT_NEW: - ct->mark = hash; + WRITE_ONCE(ct->mark, hash); break; case IP_CT_RELATED: case IP_CT_RELATED_REPLY: @@ -452,7 +452,7 @@ clusterip_tg(struct sk_buff *skb, const struct xt_action_param *par) #ifdef DEBUG nf_ct_dump_tuple_ip(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); #endif - pr_debug("hash=%u ct_hash=%u ", hash, ct->mark); + pr_debug("hash=%u ct_hash=%u ", hash, READ_ONCE(ct->mark)); if (!clusterip_responsible(cipinfo->config, hash)) { pr_debug("not responsible\n"); return NF_DROP; diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index f97bda06d2a9..2692139ce417 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -1781,7 +1781,7 @@ init_conntrack(struct net *net, struct nf_conn *tmpl, } #ifdef CONFIG_NF_CONNTRACK_MARK - ct->mark = exp->master->mark; + ct->mark = READ_ONCE(exp->master->mark); #endif #ifdef CONFIG_NF_CONNTRACK_SECMARK ct->secmark = exp->master->secmark; diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 7562b215b932..d71150a40fb0 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -328,9 +328,9 @@ nla_put_failure: } #ifdef CONFIG_NF_CONNTRACK_MARK -static int ctnetlink_dump_mark(struct sk_buff *skb, const struct nf_conn *ct) +static int ctnetlink_dump_mark(struct sk_buff *skb, u32 mark) { - if (nla_put_be32(skb, CTA_MARK, htonl(ct->mark))) + if (nla_put_be32(skb, CTA_MARK, htonl(mark))) goto nla_put_failure; return 0; @@ -543,7 +543,7 @@ static int ctnetlink_dump_extinfo(struct sk_buff *skb, static int ctnetlink_dump_info(struct sk_buff *skb, struct nf_conn *ct) { if (ctnetlink_dump_status(skb, ct) < 0 || - ctnetlink_dump_mark(skb, ct) < 0 || + ctnetlink_dump_mark(skb, READ_ONCE(ct->mark)) < 0 || ctnetlink_dump_secctx(skb, ct) < 0 || ctnetlink_dump_id(skb, ct) < 0 || ctnetlink_dump_use(skb, ct) < 0 || @@ -722,6 +722,7 @@ ctnetlink_conntrack_event(unsigned int events, const struct nf_ct_event *item) struct sk_buff *skb; unsigned int type; unsigned int flags = 0, group; + u32 mark; int err; if (events & (1 << IPCT_DESTROY)) { @@ -826,8 +827,9 @@ ctnetlink_conntrack_event(unsigned int events, const struct nf_ct_event *item) } #ifdef CONFIG_NF_CONNTRACK_MARK - if ((events & (1 << IPCT_MARK) || ct->mark) - && ctnetlink_dump_mark(skb, ct) < 0) + mark = READ_ONCE(ct->mark); + if ((events & (1 << IPCT_MARK) || mark) && + ctnetlink_dump_mark(skb, mark) < 0) goto nla_put_failure; #endif nlmsg_end(skb, nlh); @@ -1154,7 +1156,7 @@ static int ctnetlink_filter_match(struct nf_conn *ct, void *data) } #ifdef CONFIG_NF_CONNTRACK_MARK - if ((ct->mark & filter->mark.mask) != filter->mark.val) + if ((READ_ONCE(ct->mark) & filter->mark.mask) != filter->mark.val) goto ignore_entry; #endif status = (u32)READ_ONCE(ct->status); @@ -2002,9 +2004,9 @@ static void ctnetlink_change_mark(struct nf_conn *ct, mask = ~ntohl(nla_get_be32(cda[CTA_MARK_MASK])); mark = ntohl(nla_get_be32(cda[CTA_MARK])); - newmark = (ct->mark & mask) ^ mark; - if (newmark != ct->mark) - ct->mark = newmark; + newmark = (READ_ONCE(ct->mark) & mask) ^ mark; + if (newmark != READ_ONCE(ct->mark)) + WRITE_ONCE(ct->mark, newmark); } #endif @@ -2669,6 +2671,7 @@ static int __ctnetlink_glue_build(struct sk_buff *skb, struct nf_conn *ct) { const struct nf_conntrack_zone *zone; struct nlattr *nest_parms; + u32 mark; zone = nf_ct_zone(ct); @@ -2730,7 +2733,8 @@ static int __ctnetlink_glue_build(struct sk_buff *skb, struct nf_conn *ct) goto nla_put_failure; #ifdef CONFIG_NF_CONNTRACK_MARK - if (ct->mark && ctnetlink_dump_mark(skb, ct) < 0) + mark = READ_ONCE(ct->mark); + if (mark && ctnetlink_dump_mark(skb, mark) < 0) goto nla_put_failure; #endif if (ctnetlink_dump_labels(skb, ct) < 0) diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index 4ffe84c5a82c..bca839ab1ae8 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -366,7 +366,7 @@ static int ct_seq_show(struct seq_file *s, void *v) goto release; #if defined(CONFIG_NF_CONNTRACK_MARK) - seq_printf(s, "mark=%u ", ct->mark); + seq_printf(s, "mark=%u ", READ_ONCE(ct->mark)); #endif ct_show_secctx(s, ct); diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c index a3f01f209a53..641dc21f92b4 100644 --- a/net/netfilter/nft_ct.c +++ b/net/netfilter/nft_ct.c @@ -98,7 +98,7 @@ static void nft_ct_get_eval(const struct nft_expr *expr, return; #ifdef CONFIG_NF_CONNTRACK_MARK case NFT_CT_MARK: - *dest = ct->mark; + *dest = READ_ONCE(ct->mark); return; #endif #ifdef CONFIG_NF_CONNTRACK_SECMARK @@ -297,8 +297,8 @@ static void nft_ct_set_eval(const struct nft_expr *expr, switch (priv->key) { #ifdef CONFIG_NF_CONNTRACK_MARK case NFT_CT_MARK: - if (ct->mark != value) { - ct->mark = value; + if (READ_ONCE(ct->mark) != value) { + WRITE_ONCE(ct->mark, value); nf_conntrack_event_cache(IPCT_MARK, ct); } break; diff --git a/net/netfilter/xt_connmark.c b/net/netfilter/xt_connmark.c index e5ebc0810675..ad3c033db64e 100644 --- a/net/netfilter/xt_connmark.c +++ b/net/netfilter/xt_connmark.c @@ -30,6 +30,7 @@ connmark_tg_shift(struct sk_buff *skb, const struct xt_connmark_tginfo2 *info) u_int32_t new_targetmark; struct nf_conn *ct; u_int32_t newmark; + u_int32_t oldmark; ct = nf_ct_get(skb, &ctinfo); if (ct == NULL) @@ -37,14 +38,15 @@ connmark_tg_shift(struct sk_buff *skb, const struct xt_connmark_tginfo2 *info) switch (info->mode) { case XT_CONNMARK_SET: - newmark = (ct->mark & ~info->ctmask) ^ info->ctmark; + oldmark = READ_ONCE(ct->mark); + newmark = (oldmark & ~info->ctmask) ^ info->ctmark; if (info->shift_dir == D_SHIFT_RIGHT) newmark >>= info->shift_bits; else newmark <<= info->shift_bits; - if (ct->mark != newmark) { - ct->mark = newmark; + if (READ_ONCE(ct->mark) != newmark) { + WRITE_ONCE(ct->mark, newmark); nf_conntrack_event_cache(IPCT_MARK, ct); } break; @@ -55,15 +57,15 @@ connmark_tg_shift(struct sk_buff *skb, const struct xt_connmark_tginfo2 *info) else new_targetmark <<= info->shift_bits; - newmark = (ct->mark & ~info->ctmask) ^ + newmark = (READ_ONCE(ct->mark) & ~info->ctmask) ^ new_targetmark; - if (ct->mark != newmark) { - ct->mark = newmark; + if (READ_ONCE(ct->mark) != newmark) { + WRITE_ONCE(ct->mark, newmark); nf_conntrack_event_cache(IPCT_MARK, ct); } break; case XT_CONNMARK_RESTORE: - new_targetmark = (ct->mark & info->ctmask); + new_targetmark = (READ_ONCE(ct->mark) & info->ctmask); if (info->shift_dir == D_SHIFT_RIGHT) new_targetmark >>= info->shift_bits; else @@ -126,7 +128,7 @@ connmark_mt(const struct sk_buff *skb, struct xt_action_param *par) if (ct == NULL) return false; - return ((ct->mark & info->mask) == info->mark) ^ info->invert; + return ((READ_ONCE(ct->mark) & info->mask) == info->mark) ^ info->invert; } static int connmark_mt_check(const struct xt_mtchk_param *par) diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index c7b10234cf7c..c8eaf4234b2e 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -152,7 +152,7 @@ static u8 ovs_ct_get_state(enum ip_conntrack_info ctinfo) static u32 ovs_ct_get_mark(const struct nf_conn *ct) { #if IS_ENABLED(CONFIG_NF_CONNTRACK_MARK) - return ct ? ct->mark : 0; + return ct ? READ_ONCE(ct->mark) : 0; #else return 0; #endif @@ -340,9 +340,9 @@ static int ovs_ct_set_mark(struct nf_conn *ct, struct sw_flow_key *key, #if IS_ENABLED(CONFIG_NF_CONNTRACK_MARK) u32 new_mark; - new_mark = ct_mark | (ct->mark & ~(mask)); - if (ct->mark != new_mark) { - ct->mark = new_mark; + new_mark = ct_mark | (READ_ONCE(ct->mark) & ~(mask)); + if (READ_ONCE(ct->mark) != new_mark) { + WRITE_ONCE(ct->mark, new_mark); if (nf_ct_is_confirmed(ct)) nf_conntrack_event_cache(IPCT_MARK, ct); key->ct.mark = new_mark; diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c index 66b143bb04ac..d41002e4613f 100644 --- a/net/sched/act_connmark.c +++ b/net/sched/act_connmark.c @@ -61,7 +61,7 @@ static int tcf_connmark_act(struct sk_buff *skb, const struct tc_action *a, c = nf_ct_get(skb, &ctinfo); if (c) { - skb->mark = c->mark; + skb->mark = READ_ONCE(c->mark); /* using overlimits stats to count how many packets marked */ ca->tcf_qstats.overlimits++; goto out; @@ -81,7 +81,7 @@ static int tcf_connmark_act(struct sk_buff *skb, const struct tc_action *a, c = nf_ct_tuplehash_to_ctrack(thash); /* using overlimits stats to count how many packets marked */ ca->tcf_qstats.overlimits++; - skb->mark = c->mark; + skb->mark = READ_ONCE(c->mark); nf_ct_put(c); out: diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c index b38d91d6b249..4c7f7861ea96 100644 --- a/net/sched/act_ct.c +++ b/net/sched/act_ct.c @@ -178,7 +178,7 @@ static void tcf_ct_flow_table_add_action_meta(struct nf_conn *ct, entry = tcf_ct_flow_table_flow_action_get_next(action); entry->id = FLOW_ACTION_CT_METADATA; #if IS_ENABLED(CONFIG_NF_CONNTRACK_MARK) - entry->ct_metadata.mark = ct->mark; + entry->ct_metadata.mark = READ_ONCE(ct->mark); #endif ctinfo = dir == IP_CT_DIR_ORIGINAL ? IP_CT_ESTABLISHED : IP_CT_ESTABLISHED_REPLY; @@ -936,9 +936,9 @@ static void tcf_ct_act_set_mark(struct nf_conn *ct, u32 mark, u32 mask) if (!mask) return; - new_mark = mark | (ct->mark & ~(mask)); - if (ct->mark != new_mark) { - ct->mark = new_mark; + new_mark = mark | (READ_ONCE(ct->mark) & ~(mask)); + if (READ_ONCE(ct->mark) != new_mark) { + WRITE_ONCE(ct->mark, new_mark); if (nf_ct_is_confirmed(ct)) nf_conntrack_event_cache(IPCT_MARK, ct); } diff --git a/net/sched/act_ctinfo.c b/net/sched/act_ctinfo.c index d4102f0a9abd..eaa02f098d1c 100644 --- a/net/sched/act_ctinfo.c +++ b/net/sched/act_ctinfo.c @@ -32,7 +32,7 @@ static void tcf_ctinfo_dscp_set(struct nf_conn *ct, struct tcf_ctinfo *ca, { u8 dscp, newdscp; - newdscp = (((ct->mark & cp->dscpmask) >> cp->dscpmaskshift) << 2) & + newdscp = (((READ_ONCE(ct->mark) & cp->dscpmask) >> cp->dscpmaskshift) << 2) & ~INET_ECN_MASK; switch (proto) { @@ -72,7 +72,7 @@ static void tcf_ctinfo_cpmark_set(struct nf_conn *ct, struct tcf_ctinfo *ca, struct sk_buff *skb) { ca->stats_cpmark_set++; - skb->mark = ct->mark & cp->cpmarkmask; + skb->mark = READ_ONCE(ct->mark) & cp->cpmarkmask; } static int tcf_ctinfo_act(struct sk_buff *skb, const struct tc_action *a, @@ -130,7 +130,7 @@ static int tcf_ctinfo_act(struct sk_buff *skb, const struct tc_action *a, } if (cp->mode & CTINFO_MODE_DSCP) - if (!cp->dscpstatemask || (ct->mark & cp->dscpstatemask)) + if (!cp->dscpstatemask || (READ_ONCE(ct->mark) & cp->dscpstatemask)) tcf_ctinfo_dscp_set(ct, ca, cp, skb, wlen, proto); if (cp->mode & CTINFO_MODE_CPMARK) -- cgit From 33c7aba0b4ffd6d7cdab862a034eb582a5120a38 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 14 Nov 2022 11:31:54 +0100 Subject: netfilter: nf_tables: do not set up extensions for end interval Elements with an end interval flag set on do not store extensions. The global set definition is currently setting on the timeout and stateful expression for end interval elements. This leads to skipping end interval elements from the set->ops->walk() path as the expired check bogusly reports true. Moreover, do not set up stateful expressions for elements with end interval flag set on since this is never used. Fixes: 65038428b2c6 ("netfilter: nf_tables: allow to specify stateful expression in set definition") Fixes: 8d8540c4f5e0 ("netfilter: nft_set_rbtree: add timeout support") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index e7152d599d73..7a09421f19e1 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -5958,7 +5958,8 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, &timeout); if (err) return err; - } else if (set->flags & NFT_SET_TIMEOUT) { + } else if (set->flags & NFT_SET_TIMEOUT && + !(flags & NFT_SET_ELEM_INTERVAL_END)) { timeout = set->timeout; } @@ -6024,7 +6025,8 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, err = -EOPNOTSUPP; goto err_set_elem_expr; } - } else if (set->num_exprs > 0) { + } else if (set->num_exprs > 0 && + !(flags & NFT_SET_ELEM_INTERVAL_END)) { err = nft_set_elem_expr_clone(ctx, set, expr_array); if (err < 0) goto err_set_elem_expr_clone; -- cgit From 764f8485890d4988a2083f5dea90cfe0116b25f6 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 17 Nov 2022 20:21:52 -0800 Subject: ipv4/fib: Replace zero-length array with DECLARE_FLEX_ARRAY() helper Zero-length arrays are deprecated[1] and are being replaced with flexible array members in support of the ongoing efforts to tighten the FORTIFY_SOURCE routines on memcpy(), correctly instrument array indexing with UBSAN_BOUNDS, and to globally enable -fstrict-flex-arrays=3. Replace zero-length array with flexible-array member in struct key_vector. This results in no differences in binary output. [1] https://github.com/KSPP/linux/issues/78 Cc: Jakub Kicinski Cc: "David S. Miller" Cc: Hideaki YOSHIFUJI Cc: David Ahern Cc: Eric Dumazet Cc: Paolo Abeni Cc: "Gustavo A. R. Silva" Cc: netdev@vger.kernel.org Signed-off-by: Kees Cook Reviewed-by: Gustavo A. R. Silva Reviewed-by: David Ahern Signed-off-by: David S. Miller --- net/ipv4/fib_trie.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 452ff177e4da..c88bf856c443 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -126,7 +126,7 @@ struct key_vector { /* This list pointer if valid if (pos | bits) == 0 (LEAF) */ struct hlist_head leaf; /* This array is valid if (pos | bits) > 0 (TNODE) */ - struct key_vector __rcu *tnode[0]; + DECLARE_FLEX_ARRAY(struct key_vector __rcu *, tnode); }; }; -- cgit From c7aa1a76d4a0a3c401025b60c401412bbb60f8c6 Mon Sep 17 00:00:00 2001 From: Vishwanath Pai Date: Wed, 28 Sep 2022 14:26:50 -0400 Subject: netfilter: ipset: regression in ip_set_hash_ip.c This patch introduced a regression: commit 48596a8ddc46 ("netfilter: ipset: Fix adding an IPv4 range containing more than 2^31 addresses") The variable e.ip is passed to adtfn() function which finally adds the ip address to the set. The patch above refactored the for loop and moved e.ip = htonl(ip) to the end of the for loop. What this means is that if the value of "ip" changes between the first assignement of e.ip and the forloop, then e.ip is pointing to a different ip address than "ip". Test case: $ ipset create jdtest_tmp hash:ip family inet hashsize 2048 maxelem 100000 $ ipset add jdtest_tmp 10.0.1.1/31 ipset v6.21.1: Element cannot be added to the set: it's already added The value of ip gets updated inside the "else if (tb[IPSET_ATTR_CIDR])" block but e.ip is still pointing to the old value. Fixes: 48596a8ddc46 ("netfilter: ipset: Fix adding an IPv4 range containing more than 2^31 addresses") Reviewed-by: Joshua Hunt Signed-off-by: Vishwanath Pai Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipset/ip_set_hash_ip.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/netfilter/ipset/ip_set_hash_ip.c b/net/netfilter/ipset/ip_set_hash_ip.c index dd30c03d5a23..75d556d71652 100644 --- a/net/netfilter/ipset/ip_set_hash_ip.c +++ b/net/netfilter/ipset/ip_set_hash_ip.c @@ -151,18 +151,16 @@ hash_ip4_uadt(struct ip_set *set, struct nlattr *tb[], if (((u64)ip_to - ip + 1) >> (32 - h->netmask) > IPSET_MAX_RANGE) return -ERANGE; - if (retried) { + if (retried) ip = ntohl(h->next.ip); - e.ip = htonl(ip); - } for (; ip <= ip_to;) { + e.ip = htonl(ip); ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) return ret; ip += hosts; - e.ip = htonl(ip); - if (e.ip == 0) + if (ip == 0) return 0; ret = 0; -- cgit From 0e5d56c64afcd6fd2d132ea972605b66f8a7d3c4 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Fri, 18 Nov 2022 16:45:00 -0500 Subject: tipc: set con sock in tipc_conn_alloc A crash was reported by Wei Chen: BUG: kernel NULL pointer dereference, address: 0000000000000018 RIP: 0010:tipc_conn_close+0x12/0x100 Call Trace: tipc_topsrv_exit_net+0x139/0x320 ops_exit_list.isra.9+0x49/0x80 cleanup_net+0x31a/0x540 process_one_work+0x3fa/0x9f0 worker_thread+0x42/0x5c0 It was caused by !con->sock in tipc_conn_close(). In tipc_topsrv_accept(), con is allocated in conn_idr then its sock is set: con = tipc_conn_alloc(); ... <----[1] con->sock = newsock; If tipc_conn_close() is called in anytime of [1], the null-pointer-def is triggered by con->sock->sk due to con->sock is not yet set. This patch fixes it by moving the con->sock setting to tipc_conn_alloc() under s->idr_lock. So that con->sock can never be NULL when getting the con from s->conn_idr. It will be also safer to move con->server and flag CF_CONNECTED setting under s->idr_lock, as they should all be set before tipc_conn_alloc() is called. Fixes: c5fa7b3cf3cb ("tipc: introduce new TIPC server infrastructure") Reported-by: Wei Chen Signed-off-by: Xin Long Acked-by: Jon Maloy Signed-off-by: Jakub Kicinski --- net/tipc/topsrv.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/tipc/topsrv.c b/net/tipc/topsrv.c index d92ec92f0b71..b0f9aa521670 100644 --- a/net/tipc/topsrv.c +++ b/net/tipc/topsrv.c @@ -176,7 +176,7 @@ static void tipc_conn_close(struct tipc_conn *con) conn_put(con); } -static struct tipc_conn *tipc_conn_alloc(struct tipc_topsrv *s) +static struct tipc_conn *tipc_conn_alloc(struct tipc_topsrv *s, struct socket *sock) { struct tipc_conn *con; int ret; @@ -202,10 +202,11 @@ static struct tipc_conn *tipc_conn_alloc(struct tipc_topsrv *s) } con->conid = ret; s->idr_in_use++; - spin_unlock_bh(&s->idr_lock); set_bit(CF_CONNECTED, &con->flags); con->server = s; + con->sock = sock; + spin_unlock_bh(&s->idr_lock); return con; } @@ -467,7 +468,7 @@ static void tipc_topsrv_accept(struct work_struct *work) ret = kernel_accept(lsock, &newsock, O_NONBLOCK); if (ret < 0) return; - con = tipc_conn_alloc(srv); + con = tipc_conn_alloc(srv, newsock); if (IS_ERR(con)) { ret = PTR_ERR(con); sock_release(newsock); @@ -479,7 +480,6 @@ static void tipc_topsrv_accept(struct work_struct *work) newsk->sk_data_ready = tipc_conn_data_ready; newsk->sk_write_space = tipc_conn_write_space; newsk->sk_user_data = con; - con->sock = newsock; write_unlock_bh(&newsk->sk_callback_lock); /* Wake up receive process in case of 'SYN+' message */ @@ -577,12 +577,11 @@ bool tipc_topsrv_kern_subscr(struct net *net, u32 port, u32 type, u32 lower, sub.filter = filter; *(u64 *)&sub.usr_handle = (u64)port; - con = tipc_conn_alloc(tipc_topsrv(net)); + con = tipc_conn_alloc(tipc_topsrv(net), NULL); if (IS_ERR(con)) return false; *conid = con->conid; - con->sock = NULL; rc = tipc_conn_rcv_sub(tipc_topsrv(net), con, &sub); if (rc >= 0) return true; -- cgit From a7b42969d63f47320853a802efd879fbdc4e010e Mon Sep 17 00:00:00 2001 From: Xin Long Date: Fri, 18 Nov 2022 16:45:01 -0500 Subject: tipc: add an extra conn_get in tipc_conn_alloc One extra conn_get() is needed in tipc_conn_alloc(), as after tipc_conn_alloc() is called, tipc_conn_close() may free this con before deferencing it in tipc_topsrv_accept(): tipc_conn_alloc(); newsk = newsock->sk; <---- tipc_conn_close(); write_lock_bh(&sk->sk_callback_lock); newsk->sk_data_ready = tipc_conn_data_ready; Then an uaf issue can be triggered: BUG: KASAN: use-after-free in tipc_topsrv_accept+0x1e7/0x370 [tipc] Call Trace: dump_stack_lvl+0x33/0x46 print_report+0x178/0x4b0 kasan_report+0x8c/0x100 kasan_check_range+0x179/0x1e0 tipc_topsrv_accept+0x1e7/0x370 [tipc] process_one_work+0x6a3/0x1030 worker_thread+0x8a/0xdf0 This patch fixes it by holding it in tipc_conn_alloc(), then after all accessing in tipc_topsrv_accept() releasing it. Note when does this in tipc_topsrv_kern_subscr(), as tipc_conn_rcv_sub() returns 0 or -1 only, we don't need to check for "> 0". Fixes: c5fa7b3cf3cb ("tipc: introduce new TIPC server infrastructure") Signed-off-by: Xin Long Acked-by: Jon Maloy Signed-off-by: Jakub Kicinski --- net/tipc/topsrv.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/tipc/topsrv.c b/net/tipc/topsrv.c index b0f9aa521670..e3b427a70398 100644 --- a/net/tipc/topsrv.c +++ b/net/tipc/topsrv.c @@ -206,6 +206,7 @@ static struct tipc_conn *tipc_conn_alloc(struct tipc_topsrv *s, struct socket *s set_bit(CF_CONNECTED, &con->flags); con->server = s; con->sock = sock; + conn_get(con); spin_unlock_bh(&s->idr_lock); return con; @@ -484,6 +485,7 @@ static void tipc_topsrv_accept(struct work_struct *work) /* Wake up receive process in case of 'SYN+' message */ newsk->sk_data_ready(newsk); + conn_put(con); } } @@ -583,10 +585,11 @@ bool tipc_topsrv_kern_subscr(struct net *net, u32 port, u32 type, u32 lower, *conid = con->conid; rc = tipc_conn_rcv_sub(tipc_topsrv(net), con, &sub); - if (rc >= 0) - return true; + if (rc) + conn_put(con); + conn_put(con); - return false; + return !rc; } void tipc_topsrv_kern_unsubscr(struct net *net, int conid) -- cgit From cd0f6421162201e4b22ce757a1966729323185eb Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Sat, 19 Nov 2022 15:28:32 +0800 Subject: tipc: check skb_linearize() return value in tipc_disc_rcv() If skb_linearize() fails in tipc_disc_rcv(), we need to free the skb instead of handle it. Fixes: 25b0b9c4e835 ("tipc: handle collisions of 32-bit node address hash values") Signed-off-by: YueHaibing Acked-by: Jon Maloy Link: https://lore.kernel.org/r/20221119072832.7896-1-yuehaibing@huawei.com Signed-off-by: Jakub Kicinski --- net/tipc/discover.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/tipc/discover.c b/net/tipc/discover.c index e8630707901e..e8dcdf267c0c 100644 --- a/net/tipc/discover.c +++ b/net/tipc/discover.c @@ -211,7 +211,10 @@ void tipc_disc_rcv(struct net *net, struct sk_buff *skb, u32 self; int err; - skb_linearize(skb); + if (skb_linearize(skb)) { + kfree_skb(skb); + return; + } hdr = buf_msg(skb); if (caps & TIPC_NODE_ID128) -- cgit From b97df039a68b2f3e848e238df5d5d06343ea497b Mon Sep 17 00:00:00 2001 From: Thomas Jarosch Date: Wed, 2 Nov 2022 11:18:48 +0100 Subject: xfrm: Fix oops in __xfrm_state_delete() Kernel 5.14 added a new "byseq" index to speed up xfrm_state lookups by sequence number in commit fe9f1d8779cb ("xfrm: add state hashtable keyed by seq") While the patch was thorough, the function pfkey_send_new_mapping() in net/af_key.c also modifies x->km.seq and never added the current xfrm_state to the "byseq" index. This leads to the following kernel Ooops: BUG: kernel NULL pointer dereference, address: 0000000000000000 .. RIP: 0010:__xfrm_state_delete+0xc9/0x1c0 .. Call Trace: xfrm_state_delete+0x1e/0x40 xfrm_del_sa+0xb0/0x110 [xfrm_user] xfrm_user_rcv_msg+0x12d/0x270 [xfrm_user] ? remove_entity_load_avg+0x8a/0xa0 ? copy_to_user_state_extra+0x580/0x580 [xfrm_user] netlink_rcv_skb+0x51/0x100 xfrm_netlink_rcv+0x30/0x50 [xfrm_user] netlink_unicast+0x1a6/0x270 netlink_sendmsg+0x22a/0x480 __sys_sendto+0x1a6/0x1c0 ? __audit_syscall_entry+0xd8/0x130 ? __audit_syscall_exit+0x249/0x2b0 __x64_sys_sendto+0x23/0x30 do_syscall_64+0x3a/0x90 entry_SYSCALL_64_after_hwframe+0x61/0xcb Exact location of the crash in __xfrm_state_delete(): if (x->km.seq) hlist_del_rcu(&x->byseq); The hlist_node "byseq" was never populated. The bug only triggers if a new NAT traversal mapping (changed IP or port) is detected in esp_input_done2() / esp6_input_done2(), which in turn indirectly calls pfkey_send_new_mapping() *if* the kernel is compiled with CONFIG_NET_KEY and "af_key" is active. The PF_KEYv2 message SADB_X_NAT_T_NEW_MAPPING is not part of RFC 2367. Various implementations have been examined how they handle the "sadb_msg_seq" header field: - racoon (Android): does not process SADB_X_NAT_T_NEW_MAPPING - strongswan: does not care about sadb_msg_seq - openswan: does not care about sadb_msg_seq There is no standard how PF_KEYv2 sadb_msg_seq should be populated for SADB_X_NAT_T_NEW_MAPPING and it's not used in popular implementations either. Herbert Xu suggested we should just use the current km.seq value as is. This fixes the root cause of the oops since we no longer modify km.seq itself. The update of "km.seq" looks like a copy'n'paste error from pfkey_send_acquire(). SADB_ACQUIRE must indeed assign a unique km.seq number according to RFC 2367. It has been verified that code paths involving pfkey_send_acquire() don't cause the same Oops. PF_KEYv2 SADB_X_NAT_T_NEW_MAPPING support was originally added here: https://git.kernel.org/pub/scm/linux/kernel/git/tglx/history.git commit cbc3488685b20e7b2a98ad387a1a816aada569d8 Author: Derek Atkins AuthorDate: Wed Apr 2 13:21:02 2003 -0800 [IPSEC]: Implement UDP Encapsulation framework. In particular, implement ESPinUDP encapsulation for IPsec Nat Traversal. A note on triggering the bug: I was not able to trigger it using VMs. There is one VPN using a high latency link on our production VPN server that triggered it like once a day though. Link: https://github.com/strongswan/strongswan/issues/992 Link: https://lore.kernel.org/netdev/00959f33ee52c4b3b0084d42c430418e502db554.1652340703.git.antony.antony@secunet.com/T/ Link: https://lore.kernel.org/netdev/20221027142455.3975224-1-chenzhihao@meizu.com/T/ Fixes: fe9f1d8779cb ("xfrm: add state hashtable keyed by seq") Reported-by: Roth Mark Reported-by: Zhihao Chen Tested-by: Roth Mark Signed-off-by: Thomas Jarosch Acked-by: Antony Antony Acked-by: Herbert Xu Signed-off-by: Steffen Klassert --- net/key/af_key.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/key/af_key.c b/net/key/af_key.c index 213287814328..95edcbedf6ef 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -3394,7 +3394,7 @@ static int pfkey_send_new_mapping(struct xfrm_state *x, xfrm_address_t *ipaddr, hdr->sadb_msg_len = size / sizeof(uint64_t); hdr->sadb_msg_errno = 0; hdr->sadb_msg_reserved = 0; - hdr->sadb_msg_seq = x->km.seq = get_acqseq(); + hdr->sadb_msg_seq = x->km.seq; hdr->sadb_msg_pid = 0; /* SA */ -- cgit From 40781bfb836eda57d19c0baa37c7e72590e05fdc Mon Sep 17 00:00:00 2001 From: Chen Zhongjin Date: Thu, 3 Nov 2022 17:07:13 +0800 Subject: xfrm: Fix ignored return value in xfrm6_init() When IPv6 module initializing in xfrm6_init(), register_pernet_subsys() is possible to fail but its return value is ignored. If IPv6 initialization fails later and xfrm6_fini() is called, removing uninitialized list in xfrm6_net_ops will cause null-ptr-deref: KASAN: null-ptr-deref in range [0x0000000000000008-0x000000000000000f] CPU: 1 PID: 330 Comm: insmod RIP: 0010:unregister_pernet_operations+0xc9/0x450 Call Trace: unregister_pernet_subsys+0x31/0x3e xfrm6_fini+0x16/0x30 [ipv6] ip6_route_init+0xcd/0x128 [ipv6] inet6_init+0x29c/0x602 [ipv6] ... Fix it by catching the error return value of register_pernet_subsys(). Fixes: 8d068875caca ("xfrm: make gc_thresh configurable in all namespaces") Signed-off-by: Chen Zhongjin Reviewed-by: Leon Romanovsky Signed-off-by: Steffen Klassert --- net/ipv6/xfrm6_policy.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c index 4a4b0e49ec92..ea435eba3053 100644 --- a/net/ipv6/xfrm6_policy.c +++ b/net/ipv6/xfrm6_policy.c @@ -287,9 +287,13 @@ int __init xfrm6_init(void) if (ret) goto out_state; - register_pernet_subsys(&xfrm6_net_ops); + ret = register_pernet_subsys(&xfrm6_net_ops); + if (ret) + goto out_protocol; out: return ret; +out_protocol: + xfrm6_protocol_fini(); out_state: xfrm6_state_fini(); out_policy: -- cgit From 8427fd100c7b7793650e212a81e42f1cf124613d Mon Sep 17 00:00:00 2001 From: Xin Long Date: Fri, 18 Nov 2022 16:33:03 -0500 Subject: net: sched: allow act_ct to be built without NF_NAT In commit f11fe1dae1c4 ("net/sched: Make NET_ACT_CT depends on NF_NAT"), it fixed the build failure when NF_NAT is m and NET_ACT_CT is y by adding depends on NF_NAT for NET_ACT_CT. However, it would also cause NET_ACT_CT cannot be built without NF_NAT, which is not expected. This patch fixes it by changing to use "(!NF_NAT || NF_NAT)" as the depend. Fixes: f11fe1dae1c4 ("net/sched: Make NET_ACT_CT depends on NF_NAT") Signed-off-by: Xin Long Link: https://lore.kernel.org/r/b6386f28d1ba34721795fb776a91cbdabb203447.1668807183.git.lucien.xin@gmail.com Signed-off-by: Paolo Abeni --- net/sched/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/sched/Kconfig b/net/sched/Kconfig index 1e8ab4749c6c..4662a6ce8a7e 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -976,7 +976,7 @@ config NET_ACT_TUNNEL_KEY config NET_ACT_CT tristate "connection tracking tc action" - depends on NET_CLS_ACT && NF_CONNTRACK && NF_NAT && NF_FLOW_TABLE + depends on NET_CLS_ACT && NF_CONNTRACK && (!NF_NAT || NF_NAT) && NF_FLOW_TABLE help Say Y here to allow sending the packets to conntrack module. -- cgit From 53270fb0fd77fe786d8c07a0793981d797836b93 Mon Sep 17 00:00:00 2001 From: Liu Shixin Date: Fri, 18 Nov 2022 16:24:19 +0800 Subject: NFC: nci: fix memory leak in nci_rx_data_packet() Syzbot reported a memory leak about skb: unreferenced object 0xffff88810e144e00 (size 240): comm "syz-executor284", pid 3701, jiffies 4294952403 (age 12.620s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [] __alloc_skb+0x1f9/0x270 net/core/skbuff.c:497 [] alloc_skb include/linux/skbuff.h:1267 [inline] [] virtual_ncidev_write+0x24/0xe0 drivers/nfc/virtual_ncidev.c:116 [] do_loop_readv_writev fs/read_write.c:759 [inline] [] do_loop_readv_writev fs/read_write.c:743 [inline] [] do_iter_write+0x253/0x300 fs/read_write.c:863 [] vfs_writev+0xdd/0x240 fs/read_write.c:934 [] do_writev+0xa6/0x1c0 fs/read_write.c:977 [] do_syscall_x64 arch/x86/entry/common.c:50 [inline] [] do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80 [] entry_SYSCALL_64_after_hwframe+0x63/0xcd In nci_rx_data_packet(), if we don't get a valid conn_info, we will return directly but forget to release the skb. Reported-by: syzbot+cdb9a427d1bc08815104@syzkaller.appspotmail.com Fixes: 4aeee6871e8c ("NFC: nci: Add dynamic logical connections support") Signed-off-by: Liu Shixin Link: https://lore.kernel.org/r/20221118082419.239475-1-liushixin2@huawei.com Signed-off-by: Paolo Abeni --- net/nfc/nci/data.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/nfc/nci/data.c b/net/nfc/nci/data.c index aa5e712adf07..3d36ea5701f0 100644 --- a/net/nfc/nci/data.c +++ b/net/nfc/nci/data.c @@ -279,8 +279,10 @@ void nci_rx_data_packet(struct nci_dev *ndev, struct sk_buff *skb) nci_plen(skb->data)); conn_info = nci_get_conn_info_by_conn_id(ndev, nci_conn_id(skb->data)); - if (!conn_info) + if (!conn_info) { + kfree_skb(skb); return; + } /* strip the nci data header */ skb_pull(skb, NCI_DATA_HDR_SIZE); -- cgit From 6a66ce44a51bdfc47721f0c591137df2d4b21247 Mon Sep 17 00:00:00 2001 From: Jozsef Kadlecsik Date: Tue, 22 Nov 2022 20:18:58 +0100 Subject: netfilter: ipset: restore allowing 64 clashing elements in hash:net,iface The commit 510841da1fcc ("netfilter: ipset: enforce documented limit to prevent allocating huge memory") was too strict and prevented to add up to 64 clashing elements to a hash:net,iface type of set. This patch fixes the issue and now the type behaves as documented. Fixes: 510841da1fcc ("netfilter: ipset: enforce documented limit to prevent allocating huge memory") Signed-off-by: Jozsef Kadlecsik Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipset/ip_set_hash_gen.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h index 3adc291d9ce1..7499192af586 100644 --- a/net/netfilter/ipset/ip_set_hash_gen.h +++ b/net/netfilter/ipset/ip_set_hash_gen.h @@ -916,7 +916,7 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext, #ifdef IP_SET_HASH_WITH_MULTI if (h->bucketsize >= AHASH_MAX_TUNED) goto set_full; - else if (h->bucketsize < multi) + else if (h->bucketsize <= multi) h->bucketsize += AHASH_INIT_SIZE; #endif if (n->size >= AHASH_MAX(h)) { -- cgit From bcd9e3c1656d0f7dd9743598c65c3ae24efb38d0 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Mon, 21 Nov 2022 19:26:15 +0100 Subject: netfilter: flowtable_offload: add missing locking nf_flow_table_block_setup and the driver TC_SETUP_FT call can modify the flow block cb list while they are being traversed elsewhere, causing a crash. Add a write lock around the calls to protect readers Fixes: c29f74e0df7a ("netfilter: nf_flow_table: hardware offload support") Reported-by: Chad Monroe Signed-off-by: Felix Fietkau Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_flow_table_offload.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/netfilter/nf_flow_table_offload.c b/net/netfilter/nf_flow_table_offload.c index b04645ced89b..00b522890d77 100644 --- a/net/netfilter/nf_flow_table_offload.c +++ b/net/netfilter/nf_flow_table_offload.c @@ -1098,6 +1098,7 @@ static int nf_flow_table_block_setup(struct nf_flowtable *flowtable, struct flow_block_cb *block_cb, *next; int err = 0; + down_write(&flowtable->flow_block_lock); switch (cmd) { case FLOW_BLOCK_BIND: list_splice(&bo->cb_list, &flowtable->flow_block.cb_list); @@ -1112,6 +1113,7 @@ static int nf_flow_table_block_setup(struct nf_flowtable *flowtable, WARN_ON_ONCE(1); err = -EOPNOTSUPP; } + up_write(&flowtable->flow_block_lock); return err; } @@ -1168,7 +1170,9 @@ static int nf_flow_table_offload_cmd(struct flow_block_offload *bo, nf_flow_table_block_offload_init(bo, dev_net(dev), cmd, flowtable, extack); + down_write(&flowtable->flow_block_lock); err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_FT, bo); + up_write(&flowtable->flow_block_lock); if (err < 0) return err; -- cgit From 77934dc6db0d2b111a8f2759e9ad2fb67f5cffa5 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 18 Nov 2022 17:49:11 -0800 Subject: dccp/tcp: Reset saddr on failure after inet6?_hash_connect(). When connect() is called on a socket bound to the wildcard address, we change the socket's saddr to a local address. If the socket fails to connect() to the destination, we have to reset the saddr. However, when an error occurs after inet_hash6?_connect() in (dccp|tcp)_v[46]_conect(), we forget to reset saddr and leave the socket bound to the address. From the user's point of view, whether saddr is reset or not varies with errno. Let's fix this inconsistent behaviour. Note that after this patch, the repro [0] will trigger the WARN_ON() in inet_csk_get_port() again, but this patch is not buggy and rather fixes a bug papering over the bhash2's bug for which we need another fix. For the record, the repro causes -EADDRNOTAVAIL in inet_hash6_connect() by this sequence: s1 = socket() s1.setsockopt(SOL_SOCKET, SO_REUSEADDR, 1) s1.bind(('127.0.0.1', 10000)) s1.sendto(b'hello', MSG_FASTOPEN, (('127.0.0.1', 10000))) # or s1.connect(('127.0.0.1', 10000)) s2 = socket() s2.setsockopt(SOL_SOCKET, SO_REUSEADDR, 1) s2.bind(('0.0.0.0', 10000)) s2.connect(('127.0.0.1', 10000)) # -EADDRNOTAVAIL s2.listen(32) # WARN_ON(inet_csk(sk)->icsk_bind2_hash != tb2); [0]: https://syzkaller.appspot.com/bug?extid=015d756bbd1f8b5c8f09 Fixes: 3df80d9320bc ("[DCCP]: Introduce DCCPv6") Fixes: 7c657876b63c ("[DCCP]: Initial implementation") Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Kuniyuki Iwashima Acked-by: Joanne Koong Reviewed-by: Eric Dumazet Signed-off-by: Jakub Kicinski --- net/dccp/ipv4.c | 2 ++ net/dccp/ipv6.c | 2 ++ net/ipv4/tcp_ipv4.c | 2 ++ net/ipv6/tcp_ipv6.c | 2 ++ 4 files changed, 8 insertions(+) (limited to 'net') diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index 713b7b8dad7e..40640c26680e 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -157,6 +157,8 @@ failure: * This unhashes the socket and releases the local port, if necessary. */ dccp_set_state(sk, DCCP_CLOSED); + if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK)) + inet_reset_saddr(sk); ip_rt_put(rt); sk->sk_route_caps = 0; inet->inet_dport = 0; diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index e57b43006074..626166cb6d7e 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -985,6 +985,8 @@ static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr, late_failure: dccp_set_state(sk, DCCP_CLOSED); + if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK)) + inet_reset_saddr(sk); __sk_dst_reset(sk); failure: inet->inet_dport = 0; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 87d440f47a70..6a3a732b584d 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -343,6 +343,8 @@ failure: * if necessary. */ tcp_set_state(sk, TCP_CLOSE); + if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK)) + inet_reset_saddr(sk); ip_rt_put(rt); sk->sk_route_caps = 0; inet->inet_dport = 0; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 2a3f9296df1e..81b396e5cf79 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -359,6 +359,8 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, late_failure: tcp_set_state(sk, TCP_CLOSE); + if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK)) + inet_reset_saddr(sk); failure: inet->inet_dport = 0; sk->sk_route_caps = 0; -- cgit From 8acdad37cd13ce777b0811b2d332314037fcefa8 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 18 Nov 2022 17:49:12 -0800 Subject: dccp/tcp: Remove NULL check for prev_saddr in inet_bhash2_update_saddr(). When we call inet_bhash2_update_saddr(), prev_saddr is always non-NULL. Let's remove the unnecessary test. Signed-off-by: Kuniyuki Iwashima Acked-by: Joanne Koong Reviewed-by: Eric Dumazet Signed-off-by: Jakub Kicinski --- net/ipv4/inet_hashtables.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 033bf3c2538f..d745f962745e 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -877,13 +877,10 @@ int inet_bhash2_update_saddr(struct inet_bind_hashbucket *prev_saddr, struct soc head2 = inet_bhashfn_portaddr(hinfo, sk, net, port); - if (prev_saddr) { - spin_lock_bh(&prev_saddr->lock); - __sk_del_bind2_node(sk); - inet_bind2_bucket_destroy(hinfo->bind2_bucket_cachep, - inet_csk(sk)->icsk_bind2_hash); - spin_unlock_bh(&prev_saddr->lock); - } + spin_lock_bh(&prev_saddr->lock); + __sk_del_bind2_node(sk); + inet_bind2_bucket_destroy(hinfo->bind2_bucket_cachep, inet_csk(sk)->icsk_bind2_hash); + spin_unlock_bh(&prev_saddr->lock); spin_lock_bh(&head2->lock); tb2 = inet_bind2_bucket_find(head2, net, port, l3mdev, sk); -- cgit From 8c5dae4c1a49489499e6708c7dd284370ca36287 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 18 Nov 2022 17:49:13 -0800 Subject: dccp/tcp: Update saddr under bhash's lock. When we call connect() for a socket bound to a wildcard address, we update saddr locklessly. However, it could result in a data race; another thread iterating over bhash might see a corrupted address. Let's update saddr under the bhash bucket's lock. Fixes: 3df80d9320bc ("[DCCP]: Introduce DCCPv6") Fixes: 7c657876b63c ("[DCCP]: Initial implementation") Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Kuniyuki Iwashima Acked-by: Joanne Koong Reviewed-by: Eric Dumazet Signed-off-by: Jakub Kicinski --- net/dccp/ipv4.c | 22 ++++------------------ net/dccp/ipv6.c | 23 ++++------------------- net/ipv4/af_inet.c | 11 +---------- net/ipv4/inet_hashtables.c | 45 +++++++++++++++++++++++++++++++++++++++------ net/ipv4/tcp_ipv4.c | 20 ++++---------------- net/ipv6/tcp_ipv6.c | 19 +++---------------- 6 files changed, 55 insertions(+), 85 deletions(-) (limited to 'net') diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index 40640c26680e..95e376e3b911 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -45,11 +45,10 @@ static unsigned int dccp_v4_pernet_id __read_mostly; int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { const struct sockaddr_in *usin = (struct sockaddr_in *)uaddr; - struct inet_bind_hashbucket *prev_addr_hashbucket = NULL; - __be32 daddr, nexthop, prev_sk_rcv_saddr; struct inet_sock *inet = inet_sk(sk); struct dccp_sock *dp = dccp_sk(sk); __be16 orig_sport, orig_dport; + __be32 daddr, nexthop; struct flowi4 *fl4; struct rtable *rt; int err; @@ -91,26 +90,13 @@ int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) daddr = fl4->daddr; if (inet->inet_saddr == 0) { - if (inet_csk(sk)->icsk_bind2_hash) { - prev_addr_hashbucket = - inet_bhashfn_portaddr(&dccp_hashinfo, sk, - sock_net(sk), - inet->inet_num); - prev_sk_rcv_saddr = sk->sk_rcv_saddr; - } - inet->inet_saddr = fl4->saddr; - } - - sk_rcv_saddr_set(sk, inet->inet_saddr); - - if (prev_addr_hashbucket) { - err = inet_bhash2_update_saddr(prev_addr_hashbucket, sk); + err = inet_bhash2_update_saddr(sk, &fl4->saddr, AF_INET); if (err) { - inet->inet_saddr = 0; - sk_rcv_saddr_set(sk, prev_sk_rcv_saddr); ip_rt_put(rt); return err; } + } else { + sk_rcv_saddr_set(sk, inet->inet_saddr); } inet->inet_dport = usin->sin_port; diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index 626166cb6d7e..94c101ed57a9 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -934,26 +934,11 @@ static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr, } if (saddr == NULL) { - struct inet_bind_hashbucket *prev_addr_hashbucket = NULL; - struct in6_addr prev_v6_rcv_saddr; - - if (icsk->icsk_bind2_hash) { - prev_addr_hashbucket = inet_bhashfn_portaddr(&dccp_hashinfo, - sk, sock_net(sk), - inet->inet_num); - prev_v6_rcv_saddr = sk->sk_v6_rcv_saddr; - } - saddr = &fl6.saddr; - sk->sk_v6_rcv_saddr = *saddr; - - if (prev_addr_hashbucket) { - err = inet_bhash2_update_saddr(prev_addr_hashbucket, sk); - if (err) { - sk->sk_v6_rcv_saddr = prev_v6_rcv_saddr; - goto failure; - } - } + + err = inet_bhash2_update_saddr(sk, saddr, AF_INET6); + if (err) + goto failure; } /* set the source address */ diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 4728087c42a5..0da679411330 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1230,7 +1230,6 @@ EXPORT_SYMBOL(inet_unregister_protosw); static int inet_sk_reselect_saddr(struct sock *sk) { - struct inet_bind_hashbucket *prev_addr_hashbucket; struct inet_sock *inet = inet_sk(sk); __be32 old_saddr = inet->inet_saddr; __be32 daddr = inet->inet_daddr; @@ -1260,16 +1259,8 @@ static int inet_sk_reselect_saddr(struct sock *sk) return 0; } - prev_addr_hashbucket = - inet_bhashfn_portaddr(tcp_or_dccp_get_hashinfo(sk), sk, - sock_net(sk), inet->inet_num); - - inet->inet_saddr = inet->inet_rcv_saddr = new_saddr; - - err = inet_bhash2_update_saddr(prev_addr_hashbucket, sk); + err = inet_bhash2_update_saddr(sk, &new_saddr, AF_INET); if (err) { - inet->inet_saddr = old_saddr; - inet->inet_rcv_saddr = old_saddr; ip_rt_put(rt); return err; } diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index d745f962745e..18ef370af113 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -858,14 +858,34 @@ inet_bhash2_addr_any_hashbucket(const struct sock *sk, const struct net *net, in return &hinfo->bhash2[hash & (hinfo->bhash_size - 1)]; } -int inet_bhash2_update_saddr(struct inet_bind_hashbucket *prev_saddr, struct sock *sk) +static void inet_update_saddr(struct sock *sk, void *saddr, int family) +{ + if (family == AF_INET) { + inet_sk(sk)->inet_saddr = *(__be32 *)saddr; + sk_rcv_saddr_set(sk, inet_sk(sk)->inet_saddr); + } +#if IS_ENABLED(CONFIG_IPV6) + else { + sk->sk_v6_rcv_saddr = *(struct in6_addr *)saddr; + } +#endif +} + +int inet_bhash2_update_saddr(struct sock *sk, void *saddr, int family) { struct inet_hashinfo *hinfo = tcp_or_dccp_get_hashinfo(sk); + struct inet_bind_hashbucket *head, *head2; struct inet_bind2_bucket *tb2, *new_tb2; int l3mdev = inet_sk_bound_l3mdev(sk); - struct inet_bind_hashbucket *head2; int port = inet_sk(sk)->inet_num; struct net *net = sock_net(sk); + int bhash; + + if (!inet_csk(sk)->icsk_bind2_hash) { + /* Not bind()ed before. */ + inet_update_saddr(sk, saddr, family); + return 0; + } /* Allocate a bind2 bucket ahead of time to avoid permanently putting * the bhash2 table in an inconsistent state if a new tb2 bucket @@ -875,14 +895,25 @@ int inet_bhash2_update_saddr(struct inet_bind_hashbucket *prev_saddr, struct soc if (!new_tb2) return -ENOMEM; + bhash = inet_bhashfn(net, port, hinfo->bhash_size); + head = &hinfo->bhash[bhash]; head2 = inet_bhashfn_portaddr(hinfo, sk, net, port); - spin_lock_bh(&prev_saddr->lock); + /* If we change saddr locklessly, another thread + * iterating over bhash might see corrupted address. + */ + spin_lock_bh(&head->lock); + + spin_lock(&head2->lock); __sk_del_bind2_node(sk); inet_bind2_bucket_destroy(hinfo->bind2_bucket_cachep, inet_csk(sk)->icsk_bind2_hash); - spin_unlock_bh(&prev_saddr->lock); + spin_unlock(&head2->lock); + + inet_update_saddr(sk, saddr, family); - spin_lock_bh(&head2->lock); + head2 = inet_bhashfn_portaddr(hinfo, sk, net, port); + + spin_lock(&head2->lock); tb2 = inet_bind2_bucket_find(head2, net, port, l3mdev, sk); if (!tb2) { tb2 = new_tb2; @@ -890,7 +921,9 @@ int inet_bhash2_update_saddr(struct inet_bind_hashbucket *prev_saddr, struct soc } sk_add_bind2_node(sk, &tb2->owners); inet_csk(sk)->icsk_bind2_hash = tb2; - spin_unlock_bh(&head2->lock); + spin_unlock(&head2->lock); + + spin_unlock_bh(&head->lock); if (tb2 != new_tb2) kmem_cache_free(hinfo->bind2_bucket_cachep, new_tb2); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 6a3a732b584d..23dd7e9df2d5 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -199,15 +199,14 @@ static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr, /* This will initiate an outgoing connection. */ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { - struct inet_bind_hashbucket *prev_addr_hashbucket = NULL; struct sockaddr_in *usin = (struct sockaddr_in *)uaddr; struct inet_timewait_death_row *tcp_death_row; - __be32 daddr, nexthop, prev_sk_rcv_saddr; struct inet_sock *inet = inet_sk(sk); struct tcp_sock *tp = tcp_sk(sk); struct ip_options_rcu *inet_opt; struct net *net = sock_net(sk); __be16 orig_sport, orig_dport; + __be32 daddr, nexthop; struct flowi4 *fl4; struct rtable *rt; int err; @@ -251,24 +250,13 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row; if (!inet->inet_saddr) { - if (inet_csk(sk)->icsk_bind2_hash) { - prev_addr_hashbucket = inet_bhashfn_portaddr(tcp_death_row->hashinfo, - sk, net, inet->inet_num); - prev_sk_rcv_saddr = sk->sk_rcv_saddr; - } - inet->inet_saddr = fl4->saddr; - } - - sk_rcv_saddr_set(sk, inet->inet_saddr); - - if (prev_addr_hashbucket) { - err = inet_bhash2_update_saddr(prev_addr_hashbucket, sk); + err = inet_bhash2_update_saddr(sk, &fl4->saddr, AF_INET); if (err) { - inet->inet_saddr = 0; - sk_rcv_saddr_set(sk, prev_sk_rcv_saddr); ip_rt_put(rt); return err; } + } else { + sk_rcv_saddr_set(sk, inet->inet_saddr); } if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) { diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 81b396e5cf79..2f3ca3190d26 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -292,24 +292,11 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row; if (!saddr) { - struct inet_bind_hashbucket *prev_addr_hashbucket = NULL; - struct in6_addr prev_v6_rcv_saddr; - - if (icsk->icsk_bind2_hash) { - prev_addr_hashbucket = inet_bhashfn_portaddr(tcp_death_row->hashinfo, - sk, net, inet->inet_num); - prev_v6_rcv_saddr = sk->sk_v6_rcv_saddr; - } saddr = &fl6.saddr; - sk->sk_v6_rcv_saddr = *saddr; - if (prev_addr_hashbucket) { - err = inet_bhash2_update_saddr(prev_addr_hashbucket, sk); - if (err) { - sk->sk_v6_rcv_saddr = prev_v6_rcv_saddr; - goto failure; - } - } + err = inet_bhash2_update_saddr(sk, saddr, AF_INET6); + if (err) + goto failure; } /* set the source address */ -- cgit From e0833d1fedb02f038b526ae7dde178a076f56545 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 18 Nov 2022 17:49:14 -0800 Subject: dccp/tcp: Fixup bhash2 bucket when connect() fails. If a socket bound to a wildcard address fails to connect(), we only reset saddr and keep the port. Then, we have to fix up the bhash2 bucket; otherwise, the bucket has an inconsistent address in the list. Also, listen() for such a socket will fire the WARN_ON() in inet_csk_get_port(). [0] Note that when a system runs out of memory, we give up fixing the bucket and unlink sk from bhash and bhash2 by inet_put_port(). [0]: WARNING: CPU: 0 PID: 207 at net/ipv4/inet_connection_sock.c:548 inet_csk_get_port (net/ipv4/inet_connection_sock.c:548 (discriminator 1)) Modules linked in: CPU: 0 PID: 207 Comm: bhash2_prev_rep Not tainted 6.1.0-rc3-00799-gc8421681c845 #63 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.0-1.amzn2022.0.1 04/01/2014 RIP: 0010:inet_csk_get_port (net/ipv4/inet_connection_sock.c:548 (discriminator 1)) Code: 74 a7 eb 93 48 8b 54 24 18 0f b7 cb 4c 89 e6 4c 89 ff e8 48 b2 ff ff 49 8b 87 18 04 00 00 e9 32 ff ff ff 0f 0b e9 34 ff ff ff <0f> 0b e9 42 ff ff ff 41 8b 7f 50 41 8b 4f 54 89 fe 81 f6 00 00 ff RSP: 0018:ffffc900003d7e50 EFLAGS: 00010202 RAX: ffff8881047fb500 RBX: 0000000000004e20 RCX: 0000000000000000 RDX: 000000000000000a RSI: 00000000fffffe00 RDI: 00000000ffffffff RBP: ffffffff8324dc00 R08: 0000000000000001 R09: 0000000000000001 R10: 0000000000000001 R11: 0000000000000001 R12: 0000000000000000 R13: 0000000000000001 R14: 0000000000004e20 R15: ffff8881054e1280 FS: 00007f8ac04dc740(0000) GS:ffff88842fc00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000020001540 CR3: 00000001055fa003 CR4: 0000000000770ef0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 PKRU: 55555554 Call Trace: inet_csk_listen_start (net/ipv4/inet_connection_sock.c:1205) inet_listen (net/ipv4/af_inet.c:228) __sys_listen (net/socket.c:1810) __x64_sys_listen (net/socket.c:1819 net/socket.c:1817 net/socket.c:1817) do_syscall_64 (arch/x86/entry/common.c:50 arch/x86/entry/common.c:80) entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:120) RIP: 0033:0x7f8ac051de5d Code: ff c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 93 af 1b 00 f7 d8 64 89 01 48 RSP: 002b:00007ffc1c177248 EFLAGS: 00000206 ORIG_RAX: 0000000000000032 RAX: ffffffffffffffda RBX: 0000000020001550 RCX: 00007f8ac051de5d RDX: ffffffffffffff80 RSI: 0000000000000000 RDI: 0000000000000004 RBP: 00007ffc1c177270 R08: 0000000000000018 R09: 0000000000000007 R10: 0000000020001540 R11: 0000000000000206 R12: 00007ffc1c177388 R13: 0000000000401169 R14: 0000000000403e18 R15: 00007f8ac0723000 Fixes: 28044fc1d495 ("net: Add a bhash2 table hashed by port and address") Reported-by: syzbot Reported-by: Mat Martineau Signed-off-by: Kuniyuki Iwashima Acked-by: Joanne Koong Reviewed-by: Eric Dumazet Signed-off-by: Jakub Kicinski --- net/dccp/ipv4.c | 3 +-- net/dccp/ipv6.c | 3 +-- net/dccp/proto.c | 3 +-- net/ipv4/inet_hashtables.c | 38 ++++++++++++++++++++++++++++++++++---- net/ipv4/tcp.c | 3 +-- net/ipv4/tcp_ipv4.c | 3 +-- net/ipv6/tcp_ipv6.c | 3 +-- 7 files changed, 40 insertions(+), 16 deletions(-) (limited to 'net') diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index 95e376e3b911..b780827f5e0a 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -143,8 +143,7 @@ failure: * This unhashes the socket and releases the local port, if necessary. */ dccp_set_state(sk, DCCP_CLOSED); - if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK)) - inet_reset_saddr(sk); + inet_bhash2_reset_saddr(sk); ip_rt_put(rt); sk->sk_route_caps = 0; inet->inet_dport = 0; diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index 94c101ed57a9..602f3432d80b 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -970,8 +970,7 @@ static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr, late_failure: dccp_set_state(sk, DCCP_CLOSED); - if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK)) - inet_reset_saddr(sk); + inet_bhash2_reset_saddr(sk); __sk_dst_reset(sk); failure: inet->inet_dport = 0; diff --git a/net/dccp/proto.c b/net/dccp/proto.c index c548ca3e9b0e..85e35c5e8890 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -279,8 +279,7 @@ int dccp_disconnect(struct sock *sk, int flags) inet->inet_dport = 0; - if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK)) - inet_reset_saddr(sk); + inet_bhash2_reset_saddr(sk); sk->sk_shutdown = 0; sock_reset_flag(sk, SOCK_DONE); diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 18ef370af113..3cec471a2cd2 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -871,7 +871,7 @@ static void inet_update_saddr(struct sock *sk, void *saddr, int family) #endif } -int inet_bhash2_update_saddr(struct sock *sk, void *saddr, int family) +static int __inet_bhash2_update_saddr(struct sock *sk, void *saddr, int family, bool reset) { struct inet_hashinfo *hinfo = tcp_or_dccp_get_hashinfo(sk); struct inet_bind_hashbucket *head, *head2; @@ -883,7 +883,11 @@ int inet_bhash2_update_saddr(struct sock *sk, void *saddr, int family) if (!inet_csk(sk)->icsk_bind2_hash) { /* Not bind()ed before. */ - inet_update_saddr(sk, saddr, family); + if (reset) + inet_reset_saddr(sk); + else + inet_update_saddr(sk, saddr, family); + return 0; } @@ -892,8 +896,19 @@ int inet_bhash2_update_saddr(struct sock *sk, void *saddr, int family) * allocation fails. */ new_tb2 = kmem_cache_alloc(hinfo->bind2_bucket_cachep, GFP_ATOMIC); - if (!new_tb2) + if (!new_tb2) { + if (reset) { + /* The (INADDR_ANY, port) bucket might have already + * been freed, then we cannot fixup icsk_bind2_hash, + * so we give up and unlink sk from bhash/bhash2 not + * to leave inconsistency in bhash2. + */ + inet_put_port(sk); + inet_reset_saddr(sk); + } + return -ENOMEM; + } bhash = inet_bhashfn(net, port, hinfo->bhash_size); head = &hinfo->bhash[bhash]; @@ -909,7 +924,10 @@ int inet_bhash2_update_saddr(struct sock *sk, void *saddr, int family) inet_bind2_bucket_destroy(hinfo->bind2_bucket_cachep, inet_csk(sk)->icsk_bind2_hash); spin_unlock(&head2->lock); - inet_update_saddr(sk, saddr, family); + if (reset) + inet_reset_saddr(sk); + else + inet_update_saddr(sk, saddr, family); head2 = inet_bhashfn_portaddr(hinfo, sk, net, port); @@ -930,8 +948,20 @@ int inet_bhash2_update_saddr(struct sock *sk, void *saddr, int family) return 0; } + +int inet_bhash2_update_saddr(struct sock *sk, void *saddr, int family) +{ + return __inet_bhash2_update_saddr(sk, saddr, family, false); +} EXPORT_SYMBOL_GPL(inet_bhash2_update_saddr); +void inet_bhash2_reset_saddr(struct sock *sk) +{ + if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK)) + __inet_bhash2_update_saddr(sk, NULL, 0, true); +} +EXPORT_SYMBOL_GPL(inet_bhash2_reset_saddr); + /* RFC 6056 3.3.4. Algorithm 4: Double-Hash Port Selection Algorithm * Note that we use 32bit integers (vs RFC 'short integers') * because 2^16 is not a multiple of num_ephemeral and this diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 54836a6b81d6..4f2205756cfe 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3114,8 +3114,7 @@ int tcp_disconnect(struct sock *sk, int flags) inet->inet_dport = 0; - if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK)) - inet_reset_saddr(sk); + inet_bhash2_reset_saddr(sk); sk->sk_shutdown = 0; sock_reset_flag(sk, SOCK_DONE); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 23dd7e9df2d5..da46357f501b 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -331,8 +331,7 @@ failure: * if necessary. */ tcp_set_state(sk, TCP_CLOSE); - if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK)) - inet_reset_saddr(sk); + inet_bhash2_reset_saddr(sk); ip_rt_put(rt); sk->sk_route_caps = 0; inet->inet_dport = 0; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 2f3ca3190d26..f0548dbcabd2 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -346,8 +346,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, late_failure: tcp_set_state(sk, TCP_CLOSE); - if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK)) - inet_reset_saddr(sk); + inet_bhash2_reset_saddr(sk); failure: inet->inet_dport = 0; sk->sk_route_caps = 0; -- cgit From 568fe84940ac0e4e0b2cd7751b8b4911f7b9c215 Mon Sep 17 00:00:00 2001 From: Ziyang Xuan Date: Sun, 20 Nov 2022 15:28:38 +0800 Subject: ipv4: Fix error return code in fib_table_insert() In fib_table_insert(), if the alias was already inserted, but node not exist, the error code should be set before return from error handling path. Fixes: a6c76c17df02 ("ipv4: Notify route after insertion to the routing table") Signed-off-by: Ziyang Xuan Link: https://lore.kernel.org/r/20221120072838.2167047-1-william.xuanziyang@huawei.com Signed-off-by: Jakub Kicinski --- net/ipv4/fib_trie.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index c88bf856c443..74d403dbd2b4 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -1381,8 +1381,10 @@ int fib_table_insert(struct net *net, struct fib_table *tb, /* The alias was already inserted, so the node must exist. */ l = l ? l : fib_find_node(t, &tp, key); - if (WARN_ON_ONCE(!l)) + if (WARN_ON_ONCE(!l)) { + err = -ENOENT; goto out_free_new_fa; + } if (fib_find_alias(&l->leaf, new_fa->fa_slen, 0, 0, tb->tb_id, true) == new_fa) { -- cgit From 391c18cf776eb4569ecda1f7794f360fe0a45a26 Mon Sep 17 00:00:00 2001 From: Dominique Martinet Date: Fri, 18 Nov 2022 22:44:41 +0900 Subject: 9p/xen: check logical size for buffer size trans_xen did not check the data fits into the buffer before copying from the xen ring, but we probably should. Add a check that just skips the request and return an error to userspace if it did not fit Tested-by: Stefano Stabellini Reviewed-by: Christian Schoenebeck Link: https://lkml.kernel.org/r/20221118135542.63400-1-asmadeus@codewreck.org Signed-off-by: Dominique Martinet --- net/9p/trans_xen.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'net') diff --git a/net/9p/trans_xen.c b/net/9p/trans_xen.c index b15c64128c3e..aaa5fd364691 100644 --- a/net/9p/trans_xen.c +++ b/net/9p/trans_xen.c @@ -208,6 +208,14 @@ static void p9_xen_response(struct work_struct *work) continue; } + if (h.size > req->rc.capacity) { + dev_warn(&priv->dev->dev, + "requested packet size too big: %d for tag %d with capacity %zd\n", + h.size, h.tag, req->rc.capacity); + req->status = REQ_STATUS_ERROR; + goto recv_error; + } + memcpy(&req->rc, &h, sizeof(h)); req->rc.offset = 0; @@ -217,6 +225,7 @@ static void p9_xen_response(struct work_struct *work) masked_prod, &masked_cons, XEN_9PFS_RING_SIZE(ring)); +recv_error: virt_mb(); cons += h.size; ring->intf->in_cons = cons; -- cgit From af295e854a4e3813ffbdef26dbb6a4d6226c3ea1 Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Mon, 21 Nov 2022 09:54:26 +0100 Subject: l2tp: Don't sleep and disable BH under writer-side sk_callback_lock When holding a reader-writer spin lock we cannot sleep. Calling setup_udp_tunnel_sock() with write lock held violates this rule, because we end up calling percpu_down_read(), which might sleep, as syzbot reports [1]: __might_resched.cold+0x222/0x26b kernel/sched/core.c:9890 percpu_down_read include/linux/percpu-rwsem.h:49 [inline] cpus_read_lock+0x1b/0x140 kernel/cpu.c:310 static_key_slow_inc+0x12/0x20 kernel/jump_label.c:158 udp_tunnel_encap_enable include/net/udp_tunnel.h:187 [inline] setup_udp_tunnel_sock+0x43d/0x550 net/ipv4/udp_tunnel_core.c:81 l2tp_tunnel_register+0xc51/0x1210 net/l2tp/l2tp_core.c:1509 pppol2tp_connect+0xcdc/0x1a10 net/l2tp/l2tp_ppp.c:723 Trim the writer-side critical section for sk_callback_lock down to the minimum, so that it covers only operations on sk_user_data. Also, when grabbing the sk_callback_lock, we always need to disable BH, as Eric points out. Failing to do so leads to deadlocks because we acquire sk_callback_lock in softirq context, which can get stuck waiting on us if: 1) it runs on the same CPU, or CPU0 ---- lock(clock-AF_INET6); lock(clock-AF_INET6); 2) lock ordering leads to priority inversion CPU0 CPU1 ---- ---- lock(clock-AF_INET6); local_irq_disable(); lock(&tcp_hashinfo.bhash[i].lock); lock(clock-AF_INET6); lock(&tcp_hashinfo.bhash[i].lock); ... as syzbot reports [2,3]. Use the _bh variants for write_(un)lock. [1] https://lore.kernel.org/netdev/0000000000004e78ec05eda79749@google.com/ [2] https://lore.kernel.org/netdev/000000000000e38b6605eda76f98@google.com/ [3] https://lore.kernel.org/netdev/000000000000dfa31e05eda76f75@google.com/ v2: - Check and set sk_user_data while holding sk_callback_lock for both L2TP encapsulation types (IP and UDP) (Tetsuo) Cc: Tom Parkin Cc: Tetsuo Handa Fixes: b68777d54fac ("l2tp: Serialize access to sk_user_data with sk_callback_lock") Reported-by: Eric Dumazet Reported-by: syzbot+703d9e154b3b58277261@syzkaller.appspotmail.com Reported-by: syzbot+50680ced9e98a61f7698@syzkaller.appspotmail.com Reported-by: syzbot+de987172bb74a381879b@syzkaller.appspotmail.com Signed-off-by: Jakub Sitnicki Signed-off-by: David S. Miller --- net/l2tp/l2tp_core.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index 754fdda8a5f5..9a1415fe3fa7 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -1474,11 +1474,12 @@ int l2tp_tunnel_register(struct l2tp_tunnel *tunnel, struct net *net, } sk = sock->sk; - write_lock(&sk->sk_callback_lock); - + write_lock_bh(&sk->sk_callback_lock); ret = l2tp_validate_socket(sk, net, tunnel->encap); if (ret < 0) - goto err_sock; + goto err_inval_sock; + rcu_assign_sk_user_data(sk, tunnel); + write_unlock_bh(&sk->sk_callback_lock); tunnel->l2tp_net = net; pn = l2tp_pernet(net); @@ -1507,8 +1508,6 @@ int l2tp_tunnel_register(struct l2tp_tunnel *tunnel, struct net *net, }; setup_udp_tunnel_sock(net, sock, &udp_cfg); - } else { - rcu_assign_sk_user_data(sk, tunnel); } tunnel->old_sk_destruct = sk->sk_destruct; @@ -1522,16 +1521,18 @@ int l2tp_tunnel_register(struct l2tp_tunnel *tunnel, struct net *net, if (tunnel->fd >= 0) sockfd_put(sock); - write_unlock(&sk->sk_callback_lock); return 0; err_sock: + write_lock_bh(&sk->sk_callback_lock); + rcu_assign_sk_user_data(sk, NULL); +err_inval_sock: + write_unlock_bh(&sk->sk_callback_lock); + if (tunnel->fd < 0) sock_release(sock); else sockfd_put(sock); - - write_unlock(&sk->sk_callback_lock); err: return ret; } -- cgit From 9f16b5c82a025cd4c864737409234ddc44fb166a Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 25 Nov 2022 12:36:57 +0100 Subject: wifi: cfg80211: fix buffer overflow in elem comparison MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For vendor elements, the code here assumes that 5 octets are present without checking. Since the element itself is already checked to fit, we only need to check the length. Reported-and-tested-by: Sönke Huster Fixes: 0b8fb8235be8 ("cfg80211: Parsing of Multiple BSSID information in scanning") Signed-off-by: Johannes Berg --- net/wireless/scan.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/wireless/scan.c b/net/wireless/scan.c index da752b0cc752..4d217798890a 100644 --- a/net/wireless/scan.c +++ b/net/wireless/scan.c @@ -330,7 +330,8 @@ static size_t cfg80211_gen_new_ie(const u8 *ie, size_t ielen, * determine if they are the same ie. */ if (tmp_old[0] == WLAN_EID_VENDOR_SPECIFIC) { - if (!memcmp(tmp_old + 2, tmp + 2, 5)) { + if (tmp_old[1] >= 5 && tmp[1] >= 5 && + !memcmp(tmp_old + 2, tmp + 2, 5)) { /* same vendor ie, copy from * subelement */ -- cgit From acd3c92acc7aaec50a94d0a7faf7ccd74e952493 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 25 Nov 2022 12:36:58 +0100 Subject: wifi: cfg80211: don't allow multi-BSSID in S1G MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In S1G beacon frames there shouldn't be multi-BSSID elements since that's not supported, remove that to avoid a potential integer underflow and/or misparsing the frames due to the different length of the fixed part of the frame. While at it, initialize non_tx_data so we don't send garbage values to the user (even if it doesn't seem to matter now.) Reported-and-tested-by: Sönke Huster Fixes: 9eaffe5078ca ("cfg80211: convert S1G beacon to scan results") Signed-off-by: Johannes Berg --- net/wireless/scan.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/wireless/scan.c b/net/wireless/scan.c index 4d217798890a..3d86482e83f5 100644 --- a/net/wireless/scan.c +++ b/net/wireless/scan.c @@ -2527,10 +2527,15 @@ cfg80211_inform_bss_frame_data(struct wiphy *wiphy, const struct cfg80211_bss_ies *ies1, *ies2; size_t ielen = len - offsetof(struct ieee80211_mgmt, u.probe_resp.variable); - struct cfg80211_non_tx_bss non_tx_data; + struct cfg80211_non_tx_bss non_tx_data = {}; res = cfg80211_inform_single_bss_frame_data(wiphy, data, mgmt, len, gfp); + + /* don't do any further MBSSID handling for S1G */ + if (ieee80211_is_s1g_beacon(mgmt->frame_control)) + return res; + if (!res || !wiphy->support_mbssid || !cfg80211_find_elem(WLAN_EID_MULTIPLE_BSSID, ie, ielen)) return res; -- cgit From 3e8f7abcc3473bc9603323803aeaed4ffcc3a2ab Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Tue, 8 Nov 2022 16:19:26 +0100 Subject: wifi: mac8021: fix possible oob access in ieee80211_get_rate_duration MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix possible out-of-bound access in ieee80211_get_rate_duration routine as reported by the following UBSAN report: UBSAN: array-index-out-of-bounds in net/mac80211/airtime.c:455:47 index 15 is out of range for type 'u16 [12]' CPU: 2 PID: 217 Comm: kworker/u32:10 Not tainted 6.1.0-060100rc3-generic Hardware name: Acer Aspire TC-281/Aspire TC-281, BIOS R01-A2 07/18/2017 Workqueue: mt76 mt76u_tx_status_data [mt76_usb] Call Trace: show_stack+0x4e/0x61 dump_stack_lvl+0x4a/0x6f dump_stack+0x10/0x18 ubsan_epilogue+0x9/0x43 __ubsan_handle_out_of_bounds.cold+0x42/0x47 ieee80211_get_rate_duration.constprop.0+0x22f/0x2a0 [mac80211] ? ieee80211_tx_status_ext+0x32e/0x640 [mac80211] ieee80211_calc_rx_airtime+0xda/0x120 [mac80211] ieee80211_calc_tx_airtime+0xb4/0x100 [mac80211] mt76x02_send_tx_status+0x266/0x480 [mt76x02_lib] mt76x02_tx_status_data+0x52/0x80 [mt76x02_lib] mt76u_tx_status_data+0x67/0xd0 [mt76_usb] process_one_work+0x225/0x400 worker_thread+0x50/0x3e0 ? process_one_work+0x400/0x400 kthread+0xe9/0x110 ? kthread_complete_and_exit+0x20/0x20 ret_from_fork+0x22/0x30 Fixes: db3e1c40cf2f ("mac80211: Import airtime calculation code from mt76") Signed-off-by: Lorenzo Bianconi Acked-by: Toke Høiland-Jørgensen Signed-off-by: Johannes Berg --- net/mac80211/airtime.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/mac80211/airtime.c b/net/mac80211/airtime.c index 2e66598fac79..e8ebd343e2bf 100644 --- a/net/mac80211/airtime.c +++ b/net/mac80211/airtime.c @@ -452,6 +452,9 @@ static u32 ieee80211_get_rate_duration(struct ieee80211_hw *hw, (status->encoding == RX_ENC_HE && streams > 8))) return 0; + if (idx >= MCS_GROUP_RATES) + return 0; + duration = airtime_mcs_groups[group].duration[idx]; duration <<= airtime_mcs_groups[group].shift; *overhead = 36 + (streams << 2); -- cgit From dcc14cfd7debe11b825cb077e75d91d2575b4cb8 Mon Sep 17 00:00:00 2001 From: Wang Hai Date: Thu, 24 Nov 2022 16:10:05 +0800 Subject: net/9p: Fix a potential socket leak in p9_socket_open Both p9_fd_create_tcp() and p9_fd_create_unix() will call p9_socket_open(). If the creation of p9_trans_fd fails, p9_fd_create_tcp() and p9_fd_create_unix() will return an error directly instead of releasing the cscoket, which will result in a socket leak. This patch adds sock_release() to fix the leak issue. Fixes: 6b18662e239a ("9p connect fixes") Signed-off-by: Wang Hai ACKed-by: Al Viro Signed-off-by: David S. Miller --- net/9p/trans_fd.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c index 56a186768750..f834726d21ea 100644 --- a/net/9p/trans_fd.c +++ b/net/9p/trans_fd.c @@ -860,8 +860,10 @@ static int p9_socket_open(struct p9_client *client, struct socket *csocket) struct file *file; p = kzalloc(sizeof(struct p9_trans_fd), GFP_KERNEL); - if (!p) + if (!p) { + sock_release(csocket); return -ENOMEM; + } csocket->sk->sk_allocation = GFP_NOIO; file = sock_alloc_file(csocket, 0, NULL); -- cgit From d5082d386eee7e8ec46fa8581932c81a4961dcef Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Thu, 24 Nov 2022 23:09:32 +0200 Subject: ipv4: Fix route deletion when nexthop info is not specified When the kernel receives a route deletion request from user space it tries to delete a route that matches the route attributes specified in the request. If only prefix information is specified in the request, the kernel should delete the first matching FIB alias regardless of its associated FIB info. However, an error is currently returned when the FIB info is backed by a nexthop object: # ip nexthop add id 1 via 192.0.2.2 dev dummy10 # ip route add 198.51.100.0/24 nhid 1 # ip route del 198.51.100.0/24 RTNETLINK answers: No such process Fix by matching on such a FIB info when legacy nexthop attributes are not specified in the request. An earlier check already covers the case where a nexthop ID is specified in the request. Add tests that cover these flows. Before the fix: # ./fib_nexthops.sh -t ipv4_fcnal ... TEST: Delete route when not specifying nexthop attributes [FAIL] Tests passed: 11 Tests failed: 1 After the fix: # ./fib_nexthops.sh -t ipv4_fcnal ... TEST: Delete route when not specifying nexthop attributes [ OK ] Tests passed: 12 Tests failed: 0 No regressions in other tests: # ./fib_nexthops.sh ... Tests passed: 228 Tests failed: 0 # ./fib_tests.sh ... Tests passed: 186 Tests failed: 0 Cc: stable@vger.kernel.org Reported-by: Jonas Gorski Tested-by: Jonas Gorski Fixes: 493ced1ac47c ("ipv4: Allow routes to use nexthop objects") Fixes: 6bf92d70e690 ("net: ipv4: fix route with nexthop object delete warning") Fixes: 61b91eb33a69 ("ipv4: Handle attempt to delete multipath route when fib_info contains an nh reference") Signed-off-by: Ido Schimmel Reviewed-by: Nikolay Aleksandrov Reviewed-by: David Ahern Link: https://lore.kernel.org/r/20221124210932.2470010-1-idosch@nvidia.com Signed-off-by: Jakub Kicinski --- net/ipv4/fib_semantics.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index f721c308248b..19a662003eef 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -888,9 +888,11 @@ int fib_nh_match(struct net *net, struct fib_config *cfg, struct fib_info *fi, return 1; } - /* cannot match on nexthop object attributes */ - if (fi->nh) - return 1; + if (fi->nh) { + if (cfg->fc_oif || cfg->fc_gw_family || cfg->fc_mp) + return 1; + return 0; + } if (cfg->fc_oif || cfg->fc_gw_family) { struct fib_nh *nh; -- cgit From fe94800184f22d4778628f1321dce5acb7513d84 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Mon, 28 Nov 2022 16:42:37 +0100 Subject: mptcp: don't orphan ssk in mptcp_close() All of the subflows of a msk will be orphaned in mptcp_close(), which means the subflows are in DEAD state. After then, DATA_FIN will be sent, and the other side will response with a DATA_ACK for this DATA_FIN. However, if the other side still has pending data, the data that received on these subflows will not be passed to the msk, as they are DEAD and subflow_data_ready() will not be called in tcp_data_ready(). Therefore, these data can't be acked, and they will be retransmitted again and again, until timeout. Fix this by setting ssk->sk_socket and ssk->sk_wq to 'NULL', instead of orphaning the subflows in __mptcp_close(), as Paolo suggested. Fixes: e16163b6e2b7 ("mptcp: refactor shutdown and close") Reviewed-by: Biao Jiang Reviewed-by: Mengen Sun Signed-off-by: Menglong Dong Reviewed-by: Paolo Abeni Signed-off-by: Matthieu Baerts Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index b6dc6e260334..1dbc62537259 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -2354,12 +2354,7 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk, goto out; } - /* if we are invoked by the msk cleanup code, the subflow is - * already orphaned - */ - if (ssk->sk_socket) - sock_orphan(ssk); - + sock_orphan(ssk); subflow->disposable = 1; /* if ssk hit tcp_done(), tcp_cleanup_ulp() cleared the related ops @@ -2940,7 +2935,11 @@ cleanup: if (ssk == msk->first) subflow->fail_tout = 0; - sock_orphan(ssk); + /* detach from the parent socket, but allow data_ready to + * push incoming data into the mptcp stack, to properly ack it + */ + ssk->sk_socket = NULL; + ssk->sk_wq = NULL; unlock_sock_fast(ssk, slow); } sock_orphan(sk); -- cgit From b4f166651d03b5484fa179817ba8ad4899a5a6ac Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Mon, 28 Nov 2022 16:42:38 +0100 Subject: mptcp: fix sleep in atomic at close time Matt reported a splat at msk close time: BUG: sleeping function called from invalid context at net/mptcp/protocol.c:2877 in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 155, name: packetdrill preempt_count: 201, expected: 0 RCU nest depth: 0, expected: 0 4 locks held by packetdrill/155: #0: ffff888001536990 (&sb->s_type->i_mutex_key#6){+.+.}-{3:3}, at: __sock_release (net/socket.c:650) #1: ffff88800b498130 (sk_lock-AF_INET){+.+.}-{0:0}, at: mptcp_close (net/mptcp/protocol.c:2973) #2: ffff88800b49a130 (sk_lock-AF_INET/1){+.+.}-{0:0}, at: __mptcp_close_ssk (net/mptcp/protocol.c:2363) #3: ffff88800b49a0b0 (slock-AF_INET){+...}-{2:2}, at: __lock_sock_fast (include/net/sock.h:1820) Preemption disabled at: 0x0 CPU: 1 PID: 155 Comm: packetdrill Not tainted 6.1.0-rc5 #365 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.15.0-1 04/01/2014 Call Trace: dump_stack_lvl (lib/dump_stack.c:107 (discriminator 4)) __might_resched.cold (kernel/sched/core.c:9891) __mptcp_destroy_sock (include/linux/kernel.h:110) __mptcp_close (net/mptcp/protocol.c:2959) mptcp_subflow_queue_clean (include/net/sock.h:1777) __mptcp_close_ssk (net/mptcp/protocol.c:2363) mptcp_destroy_common (net/mptcp/protocol.c:3170) mptcp_destroy (include/net/sock.h:1495) __mptcp_destroy_sock (net/mptcp/protocol.c:2886) __mptcp_close (net/mptcp/protocol.c:2959) mptcp_close (net/mptcp/protocol.c:2974) inet_release (net/ipv4/af_inet.c:432) __sock_release (net/socket.c:651) sock_close (net/socket.c:1367) __fput (fs/file_table.c:320) task_work_run (kernel/task_work.c:181 (discriminator 1)) exit_to_user_mode_prepare (include/linux/resume_user_mode.h:49) syscall_exit_to_user_mode (kernel/entry/common.c:130) do_syscall_64 (arch/x86/entry/common.c:87) entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:120) We can't call mptcp_close under the 'fast' socket lock variant, replace it with a sock_lock_nested() as the relevant code is already under the listening msk socket lock protection. Reported-by: Matthieu Baerts Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/316 Fixes: 30e51b923e43 ("mptcp: fix unreleased socket in accept queue") Signed-off-by: Paolo Abeni Reviewed-by: Matthieu Baerts Signed-off-by: Matthieu Baerts Signed-off-by: Jakub Kicinski --- net/mptcp/subflow.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 02a54d59697b..2159b5f9988f 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -1745,16 +1745,16 @@ void mptcp_subflow_queue_clean(struct sock *listener_ssk) for (msk = head; msk; msk = next) { struct sock *sk = (struct sock *)msk; - bool slow, do_cancel_work; + bool do_cancel_work; sock_hold(sk); - slow = lock_sock_fast_nested(sk); + lock_sock_nested(sk, SINGLE_DEPTH_NESTING); next = msk->dl_next; msk->first = NULL; msk->dl_next = NULL; do_cancel_work = __mptcp_close(sk, 0); - unlock_sock_fast(sk, slow); + release_sock(sk); if (do_cancel_work) mptcp_cancel_work(sk); sock_put(sk); -- cgit From 3067bc61fcfe3081bf4807ce65560f499e895e77 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Fri, 25 Nov 2022 12:46:43 -0500 Subject: tipc: re-fetch skb cb after tipc_msg_validate As the call trace shows, the original skb was freed in tipc_msg_validate(), and dereferencing the old skb cb would cause an use-after-free crash. BUG: KASAN: use-after-free in tipc_crypto_rcv_complete+0x1835/0x2240 [tipc] Call Trace: tipc_crypto_rcv_complete+0x1835/0x2240 [tipc] tipc_crypto_rcv+0xd32/0x1ec0 [tipc] tipc_rcv+0x744/0x1150 [tipc] ... Allocated by task 47078: kmem_cache_alloc_node+0x158/0x4d0 __alloc_skb+0x1c1/0x270 tipc_buf_acquire+0x1e/0xe0 [tipc] tipc_msg_create+0x33/0x1c0 [tipc] tipc_link_build_proto_msg+0x38a/0x2100 [tipc] tipc_link_timeout+0x8b8/0xef0 [tipc] tipc_node_timeout+0x2a1/0x960 [tipc] call_timer_fn+0x2d/0x1c0 ... Freed by task 47078: tipc_msg_validate+0x7b/0x440 [tipc] tipc_crypto_rcv_complete+0x4b5/0x2240 [tipc] tipc_crypto_rcv+0xd32/0x1ec0 [tipc] tipc_rcv+0x744/0x1150 [tipc] This patch fixes it by re-fetching the skb cb from the new allocated skb after calling tipc_msg_validate(). Fixes: fc1b6d6de220 ("tipc: introduce TIPC encryption & authentication") Reported-by: Shuang Li Signed-off-by: Xin Long Link: https://lore.kernel.org/r/1b1cdba762915325bd8ef9a98d0276eb673df2a5.1669398403.git.lucien.xin@gmail.com Signed-off-by: Jakub Kicinski --- net/tipc/crypto.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/tipc/crypto.c b/net/tipc/crypto.c index f09316a9035f..d67440de011e 100644 --- a/net/tipc/crypto.c +++ b/net/tipc/crypto.c @@ -1971,6 +1971,9 @@ rcv: /* Ok, everything's fine, try to synch own keys according to peers' */ tipc_crypto_key_synch(rx, *skb); + /* Re-fetch skb cb as skb might be changed in tipc_msg_validate */ + skb_cb = TIPC_SKB_CB(*skb); + /* Mark skb decrypted */ skb_cb->decrypted = 1; -- cgit From 7e177d32442b7ed08a9fa61b61724abc548cb248 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Fri, 25 Nov 2022 15:57:24 +0800 Subject: net: hsr: Fix potential use-after-free The skb is delivered to netif_rx() which may free it, after calling this, dereferencing skb may trigger use-after-free. Fixes: f421436a591d ("net/hsr: Add support for the High-availability Seamless Redundancy protocol (HSRv0)") Signed-off-by: YueHaibing Link: https://lore.kernel.org/r/20221125075724.27912-1-yuehaibing@huawei.com Signed-off-by: Jakub Kicinski --- net/hsr/hsr_forward.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/hsr/hsr_forward.c b/net/hsr/hsr_forward.c index a50429a62f74..56bb27d67a2e 100644 --- a/net/hsr/hsr_forward.c +++ b/net/hsr/hsr_forward.c @@ -351,17 +351,18 @@ static void hsr_deliver_master(struct sk_buff *skb, struct net_device *dev, struct hsr_node *node_src) { bool was_multicast_frame; - int res; + int res, recv_len; was_multicast_frame = (skb->pkt_type == PACKET_MULTICAST); hsr_addr_subst_source(node_src, skb); skb_pull(skb, ETH_HLEN); + recv_len = skb->len; res = netif_rx(skb); if (res == NET_RX_DROP) { dev->stats.rx_dropped++; } else { dev->stats.rx_packets++; - dev->stats.rx_bytes += skb->len; + dev->stats.rx_bytes += recv_len; if (was_multicast_frame) dev->stats.multicast++; } -- cgit From b85f628aa158a653c006e9c1405a117baef8c868 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Mon, 28 Nov 2022 11:18:12 -0500 Subject: packet: do not set TP_STATUS_CSUM_VALID on CHECKSUM_COMPLETE CHECKSUM_COMPLETE signals that skb->csum stores the sum over the entire packet. It does not imply that an embedded l4 checksum field has been validated. Fixes: 682f048bd494 ("af_packet: pass checksum validation status to the user") Signed-off-by: Willem de Bruijn Link: https://lore.kernel.org/r/20221128161812.640098-1-willemdebruijn.kernel@gmail.com Signed-off-by: Jakub Kicinski --- net/packet/af_packet.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 6ce8dd19f33c..1ab65f7f2a0a 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -2293,8 +2293,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, if (skb->ip_summed == CHECKSUM_PARTIAL) status |= TP_STATUS_CSUMNOTREADY; else if (skb->pkt_type != PACKET_OUTGOING && - (skb->ip_summed == CHECKSUM_COMPLETE || - skb_csum_unnecessary(skb))) + skb_csum_unnecessary(skb)) status |= TP_STATUS_CSUM_VALID; if (snaplen > res) @@ -3520,8 +3519,7 @@ static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, if (skb->ip_summed == CHECKSUM_PARTIAL) aux.tp_status |= TP_STATUS_CSUMNOTREADY; else if (skb->pkt_type != PACKET_OUTGOING && - (skb->ip_summed == CHECKSUM_COMPLETE || - skb_csum_unnecessary(skb))) + skb_csum_unnecessary(skb)) aux.tp_status |= TP_STATUS_CSUM_VALID; aux.tp_len = origlen; -- cgit From 9ed7bfc79542119ac0a9e1ce8a2a5285e43433e9 Mon Sep 17 00:00:00 2001 From: Zhengchao Shao Date: Sat, 26 Nov 2022 11:17:20 +0800 Subject: sctp: fix memory leak in sctp_stream_outq_migrate() When sctp_stream_outq_migrate() is called to release stream out resources, the memory pointed to by prio_head in stream out is not released. The memory leak information is as follows: unreferenced object 0xffff88801fe79f80 (size 64): comm "sctp_repo", pid 7957, jiffies 4294951704 (age 36.480s) hex dump (first 32 bytes): 80 9f e7 1f 80 88 ff ff 80 9f e7 1f 80 88 ff ff ................ 90 9f e7 1f 80 88 ff ff 90 9f e7 1f 80 88 ff ff ................ backtrace: [] kmalloc_trace+0x26/0x60 [] sctp_sched_prio_set+0x4cc/0x770 [] sctp_stream_init_ext+0xd2/0x1b0 [] sctp_sendmsg_to_asoc+0x1614/0x1a30 [] sctp_sendmsg+0xda1/0x1ef0 [] inet_sendmsg+0x9d/0xe0 [] sock_sendmsg+0xd3/0x120 [] __sys_sendto+0x23a/0x340 [] __x64_sys_sendto+0xe1/0x1b0 [] do_syscall_64+0x39/0xb0 [] entry_SYSCALL_64_after_hwframe+0x63/0xcd Link: https://syzkaller.appspot.com/bug?exrid=29c402e56c4760763cc0 Fixes: 637784ade221 ("sctp: introduce priority based stream scheduler") Reported-by: syzbot+29c402e56c4760763cc0@syzkaller.appspotmail.com Signed-off-by: Zhengchao Shao Reviewed-by: Xin Long Link: https://lore.kernel.org/r/20221126031720.378562-1-shaozhengchao@huawei.com Signed-off-by: Jakub Kicinski --- net/sctp/stream.c | 25 ++++++++++++++++++------- net/sctp/stream_sched.c | 5 +++++ net/sctp/stream_sched_prio.c | 19 +++++++++++++++++++ net/sctp/stream_sched_rr.c | 5 +++++ 4 files changed, 47 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/sctp/stream.c b/net/sctp/stream.c index ef9fceadef8d..ee6514af830f 100644 --- a/net/sctp/stream.c +++ b/net/sctp/stream.c @@ -52,6 +52,19 @@ static void sctp_stream_shrink_out(struct sctp_stream *stream, __u16 outcnt) } } +static void sctp_stream_free_ext(struct sctp_stream *stream, __u16 sid) +{ + struct sctp_sched_ops *sched; + + if (!SCTP_SO(stream, sid)->ext) + return; + + sched = sctp_sched_ops_from_stream(stream); + sched->free_sid(stream, sid); + kfree(SCTP_SO(stream, sid)->ext); + SCTP_SO(stream, sid)->ext = NULL; +} + /* Migrates chunks from stream queues to new stream queues if needed, * but not across associations. Also, removes those chunks to streams * higher than the new max. @@ -70,16 +83,14 @@ static void sctp_stream_outq_migrate(struct sctp_stream *stream, * sctp_stream_update will swap ->out pointers. */ for (i = 0; i < outcnt; i++) { - kfree(SCTP_SO(new, i)->ext); + sctp_stream_free_ext(new, i); SCTP_SO(new, i)->ext = SCTP_SO(stream, i)->ext; SCTP_SO(stream, i)->ext = NULL; } } - for (i = outcnt; i < stream->outcnt; i++) { - kfree(SCTP_SO(stream, i)->ext); - SCTP_SO(stream, i)->ext = NULL; - } + for (i = outcnt; i < stream->outcnt; i++) + sctp_stream_free_ext(stream, i); } static int sctp_stream_alloc_out(struct sctp_stream *stream, __u16 outcnt, @@ -174,9 +185,9 @@ void sctp_stream_free(struct sctp_stream *stream) struct sctp_sched_ops *sched = sctp_sched_ops_from_stream(stream); int i; - sched->free(stream); + sched->unsched_all(stream); for (i = 0; i < stream->outcnt; i++) - kfree(SCTP_SO(stream, i)->ext); + sctp_stream_free_ext(stream, i); genradix_free(&stream->out); genradix_free(&stream->in); } diff --git a/net/sctp/stream_sched.c b/net/sctp/stream_sched.c index 1ad565ed5627..7c8f9d89e16a 100644 --- a/net/sctp/stream_sched.c +++ b/net/sctp/stream_sched.c @@ -46,6 +46,10 @@ static int sctp_sched_fcfs_init_sid(struct sctp_stream *stream, __u16 sid, return 0; } +static void sctp_sched_fcfs_free_sid(struct sctp_stream *stream, __u16 sid) +{ +} + static void sctp_sched_fcfs_free(struct sctp_stream *stream) { } @@ -96,6 +100,7 @@ static struct sctp_sched_ops sctp_sched_fcfs = { .get = sctp_sched_fcfs_get, .init = sctp_sched_fcfs_init, .init_sid = sctp_sched_fcfs_init_sid, + .free_sid = sctp_sched_fcfs_free_sid, .free = sctp_sched_fcfs_free, .enqueue = sctp_sched_fcfs_enqueue, .dequeue = sctp_sched_fcfs_dequeue, diff --git a/net/sctp/stream_sched_prio.c b/net/sctp/stream_sched_prio.c index 80b5a2c4cbc7..4fc9f2923ed1 100644 --- a/net/sctp/stream_sched_prio.c +++ b/net/sctp/stream_sched_prio.c @@ -204,6 +204,24 @@ static int sctp_sched_prio_init_sid(struct sctp_stream *stream, __u16 sid, return sctp_sched_prio_set(stream, sid, 0, gfp); } +static void sctp_sched_prio_free_sid(struct sctp_stream *stream, __u16 sid) +{ + struct sctp_stream_priorities *prio = SCTP_SO(stream, sid)->ext->prio_head; + int i; + + if (!prio) + return; + + SCTP_SO(stream, sid)->ext->prio_head = NULL; + for (i = 0; i < stream->outcnt; i++) { + if (SCTP_SO(stream, i)->ext && + SCTP_SO(stream, i)->ext->prio_head == prio) + return; + } + + kfree(prio); +} + static void sctp_sched_prio_free(struct sctp_stream *stream) { struct sctp_stream_priorities *prio, *n; @@ -323,6 +341,7 @@ static struct sctp_sched_ops sctp_sched_prio = { .get = sctp_sched_prio_get, .init = sctp_sched_prio_init, .init_sid = sctp_sched_prio_init_sid, + .free_sid = sctp_sched_prio_free_sid, .free = sctp_sched_prio_free, .enqueue = sctp_sched_prio_enqueue, .dequeue = sctp_sched_prio_dequeue, diff --git a/net/sctp/stream_sched_rr.c b/net/sctp/stream_sched_rr.c index ff425aed62c7..cc444fe0d67c 100644 --- a/net/sctp/stream_sched_rr.c +++ b/net/sctp/stream_sched_rr.c @@ -90,6 +90,10 @@ static int sctp_sched_rr_init_sid(struct sctp_stream *stream, __u16 sid, return 0; } +static void sctp_sched_rr_free_sid(struct sctp_stream *stream, __u16 sid) +{ +} + static void sctp_sched_rr_free(struct sctp_stream *stream) { sctp_sched_rr_unsched_all(stream); @@ -177,6 +181,7 @@ static struct sctp_sched_ops sctp_sched_rr = { .get = sctp_sched_rr_get, .init = sctp_sched_rr_init, .init_sid = sctp_sched_rr_init_sid, + .free_sid = sctp_sched_rr_free_sid, .free = sctp_sched_rr_free, .enqueue = sctp_sched_rr_enqueue, .dequeue = sctp_sched_rr_dequeue, -- cgit