From 2cdaa3eefed83082923cf219c8b6a314e622da74 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 18 Apr 2023 23:31:26 +0200 Subject: netfilter: conntrack: restore IPS_CONFIRMED out of nf_conntrack_hash_check_insert() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit e6d57e9ff0ae ("netfilter: conntrack: fix rmmod double-free race") consolidates IPS_CONFIRMED bit set in nf_conntrack_hash_check_insert(). However, this breaks ctnetlink: # conntrack -I -p tcp --timeout 123 --src 1.2.3.4 --dst 5.6.7.8 --state ESTABLISHED --sport 1 --dport 4 -u SEEN_REPLY conntrack v1.4.6 (conntrack-tools): Operation failed: Device or resource busy This is a partial revert of the aforementioned commit to restore IPS_CONFIRMED. Fixes: e6d57e9ff0ae ("netfilter: conntrack: fix rmmod double-free race") Reported-by: Stéphane Graber Tested-by: Stéphane Graber Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_bpf.c | 1 + net/netfilter/nf_conntrack_core.c | 1 - net/netfilter/nf_conntrack_netlink.c | 3 +++ 3 files changed, 4 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_bpf.c b/net/netfilter/nf_conntrack_bpf.c index cd99e6dc1f35..34913521c385 100644 --- a/net/netfilter/nf_conntrack_bpf.c +++ b/net/netfilter/nf_conntrack_bpf.c @@ -381,6 +381,7 @@ __bpf_kfunc struct nf_conn *bpf_ct_insert_entry(struct nf_conn___init *nfct_i) struct nf_conn *nfct = (struct nf_conn *)nfct_i; int err; + nfct->status |= IPS_CONFIRMED; err = nf_conntrack_hash_check_insert(nfct); if (err < 0) { nf_conntrack_free(nfct); diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index c6a6a6099b4e..7ba6ab9b54b5 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -932,7 +932,6 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct) goto out; } - ct->status |= IPS_CONFIRMED; smp_wmb(); /* The caller holds a reference to this object */ refcount_set(&ct->ct_general.use, 2); diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index bfc3aaa2c872..d3ee18854698 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -2316,6 +2316,9 @@ ctnetlink_create_conntrack(struct net *net, nfct_seqadj_ext_add(ct); nfct_synproxy_ext_add(ct); + /* we must add conntrack extensions before confirmation. */ + ct->status |= IPS_CONFIRMED; + if (cda[CTA_STATUS]) { err = ctnetlink_change_status(ct, cda); if (err < 0) -- cgit From 73db1b8f2bb6725b7391e85aab41fdf592b3c0c1 Mon Sep 17 00:00:00 2001 From: Tzung-Bi Shih Date: Wed, 19 Apr 2023 13:15:26 +0800 Subject: netfilter: conntrack: fix wrong ct->timeout value (struct nf_conn)->timeout is an interval before the conntrack confirmed. After confirmed, it becomes a timestamp. It is observed that timeout of an unconfirmed conntrack: - Set by calling ctnetlink_change_timeout(). As a result, `nfct_time_stamp` was wrongly added to `ct->timeout` twice. - Get by calling ctnetlink_dump_timeout(). As a result, `nfct_time_stamp` was wrongly subtracted. Call Trace: dump_stack_lvl ctnetlink_dump_timeout __ctnetlink_glue_build ctnetlink_glue_build __nfqnl_enqueue_packet nf_queue nf_hook_slow ip_mc_output ? __pfx_ip_finish_output ip_send_skb ? __pfx_dst_output udp_send_skb udp_sendmsg ? __pfx_ip_generic_getfrag sock_sendmsg Separate the 2 cases in: - Setting `ct->timeout` in __nf_ct_set_timeout(). - Getting `ct->timeout` in ctnetlink_dump_timeout(). Pablo appends: Update ctnetlink to set up the timeout _after_ the IPS_CONFIRMED flag is set on, otherwise conntrack creation via ctnetlink breaks. Note that the problem described in this patch occurs since the introduction of the nfnetlink_queue conntrack support, select a sufficiently old Fixes: tag for -stable kernel to pick up this fix. Fixes: a4b4766c3ceb ("netfilter: nfnetlink_queue: rename related to nfqueue attaching conntrack info") Signed-off-by: Tzung-Bi Shih Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_netlink.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index d3ee18854698..6f3b23a6653c 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -176,7 +176,12 @@ nla_put_failure: static int ctnetlink_dump_timeout(struct sk_buff *skb, const struct nf_conn *ct, bool skip_zero) { - long timeout = nf_ct_expires(ct) / HZ; + long timeout; + + if (nf_ct_is_confirmed(ct)) + timeout = nf_ct_expires(ct) / HZ; + else + timeout = ct->timeout / HZ; if (skip_zero && timeout == 0) return 0; @@ -2253,9 +2258,6 @@ ctnetlink_create_conntrack(struct net *net, if (!cda[CTA_TIMEOUT]) goto err1; - timeout = (u64)ntohl(nla_get_be32(cda[CTA_TIMEOUT])) * HZ; - __nf_ct_set_timeout(ct, timeout); - rcu_read_lock(); if (cda[CTA_HELP]) { char *helpname = NULL; @@ -2319,6 +2321,9 @@ ctnetlink_create_conntrack(struct net *net, /* we must add conntrack extensions before confirmation. */ ct->status |= IPS_CONFIRMED; + timeout = (u64)ntohl(nla_get_be32(cda[CTA_TIMEOUT])) * HZ; + __nf_ct_set_timeout(ct, timeout); + if (cda[CTA_STATUS]) { err = ctnetlink_change_status(ct, cda); if (err < 0) -- cgit From 7041101ff6c3073fd8f2e99920f535b111c929cb Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Thu, 20 Apr 2023 16:59:46 +0200 Subject: net/sched: sch_fq: fix integer overflow of "credit" if sch_fq is configured with "initial quantum" having values greater than INT_MAX, the first assignment of "credit" does signed integer overflow to a very negative value. In this situation, the syzkaller script provided by Cristoph triggers the CPU soft-lockup warning even with few sockets. It's not an infinite loop, but "credit" wasn't probably meant to be minus 2Gb for each new flow. Capping "initial quantum" to INT_MAX proved to fix the issue. v2: validation of "initial quantum" is done in fq_policy, instead of open coding in fq_change() _ suggested by Jakub Kicinski Reported-by: Christoph Paasch Link: https://github.com/multipath-tcp/mptcp_net-next/issues/377 Fixes: afe4fd062416 ("pkt_sched: fq: Fair Queue packet scheduler") Reviewed-by: Eric Dumazet Signed-off-by: Davide Caratti Link: https://lore.kernel.org/r/7b3a3c7e36d03068707a021760a194a8eb5ad41a.1682002300.git.dcaratti@redhat.com Signed-off-by: Jakub Kicinski --- net/sched/sch_fq.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c index 48d14fb90ba0..f59a2cb2c803 100644 --- a/net/sched/sch_fq.c +++ b/net/sched/sch_fq.c @@ -779,13 +779,17 @@ static int fq_resize(struct Qdisc *sch, u32 log) return 0; } +static struct netlink_range_validation iq_range = { + .max = INT_MAX, +}; + static const struct nla_policy fq_policy[TCA_FQ_MAX + 1] = { [TCA_FQ_UNSPEC] = { .strict_start_type = TCA_FQ_TIMER_SLACK }, [TCA_FQ_PLIMIT] = { .type = NLA_U32 }, [TCA_FQ_FLOW_PLIMIT] = { .type = NLA_U32 }, [TCA_FQ_QUANTUM] = { .type = NLA_U32 }, - [TCA_FQ_INITIAL_QUANTUM] = { .type = NLA_U32 }, + [TCA_FQ_INITIAL_QUANTUM] = NLA_POLICY_FULL_RANGE(NLA_U32, &iq_range), [TCA_FQ_RATE_ENABLE] = { .type = NLA_U32 }, [TCA_FQ_FLOW_DEFAULT_RATE] = { .type = NLA_U32 }, [TCA_FQ_FLOW_MAX_RATE] = { .type = NLA_U32 }, -- cgit From 2cc8a008d62f3c04eeb7ec6fe59e542802bb8df3 Mon Sep 17 00:00:00 2001 From: Ivan Vecera Date: Thu, 20 Apr 2023 20:36:33 +0200 Subject: net/sched: cls_api: Initialize miss_cookie_node when action miss is not used Function tcf_exts_init_ex() sets exts->miss_cookie_node ptr only when use_action_miss is true so it assumes in other case that the field is set to NULL by the caller. If not then the field contains garbage and subsequent tcf_exts_destroy() call results in a crash. Ensure that the field .miss_cookie_node pointer is NULL when use_action_miss parameter is false to avoid this potential scenario. Fixes: 80cd22c35c90 ("net/sched: cls_api: Support hardware miss to tc action") Signed-off-by: Ivan Vecera Reviewed-by: Pedro Tammela Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20230420183634.1139391-1-ivecera@redhat.com Signed-off-by: Jakub Kicinski --- net/sched/cls_api.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 35785a36c802..3c3629c9e7b6 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -3211,6 +3211,7 @@ int tcf_exts_init_ex(struct tcf_exts *exts, struct net *net, int action, #ifdef CONFIG_NET_CLS_ACT exts->type = 0; exts->nr_actions = 0; + exts->miss_cookie_node = NULL; /* Note: we do not own yet a reference on net. * This reference might be taken later from tcf_exts_get_net(). */ -- cgit From 99e5acae193e369b71217efe6f1dad42f3f18815 Mon Sep 17 00:00:00 2001 From: Ziyang Xuan Date: Thu, 20 Apr 2023 20:40:35 +0800 Subject: ipv4: Fix potential uninit variable access bug in __ip_make_skb() Like commit ea30388baebc ("ipv6: Fix an uninit variable access bug in __ip6_make_skb()"). icmphdr does not in skb linear region under the scenario of SOCK_RAW socket. Access icmp_hdr(skb)->type directly will trigger the uninit variable access bug. Use a local variable icmp_type to carry the correct value in different scenarios. Fixes: 96793b482540 ("[IPV4]: Add ICMPMsgStats MIB (RFC 4293)") Reviewed-by: Willem de Bruijn Signed-off-by: Ziyang Xuan Signed-off-by: David S. Miller --- net/ipv4/ip_output.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 4e4e308c3230..89bd7872c629 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -1570,9 +1570,19 @@ struct sk_buff *__ip_make_skb(struct sock *sk, cork->dst = NULL; skb_dst_set(skb, &rt->dst); - if (iph->protocol == IPPROTO_ICMP) - icmp_out_count(net, ((struct icmphdr *) - skb_transport_header(skb))->type); + if (iph->protocol == IPPROTO_ICMP) { + u8 icmp_type; + + /* For such sockets, transhdrlen is zero when do ip_append_data(), + * so icmphdr does not in skb linear region and can not get icmp_type + * by icmp_hdr(skb)->type. + */ + if (sk->sk_type == SOCK_RAW && !inet_sk(sk)->hdrincl) + icmp_type = fl4->fl4_icmp_type; + else + icmp_type = icmp_hdr(skb)->type; + icmp_out_count(net, icmp_type); + } ip_cork_release(cork); out: -- cgit From e0416e7d33361d2ad0bf9f007428346579ac854a Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 21 Apr 2023 23:03:46 +0100 Subject: rxrpc: Fix potential race in error handling in afs_make_call() If the rxrpc call set up by afs_make_call() receives an error whilst it is transmitting the request, there's the possibility that it may get to the point the rxrpc call is ended (after the error_kill_call label) just as the call is queued for async processing. This could manifest itself as call->rxcall being seen as NULL in afs_deliver_to_call() when it tries to lock the call. Fix this by splitting rxrpc_kernel_end_call() into a function to shut down an rxrpc call and a function to release the caller's reference and calling the latter only when we get to afs_put_call(). Reported-by: Jeffrey Altman Signed-off-by: David Howells Tested-by: kafs-testing+fedora36_64checkkafs-build-306@auristor.com cc: Marc Dionne cc: "David S. Miller" cc: Eric Dumazet cc: Jakub Kicinski cc: Paolo Abeni cc: linux-afs@lists.infradead.org cc: netdev@vger.kernel.org Signed-off-by: David S. Miller --- net/rxrpc/af_rxrpc.c | 37 +++++++++++++++++++++++++------------ net/rxrpc/rxperf.c | 3 ++- 2 files changed, 27 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index 102f5cbff91a..c32b164206f9 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -342,31 +342,44 @@ static void rxrpc_dummy_notify_rx(struct sock *sk, struct rxrpc_call *rxcall, } /** - * rxrpc_kernel_end_call - Allow a kernel service to end a call it was using + * rxrpc_kernel_shutdown_call - Allow a kernel service to shut down a call it was using * @sock: The socket the call is on * @call: The call to end * - * Allow a kernel service to end a call it was using. The call must be + * Allow a kernel service to shut down a call it was using. The call must be * complete before this is called (the call should be aborted if necessary). */ -void rxrpc_kernel_end_call(struct socket *sock, struct rxrpc_call *call) +void rxrpc_kernel_shutdown_call(struct socket *sock, struct rxrpc_call *call) { _enter("%d{%d}", call->debug_id, refcount_read(&call->ref)); mutex_lock(&call->user_mutex); - rxrpc_release_call(rxrpc_sk(sock->sk), call); - - /* Make sure we're not going to call back into a kernel service */ - if (call->notify_rx) { - spin_lock(&call->notify_lock); - call->notify_rx = rxrpc_dummy_notify_rx; - spin_unlock(&call->notify_lock); + if (!test_bit(RXRPC_CALL_RELEASED, &call->flags)) { + rxrpc_release_call(rxrpc_sk(sock->sk), call); + + /* Make sure we're not going to call back into a kernel service */ + if (call->notify_rx) { + spin_lock(&call->notify_lock); + call->notify_rx = rxrpc_dummy_notify_rx; + spin_unlock(&call->notify_lock); + } } - mutex_unlock(&call->user_mutex); +} +EXPORT_SYMBOL(rxrpc_kernel_shutdown_call); + +/** + * rxrpc_kernel_put_call - Release a reference to a call + * @sock: The socket the call is on + * @call: The call to put + * + * Drop the application's ref on an rxrpc call. + */ +void rxrpc_kernel_put_call(struct socket *sock, struct rxrpc_call *call) +{ rxrpc_put_call(call, rxrpc_call_put_kernel); } -EXPORT_SYMBOL(rxrpc_kernel_end_call); +EXPORT_SYMBOL(rxrpc_kernel_put_call); /** * rxrpc_kernel_check_life - Check to see whether a call is still alive diff --git a/net/rxrpc/rxperf.c b/net/rxrpc/rxperf.c index 4a2e90015ca7..085e7892d310 100644 --- a/net/rxrpc/rxperf.c +++ b/net/rxrpc/rxperf.c @@ -342,7 +342,8 @@ static void rxperf_deliver_to_call(struct work_struct *work) call_complete: rxperf_set_call_complete(call, ret, remote_abort); /* The call may have been requeued */ - rxrpc_kernel_end_call(rxperf_socket, call->rxcall); + rxrpc_kernel_shutdown_call(rxperf_socket, call->rxcall); + rxrpc_kernel_put_call(rxperf_socket, call->rxcall); cancel_work(&call->work); kfree(call); } -- cgit From fadfc57cc8047080a123b16f288b7138524fb1e2 Mon Sep 17 00:00:00 2001 From: Marc Dionne Date: Fri, 21 Apr 2023 17:16:17 +0100 Subject: rxrpc: Fix error when reading rxrpc tokens When converting from ASSERTCMP to WARN_ON, the tested condition must be inverted, which was missed for this case. This would cause an EIO error when trying to read an rxrpc token, for instance when trying to display tokens with AuriStor's "tokens" command. Fixes: 84924aac08a4 ("rxrpc: Fix checker warning") Signed-off-by: Marc Dionne Signed-off-by: David Howells cc: "David S. Miller" cc: Eric Dumazet cc: Jakub Kicinski cc: Paolo Abeni cc: linux-afs@lists.infradead.org cc: netdev@vger.kernel.org Signed-off-by: David S. Miller --- net/rxrpc/key.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/rxrpc/key.c b/net/rxrpc/key.c index 8d53aded09c4..33e8302a79e3 100644 --- a/net/rxrpc/key.c +++ b/net/rxrpc/key.c @@ -680,7 +680,7 @@ static long rxrpc_read(const struct key *key, return -ENOPKG; } - if (WARN_ON((unsigned long)xdr - (unsigned long)oldxdr == + if (WARN_ON((unsigned long)xdr - (unsigned long)oldxdr != toksize)) return -EIO; } -- cgit From 807cfded92b0a2e8be9c078c693075790c7c4131 Mon Sep 17 00:00:00 2001 From: Pedro Tammela Date: Sat, 22 Apr 2023 12:56:09 -0300 Subject: net/sched: sch_htb: use extack on errors messages Some error messages are still being printed to dmesg. Since extack is available, provide error messages there. Reviewed-by: Simon Horman Acked-by: Jamal Hadi Salim Signed-off-by: Pedro Tammela Signed-off-by: David S. Miller --- net/sched/sch_htb.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index 92f2975b6a82..8aef7dd9fb88 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -1786,7 +1786,7 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, goto failure; err = nla_parse_nested_deprecated(tb, TCA_HTB_MAX, opt, htb_policy, - NULL); + extack); if (err < 0) goto failure; @@ -1858,7 +1858,7 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, /* check maximal depth */ if (parent && parent->parent && parent->parent->level < 2) { - pr_err("htb: tree is too deep\n"); + NL_SET_ERR_MSG_MOD(extack, "tree is too deep"); goto failure; } err = -ENOBUFS; @@ -1917,8 +1917,8 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, }; err = htb_offload(dev, &offload_opt); if (err) { - pr_err("htb: TC_HTB_LEAF_ALLOC_QUEUE failed with err = %d\n", - err); + NL_SET_ERR_MSG_WEAK(extack, + "Failed to offload TC_HTB_LEAF_ALLOC_QUEUE"); goto err_kill_estimator; } dev_queue = netdev_get_tx_queue(dev, offload_opt.qid); @@ -1937,8 +1937,8 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, }; err = htb_offload(dev, &offload_opt); if (err) { - pr_err("htb: TC_HTB_LEAF_TO_INNER failed with err = %d\n", - err); + NL_SET_ERR_MSG_WEAK(extack, + "Failed to offload TC_HTB_LEAF_TO_INNER"); htb_graft_helper(dev_queue, old_q); goto err_kill_estimator; } @@ -2067,8 +2067,9 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, qdisc_put(parent_qdisc); if (warn) - pr_warn("HTB: quantum of class %X is %s. Consider r2q change.\n", - cl->common.classid, (warn == -1 ? "small" : "big")); + NL_SET_ERR_MSG_FMT_MOD(extack, + "quantum of class %X is %s. Consider r2q change.", + cl->common.classid, (warn == -1 ? "small" : "big")); qdisc_class_hash_grow(sch, &q->clhash); -- cgit From c69a9b023f65b73068a17f2f4eb6a1b6e257f5bf Mon Sep 17 00:00:00 2001 From: Pedro Tammela Date: Sat, 22 Apr 2023 12:56:10 -0300 Subject: net/sched: sch_qfq: use extack on errors messages Some error messages are still being printed to dmesg. Since extack is available, provide error messages there. Reviewed-by: Simon Horman Acked-by: Jamal Hadi Salim Signed-off-by: Pedro Tammela Signed-off-by: David S. Miller --- net/sched/sch_qfq.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c index 02098a02943e..abcc48087831 100644 --- a/net/sched/sch_qfq.c +++ b/net/sched/sch_qfq.c @@ -402,8 +402,8 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, int err; int delta_w; - if (tca[TCA_OPTIONS] == NULL) { - pr_notice("qfq: no options\n"); + if (NL_REQ_ATTR_CHECK(extack, NULL, tca, TCA_OPTIONS)) { + NL_SET_ERR_MSG_MOD(extack, "missing options"); return -EINVAL; } @@ -442,8 +442,9 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, delta_w = weight - (cl ? cl->agg->class_weight : 0); if (q->wsum + delta_w > QFQ_MAX_WSUM) { - pr_notice("qfq: total weight out of range (%d + %u)\n", - delta_w, q->wsum); + NL_SET_ERR_MSG_FMT_MOD(extack, + "total weight out of range (%d + %u)\n", + delta_w, q->wsum); return -EINVAL; } -- cgit From 25369891fcef373540f8b4e0b3bccf77a04490d5 Mon Sep 17 00:00:00 2001 From: Pedro Tammela Date: Sat, 22 Apr 2023 12:56:11 -0300 Subject: net/sched: sch_qfq: refactor parsing of netlink parameters Two parameters can be transformed into netlink policies and validated while parsing the netlink message. Reviewed-by: Simon Horman Acked-by: Jamal Hadi Salim Signed-off-by: Pedro Tammela Signed-off-by: David S. Miller --- net/sched/sch_qfq.c | 25 +++++++++++-------------- 1 file changed, 11 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c index abcc48087831..dfd9a99e6257 100644 --- a/net/sched/sch_qfq.c +++ b/net/sched/sch_qfq.c @@ -113,6 +113,7 @@ #define QFQ_MTU_SHIFT 16 /* to support TSO/GSO */ #define QFQ_MIN_LMAX 512 /* see qfq_slot_insert */ +#define QFQ_MAX_LMAX (1UL << QFQ_MTU_SHIFT) #define QFQ_MAX_AGG_CLASSES 8 /* max num classes per aggregate allowed */ @@ -214,9 +215,14 @@ static struct qfq_class *qfq_find_class(struct Qdisc *sch, u32 classid) return container_of(clc, struct qfq_class, common); } +static struct netlink_range_validation lmax_range = { + .min = QFQ_MIN_LMAX, + .max = QFQ_MAX_LMAX, +}; + static const struct nla_policy qfq_policy[TCA_QFQ_MAX + 1] = { - [TCA_QFQ_WEIGHT] = { .type = NLA_U32 }, - [TCA_QFQ_LMAX] = { .type = NLA_U32 }, + [TCA_QFQ_WEIGHT] = NLA_POLICY_RANGE(NLA_U32, 1, QFQ_MAX_WEIGHT), + [TCA_QFQ_LMAX] = NLA_POLICY_FULL_RANGE(NLA_U32, &lmax_range), }; /* @@ -408,17 +414,13 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, } err = nla_parse_nested_deprecated(tb, TCA_QFQ_MAX, tca[TCA_OPTIONS], - qfq_policy, NULL); + qfq_policy, extack); if (err < 0) return err; - if (tb[TCA_QFQ_WEIGHT]) { + if (tb[TCA_QFQ_WEIGHT]) weight = nla_get_u32(tb[TCA_QFQ_WEIGHT]); - if (!weight || weight > (1UL << QFQ_MAX_WSHIFT)) { - pr_notice("qfq: invalid weight %u\n", weight); - return -EINVAL; - } - } else + else weight = 1; if (tb[TCA_QFQ_LMAX]) @@ -426,11 +428,6 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, else lmax = psched_mtu(qdisc_dev(sch)); - if (lmax < QFQ_MIN_LMAX || lmax > (1UL << QFQ_MTU_SHIFT)) { - pr_notice("qfq: invalid max length %u\n", lmax); - return -EINVAL; - } - inv_w = ONE_FP / weight; weight = ONE_FP / inv_w; -- cgit From d913d32cc2707e9cd24fe6fa6d7d470e9c728980 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 21 Apr 2023 11:52:55 -0700 Subject: netlink: Use copy_to_user() for optval in netlink_getsockopt(). Brad Spencer provided a detailed report [0] that when calling getsockopt() for AF_NETLINK, some SOL_NETLINK options set only 1 byte even though such options require at least sizeof(int) as length. The options return a flag value that fits into 1 byte, but such behaviour confuses users who do not initialise the variable before calling getsockopt() and do not strictly check the returned value as char. Currently, netlink_getsockopt() uses put_user() to copy data to optlen and optval, but put_user() casts the data based on the pointer, char *optval. As a result, only 1 byte is set to optval. To avoid this behaviour, we need to use copy_to_user() or cast optval for put_user(). Note that this changes the behaviour on big-endian systems, but we document that the size of optval is int in the man page. $ man 7 netlink ... Socket options To set or get a netlink socket option, call getsockopt(2) to read or setsockopt(2) to write the option with the option level argument set to SOL_NETLINK. Unless otherwise noted, optval is a pointer to an int. Fixes: 9a4595bc7e67 ("[NETLINK]: Add set/getsockopt options to support more than 32 groups") Fixes: be0c22a46cfb ("netlink: add NETLINK_BROADCAST_ERROR socket option") Fixes: 38938bfe3489 ("netlink: add NETLINK_NO_ENOBUFS socket flag") Fixes: 0a6a3a23ea6e ("netlink: add NETLINK_CAP_ACK socket option") Fixes: 2d4bc93368f5 ("netlink: extended ACK reporting") Fixes: 89d35528d17d ("netlink: Add new socket option to enable strict checking on dumps") Reported-by: Brad Spencer Link: https://lore.kernel.org/netdev/ZD7VkNWFfp22kTDt@datsun.rim.net/ Signed-off-by: Kuniyuki Iwashima Reviewed-by: Johannes Berg Link: https://lore.kernel.org/r/20230421185255.94606-1-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- net/netlink/af_netlink.c | 75 +++++++++++++++--------------------------------- 1 file changed, 23 insertions(+), 52 deletions(-) (limited to 'net') diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index f365dfdd672d..9b6eb28e6e94 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -1742,7 +1742,8 @@ static int netlink_getsockopt(struct socket *sock, int level, int optname, { struct sock *sk = sock->sk; struct netlink_sock *nlk = nlk_sk(sk); - int len, val, err; + unsigned int flag; + int len, val; if (level != SOL_NETLINK) return -ENOPROTOOPT; @@ -1754,39 +1755,17 @@ static int netlink_getsockopt(struct socket *sock, int level, int optname, switch (optname) { case NETLINK_PKTINFO: - if (len < sizeof(int)) - return -EINVAL; - len = sizeof(int); - val = nlk->flags & NETLINK_F_RECV_PKTINFO ? 1 : 0; - if (put_user(len, optlen) || - put_user(val, optval)) - return -EFAULT; - err = 0; + flag = NETLINK_F_RECV_PKTINFO; break; case NETLINK_BROADCAST_ERROR: - if (len < sizeof(int)) - return -EINVAL; - len = sizeof(int); - val = nlk->flags & NETLINK_F_BROADCAST_SEND_ERROR ? 1 : 0; - if (put_user(len, optlen) || - put_user(val, optval)) - return -EFAULT; - err = 0; + flag = NETLINK_F_BROADCAST_SEND_ERROR; break; case NETLINK_NO_ENOBUFS: - if (len < sizeof(int)) - return -EINVAL; - len = sizeof(int); - val = nlk->flags & NETLINK_F_RECV_NO_ENOBUFS ? 1 : 0; - if (put_user(len, optlen) || - put_user(val, optval)) - return -EFAULT; - err = 0; + flag = NETLINK_F_RECV_NO_ENOBUFS; break; case NETLINK_LIST_MEMBERSHIPS: { - int pos, idx, shift; + int pos, idx, shift, err = 0; - err = 0; netlink_lock_table(); for (pos = 0; pos * 8 < nlk->ngroups; pos += sizeof(u32)) { if (len - pos < sizeof(u32)) @@ -1803,40 +1782,32 @@ static int netlink_getsockopt(struct socket *sock, int level, int optname, if (put_user(ALIGN(nlk->ngroups / 8, sizeof(u32)), optlen)) err = -EFAULT; netlink_unlock_table(); - break; + return err; } case NETLINK_CAP_ACK: - if (len < sizeof(int)) - return -EINVAL; - len = sizeof(int); - val = nlk->flags & NETLINK_F_CAP_ACK ? 1 : 0; - if (put_user(len, optlen) || - put_user(val, optval)) - return -EFAULT; - err = 0; + flag = NETLINK_F_CAP_ACK; break; case NETLINK_EXT_ACK: - if (len < sizeof(int)) - return -EINVAL; - len = sizeof(int); - val = nlk->flags & NETLINK_F_EXT_ACK ? 1 : 0; - if (put_user(len, optlen) || put_user(val, optval)) - return -EFAULT; - err = 0; + flag = NETLINK_F_EXT_ACK; break; case NETLINK_GET_STRICT_CHK: - if (len < sizeof(int)) - return -EINVAL; - len = sizeof(int); - val = nlk->flags & NETLINK_F_STRICT_CHK ? 1 : 0; - if (put_user(len, optlen) || put_user(val, optval)) - return -EFAULT; - err = 0; + flag = NETLINK_F_STRICT_CHK; break; default: - err = -ENOPROTOOPT; + return -ENOPROTOOPT; } - return err; + + if (len < sizeof(int)) + return -EINVAL; + + len = sizeof(int); + val = nlk->flags & flag ? 1 : 0; + + if (put_user(len, optlen) || + copy_to_user(optval, &val, len)) + return -EFAULT; + + return 0; } static void netlink_cmsg_recv_pktinfo(struct msghdr *msg, struct sk_buff *skb) -- cgit From 50749f2dd6854a41830996ad302aef2ffaf011d8 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 24 Apr 2023 15:20:22 -0700 Subject: tcp/udp: Fix memleaks of sk and zerocopy skbs with TX timestamp. syzkaller reported [0] memory leaks of an UDP socket and ZEROCOPY skbs. We can reproduce the problem with these sequences: sk = socket(AF_INET, SOCK_DGRAM, 0) sk.setsockopt(SOL_SOCKET, SO_TIMESTAMPING, SOF_TIMESTAMPING_TX_SOFTWARE) sk.setsockopt(SOL_SOCKET, SO_ZEROCOPY, 1) sk.sendto(b'', MSG_ZEROCOPY, ('127.0.0.1', 53)) sk.close() sendmsg() calls msg_zerocopy_alloc(), which allocates a skb, sets skb->cb->ubuf.refcnt to 1, and calls sock_hold(). Here, struct ubuf_info_msgzc indirectly holds a refcnt of the socket. When the skb is sent, __skb_tstamp_tx() clones it and puts the clone into the socket's error queue with the TX timestamp. When the original skb is received locally, skb_copy_ubufs() calls skb_unclone(), and pskb_expand_head() increments skb->cb->ubuf.refcnt. This additional count is decremented while freeing the skb, but struct ubuf_info_msgzc still has a refcnt, so __msg_zerocopy_callback() is not called. The last refcnt is not released unless we retrieve the TX timestamped skb by recvmsg(). Since we clear the error queue in inet_sock_destruct() after the socket's refcnt reaches 0, there is a circular dependency. If we close() the socket holding such skbs, we never call sock_put() and leak the count, sk, and skb. TCP has the same problem, and commit e0c8bccd40fc ("net: stream: purge sk_error_queue in sk_stream_kill_queues()") tried to fix it by calling skb_queue_purge() during close(). However, there is a small chance that skb queued in a qdisc or device could be put into the error queue after the skb_queue_purge() call. In __skb_tstamp_tx(), the cloned skb should not have a reference to the ubuf to remove the circular dependency, but skb_clone() does not call skb_copy_ubufs() for zerocopy skb. So, we need to call skb_orphan_frags_rx() for the cloned skb to call skb_copy_ubufs(). [0]: BUG: memory leak unreferenced object 0xffff88800c6d2d00 (size 1152): comm "syz-executor392", pid 264, jiffies 4294785440 (age 13.044s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 cd af e8 81 00 00 00 00 ................ 02 00 07 40 00 00 00 00 00 00 00 00 00 00 00 00 ...@............ backtrace: [<0000000055636812>] sk_prot_alloc+0x64/0x2a0 net/core/sock.c:2024 [<0000000054d77b7a>] sk_alloc+0x3b/0x800 net/core/sock.c:2083 [<0000000066f3c7e0>] inet_create net/ipv4/af_inet.c:319 [inline] [<0000000066f3c7e0>] inet_create+0x31e/0xe40 net/ipv4/af_inet.c:245 [<000000009b83af97>] __sock_create+0x2ab/0x550 net/socket.c:1515 [<00000000b9b11231>] sock_create net/socket.c:1566 [inline] [<00000000b9b11231>] __sys_socket_create net/socket.c:1603 [inline] [<00000000b9b11231>] __sys_socket_create net/socket.c:1588 [inline] [<00000000b9b11231>] __sys_socket+0x138/0x250 net/socket.c:1636 [<000000004fb45142>] __do_sys_socket net/socket.c:1649 [inline] [<000000004fb45142>] __se_sys_socket net/socket.c:1647 [inline] [<000000004fb45142>] __x64_sys_socket+0x73/0xb0 net/socket.c:1647 [<0000000066999e0e>] do_syscall_x64 arch/x86/entry/common.c:50 [inline] [<0000000066999e0e>] do_syscall_64+0x38/0x90 arch/x86/entry/common.c:80 [<0000000017f238c1>] entry_SYSCALL_64_after_hwframe+0x63/0xcd BUG: memory leak unreferenced object 0xffff888017633a00 (size 240): comm "syz-executor392", pid 264, jiffies 4294785440 (age 13.044s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 00 00 00 00 00 00 00 00 00 2d 6d 0c 80 88 ff ff .........-m..... backtrace: [<000000002b1c4368>] __alloc_skb+0x229/0x320 net/core/skbuff.c:497 [<00000000143579a6>] alloc_skb include/linux/skbuff.h:1265 [inline] [<00000000143579a6>] sock_omalloc+0xaa/0x190 net/core/sock.c:2596 [<00000000be626478>] msg_zerocopy_alloc net/core/skbuff.c:1294 [inline] [<00000000be626478>] msg_zerocopy_realloc+0x1ce/0x7f0 net/core/skbuff.c:1370 [<00000000cbfc9870>] __ip_append_data+0x2adf/0x3b30 net/ipv4/ip_output.c:1037 [<0000000089869146>] ip_make_skb+0x26c/0x2e0 net/ipv4/ip_output.c:1652 [<00000000098015c2>] udp_sendmsg+0x1bac/0x2390 net/ipv4/udp.c:1253 [<0000000045e0e95e>] inet_sendmsg+0x10a/0x150 net/ipv4/af_inet.c:819 [<000000008d31bfde>] sock_sendmsg_nosec net/socket.c:714 [inline] [<000000008d31bfde>] sock_sendmsg+0x141/0x190 net/socket.c:734 [<0000000021e21aa4>] __sys_sendto+0x243/0x360 net/socket.c:2117 [<00000000ac0af00c>] __do_sys_sendto net/socket.c:2129 [inline] [<00000000ac0af00c>] __se_sys_sendto net/socket.c:2125 [inline] [<00000000ac0af00c>] __x64_sys_sendto+0xe1/0x1c0 net/socket.c:2125 [<0000000066999e0e>] do_syscall_x64 arch/x86/entry/common.c:50 [inline] [<0000000066999e0e>] do_syscall_64+0x38/0x90 arch/x86/entry/common.c:80 [<0000000017f238c1>] entry_SYSCALL_64_after_hwframe+0x63/0xcd Fixes: f214f915e7db ("tcp: enable MSG_ZEROCOPY") Fixes: b5947e5d1e71 ("udp: msg_zerocopy") Reported-by: syzbot Signed-off-by: Kuniyuki Iwashima Reviewed-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/core/skbuff.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 4c0879798eb8..2f9bb98170ab 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -5162,6 +5162,9 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb, skb = alloc_skb(0, GFP_ATOMIC); } else { skb = skb_clone(orig_skb, GFP_ATOMIC); + + if (skb_orphan_frags_rx(skb, GFP_ATOMIC)) + return; } if (!skb) return; -- cgit