diff options
Diffstat (limited to 'net/sched')
55 files changed, 2195 insertions, 1044 deletions
diff --git a/net/sched/Kconfig b/net/sched/Kconfig index 8180d0c12fce..ad914d2b2e22 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -403,6 +403,18 @@ config NET_SCH_ETS If unsure, say N. +config NET_SCH_BPF + bool "BPF-based Qdisc" + depends on BPF_SYSCALL && BPF_JIT && DEBUG_INFO_BTF + help + This option allows BPF-based queueing disiplines. With BPF struct_ops, + users can implement supported operators in Qdisc_ops using BPF programs. + The queue holding skb can be built with BPF maps or graphs. + + Say Y here if you want to use BPF-based Qdisc. + + If unsure, say N. + menuconfig NET_SCH_DEFAULT bool "Allow override default queue discipline" help @@ -784,7 +796,7 @@ config NET_ACT_SKBEDIT config NET_ACT_CSUM tristate "Checksum Updating" depends on NET_CLS_ACT && INET - select LIBCRC32C + select NET_CRC32C help Say Y here to update some common checksum after some direct packet alterations. diff --git a/net/sched/Makefile b/net/sched/Makefile index 82c3f78ca486..904d784902d1 100644 --- a/net/sched/Makefile +++ b/net/sched/Makefile @@ -62,6 +62,7 @@ obj-$(CONFIG_NET_SCH_FQ_PIE) += sch_fq_pie.o obj-$(CONFIG_NET_SCH_CBS) += sch_cbs.o obj-$(CONFIG_NET_SCH_ETF) += sch_etf.o obj-$(CONFIG_NET_SCH_TAPRIO) += sch_taprio.o +obj-$(CONFIG_NET_SCH_BPF) += bpf_qdisc.o obj-$(CONFIG_NET_CLS_U32) += cls_u32.o obj-$(CONFIG_NET_CLS_ROUTE4) += cls_route.o diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 9ee622fb1160..057e20cef375 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -62,7 +62,7 @@ static void tcf_set_action_cookie(struct tc_cookie __rcu **old_cookie, { struct tc_cookie *old; - old = xchg((__force struct tc_cookie **)old_cookie, new_cookie); + old = unrcu_pointer(xchg(old_cookie, RCU_INITIALIZER(new_cookie))); if (old) call_rcu(&old->rcu, tcf_free_cookie_rcu); } @@ -504,6 +504,50 @@ nla_put_failure: return -1; } +static int +tcf_action_dump_1(struct sk_buff *skb, struct tc_action *a, int bind, int ref) +{ + unsigned char *b = skb_tail_pointer(skb); + struct nlattr *nest; + int err = -EINVAL; + u32 flags; + + if (tcf_action_dump_terse(skb, a, false)) + goto nla_put_failure; + + if (a->hw_stats != TCA_ACT_HW_STATS_ANY && + nla_put_bitfield32(skb, TCA_ACT_HW_STATS, + a->hw_stats, TCA_ACT_HW_STATS_ANY)) + goto nla_put_failure; + + if (a->used_hw_stats_valid && + nla_put_bitfield32(skb, TCA_ACT_USED_HW_STATS, + a->used_hw_stats, TCA_ACT_HW_STATS_ANY)) + goto nla_put_failure; + + flags = a->tcfa_flags & TCA_ACT_FLAGS_USER_MASK; + if (flags && + nla_put_bitfield32(skb, TCA_ACT_FLAGS, + flags, flags)) + goto nla_put_failure; + + if (nla_put_u32(skb, TCA_ACT_IN_HW_COUNT, a->in_hw_count)) + goto nla_put_failure; + + nest = nla_nest_start_noflag(skb, TCA_ACT_OPTIONS); + if (nest == NULL) + goto nla_put_failure; + err = tcf_action_dump_old(skb, a, bind, ref); + if (err > 0) { + nla_nest_end(skb, nest); + return err; + } + +nla_put_failure: + nlmsg_trim(skb, b); + return -1; +} + static int tcf_dump_walker(struct tcf_idrinfo *idrinfo, struct sk_buff *skb, struct netlink_callback *cb) { @@ -830,7 +874,6 @@ int tcf_idr_check_alloc(struct tc_action_net *tn, u32 *index, u32 max; if (*index) { -again: rcu_read_lock(); p = idr_find(&idrinfo->action_idr, *index); @@ -839,7 +882,7 @@ again: * index but did not assign the pointer yet. */ rcu_read_unlock(); - goto again; + return -EAGAIN; } if (!p) { @@ -1191,51 +1234,6 @@ tcf_action_dump_old(struct sk_buff *skb, struct tc_action *a, int bind, int ref) return a->ops->dump(skb, a, bind, ref); } -int -tcf_action_dump_1(struct sk_buff *skb, struct tc_action *a, int bind, int ref) -{ - int err = -EINVAL; - unsigned char *b = skb_tail_pointer(skb); - struct nlattr *nest; - u32 flags; - - if (tcf_action_dump_terse(skb, a, false)) - goto nla_put_failure; - - if (a->hw_stats != TCA_ACT_HW_STATS_ANY && - nla_put_bitfield32(skb, TCA_ACT_HW_STATS, - a->hw_stats, TCA_ACT_HW_STATS_ANY)) - goto nla_put_failure; - - if (a->used_hw_stats_valid && - nla_put_bitfield32(skb, TCA_ACT_USED_HW_STATS, - a->used_hw_stats, TCA_ACT_HW_STATS_ANY)) - goto nla_put_failure; - - flags = a->tcfa_flags & TCA_ACT_FLAGS_USER_MASK; - if (flags && - nla_put_bitfield32(skb, TCA_ACT_FLAGS, - flags, flags)) - goto nla_put_failure; - - if (nla_put_u32(skb, TCA_ACT_IN_HW_COUNT, a->in_hw_count)) - goto nla_put_failure; - - nest = nla_nest_start_noflag(skb, TCA_ACT_OPTIONS); - if (nest == NULL) - goto nla_put_failure; - err = tcf_action_dump_old(skb, a, bind, ref); - if (err > 0) { - nla_nest_end(skb, nest); - return err; - } - -nla_put_failure: - nlmsg_trim(skb, b); - return -1; -} -EXPORT_SYMBOL(tcf_action_dump_1); - int tcf_action_dump(struct sk_buff *skb, struct tc_action *actions[], int bind, int ref, bool terse) { @@ -1463,17 +1461,29 @@ int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla, struct netlink_ext_ack *extack) { struct tc_action_ops *ops[TCA_ACT_MAX_PRIO] = {}; - struct nlattr *tb[TCA_ACT_MAX_PRIO + 1]; + struct nlattr *tb[TCA_ACT_MAX_PRIO + 2]; struct tc_action *act; size_t sz = 0; int err; int i; - err = nla_parse_nested_deprecated(tb, TCA_ACT_MAX_PRIO, nla, NULL, + err = nla_parse_nested_deprecated(tb, TCA_ACT_MAX_PRIO + 1, nla, NULL, extack); if (err < 0) return err; + /* The nested attributes are parsed as types, but they are really an + * array of actions. So we parse one more than we can handle, and return + * an error if the last one is set (as that indicates that the request + * contained more than the maximum number of actions). + */ + if (tb[TCA_ACT_MAX_PRIO + 1]) { + NL_SET_ERR_MSG_FMT(extack, + "Only %d actions supported per filter", + TCA_ACT_MAX_PRIO); + return -EINVAL; + } + for (i = 1; i <= TCA_ACT_MAX_PRIO && tb[i]; i++) { struct tc_action_ops *a_o; @@ -1499,8 +1509,29 @@ int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla, bool skip_sw = tc_skip_sw(fl_flags); bool skip_hw = tc_skip_hw(fl_flags); - if (tc_act_bind(act->tcfa_flags)) + if (tc_act_bind(act->tcfa_flags)) { + /* Action is created by classifier and is not + * standalone. Check that the user did not set + * any action flags different than the + * classifier flags, and inherit the flags from + * the classifier for the compatibility case + * where no flags were specified at all. + */ + if ((tc_act_skip_sw(act->tcfa_flags) && !skip_sw) || + (tc_act_skip_hw(act->tcfa_flags) && !skip_hw)) { + NL_SET_ERR_MSG(extack, + "Mismatch between action and filter offload flags"); + err = -EINVAL; + goto err; + } + if (skip_sw) + act->tcfa_flags |= TCA_ACT_FLAGS_SKIP_SW; + if (skip_hw) + act->tcfa_flags |= TCA_ACT_FLAGS_SKIP_HW; continue; + } + + /* Action is standalone */ if (skip_sw != tc_act_skip_sw(act->tcfa_flags) || skip_hw != tc_act_skip_hw(act->tcfa_flags)) { NL_SET_ERR_MSG(extack, @@ -2244,13 +2275,16 @@ out_module_put: return skb->len; } +static const struct rtnl_msg_handler tc_action_rtnl_msg_handlers[] __initconst = { + {.msgtype = RTM_NEWACTION, .doit = tc_ctl_action}, + {.msgtype = RTM_DELACTION, .doit = tc_ctl_action}, + {.msgtype = RTM_GETACTION, .doit = tc_ctl_action, + .dumpit = tc_dump_action}, +}; + static int __init tc_action_init(void) { - rtnl_register(PF_UNSPEC, RTM_NEWACTION, tc_ctl_action, NULL, 0); - rtnl_register(PF_UNSPEC, RTM_DELACTION, tc_ctl_action, NULL, 0); - rtnl_register(PF_UNSPEC, RTM_GETACTION, tc_ctl_action, tc_dump_action, - 0); - + rtnl_register_many(tc_action_rtnl_msg_handlers); return 0; } diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c index 0e3cf11ae5fc..396b576390d0 100644 --- a/net/sched/act_bpf.c +++ b/net/sched/act_bpf.c @@ -54,8 +54,8 @@ TC_INDIRECT_SCOPE int tcf_bpf_act(struct sk_buff *skb, bpf_compute_data_pointers(skb); filter_res = bpf_prog_run(filter, skb); } - if (unlikely(!skb->tstamp && skb->mono_delivery_time)) - skb->mono_delivery_time = 0; + if (unlikely(!skb->tstamp && skb->tstamp_type)) + skb->tstamp_type = SKB_CLOCK_REALTIME; if (skb_sk_is_prefetched(skb) && filter_res != TC_ACT_OK) skb_orphan(skb); diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c index baac083fd8f1..c02f39efc6ef 100644 --- a/net/sched/act_ct.c +++ b/net/sched/act_ct.c @@ -41,21 +41,26 @@ static struct workqueue_struct *act_ct_wq; static struct rhashtable zones_ht; static DEFINE_MUTEX(zones_mutex); +struct zones_ht_key { + struct net *net; + u16 zone; +}; + struct tcf_ct_flow_table { struct rhash_head node; /* In zones tables */ struct rcu_work rwork; struct nf_flowtable nf_ft; refcount_t ref; - u16 zone; + struct zones_ht_key key; bool dying; }; static const struct rhashtable_params zones_params = { .head_offset = offsetof(struct tcf_ct_flow_table, node), - .key_offset = offsetof(struct tcf_ct_flow_table, zone), - .key_len = sizeof_field(struct tcf_ct_flow_table, zone), + .key_offset = offsetof(struct tcf_ct_flow_table, key), + .key_len = offsetofend(struct zones_ht_key, zone), .automatic_shrinking = true, }; @@ -316,11 +321,12 @@ static struct nf_flowtable_type flowtable_ct = { static int tcf_ct_flow_table_get(struct net *net, struct tcf_ct_params *params) { + struct zones_ht_key key = { .net = net, .zone = params->zone }; struct tcf_ct_flow_table *ct_ft; int err = -ENOMEM; mutex_lock(&zones_mutex); - ct_ft = rhashtable_lookup_fast(&zones_ht, ¶ms->zone, zones_params); + ct_ft = rhashtable_lookup_fast(&zones_ht, &key, zones_params); if (ct_ft && refcount_inc_not_zero(&ct_ft->ref)) goto out_unlock; @@ -329,7 +335,7 @@ static int tcf_ct_flow_table_get(struct net *net, struct tcf_ct_params *params) goto err_alloc; refcount_set(&ct_ft->ref, 1); - ct_ft->zone = params->zone; + ct_ft->key = key; err = rhashtable_insert_fast(&zones_ht, &ct_ft->node, zones_params); if (err) goto err_insert; @@ -938,6 +944,8 @@ static int tcf_ct_act_nat(struct sk_buff *skb, action |= BIT(NF_NAT_MANIP_DST); err = nf_ct_nat(skb, ct, ctinfo, &action, range, commit); + if (err != NF_ACCEPT) + return err & NF_VERDICT_MASK; if (action & BIT(NF_NAT_MANIP_SRC)) tc_skb_cb(skb)->post_ct_snat = 1; @@ -1029,7 +1037,7 @@ TC_INDIRECT_SCOPE int tcf_ct_act(struct sk_buff *skb, const struct tc_action *a, state.pf = family; err = nf_conntrack_in(skb, &state); if (err != NF_ACCEPT) - goto out_push; + goto nf_error; } do_nat: @@ -1041,7 +1049,7 @@ do_nat: err = tcf_ct_act_nat(skb, ct, ctinfo, p->ct_action, &p->range, commit); if (err != NF_ACCEPT) - goto drop; + goto nf_error; if (!nf_ct_is_confirmed(ct) && commit && p->helper && !nfct_help(ct)) { err = __nf_ct_try_assign_helper(ct, p->tmpl, GFP_ATOMIC); @@ -1055,8 +1063,9 @@ do_nat: } if (nf_ct_is_confirmed(ct) ? ((!cached && !skip_add) || add_helper) : commit) { - if (nf_ct_helper(skb, ct, ctinfo, family) != NF_ACCEPT) - goto drop; + err = nf_ct_helper(skb, ct, ctinfo, family); + if (err != NF_ACCEPT) + goto nf_error; } if (commit) { @@ -1069,8 +1078,17 @@ do_nat: /* This will take care of sending queued events * even if the connection is already confirmed. */ - if (nf_conntrack_confirm(skb) != NF_ACCEPT) - goto drop; + err = nf_conntrack_confirm(skb); + if (err != NF_ACCEPT) + goto nf_error; + + /* The ct may be dropped if a clash has been resolved, + * so it's necessary to retrieve it from skb again to + * prevent UAF. + */ + ct = nf_ct_get(skb, &ctinfo); + if (!ct) + skip_add = true; } if (!skip_add) @@ -1094,6 +1112,21 @@ out_frag: drop: tcf_action_inc_drop_qstats(&c->common); return TC_ACT_SHOT; + +nf_error: + /* some verdicts store extra data in upper bits, such + * as errno or queue number. + */ + switch (err & NF_VERDICT_MASK) { + case NF_DROP: + goto drop; + case NF_STOLEN: + tcf_action_inc_drop_qstats(&c->common); + return TC_ACT_CONSUMED; + default: + DEBUG_NET_WARN_ON_ONCE(1); + goto drop; + } } static const struct nla_policy ct_policy[TCA_CT_MAX + 1] = { @@ -1150,9 +1183,8 @@ static int tcf_ct_fill_params_nat(struct tcf_ct_params *p, range->min_addr.ip = nla_get_in_addr(tb[TCA_CT_NAT_IPV4_MIN]); - range->max_addr.ip = max_attr ? - nla_get_in_addr(max_attr) : - range->min_addr.ip; + range->max_addr.ip = + nla_get_in_addr_default(max_attr, range->min_addr.ip); } else if (tb[TCA_CT_NAT_IPV6_MIN]) { struct nlattr *max_attr = tb[TCA_CT_NAT_IPV6_MAX]; @@ -1281,8 +1313,9 @@ static int tcf_ct_fill_params(struct net *net, err = -EINVAL; goto err; } - family = tb[TCA_CT_HELPER_FAMILY] ? nla_get_u8(tb[TCA_CT_HELPER_FAMILY]) : AF_INET; - proto = tb[TCA_CT_HELPER_PROTO] ? nla_get_u8(tb[TCA_CT_HELPER_PROTO]) : IPPROTO_TCP; + family = nla_get_u8_default(tb[TCA_CT_HELPER_FAMILY], AF_INET); + proto = nla_get_u8_default(tb[TCA_CT_HELPER_PROTO], + IPPROTO_TCP); err = nf_ct_add_helper(tmpl, name, family, proto, p->ct_action & TCA_CT_ACT_NAT, &p->helper); if (err) { diff --git a/net/sched/act_ctinfo.c b/net/sched/act_ctinfo.c index 5dd41a012110..5b1241ddc758 100644 --- a/net/sched/act_ctinfo.c +++ b/net/sched/act_ctinfo.c @@ -197,8 +197,9 @@ static int tcf_ctinfo_init(struct net *net, struct nlattr *nla, "dscp mask must be 6 contiguous bits"); return -EINVAL; } - dscpstatemask = tb[TCA_CTINFO_PARMS_DSCP_STATEMASK] ? - nla_get_u32(tb[TCA_CTINFO_PARMS_DSCP_STATEMASK]) : 0; + dscpstatemask = + nla_get_u32_default(tb[TCA_CTINFO_PARMS_DSCP_STATEMASK], + 0); /* mask & statemask must not overlap */ if (dscpmask & dscpstatemask) { NL_SET_ERR_MSG_ATTR(extack, @@ -243,8 +244,7 @@ static int tcf_ctinfo_init(struct net *net, struct nlattr *nla, } cp_new->net = net; - cp_new->zone = tb[TCA_CTINFO_ZONE] ? - nla_get_u16(tb[TCA_CTINFO_ZONE]) : 0; + cp_new->zone = nla_get_u16_default(tb[TCA_CTINFO_ZONE], 0); if (dscpmask) { cp_new->dscpmask = dscpmask; cp_new->dscpmaskshift = dscpmaskshift; diff --git a/net/sched/act_gate.c b/net/sched/act_gate.c index 1dd74125398a..c1f75f272757 100644 --- a/net/sched/act_gate.c +++ b/net/sched/act_gate.c @@ -190,15 +190,10 @@ static int fill_gate_entry(struct nlattr **tb, struct tcfg_gate_entry *entry, entry->interval = interval; - if (tb[TCA_GATE_ENTRY_IPV]) - entry->ipv = nla_get_s32(tb[TCA_GATE_ENTRY_IPV]); - else - entry->ipv = -1; + entry->ipv = nla_get_s32_default(tb[TCA_GATE_ENTRY_IPV], -1); - if (tb[TCA_GATE_ENTRY_MAX_OCTETS]) - entry->maxoctets = nla_get_s32(tb[TCA_GATE_ENTRY_MAX_OCTETS]); - else - entry->maxoctets = -1; + entry->maxoctets = nla_get_s32_default(tb[TCA_GATE_ENTRY_MAX_OCTETS], + -1); return 0; } @@ -292,8 +287,7 @@ static void gate_setup_timer(struct tcf_gate *gact, u64 basetime, gact->param.tcfg_basetime = basetime; gact->param.tcfg_clockid = clockid; gact->tk_offset = tko; - hrtimer_init(&gact->hitimer, clockid, HRTIMER_MODE_ABS_SOFT); - gact->hitimer.function = gate_timer_func; + hrtimer_setup(&gact->hitimer, gate_timer_func, clockid, HRTIMER_MODE_ABS_SOFT); } static int tcf_gate_init(struct net *net, struct nlattr *nla, diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index 5b3814365924..5f01f567c934 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -30,7 +30,29 @@ static LIST_HEAD(mirred_list); static DEFINE_SPINLOCK(mirred_list_lock); #define MIRRED_NEST_LIMIT 4 -static DEFINE_PER_CPU(unsigned int, mirred_nest_level); + +#ifndef CONFIG_PREEMPT_RT +static u8 tcf_mirred_nest_level_inc_return(void) +{ + return __this_cpu_inc_return(softnet_data.xmit.sched_mirred_nest); +} + +static void tcf_mirred_nest_level_dec(void) +{ + __this_cpu_dec(softnet_data.xmit.sched_mirred_nest); +} + +#else +static u8 tcf_mirred_nest_level_inc_return(void) +{ + return current->net_xmit.sched_mirred_nest++; +} + +static void tcf_mirred_nest_level_dec(void) +{ + current->net_xmit.sched_mirred_nest--; +} +#endif static bool tcf_mirred_is_act_redirect(int action) { @@ -423,7 +445,7 @@ TC_INDIRECT_SCOPE int tcf_mirred_act(struct sk_buff *skb, int m_eaction; u32 blockid; - nest_level = __this_cpu_inc_return(mirred_nest_level); + nest_level = tcf_mirred_nest_level_inc_return(); if (unlikely(nest_level > MIRRED_NEST_LIMIT)) { net_warn_ratelimited("Packet exceeded mirred recursion limit on dev %s\n", netdev_name(skb->dev)); @@ -454,7 +476,7 @@ TC_INDIRECT_SCOPE int tcf_mirred_act(struct sk_buff *skb, retval); dec_nest_level: - __this_cpu_dec(mirred_nest_level); + tcf_mirred_nest_level_dec(); return retval; } diff --git a/net/sched/act_mpls.c b/net/sched/act_mpls.c index 44a37a71ae92..9f86f4e666d3 100644 --- a/net/sched/act_mpls.c +++ b/net/sched/act_mpls.c @@ -288,16 +288,14 @@ static int tcf_mpls_init(struct net *net, struct nlattr *nla, } p->tcfm_action = parm->m_action; - p->tcfm_label = tb[TCA_MPLS_LABEL] ? nla_get_u32(tb[TCA_MPLS_LABEL]) : - ACT_MPLS_LABEL_NOT_SET; - p->tcfm_tc = tb[TCA_MPLS_TC] ? nla_get_u8(tb[TCA_MPLS_TC]) : - ACT_MPLS_TC_NOT_SET; - p->tcfm_ttl = tb[TCA_MPLS_TTL] ? nla_get_u8(tb[TCA_MPLS_TTL]) : - mpls_ttl; - p->tcfm_bos = tb[TCA_MPLS_BOS] ? nla_get_u8(tb[TCA_MPLS_BOS]) : - ACT_MPLS_BOS_NOT_SET; - p->tcfm_proto = tb[TCA_MPLS_PROTO] ? nla_get_be16(tb[TCA_MPLS_PROTO]) : - htons(ETH_P_MPLS_UC); + p->tcfm_label = nla_get_u32_default(tb[TCA_MPLS_LABEL], + ACT_MPLS_LABEL_NOT_SET); + p->tcfm_tc = nla_get_u8_default(tb[TCA_MPLS_TC], ACT_MPLS_TC_NOT_SET); + p->tcfm_ttl = nla_get_u8_default(tb[TCA_MPLS_TTL], mpls_ttl); + p->tcfm_bos = nla_get_u8_default(tb[TCA_MPLS_BOS], + ACT_MPLS_BOS_NOT_SET); + p->tcfm_proto = nla_get_be16_default(tb[TCA_MPLS_PROTO], + htons(ETH_P_MPLS_UC)); spin_lock_bh(&m->tcf_lock); goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); diff --git a/net/sched/act_police.c b/net/sched/act_police.c index 8555125ed34d..a214ed681142 100644 --- a/net/sched/act_police.c +++ b/net/sched/act_police.c @@ -167,8 +167,7 @@ static int tcf_police_init(struct net *net, struct nlattr *nla, } if (R_tab) { new->rate_present = true; - rate64 = tb[TCA_POLICE_RATE64] ? - nla_get_u64(tb[TCA_POLICE_RATE64]) : 0; + rate64 = nla_get_u64_default(tb[TCA_POLICE_RATE64], 0); psched_ratecfg_precompute(&new->rate, &R_tab->rate, rate64); qdisc_put_rtab(R_tab); } else { @@ -176,8 +175,7 @@ static int tcf_police_init(struct net *net, struct nlattr *nla, } if (P_tab) { new->peak_present = true; - prate64 = tb[TCA_POLICE_PEAKRATE64] ? - nla_get_u64(tb[TCA_POLICE_PEAKRATE64]) : 0; + prate64 = nla_get_u64_default(tb[TCA_POLICE_PEAKRATE64], 0); psched_ratecfg_precompute(&new->peak, &P_tab->rate, prate64); qdisc_put_rtab(P_tab); } else { diff --git a/net/sched/act_sample.c b/net/sched/act_sample.c index a69b53d54039..2ceb4d141b71 100644 --- a/net/sched/act_sample.c +++ b/net/sched/act_sample.c @@ -167,7 +167,9 @@ TC_INDIRECT_SCOPE int tcf_sample_act(struct sk_buff *skb, { struct tcf_sample *s = to_sample(a); struct psample_group *psample_group; + u8 cookie_data[TC_COOKIE_MAX_SIZE]; struct psample_metadata md = {}; + struct tc_cookie *user_cookie; int retval; tcf_lastuse_update(&s->tcf_tm); @@ -189,6 +191,16 @@ TC_INDIRECT_SCOPE int tcf_sample_act(struct sk_buff *skb, if (skb_at_tc_ingress(skb) && tcf_sample_dev_ok_push(skb->dev)) skb_push(skb, skb->mac_len); + rcu_read_lock(); + user_cookie = rcu_dereference(a->user_cookie); + if (user_cookie) { + memcpy(cookie_data, user_cookie->data, + user_cookie->len); + md.user_cookie = cookie_data; + md.user_cookie_len = user_cookie->len; + } + rcu_read_unlock(); + md.trunc_size = s->truncate ? s->trunc_size : skb->len; psample_sample_packet(psample_group, skb, s->rate, &md); diff --git a/net/sched/act_skbmod.c b/net/sched/act_skbmod.c index 39945b139c48..dc0229693461 100644 --- a/net/sched/act_skbmod.c +++ b/net/sched/act_skbmod.c @@ -241,13 +241,13 @@ static int tcf_skbmod_dump(struct sk_buff *skb, struct tc_action *a, struct tcf_skbmod *d = to_skbmod(a); unsigned char *b = skb_tail_pointer(skb); struct tcf_skbmod_params *p; - struct tc_skbmod opt = { - .index = d->tcf_index, - .refcnt = refcount_read(&d->tcf_refcnt) - ref, - .bindcnt = atomic_read(&d->tcf_bindcnt) - bind, - }; + struct tc_skbmod opt; struct tcf_t t; + memset(&opt, 0, sizeof(opt)); + opt.index = d->tcf_index; + opt.refcnt = refcount_read(&d->tcf_refcnt) - ref; + opt.bindcnt = atomic_read(&d->tcf_bindcnt) - bind; spin_lock_bh(&d->tcf_lock); opt.action = d->tcf_action; p = rcu_dereference_protected(d->skbmod_p, diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c index 1536f8b16f1b..2cef4b08befb 100644 --- a/net/sched/act_tunnel_key.c +++ b/net/sched/act_tunnel_key.c @@ -68,7 +68,7 @@ geneve_opt_policy[TCA_TUNNEL_KEY_ENC_OPT_GENEVE_MAX + 1] = { [TCA_TUNNEL_KEY_ENC_OPT_GENEVE_CLASS] = { .type = NLA_U16 }, [TCA_TUNNEL_KEY_ENC_OPT_GENEVE_TYPE] = { .type = NLA_U8 }, [TCA_TUNNEL_KEY_ENC_OPT_GENEVE_DATA] = { .type = NLA_BINARY, - .len = 128 }, + .len = 127 }, }; static const struct nla_policy @@ -230,7 +230,7 @@ static int tunnel_key_copy_opts(const struct nlattr *nla, u8 *dst, nla_for_each_attr(attr, head, len, rem) { switch (nla_type(attr)) { case TCA_TUNNEL_KEY_ENC_OPTS_GENEVE: - if (type && type != TUNNEL_GENEVE_OPT) { + if (type && type != IP_TUNNEL_GENEVE_OPT_BIT) { NL_SET_ERR_MSG(extack, "Duplicate type for geneve options"); return -EINVAL; } @@ -247,7 +247,7 @@ static int tunnel_key_copy_opts(const struct nlattr *nla, u8 *dst, dst_len -= opt_len; dst += opt_len; } - type = TUNNEL_GENEVE_OPT; + type = IP_TUNNEL_GENEVE_OPT_BIT; break; case TCA_TUNNEL_KEY_ENC_OPTS_VXLAN: if (type) { @@ -259,7 +259,7 @@ static int tunnel_key_copy_opts(const struct nlattr *nla, u8 *dst, if (opt_len < 0) return opt_len; opts_len += opt_len; - type = TUNNEL_VXLAN_OPT; + type = IP_TUNNEL_VXLAN_OPT_BIT; break; case TCA_TUNNEL_KEY_ENC_OPTS_ERSPAN: if (type) { @@ -271,7 +271,7 @@ static int tunnel_key_copy_opts(const struct nlattr *nla, u8 *dst, if (opt_len < 0) return opt_len; opts_len += opt_len; - type = TUNNEL_ERSPAN_OPT; + type = IP_TUNNEL_ERSPAN_OPT_BIT; break; } } @@ -302,7 +302,7 @@ static int tunnel_key_opts_set(struct nlattr *nla, struct ip_tunnel_info *info, switch (nla_type(nla_data(nla))) { case TCA_TUNNEL_KEY_ENC_OPTS_GENEVE: #if IS_ENABLED(CONFIG_INET) - info->key.tun_flags |= TUNNEL_GENEVE_OPT; + __set_bit(IP_TUNNEL_GENEVE_OPT_BIT, info->key.tun_flags); return tunnel_key_copy_opts(nla, ip_tunnel_info_opts(info), opts_len, extack); #else @@ -310,7 +310,7 @@ static int tunnel_key_opts_set(struct nlattr *nla, struct ip_tunnel_info *info, #endif case TCA_TUNNEL_KEY_ENC_OPTS_VXLAN: #if IS_ENABLED(CONFIG_INET) - info->key.tun_flags |= TUNNEL_VXLAN_OPT; + __set_bit(IP_TUNNEL_VXLAN_OPT_BIT, info->key.tun_flags); return tunnel_key_copy_opts(nla, ip_tunnel_info_opts(info), opts_len, extack); #else @@ -318,7 +318,7 @@ static int tunnel_key_opts_set(struct nlattr *nla, struct ip_tunnel_info *info, #endif case TCA_TUNNEL_KEY_ENC_OPTS_ERSPAN: #if IS_ENABLED(CONFIG_INET) - info->key.tun_flags |= TUNNEL_ERSPAN_OPT; + __set_bit(IP_TUNNEL_ERSPAN_OPT_BIT, info->key.tun_flags); return tunnel_key_copy_opts(nla, ip_tunnel_info_opts(info), opts_len, extack); #else @@ -363,6 +363,7 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla, bool bind = act_flags & TCA_ACT_FLAGS_BIND; struct nlattr *tb[TCA_TUNNEL_KEY_MAX + 1]; struct tcf_tunnel_key_params *params_new; + IP_TUNNEL_DECLARE_FLAGS(flags) = { }; struct metadata_dst *metadata = NULL; struct tcf_chain *goto_ch = NULL; struct tc_tunnel_key *parm; @@ -371,7 +372,6 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla, __be16 dst_port = 0; __be64 key_id = 0; int opts_len = 0; - __be16 flags = 0; u8 tos, ttl; int ret = 0; u32 index; @@ -412,16 +412,16 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla, key32 = nla_get_be32(tb[TCA_TUNNEL_KEY_ENC_KEY_ID]); key_id = key32_to_tunnel_id(key32); - flags = TUNNEL_KEY; + __set_bit(IP_TUNNEL_KEY_BIT, flags); } - flags |= TUNNEL_CSUM; + __set_bit(IP_TUNNEL_CSUM_BIT, flags); if (tb[TCA_TUNNEL_KEY_NO_CSUM] && nla_get_u8(tb[TCA_TUNNEL_KEY_NO_CSUM])) - flags &= ~TUNNEL_CSUM; + __clear_bit(IP_TUNNEL_CSUM_BIT, flags); if (nla_get_flag(tb[TCA_TUNNEL_KEY_NO_FRAG])) - flags |= TUNNEL_DONT_FRAGMENT; + __set_bit(IP_TUNNEL_DONT_FRAGMENT_BIT, flags); if (tb[TCA_TUNNEL_KEY_ENC_DST_PORT]) dst_port = nla_get_be16(tb[TCA_TUNNEL_KEY_ENC_DST_PORT]); @@ -571,8 +571,8 @@ static void tunnel_key_release(struct tc_action *a) static int tunnel_key_geneve_opts_dump(struct sk_buff *skb, const struct ip_tunnel_info *info) { + const u8 *src = ip_tunnel_info_opts(info); int len = info->options_len; - u8 *src = (u8 *)(info + 1); struct nlattr *start; start = nla_nest_start_noflag(skb, TCA_TUNNEL_KEY_ENC_OPTS_GENEVE); @@ -580,7 +580,7 @@ static int tunnel_key_geneve_opts_dump(struct sk_buff *skb, return -EMSGSIZE; while (len > 0) { - struct geneve_opt *opt = (struct geneve_opt *)src; + const struct geneve_opt *opt = (const struct geneve_opt *)src; if (nla_put_be16(skb, TCA_TUNNEL_KEY_ENC_OPT_GENEVE_CLASS, opt->opt_class) || @@ -603,7 +603,7 @@ static int tunnel_key_geneve_opts_dump(struct sk_buff *skb, static int tunnel_key_vxlan_opts_dump(struct sk_buff *skb, const struct ip_tunnel_info *info) { - struct vxlan_metadata *md = (struct vxlan_metadata *)(info + 1); + const struct vxlan_metadata *md = ip_tunnel_info_opts(info); struct nlattr *start; start = nla_nest_start_noflag(skb, TCA_TUNNEL_KEY_ENC_OPTS_VXLAN); @@ -622,7 +622,7 @@ static int tunnel_key_vxlan_opts_dump(struct sk_buff *skb, static int tunnel_key_erspan_opts_dump(struct sk_buff *skb, const struct ip_tunnel_info *info) { - struct erspan_metadata *md = (struct erspan_metadata *)(info + 1); + const struct erspan_metadata *md = ip_tunnel_info_opts(info); struct nlattr *start; start = nla_nest_start_noflag(skb, TCA_TUNNEL_KEY_ENC_OPTS_ERSPAN); @@ -663,15 +663,15 @@ static int tunnel_key_opts_dump(struct sk_buff *skb, if (!start) return -EMSGSIZE; - if (info->key.tun_flags & TUNNEL_GENEVE_OPT) { + if (test_bit(IP_TUNNEL_GENEVE_OPT_BIT, info->key.tun_flags)) { err = tunnel_key_geneve_opts_dump(skb, info); if (err) goto err_out; - } else if (info->key.tun_flags & TUNNEL_VXLAN_OPT) { + } else if (test_bit(IP_TUNNEL_VXLAN_OPT_BIT, info->key.tun_flags)) { err = tunnel_key_vxlan_opts_dump(skb, info); if (err) goto err_out; - } else if (info->key.tun_flags & TUNNEL_ERSPAN_OPT) { + } else if (test_bit(IP_TUNNEL_ERSPAN_OPT_BIT, info->key.tun_flags)) { err = tunnel_key_erspan_opts_dump(skb, info); if (err) goto err_out; @@ -741,7 +741,7 @@ static int tunnel_key_dump(struct sk_buff *skb, struct tc_action *a, struct ip_tunnel_key *key = &info->key; __be32 key_id = tunnel_id_to_key32(key->tun_id); - if (((key->tun_flags & TUNNEL_KEY) && + if ((test_bit(IP_TUNNEL_KEY_BIT, key->tun_flags) && nla_put_be32(skb, TCA_TUNNEL_KEY_ENC_KEY_ID, key_id)) || tunnel_key_dump_addresses(skb, ¶ms->tcft_enc_metadata->u.tun_info) || @@ -749,8 +749,8 @@ static int tunnel_key_dump(struct sk_buff *skb, struct tc_action *a, nla_put_be16(skb, TCA_TUNNEL_KEY_ENC_DST_PORT, key->tp_dst)) || nla_put_u8(skb, TCA_TUNNEL_KEY_NO_CSUM, - !(key->tun_flags & TUNNEL_CSUM)) || - ((key->tun_flags & TUNNEL_DONT_FRAGMENT) && + !test_bit(IP_TUNNEL_CSUM_BIT, key->tun_flags)) || + (test_bit(IP_TUNNEL_DONT_FRAGMENT_BIT, key->tun_flags) && nla_put_flag(skb, TCA_TUNNEL_KEY_NO_FRAG)) || tunnel_key_opts_dump(skb, info)) goto nla_put_failure; diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c index 22f4b1e8ade9..383bf18b6862 100644 --- a/net/sched/act_vlan.c +++ b/net/sched/act_vlan.c @@ -96,6 +96,7 @@ out: if (skb_at_tc_ingress(skb)) skb_pull_rcsum(skb, skb->mac_len); + skb_reset_mac_len(skb); return action; drop: diff --git a/net/sched/bpf_qdisc.c b/net/sched/bpf_qdisc.c new file mode 100644 index 000000000000..7ea8b54b2ab1 --- /dev/null +++ b/net/sched/bpf_qdisc.c @@ -0,0 +1,475 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/types.h> +#include <linux/bpf_verifier.h> +#include <linux/bpf.h> +#include <linux/btf.h> +#include <linux/filter.h> +#include <net/pkt_sched.h> +#include <net/pkt_cls.h> + +#define QDISC_OP_IDX(op) (offsetof(struct Qdisc_ops, op) / sizeof(void (*)(void))) +#define QDISC_MOFF_IDX(moff) (moff / sizeof(void (*)(void))) + +static struct bpf_struct_ops bpf_Qdisc_ops; + +struct bpf_sched_data { + struct qdisc_watchdog watchdog; +}; + +struct bpf_sk_buff_ptr { + struct sk_buff *skb; +}; + +static int bpf_qdisc_init(struct btf *btf) +{ + return 0; +} + +BTF_ID_LIST_SINGLE(bpf_qdisc_ids, struct, Qdisc) +BTF_ID_LIST_SINGLE(bpf_sk_buff_ids, struct, sk_buff) +BTF_ID_LIST_SINGLE(bpf_sk_buff_ptr_ids, struct, bpf_sk_buff_ptr) + +static bool bpf_qdisc_is_valid_access(int off, int size, + enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + struct btf *btf = prog->aux->attach_btf; + u32 arg; + + arg = btf_ctx_arg_idx(btf, prog->aux->attach_func_proto, off); + if (prog->aux->attach_st_ops_member_off == offsetof(struct Qdisc_ops, enqueue)) { + if (arg == 2 && type == BPF_READ) { + info->reg_type = PTR_TO_BTF_ID | PTR_TRUSTED; + info->btf = btf; + info->btf_id = bpf_sk_buff_ptr_ids[0]; + return true; + } + } + + return bpf_tracing_btf_ctx_access(off, size, type, prog, info); +} + +static int bpf_qdisc_qdisc_access(struct bpf_verifier_log *log, + const struct bpf_reg_state *reg, + int off, size_t *end) +{ + switch (off) { + case offsetof(struct Qdisc, limit): + *end = offsetofend(struct Qdisc, limit); + break; + case offsetof(struct Qdisc, q) + offsetof(struct qdisc_skb_head, qlen): + *end = offsetof(struct Qdisc, q) + offsetofend(struct qdisc_skb_head, qlen); + break; + case offsetof(struct Qdisc, qstats) ... offsetofend(struct Qdisc, qstats) - 1: + *end = offsetofend(struct Qdisc, qstats); + break; + default: + return -EACCES; + } + + return 0; +} + +static int bpf_qdisc_sk_buff_access(struct bpf_verifier_log *log, + const struct bpf_reg_state *reg, + int off, size_t *end) +{ + switch (off) { + case offsetof(struct sk_buff, tstamp): + *end = offsetofend(struct sk_buff, tstamp); + break; + case offsetof(struct sk_buff, cb) + offsetof(struct qdisc_skb_cb, data[0]) ... + offsetof(struct sk_buff, cb) + offsetof(struct qdisc_skb_cb, + data[QDISC_CB_PRIV_LEN - 1]): + *end = offsetof(struct sk_buff, cb) + + offsetofend(struct qdisc_skb_cb, data[QDISC_CB_PRIV_LEN - 1]); + break; + default: + return -EACCES; + } + + return 0; +} + +static int bpf_qdisc_btf_struct_access(struct bpf_verifier_log *log, + const struct bpf_reg_state *reg, + int off, int size) +{ + const struct btf_type *t, *skbt, *qdisct; + size_t end; + int err; + + skbt = btf_type_by_id(reg->btf, bpf_sk_buff_ids[0]); + qdisct = btf_type_by_id(reg->btf, bpf_qdisc_ids[0]); + t = btf_type_by_id(reg->btf, reg->btf_id); + + if (t == skbt) { + err = bpf_qdisc_sk_buff_access(log, reg, off, &end); + } else if (t == qdisct) { + err = bpf_qdisc_qdisc_access(log, reg, off, &end); + } else { + bpf_log(log, "only read is supported\n"); + return -EACCES; + } + + if (err) { + bpf_log(log, "no write support to %s at off %d\n", + btf_name_by_offset(reg->btf, t->name_off), off); + return -EACCES; + } + + if (off + size > end) { + bpf_log(log, + "write access at off %d with size %d beyond the member of %s ended at %zu\n", + off, size, btf_name_by_offset(reg->btf, t->name_off), end); + return -EACCES; + } + + return 0; +} + +BTF_ID_LIST(bpf_qdisc_init_prologue_ids) +BTF_ID(func, bpf_qdisc_init_prologue) + +static int bpf_qdisc_gen_prologue(struct bpf_insn *insn_buf, bool direct_write, + const struct bpf_prog *prog) +{ + struct bpf_insn *insn = insn_buf; + + if (prog->aux->attach_st_ops_member_off != offsetof(struct Qdisc_ops, init)) + return 0; + + /* r6 = r1; // r6 will be "u64 *ctx". r1 is "u64 *ctx". + * r2 = r1[16]; // r2 will be "struct netlink_ext_ack *extack" + * r1 = r1[0]; // r1 will be "struct Qdisc *sch" + * r0 = bpf_qdisc_init_prologue(r1, r2); + * if r0 == 0 goto pc+1; + * BPF_EXIT; + * r1 = r6; // r1 will be "u64 *ctx". + */ + *insn++ = BPF_MOV64_REG(BPF_REG_6, BPF_REG_1); + *insn++ = BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1, 16); + *insn++ = BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_1, 0); + *insn++ = BPF_CALL_KFUNC(0, bpf_qdisc_init_prologue_ids[0]); + *insn++ = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1); + *insn++ = BPF_EXIT_INSN(); + *insn++ = BPF_MOV64_REG(BPF_REG_1, BPF_REG_6); + *insn++ = prog->insnsi[0]; + + return insn - insn_buf; +} + +BTF_ID_LIST(bpf_qdisc_reset_destroy_epilogue_ids) +BTF_ID(func, bpf_qdisc_reset_destroy_epilogue) + +static int bpf_qdisc_gen_epilogue(struct bpf_insn *insn_buf, const struct bpf_prog *prog, + s16 ctx_stack_off) +{ + struct bpf_insn *insn = insn_buf; + + if (prog->aux->attach_st_ops_member_off != offsetof(struct Qdisc_ops, reset) && + prog->aux->attach_st_ops_member_off != offsetof(struct Qdisc_ops, destroy)) + return 0; + + /* r1 = stack[ctx_stack_off]; // r1 will be "u64 *ctx" + * r1 = r1[0]; // r1 will be "struct Qdisc *sch" + * r0 = bpf_qdisc_reset_destroy_epilogue(r1); + * BPF_EXIT; + */ + *insn++ = BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_FP, ctx_stack_off); + *insn++ = BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_1, 0); + *insn++ = BPF_CALL_KFUNC(0, bpf_qdisc_reset_destroy_epilogue_ids[0]); + *insn++ = BPF_EXIT_INSN(); + + return insn - insn_buf; +} + +__bpf_kfunc_start_defs(); + +/* bpf_skb_get_hash - Get the flow hash of an skb. + * @skb: The skb to get the flow hash from. + */ +__bpf_kfunc u32 bpf_skb_get_hash(struct sk_buff *skb) +{ + return skb_get_hash(skb); +} + +/* bpf_kfree_skb - Release an skb's reference and drop it immediately. + * @skb: The skb whose reference to be released and dropped. + */ +__bpf_kfunc void bpf_kfree_skb(struct sk_buff *skb) +{ + kfree_skb(skb); +} + +/* bpf_qdisc_skb_drop - Drop an skb by adding it to a deferred free list. + * @skb: The skb whose reference to be released and dropped. + * @to_free_list: The list of skbs to be dropped. + */ +__bpf_kfunc void bpf_qdisc_skb_drop(struct sk_buff *skb, + struct bpf_sk_buff_ptr *to_free_list) +{ + __qdisc_drop(skb, (struct sk_buff **)to_free_list); +} + +/* bpf_qdisc_watchdog_schedule - Schedule a qdisc to a later time using a timer. + * @sch: The qdisc to be scheduled. + * @expire: The expiry time of the timer. + * @delta_ns: The slack range of the timer. + */ +__bpf_kfunc void bpf_qdisc_watchdog_schedule(struct Qdisc *sch, u64 expire, u64 delta_ns) +{ + struct bpf_sched_data *q = qdisc_priv(sch); + + qdisc_watchdog_schedule_range_ns(&q->watchdog, expire, delta_ns); +} + +/* bpf_qdisc_init_prologue - Hidden kfunc called in prologue of .init. */ +__bpf_kfunc int bpf_qdisc_init_prologue(struct Qdisc *sch, + struct netlink_ext_ack *extack) +{ + struct bpf_sched_data *q = qdisc_priv(sch); + struct net_device *dev = qdisc_dev(sch); + struct Qdisc *p; + + qdisc_watchdog_init(&q->watchdog, sch); + + if (sch->parent != TC_H_ROOT) { + /* If qdisc_lookup() returns NULL, it means .init is called by + * qdisc_create_dflt() in mq/mqprio_init and the parent qdisc + * has not been added to qdisc_hash yet. + */ + p = qdisc_lookup(dev, TC_H_MAJ(sch->parent)); + if (p && !(p->flags & TCQ_F_MQROOT)) { + NL_SET_ERR_MSG(extack, "BPF qdisc only supported on root or mq"); + return -EINVAL; + } + } + + return 0; +} + +/* bpf_qdisc_reset_destroy_epilogue - Hidden kfunc called in epilogue of .reset + * and .destroy + */ +__bpf_kfunc void bpf_qdisc_reset_destroy_epilogue(struct Qdisc *sch) +{ + struct bpf_sched_data *q = qdisc_priv(sch); + + qdisc_watchdog_cancel(&q->watchdog); +} + +/* bpf_qdisc_bstats_update - Update Qdisc basic statistics + * @sch: The qdisc from which an skb is dequeued. + * @skb: The skb to be dequeued. + */ +__bpf_kfunc void bpf_qdisc_bstats_update(struct Qdisc *sch, const struct sk_buff *skb) +{ + bstats_update(&sch->bstats, skb); +} + +__bpf_kfunc_end_defs(); + +BTF_KFUNCS_START(qdisc_kfunc_ids) +BTF_ID_FLAGS(func, bpf_skb_get_hash, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_kfree_skb, KF_RELEASE) +BTF_ID_FLAGS(func, bpf_qdisc_skb_drop, KF_RELEASE) +BTF_ID_FLAGS(func, bpf_dynptr_from_skb, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_qdisc_watchdog_schedule, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_qdisc_init_prologue, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_qdisc_reset_destroy_epilogue, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_qdisc_bstats_update, KF_TRUSTED_ARGS) +BTF_KFUNCS_END(qdisc_kfunc_ids) + +BTF_SET_START(qdisc_common_kfunc_set) +BTF_ID(func, bpf_skb_get_hash) +BTF_ID(func, bpf_kfree_skb) +BTF_ID(func, bpf_dynptr_from_skb) +BTF_SET_END(qdisc_common_kfunc_set) + +BTF_SET_START(qdisc_enqueue_kfunc_set) +BTF_ID(func, bpf_qdisc_skb_drop) +BTF_ID(func, bpf_qdisc_watchdog_schedule) +BTF_SET_END(qdisc_enqueue_kfunc_set) + +BTF_SET_START(qdisc_dequeue_kfunc_set) +BTF_ID(func, bpf_qdisc_watchdog_schedule) +BTF_ID(func, bpf_qdisc_bstats_update) +BTF_SET_END(qdisc_dequeue_kfunc_set) + +enum qdisc_ops_kf_flags { + QDISC_OPS_KF_COMMON = 0, + QDISC_OPS_KF_ENQUEUE = 1 << 0, + QDISC_OPS_KF_DEQUEUE = 1 << 1, +}; + +static const u32 qdisc_ops_context_flags[] = { + [QDISC_OP_IDX(enqueue)] = QDISC_OPS_KF_ENQUEUE, + [QDISC_OP_IDX(dequeue)] = QDISC_OPS_KF_DEQUEUE, + [QDISC_OP_IDX(init)] = QDISC_OPS_KF_COMMON, + [QDISC_OP_IDX(reset)] = QDISC_OPS_KF_COMMON, + [QDISC_OP_IDX(destroy)] = QDISC_OPS_KF_COMMON, +}; + +static int bpf_qdisc_kfunc_filter(const struct bpf_prog *prog, u32 kfunc_id) +{ + u32 moff, flags; + + if (!btf_id_set8_contains(&qdisc_kfunc_ids, kfunc_id)) + return 0; + + if (prog->aux->st_ops != &bpf_Qdisc_ops) + return -EACCES; + + moff = prog->aux->attach_st_ops_member_off; + flags = qdisc_ops_context_flags[QDISC_MOFF_IDX(moff)]; + + if ((flags & QDISC_OPS_KF_ENQUEUE) && + btf_id_set_contains(&qdisc_enqueue_kfunc_set, kfunc_id)) + return 0; + + if ((flags & QDISC_OPS_KF_DEQUEUE) && + btf_id_set_contains(&qdisc_dequeue_kfunc_set, kfunc_id)) + return 0; + + if (btf_id_set_contains(&qdisc_common_kfunc_set, kfunc_id)) + return 0; + + return -EACCES; +} + +static const struct btf_kfunc_id_set bpf_qdisc_kfunc_set = { + .owner = THIS_MODULE, + .set = &qdisc_kfunc_ids, + .filter = bpf_qdisc_kfunc_filter, +}; + +static const struct bpf_verifier_ops bpf_qdisc_verifier_ops = { + .get_func_proto = bpf_base_func_proto, + .is_valid_access = bpf_qdisc_is_valid_access, + .btf_struct_access = bpf_qdisc_btf_struct_access, + .gen_prologue = bpf_qdisc_gen_prologue, + .gen_epilogue = bpf_qdisc_gen_epilogue, +}; + +static int bpf_qdisc_init_member(const struct btf_type *t, + const struct btf_member *member, + void *kdata, const void *udata) +{ + const struct Qdisc_ops *uqdisc_ops; + struct Qdisc_ops *qdisc_ops; + u32 moff; + + uqdisc_ops = (const struct Qdisc_ops *)udata; + qdisc_ops = (struct Qdisc_ops *)kdata; + + moff = __btf_member_bit_offset(t, member) / 8; + switch (moff) { + case offsetof(struct Qdisc_ops, priv_size): + if (uqdisc_ops->priv_size) + return -EINVAL; + qdisc_ops->priv_size = sizeof(struct bpf_sched_data); + return 1; + case offsetof(struct Qdisc_ops, peek): + qdisc_ops->peek = qdisc_peek_dequeued; + return 0; + case offsetof(struct Qdisc_ops, id): + if (bpf_obj_name_cpy(qdisc_ops->id, uqdisc_ops->id, + sizeof(qdisc_ops->id)) <= 0) + return -EINVAL; + return 1; + } + + return 0; +} + +static int bpf_qdisc_reg(void *kdata, struct bpf_link *link) +{ + return register_qdisc(kdata); +} + +static void bpf_qdisc_unreg(void *kdata, struct bpf_link *link) +{ + return unregister_qdisc(kdata); +} + +static int bpf_qdisc_validate(void *kdata) +{ + struct Qdisc_ops *ops = (struct Qdisc_ops *)kdata; + + if (!ops->enqueue || !ops->dequeue || !ops->init || + !ops->reset || !ops->destroy) + return -EINVAL; + + return 0; +} + +static int Qdisc_ops__enqueue(struct sk_buff *skb__ref, struct Qdisc *sch, + struct sk_buff **to_free) +{ + return 0; +} + +static struct sk_buff *Qdisc_ops__dequeue(struct Qdisc *sch) +{ + return NULL; +} + +static int Qdisc_ops__init(struct Qdisc *sch, struct nlattr *arg, + struct netlink_ext_ack *extack) +{ + return 0; +} + +static void Qdisc_ops__reset(struct Qdisc *sch) +{ +} + +static void Qdisc_ops__destroy(struct Qdisc *sch) +{ +} + +static struct Qdisc_ops __bpf_ops_qdisc_ops = { + .enqueue = Qdisc_ops__enqueue, + .dequeue = Qdisc_ops__dequeue, + .init = Qdisc_ops__init, + .reset = Qdisc_ops__reset, + .destroy = Qdisc_ops__destroy, +}; + +static struct bpf_struct_ops bpf_Qdisc_ops = { + .verifier_ops = &bpf_qdisc_verifier_ops, + .reg = bpf_qdisc_reg, + .unreg = bpf_qdisc_unreg, + .validate = bpf_qdisc_validate, + .init_member = bpf_qdisc_init_member, + .init = bpf_qdisc_init, + .name = "Qdisc_ops", + .cfi_stubs = &__bpf_ops_qdisc_ops, + .owner = THIS_MODULE, +}; + +BTF_ID_LIST(bpf_sk_buff_dtor_ids) +BTF_ID(func, bpf_kfree_skb) + +static int __init bpf_qdisc_kfunc_init(void) +{ + int ret; + const struct btf_id_dtor_kfunc skb_kfunc_dtors[] = { + { + .btf_id = bpf_sk_buff_ids[0], + .kfunc_btf_id = bpf_sk_buff_dtor_ids[0] + }, + }; + + ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &bpf_qdisc_kfunc_set); + ret = ret ?: register_btf_id_dtor_kfuncs(skb_kfunc_dtors, + ARRAY_SIZE(skb_kfunc_dtors), + THIS_MODULE); + ret = ret ?: register_bpf_struct_ops(&bpf_Qdisc_ops, Qdisc_ops); + + return ret; +} +late_initcall(bpf_qdisc_kfunc_init); diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index ca5676b2668e..ecec0a1e1c1a 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -97,7 +97,7 @@ tcf_exts_miss_cookie_base_alloc(struct tcf_exts *exts, struct tcf_proto *tp, err = xa_alloc_cyclic(&tcf_exts_miss_cookies_xa, &n->miss_cookie_base, n, xa_limit_32b, &next, GFP_KERNEL); - if (err) + if (err < 0) goto err_xa_alloc; exts->miss_cookie_node = n; @@ -390,6 +390,7 @@ static struct tcf_proto *tcf_proto_create(const char *kind, u32 protocol, tp->protocol = protocol; tp->prio = prio; tp->chain = chain; + tp->usesw = !tp->ops->reoffload; spin_lock_init(&tp->lock); refcount_set(&tp->refcnt, 1); @@ -410,12 +411,40 @@ static void tcf_proto_get(struct tcf_proto *tp) refcount_inc(&tp->refcnt); } +static void tcf_proto_count_usesw(struct tcf_proto *tp, bool add) +{ +#ifdef CONFIG_NET_CLS_ACT + struct tcf_block *block = tp->chain->block; + bool counted = false; + + if (!add) { + if (tp->usesw && tp->counted) { + if (!atomic_dec_return(&block->useswcnt)) + static_branch_dec(&tcf_sw_enabled_key); + tp->counted = false; + } + return; + } + + spin_lock(&tp->lock); + if (tp->usesw && !tp->counted) { + counted = true; + tp->counted = true; + } + spin_unlock(&tp->lock); + + if (counted && atomic_inc_return(&block->useswcnt) == 1) + static_branch_inc(&tcf_sw_enabled_key); +#endif +} + static void tcf_chain_put(struct tcf_chain *chain); static void tcf_proto_destroy(struct tcf_proto *tp, bool rtnl_held, bool sig_destroy, struct netlink_ext_ack *extack) { tp->ops->destroy(tp, rtnl_held, extack); + tcf_proto_count_usesw(tp, false); if (sig_destroy) tcf_proto_signal_destroyed(tp->chain, tp); tcf_chain_put(tp->chain); @@ -1482,6 +1511,7 @@ int tcf_block_get_ext(struct tcf_block **p_block, struct Qdisc *q, return 0; err_dev_insert: + tcf_block_offload_unbind(block, q, ei); err_block_offload_bind: tcf_chain0_head_change_cb_del(block, ei); err_chain0_head_change_cb_add: @@ -1896,7 +1926,8 @@ static void tcf_chain_tp_remove(struct tcf_chain *chain, static struct tcf_proto *tcf_chain_tp_find(struct tcf_chain *chain, struct tcf_chain_info *chain_info, u32 protocol, u32 prio, - bool prio_allocate); + bool prio_allocate, + struct netlink_ext_ack *extack); /* Try to insert new proto. * If proto with specified priority already exists, free new proto @@ -1920,8 +1951,7 @@ static struct tcf_proto *tcf_chain_tp_insert_unique(struct tcf_chain *chain, return ERR_PTR(-EAGAIN); } - tp = tcf_chain_tp_find(chain, &chain_info, - protocol, prio, false); + tp = tcf_chain_tp_find(chain, &chain_info, protocol, prio, false, NULL); if (!tp) err = tcf_chain_tp_insert(chain, &chain_info, tp_new); mutex_unlock(&chain->filter_chain_lock); @@ -1981,7 +2011,8 @@ static void tcf_chain_tp_delete_empty(struct tcf_chain *chain, static struct tcf_proto *tcf_chain_tp_find(struct tcf_chain *chain, struct tcf_chain_info *chain_info, u32 protocol, u32 prio, - bool prio_allocate) + bool prio_allocate, + struct netlink_ext_ack *extack) { struct tcf_proto **pprev; struct tcf_proto *tp; @@ -1992,9 +2023,14 @@ static struct tcf_proto *tcf_chain_tp_find(struct tcf_chain *chain, pprev = &tp->next) { if (tp->prio >= prio) { if (tp->prio == prio) { - if (prio_allocate || - (tp->protocol != protocol && protocol)) + if (prio_allocate) { + NL_SET_ERR_MSG(extack, "Lowest ID from auto-alloc range already in use"); + return ERR_PTR(-ENOSPC); + } + if (tp->protocol != protocol && protocol) { + NL_SET_ERR_MSG(extack, "Protocol mismatch for filter with specified priority"); return ERR_PTR(-EINVAL); + } } else { tp = NULL; } @@ -2021,6 +2057,7 @@ static int tcf_fill_node(struct net *net, struct sk_buff *skb, struct tcmsg *tcm; struct nlmsghdr *nlh; unsigned char *b = skb_tail_pointer(skb); + int ret = -EMSGSIZE; nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags); if (!nlh) @@ -2065,11 +2102,45 @@ static int tcf_fill_node(struct net *net, struct sk_buff *skb, return skb->len; +cls_op_not_supp: + ret = -EOPNOTSUPP; out_nlmsg_trim: nla_put_failure: -cls_op_not_supp: nlmsg_trim(skb, b); - return -1; + return ret; +} + +static struct sk_buff *tfilter_notify_prep(struct net *net, + struct sk_buff *oskb, + struct nlmsghdr *n, + struct tcf_proto *tp, + struct tcf_block *block, + struct Qdisc *q, u32 parent, + void *fh, int event, + u32 portid, bool rtnl_held, + struct netlink_ext_ack *extack) +{ + unsigned int size = oskb ? max(NLMSG_GOODSIZE, oskb->len) : NLMSG_GOODSIZE; + struct sk_buff *skb; + int ret; + +retry: + skb = alloc_skb(size, GFP_KERNEL); + if (!skb) + return ERR_PTR(-ENOBUFS); + + ret = tcf_fill_node(net, skb, tp, block, q, parent, fh, portid, + n->nlmsg_seq, n->nlmsg_flags, event, false, + rtnl_held, extack); + if (ret <= 0) { + kfree_skb(skb); + if (ret == -EMSGSIZE) { + size += NLMSG_GOODSIZE; + goto retry; + } + return ERR_PTR(-EINVAL); + } + return skb; } static int tfilter_notify(struct net *net, struct sk_buff *oskb, @@ -2085,16 +2156,10 @@ static int tfilter_notify(struct net *net, struct sk_buff *oskb, if (!unicast && !rtnl_notify_needed(net, n->nlmsg_flags, RTNLGRP_TC)) return 0; - skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); - if (!skb) - return -ENOBUFS; - - if (tcf_fill_node(net, skb, tp, block, q, parent, fh, portid, - n->nlmsg_seq, n->nlmsg_flags, event, - false, rtnl_held, extack) <= 0) { - kfree_skb(skb); - return -EINVAL; - } + skb = tfilter_notify_prep(net, oskb, n, tp, block, q, parent, fh, event, + portid, rtnl_held, extack); + if (IS_ERR(skb)) + return PTR_ERR(skb); if (unicast) err = rtnl_unicast(skb, net, portid); @@ -2117,16 +2182,11 @@ static int tfilter_del_notify(struct net *net, struct sk_buff *oskb, if (!rtnl_notify_needed(net, n->nlmsg_flags, RTNLGRP_TC)) return tp->ops->delete(tp, fh, last, rtnl_held, extack); - skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); - if (!skb) - return -ENOBUFS; - - if (tcf_fill_node(net, skb, tp, block, q, parent, fh, portid, - n->nlmsg_seq, n->nlmsg_flags, RTM_DELTFILTER, - false, rtnl_held, extack) <= 0) { + skb = tfilter_notify_prep(net, oskb, n, tp, block, q, parent, fh, + RTM_DELTFILTER, portid, rtnl_held, extack); + if (IS_ERR(skb)) { NL_SET_ERR_MSG(extack, "Failed to build del event notification"); - kfree_skb(skb); - return -EINVAL; + return PTR_ERR(skb); } err = tp->ops->delete(tp, fh, last, rtnl_held, extack); @@ -2260,7 +2320,7 @@ replay: } block->classid = parent; - chain_index = tca[TCA_CHAIN] ? nla_get_u32(tca[TCA_CHAIN]) : 0; + chain_index = nla_get_u32_default(tca[TCA_CHAIN], 0); if (chain_index > TC_ACT_EXT_VAL_MASK) { NL_SET_ERR_MSG(extack, "Specified chain index exceeds upper limit"); err = -EINVAL; @@ -2275,9 +2335,8 @@ replay: mutex_lock(&chain->filter_chain_lock); tp = tcf_chain_tp_find(chain, &chain_info, protocol, - prio, prio_allocate); + prio, prio_allocate, extack); if (IS_ERR(tp)) { - NL_SET_ERR_MSG(extack, "Filter with specified priority/protocol not found"); err = PTR_ERR(tp); goto errout_locked; } @@ -2367,6 +2426,7 @@ replay: tfilter_notify(net, skb, n, tp, block, q, parent, fh, RTM_NEWTFILTER, false, rtnl_held, extack); tfilter_put(tp, fh); + tcf_proto_count_usesw(tp, true); /* q pointer is NULL for shared blocks */ if (q) q->flags &= ~TCQ_F_CAN_BYPASS; @@ -2471,7 +2531,7 @@ static int tc_del_tfilter(struct sk_buff *skb, struct nlmsghdr *n, goto errout; } - chain_index = tca[TCA_CHAIN] ? nla_get_u32(tca[TCA_CHAIN]) : 0; + chain_index = nla_get_u32_default(tca[TCA_CHAIN], 0); if (chain_index > TC_ACT_EXT_VAL_MASK) { NL_SET_ERR_MSG(extack, "Specified chain index exceeds upper limit"); err = -EINVAL; @@ -2501,10 +2561,13 @@ static int tc_del_tfilter(struct sk_buff *skb, struct nlmsghdr *n, mutex_lock(&chain->filter_chain_lock); tp = tcf_chain_tp_find(chain, &chain_info, protocol, - prio, false); - if (!tp || IS_ERR(tp)) { + prio, false, extack); + if (!tp) { + err = -ENOENT; NL_SET_ERR_MSG(extack, "Filter with specified priority/protocol not found"); - err = tp ? PTR_ERR(tp) : -ENOENT; + goto errout_locked; + } else if (IS_ERR(tp)) { + err = PTR_ERR(tp); goto errout_locked; } else if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], tp->ops->kind)) { NL_SET_ERR_MSG(extack, "Specified filter kind does not match existing one"); @@ -2626,7 +2689,7 @@ static int tc_get_tfilter(struct sk_buff *skb, struct nlmsghdr *n, goto errout; } - chain_index = tca[TCA_CHAIN] ? nla_get_u32(tca[TCA_CHAIN]) : 0; + chain_index = nla_get_u32_default(tca[TCA_CHAIN], 0); if (chain_index > TC_ACT_EXT_VAL_MASK) { NL_SET_ERR_MSG(extack, "Specified chain index exceeds upper limit"); err = -EINVAL; @@ -2641,11 +2704,14 @@ static int tc_get_tfilter(struct sk_buff *skb, struct nlmsghdr *n, mutex_lock(&chain->filter_chain_lock); tp = tcf_chain_tp_find(chain, &chain_info, protocol, - prio, false); + prio, false, extack); mutex_unlock(&chain->filter_chain_lock); - if (!tp || IS_ERR(tp)) { + if (!tp) { + err = -ENOENT; NL_SET_ERR_MSG(extack, "Filter with specified priority/protocol not found"); - err = tp ? PTR_ERR(tp) : -ENOENT; + goto errout; + } else if (IS_ERR(tp)) { + err = PTR_ERR(tp); goto errout; } else if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], tp->ops->kind)) { NL_SET_ERR_MSG(extack, "Specified filter kind does not match existing one"); @@ -3066,7 +3132,7 @@ replay: if (IS_ERR(block)) return PTR_ERR(block); - chain_index = tca[TCA_CHAIN] ? nla_get_u32(tca[TCA_CHAIN]) : 0; + chain_index = nla_get_u32_default(tca[TCA_CHAIN], 0); if (chain_index > TC_ACT_EXT_VAL_MASK) { NL_SET_ERR_MSG(extack, "Specified chain index exceeds upper limit"); err = -EINVAL; @@ -4014,6 +4080,19 @@ static struct pernet_operations tcf_net_ops = { .size = sizeof(struct tcf_net), }; +static const struct rtnl_msg_handler tc_filter_rtnl_msg_handlers[] __initconst = { + {.msgtype = RTM_NEWTFILTER, .doit = tc_new_tfilter, + .flags = RTNL_FLAG_DOIT_UNLOCKED}, + {.msgtype = RTM_DELTFILTER, .doit = tc_del_tfilter, + .flags = RTNL_FLAG_DOIT_UNLOCKED}, + {.msgtype = RTM_GETTFILTER, .doit = tc_get_tfilter, + .dumpit = tc_dump_tfilter, .flags = RTNL_FLAG_DOIT_UNLOCKED}, + {.msgtype = RTM_NEWCHAIN, .doit = tc_ctl_chain}, + {.msgtype = RTM_DELCHAIN, .doit = tc_ctl_chain}, + {.msgtype = RTM_GETCHAIN, .doit = tc_ctl_chain, + .dumpit = tc_dump_chain}, +}; + static int __init tc_filter_init(void) { int err; @@ -4027,17 +4106,7 @@ static int __init tc_filter_init(void) goto err_register_pernet_subsys; xa_init_flags(&tcf_exts_miss_cookies_xa, XA_FLAGS_ALLOC1); - - rtnl_register(PF_UNSPEC, RTM_NEWTFILTER, tc_new_tfilter, NULL, - RTNL_FLAG_DOIT_UNLOCKED); - rtnl_register(PF_UNSPEC, RTM_DELTFILTER, tc_del_tfilter, NULL, - RTNL_FLAG_DOIT_UNLOCKED); - rtnl_register(PF_UNSPEC, RTM_GETTFILTER, tc_get_tfilter, - tc_dump_tfilter, RTNL_FLAG_DOIT_UNLOCKED); - rtnl_register(PF_UNSPEC, RTM_NEWCHAIN, tc_ctl_chain, NULL, 0); - rtnl_register(PF_UNSPEC, RTM_DELCHAIN, tc_ctl_chain, NULL, 0); - rtnl_register(PF_UNSPEC, RTM_GETCHAIN, tc_ctl_chain, - tc_dump_chain, 0); + rtnl_register_many(tc_filter_rtnl_msg_handlers); return 0; diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c index 5e83e890f6a4..7fbe42f0e5c2 100644 --- a/net/sched/cls_bpf.c +++ b/net/sched/cls_bpf.c @@ -104,8 +104,8 @@ TC_INDIRECT_SCOPE int cls_bpf_classify(struct sk_buff *skb, bpf_compute_data_pointers(skb); filter_res = bpf_prog_run(prog->filter, skb); } - if (unlikely(!skb->tstamp && skb->mono_delivery_time)) - skb->mono_delivery_time = 0; + if (unlikely(!skb->tstamp && skb->tstamp_type)) + skb->tstamp_type = SKB_CLOCK_REALTIME; if (prog->exts_integrated) { res->class = 0; @@ -509,6 +509,8 @@ static int cls_bpf_change(struct net *net, struct sk_buff *in_skb, if (!tc_in_hw(prog->gen_flags)) prog->gen_flags |= TCA_CLS_FLAGS_NOT_IN_HW; + tcf_proto_update_usesw(tp, prog->gen_flags); + if (oldprog) { idr_replace(&head->handle_idr, prog, handle); list_replace_rcu(&oldprog->link, &prog->link); diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c index 5502998aace7..5693b41b093f 100644 --- a/net/sched/cls_flow.c +++ b/net/sched/cls_flow.c @@ -345,7 +345,7 @@ TC_INDIRECT_SCOPE int flow_classify(struct sk_buff *skb, static void flow_perturbation(struct timer_list *t) { - struct flow_filter *f = from_timer(f, t, perturb_timer); + struct flow_filter *f = timer_container_of(f, t, perturb_timer); get_random_bytes(&f->hashrnd, 4); if (f->perturb_period) @@ -356,7 +356,8 @@ static const struct nla_policy flow_policy[TCA_FLOW_MAX + 1] = { [TCA_FLOW_KEYS] = { .type = NLA_U32 }, [TCA_FLOW_MODE] = { .type = NLA_U32 }, [TCA_FLOW_BASECLASS] = { .type = NLA_U32 }, - [TCA_FLOW_RSHIFT] = { .type = NLA_U32 }, + [TCA_FLOW_RSHIFT] = NLA_POLICY_MAX(NLA_U32, + 31 /* BITS_PER_U32 - 1 */), [TCA_FLOW_ADDEND] = { .type = NLA_U32 }, [TCA_FLOW_MASK] = { .type = NLA_U32 }, [TCA_FLOW_XOR] = { .type = NLA_U32 }, diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index e1314674b4a9..099ff6a3e1f5 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -28,6 +28,7 @@ #include <net/vxlan.h> #include <net/erspan.h> #include <net/gtp.h> +#include <net/pfcp.h> #include <net/tc_wrapper.h> #include <net/dst.h> @@ -40,6 +41,16 @@ #define TCA_FLOWER_KEY_CT_FLAGS_MASK \ (TCA_FLOWER_KEY_CT_FLAGS_MAX - 1) +#define TCA_FLOWER_KEY_FLAGS_POLICY_MASK \ + (TCA_FLOWER_KEY_FLAGS_IS_FRAGMENT | \ + TCA_FLOWER_KEY_FLAGS_FRAG_IS_FIRST) + +#define TCA_FLOWER_KEY_ENC_FLAGS_POLICY_MASK \ + (TCA_FLOWER_KEY_FLAGS_TUNNEL_CSUM | \ + TCA_FLOWER_KEY_FLAGS_TUNNEL_DONT_FRAGMENT | \ + TCA_FLOWER_KEY_FLAGS_TUNNEL_OAM | \ + TCA_FLOWER_KEY_FLAGS_TUNNEL_CRIT_OPT) + struct fl_flow_key { struct flow_dissector_key_meta meta; struct flow_dissector_key_control control; @@ -668,8 +679,10 @@ static const struct nla_policy fl_policy[TCA_FLOWER_MAX + 1] = { [TCA_FLOWER_KEY_ENC_UDP_SRC_PORT_MASK] = { .type = NLA_U16 }, [TCA_FLOWER_KEY_ENC_UDP_DST_PORT] = { .type = NLA_U16 }, [TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK] = { .type = NLA_U16 }, - [TCA_FLOWER_KEY_FLAGS] = { .type = NLA_U32 }, - [TCA_FLOWER_KEY_FLAGS_MASK] = { .type = NLA_U32 }, + [TCA_FLOWER_KEY_FLAGS] = NLA_POLICY_MASK(NLA_BE32, + TCA_FLOWER_KEY_FLAGS_POLICY_MASK), + [TCA_FLOWER_KEY_FLAGS_MASK] = NLA_POLICY_MASK(NLA_BE32, + TCA_FLOWER_KEY_FLAGS_POLICY_MASK), [TCA_FLOWER_KEY_ICMPV4_TYPE] = { .type = NLA_U8 }, [TCA_FLOWER_KEY_ICMPV4_TYPE_MASK] = { .type = NLA_U8 }, [TCA_FLOWER_KEY_ICMPV4_CODE] = { .type = NLA_U8 }, @@ -731,6 +744,10 @@ static const struct nla_policy fl_policy[TCA_FLOWER_MAX + 1] = { [TCA_FLOWER_KEY_SPI_MASK] = { .type = NLA_U32 }, [TCA_FLOWER_L2_MISS] = NLA_POLICY_MAX(NLA_U8, 1), [TCA_FLOWER_KEY_CFM] = { .type = NLA_NESTED }, + [TCA_FLOWER_KEY_ENC_FLAGS] = NLA_POLICY_MASK(NLA_BE32, + TCA_FLOWER_KEY_ENC_FLAGS_POLICY_MASK), + [TCA_FLOWER_KEY_ENC_FLAGS_MASK] = NLA_POLICY_MASK(NLA_BE32, + TCA_FLOWER_KEY_ENC_FLAGS_POLICY_MASK), }; static const struct nla_policy @@ -741,6 +758,7 @@ enc_opts_policy[TCA_FLOWER_KEY_ENC_OPTS_MAX + 1] = { [TCA_FLOWER_KEY_ENC_OPTS_VXLAN] = { .type = NLA_NESTED }, [TCA_FLOWER_KEY_ENC_OPTS_ERSPAN] = { .type = NLA_NESTED }, [TCA_FLOWER_KEY_ENC_OPTS_GTP] = { .type = NLA_NESTED }, + [TCA_FLOWER_KEY_ENC_OPTS_PFCP] = { .type = NLA_NESTED }, }; static const struct nla_policy @@ -748,7 +766,7 @@ geneve_opt_policy[TCA_FLOWER_KEY_ENC_OPT_GENEVE_MAX + 1] = { [TCA_FLOWER_KEY_ENC_OPT_GENEVE_CLASS] = { .type = NLA_U16 }, [TCA_FLOWER_KEY_ENC_OPT_GENEVE_TYPE] = { .type = NLA_U8 }, [TCA_FLOWER_KEY_ENC_OPT_GENEVE_DATA] = { .type = NLA_BINARY, - .len = 128 }, + .len = 127 }, }; static const struct nla_policy @@ -771,6 +789,12 @@ gtp_opt_policy[TCA_FLOWER_KEY_ENC_OPT_GTP_MAX + 1] = { }; static const struct nla_policy +pfcp_opt_policy[TCA_FLOWER_KEY_ENC_OPT_PFCP_MAX + 1] = { + [TCA_FLOWER_KEY_ENC_OPT_PFCP_TYPE] = { .type = NLA_U8 }, + [TCA_FLOWER_KEY_ENC_OPT_PFCP_SEID] = { .type = NLA_U64 }, +}; + +static const struct nla_policy mpls_stack_entry_policy[TCA_FLOWER_KEY_MPLS_OPT_LSE_MAX + 1] = { [TCA_FLOWER_KEY_MPLS_OPT_LSE_DEPTH] = { .type = NLA_U8 }, [TCA_FLOWER_KEY_MPLS_OPT_LSE_TTL] = { .type = NLA_U8 }, @@ -1147,19 +1171,29 @@ static void fl_set_key_flag(u32 flower_key, u32 flower_mask, } } -static int fl_set_key_flags(struct nlattr **tb, u32 *flags_key, - u32 *flags_mask, struct netlink_ext_ack *extack) +static int fl_set_key_flags(struct nlattr *tca_opts, struct nlattr **tb, + bool encap, u32 *flags_key, u32 *flags_mask, + struct netlink_ext_ack *extack) { + int fl_key, fl_mask; u32 key, mask; + if (encap) { + fl_key = TCA_FLOWER_KEY_ENC_FLAGS; + fl_mask = TCA_FLOWER_KEY_ENC_FLAGS_MASK; + } else { + fl_key = TCA_FLOWER_KEY_FLAGS; + fl_mask = TCA_FLOWER_KEY_FLAGS_MASK; + } + /* mask is mandatory for flags */ - if (!tb[TCA_FLOWER_KEY_FLAGS_MASK]) { + if (NL_REQ_ATTR_CHECK(extack, tca_opts, tb, fl_mask)) { NL_SET_ERR_MSG(extack, "Missing flags mask"); return -EINVAL; } - key = be32_to_cpu(nla_get_be32(tb[TCA_FLOWER_KEY_FLAGS])); - mask = be32_to_cpu(nla_get_be32(tb[TCA_FLOWER_KEY_FLAGS_MASK])); + key = be32_to_cpu(nla_get_be32(tb[fl_key])); + mask = be32_to_cpu(nla_get_be32(tb[fl_mask])); *flags_key = 0; *flags_mask = 0; @@ -1170,6 +1204,21 @@ static int fl_set_key_flags(struct nlattr **tb, u32 *flags_key, TCA_FLOWER_KEY_FLAGS_FRAG_IS_FIRST, FLOW_DIS_FIRST_FRAG); + fl_set_key_flag(key, mask, flags_key, flags_mask, + TCA_FLOWER_KEY_FLAGS_TUNNEL_CSUM, + FLOW_DIS_F_TUNNEL_CSUM); + + fl_set_key_flag(key, mask, flags_key, flags_mask, + TCA_FLOWER_KEY_FLAGS_TUNNEL_DONT_FRAGMENT, + FLOW_DIS_F_TUNNEL_DONT_FRAGMENT); + + fl_set_key_flag(key, mask, flags_key, flags_mask, + TCA_FLOWER_KEY_FLAGS_TUNNEL_OAM, FLOW_DIS_F_TUNNEL_OAM); + + fl_set_key_flag(key, mask, flags_key, flags_mask, + TCA_FLOWER_KEY_FLAGS_TUNNEL_CRIT_OPT, + FLOW_DIS_F_TUNNEL_CRIT_OPT); + return 0; } @@ -1320,7 +1369,6 @@ static int fl_set_erspan_opt(const struct nlattr *nla, struct fl_flow_key *key, int err; md = (struct erspan_metadata *)&key->enc_opts.data[key->enc_opts.len]; - memset(md, 0xff, sizeof(*md)); md->version = 1; if (!depth) @@ -1349,9 +1397,9 @@ static int fl_set_erspan_opt(const struct nlattr *nla, struct fl_flow_key *key, NL_SET_ERR_MSG(extack, "Missing tunnel key erspan option index"); return -EINVAL; } + memset(&md->u.index, 0xff, sizeof(md->u.index)); if (tb[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_INDEX]) { nla = tb[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_INDEX]; - memset(&md->u, 0x00, sizeof(md->u)); md->u.index = nla_get_be32(nla); } } else if (md->version == 2) { @@ -1360,10 +1408,12 @@ static int fl_set_erspan_opt(const struct nlattr *nla, struct fl_flow_key *key, NL_SET_ERR_MSG(extack, "Missing tunnel key erspan option dir or hwid"); return -EINVAL; } + md->u.md2.dir = 1; if (tb[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_DIR]) { nla = tb[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_DIR]; md->u.md2.dir = nla_get_u8(nla); } + set_hwid(&md->u.md2, 0xff); if (tb[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_HWID]) { nla = tb[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_HWID]; set_hwid(&md->u.md2, nla_get_u8(nla)); @@ -1419,6 +1469,44 @@ static int fl_set_gtp_opt(const struct nlattr *nla, struct fl_flow_key *key, return sizeof(*sinfo); } +static int fl_set_pfcp_opt(const struct nlattr *nla, struct fl_flow_key *key, + int depth, int option_len, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[TCA_FLOWER_KEY_ENC_OPT_PFCP_MAX + 1]; + struct pfcp_metadata *md; + int err; + + md = (struct pfcp_metadata *)&key->enc_opts.data[key->enc_opts.len]; + memset(md, 0xff, sizeof(*md)); + + if (!depth) + return sizeof(*md); + + if (nla_type(nla) != TCA_FLOWER_KEY_ENC_OPTS_PFCP) { + NL_SET_ERR_MSG_MOD(extack, "Non-pfcp option type for mask"); + return -EINVAL; + } + + err = nla_parse_nested(tb, TCA_FLOWER_KEY_ENC_OPT_PFCP_MAX, nla, + pfcp_opt_policy, extack); + if (err < 0) + return err; + + if (!option_len && !tb[TCA_FLOWER_KEY_ENC_OPT_PFCP_TYPE]) { + NL_SET_ERR_MSG_MOD(extack, "Missing tunnel key pfcp option type"); + return -EINVAL; + } + + if (tb[TCA_FLOWER_KEY_ENC_OPT_PFCP_TYPE]) + md->type = nla_get_u8(tb[TCA_FLOWER_KEY_ENC_OPT_PFCP_TYPE]); + + if (tb[TCA_FLOWER_KEY_ENC_OPT_PFCP_SEID]) + md->seid = nla_get_be64(tb[TCA_FLOWER_KEY_ENC_OPT_PFCP_SEID]); + + return sizeof(*md); +} + static int fl_set_enc_opt(struct nlattr **tb, struct fl_flow_key *key, struct fl_flow_key *mask, struct netlink_ext_ack *extack) @@ -1454,12 +1542,13 @@ static int fl_set_enc_opt(struct nlattr **tb, struct fl_flow_key *key, switch (nla_type(nla_opt_key)) { case TCA_FLOWER_KEY_ENC_OPTS_GENEVE: if (key->enc_opts.dst_opt_type && - key->enc_opts.dst_opt_type != TUNNEL_GENEVE_OPT) { + key->enc_opts.dst_opt_type != + IP_TUNNEL_GENEVE_OPT_BIT) { NL_SET_ERR_MSG(extack, "Duplicate type for geneve options"); return -EINVAL; } option_len = 0; - key->enc_opts.dst_opt_type = TUNNEL_GENEVE_OPT; + key->enc_opts.dst_opt_type = IP_TUNNEL_GENEVE_OPT_BIT; option_len = fl_set_geneve_opt(nla_opt_key, key, key_depth, option_len, extack); @@ -1470,7 +1559,7 @@ static int fl_set_enc_opt(struct nlattr **tb, struct fl_flow_key *key, /* At the same time we need to parse through the mask * in order to verify exact and mask attribute lengths. */ - mask->enc_opts.dst_opt_type = TUNNEL_GENEVE_OPT; + mask->enc_opts.dst_opt_type = IP_TUNNEL_GENEVE_OPT_BIT; option_len = fl_set_geneve_opt(nla_opt_msk, mask, msk_depth, option_len, extack); @@ -1489,7 +1578,7 @@ static int fl_set_enc_opt(struct nlattr **tb, struct fl_flow_key *key, return -EINVAL; } option_len = 0; - key->enc_opts.dst_opt_type = TUNNEL_VXLAN_OPT; + key->enc_opts.dst_opt_type = IP_TUNNEL_VXLAN_OPT_BIT; option_len = fl_set_vxlan_opt(nla_opt_key, key, key_depth, option_len, extack); @@ -1500,7 +1589,7 @@ static int fl_set_enc_opt(struct nlattr **tb, struct fl_flow_key *key, /* At the same time we need to parse through the mask * in order to verify exact and mask attribute lengths. */ - mask->enc_opts.dst_opt_type = TUNNEL_VXLAN_OPT; + mask->enc_opts.dst_opt_type = IP_TUNNEL_VXLAN_OPT_BIT; option_len = fl_set_vxlan_opt(nla_opt_msk, mask, msk_depth, option_len, extack); @@ -1519,7 +1608,7 @@ static int fl_set_enc_opt(struct nlattr **tb, struct fl_flow_key *key, return -EINVAL; } option_len = 0; - key->enc_opts.dst_opt_type = TUNNEL_ERSPAN_OPT; + key->enc_opts.dst_opt_type = IP_TUNNEL_ERSPAN_OPT_BIT; option_len = fl_set_erspan_opt(nla_opt_key, key, key_depth, option_len, extack); @@ -1530,7 +1619,7 @@ static int fl_set_enc_opt(struct nlattr **tb, struct fl_flow_key *key, /* At the same time we need to parse through the mask * in order to verify exact and mask attribute lengths. */ - mask->enc_opts.dst_opt_type = TUNNEL_ERSPAN_OPT; + mask->enc_opts.dst_opt_type = IP_TUNNEL_ERSPAN_OPT_BIT; option_len = fl_set_erspan_opt(nla_opt_msk, mask, msk_depth, option_len, extack); @@ -1550,7 +1639,7 @@ static int fl_set_enc_opt(struct nlattr **tb, struct fl_flow_key *key, return -EINVAL; } option_len = 0; - key->enc_opts.dst_opt_type = TUNNEL_GTP_OPT; + key->enc_opts.dst_opt_type = IP_TUNNEL_GTP_OPT_BIT; option_len = fl_set_gtp_opt(nla_opt_key, key, key_depth, option_len, extack); @@ -1561,7 +1650,7 @@ static int fl_set_enc_opt(struct nlattr **tb, struct fl_flow_key *key, /* At the same time we need to parse through the mask * in order to verify exact and mask attribute lengths. */ - mask->enc_opts.dst_opt_type = TUNNEL_GTP_OPT; + mask->enc_opts.dst_opt_type = IP_TUNNEL_GTP_OPT_BIT; option_len = fl_set_gtp_opt(nla_opt_msk, mask, msk_depth, option_len, extack); @@ -1575,6 +1664,36 @@ static int fl_set_enc_opt(struct nlattr **tb, struct fl_flow_key *key, return -EINVAL; } break; + case TCA_FLOWER_KEY_ENC_OPTS_PFCP: + if (key->enc_opts.dst_opt_type) { + NL_SET_ERR_MSG_MOD(extack, "Duplicate type for pfcp options"); + return -EINVAL; + } + option_len = 0; + key->enc_opts.dst_opt_type = IP_TUNNEL_PFCP_OPT_BIT; + option_len = fl_set_pfcp_opt(nla_opt_key, key, + key_depth, option_len, + extack); + if (option_len < 0) + return option_len; + + key->enc_opts.len += option_len; + /* At the same time we need to parse through the mask + * in order to verify exact and mask attribute lengths. + */ + mask->enc_opts.dst_opt_type = IP_TUNNEL_PFCP_OPT_BIT; + option_len = fl_set_pfcp_opt(nla_opt_msk, mask, + msk_depth, option_len, + extack); + if (option_len < 0) + return option_len; + + mask->enc_opts.len += option_len; + if (key->enc_opts.len != mask->enc_opts.len) { + NL_SET_ERR_MSG_MOD(extack, "Key and mask miss aligned"); + return -EINVAL; + } + break; default: NL_SET_ERR_MSG(extack, "Unknown tunnel option type"); return -EINVAL; @@ -1748,9 +1867,9 @@ static int fl_set_key_cfm(struct nlattr **tb, return 0; } -static int fl_set_key(struct net *net, struct nlattr **tb, - struct fl_flow_key *key, struct fl_flow_key *mask, - struct netlink_ext_ack *extack) +static int fl_set_key(struct net *net, struct nlattr *tca_opts, + struct nlattr **tb, struct fl_flow_key *key, + struct fl_flow_key *mask, struct netlink_ext_ack *extack) { __be16 ethertype; int ret = 0; @@ -1982,9 +2101,18 @@ static int fl_set_key(struct net *net, struct nlattr **tb, if (ret) return ret; - if (tb[TCA_FLOWER_KEY_FLAGS]) - ret = fl_set_key_flags(tb, &key->control.flags, + if (tb[TCA_FLOWER_KEY_FLAGS]) { + ret = fl_set_key_flags(tca_opts, tb, false, + &key->control.flags, &mask->control.flags, extack); + if (ret) + return ret; + } + + if (tb[TCA_FLOWER_KEY_ENC_FLAGS]) + ret = fl_set_key_flags(tca_opts, tb, true, + &key->enc_control.flags, + &mask->enc_control.flags, extack); return ret; } @@ -2075,7 +2203,8 @@ static void fl_init_dissector(struct flow_dissector *dissector, FL_KEY_SET_IF_MASKED(mask, keys, cnt, FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS, enc_ipv6); if (FL_KEY_IS_MASKED(mask, enc_ipv4) || - FL_KEY_IS_MASKED(mask, enc_ipv6)) + FL_KEY_IS_MASKED(mask, enc_ipv6) || + FL_KEY_IS_MASKED(mask, enc_control)) FL_KEY_SET(keys, cnt, FLOW_DISSECTOR_KEY_ENC_CONTROL, enc_control); FL_KEY_SET_IF_MASKED(mask, keys, cnt, @@ -2233,6 +2362,7 @@ static int fl_change(struct net *net, struct sk_buff *in_skb, { struct cls_fl_head *head = fl_head_dereference(tp); bool rtnl_held = !(flags & TCA_ACT_FLAGS_NO_RTNL); + struct nlattr *tca_opts = tca[TCA_OPTIONS]; struct cls_fl_filter *fold = *arg; bool bound_to_filter = false; struct cls_fl_filter *fnew; @@ -2241,7 +2371,7 @@ static int fl_change(struct net *net, struct sk_buff *in_skb, bool in_ht; int err; - if (!tca[TCA_OPTIONS]) { + if (!tca_opts) { err = -EINVAL; goto errout_fold; } @@ -2259,7 +2389,7 @@ static int fl_change(struct net *net, struct sk_buff *in_skb, } err = nla_parse_nested_deprecated(tb, TCA_FLOWER_MAX, - tca[TCA_OPTIONS], fl_policy, NULL); + tca_opts, fl_policy, NULL); if (err < 0) goto errout_tb; @@ -2335,7 +2465,7 @@ static int fl_change(struct net *net, struct sk_buff *in_skb, bound_to_filter = true; } - err = fl_set_key(net, tb, &fnew->key, &mask->key, extack); + err = fl_set_key(net, tca_opts, tb, &fnew->key, &mask->key, extack); if (err) goto unbind_filter; @@ -2373,6 +2503,8 @@ static int fl_change(struct net *net, struct sk_buff *in_skb, if (!tc_in_hw(fnew->flags)) fnew->flags |= TCA_CLS_FLAGS_NOT_IN_HW; + tcf_proto_update_usesw(tp, fnew->flags); + spin_lock(&tp->lock); /* tp was deleted concurrently. -EAGAIN will cause caller to lookup @@ -2675,18 +2807,19 @@ static void *fl_tmplt_create(struct net *net, struct tcf_chain *chain, struct nlattr **tca, struct netlink_ext_ack *extack) { + struct nlattr *tca_opts = tca[TCA_OPTIONS]; struct fl_flow_tmplt *tmplt; struct nlattr **tb; int err; - if (!tca[TCA_OPTIONS]) + if (!tca_opts) return ERR_PTR(-EINVAL); tb = kcalloc(TCA_FLOWER_MAX + 1, sizeof(struct nlattr *), GFP_KERNEL); if (!tb) return ERR_PTR(-ENOBUFS); err = nla_parse_nested_deprecated(tb, TCA_FLOWER_MAX, - tca[TCA_OPTIONS], fl_policy, NULL); + tca_opts, fl_policy, NULL); if (err) goto errout_tb; @@ -2696,7 +2829,8 @@ static void *fl_tmplt_create(struct net *net, struct tcf_chain *chain, goto errout_tb; } tmplt->chain = chain; - err = fl_set_key(net, tb, &tmplt->dummy_key, &tmplt->mask, extack); + err = fl_set_key(net, tca_opts, tb, &tmplt->dummy_key, + &tmplt->mask, extack); if (err) goto errout_tmplt; @@ -2972,12 +3106,22 @@ static void fl_get_key_flag(u32 dissector_key, u32 dissector_mask, } } -static int fl_dump_key_flags(struct sk_buff *skb, u32 flags_key, u32 flags_mask) +static int fl_dump_key_flags(struct sk_buff *skb, bool encap, + u32 flags_key, u32 flags_mask) { - u32 key, mask; + int fl_key, fl_mask; __be32 _key, _mask; + u32 key, mask; int err; + if (encap) { + fl_key = TCA_FLOWER_KEY_ENC_FLAGS; + fl_mask = TCA_FLOWER_KEY_ENC_FLAGS_MASK; + } else { + fl_key = TCA_FLOWER_KEY_FLAGS; + fl_mask = TCA_FLOWER_KEY_FLAGS_MASK; + } + if (!memchr_inv(&flags_mask, 0, sizeof(flags_mask))) return 0; @@ -2990,14 +3134,29 @@ static int fl_dump_key_flags(struct sk_buff *skb, u32 flags_key, u32 flags_mask) TCA_FLOWER_KEY_FLAGS_FRAG_IS_FIRST, FLOW_DIS_FIRST_FRAG); + fl_get_key_flag(flags_key, flags_mask, &key, &mask, + TCA_FLOWER_KEY_FLAGS_TUNNEL_CSUM, + FLOW_DIS_F_TUNNEL_CSUM); + + fl_get_key_flag(flags_key, flags_mask, &key, &mask, + TCA_FLOWER_KEY_FLAGS_TUNNEL_DONT_FRAGMENT, + FLOW_DIS_F_TUNNEL_DONT_FRAGMENT); + + fl_get_key_flag(flags_key, flags_mask, &key, &mask, + TCA_FLOWER_KEY_FLAGS_TUNNEL_OAM, FLOW_DIS_F_TUNNEL_OAM); + + fl_get_key_flag(flags_key, flags_mask, &key, &mask, + TCA_FLOWER_KEY_FLAGS_TUNNEL_CRIT_OPT, + FLOW_DIS_F_TUNNEL_CRIT_OPT); + _key = cpu_to_be32(key); _mask = cpu_to_be32(mask); - err = nla_put(skb, TCA_FLOWER_KEY_FLAGS, 4, &_key); + err = nla_put(skb, fl_key, 4, &_key); if (err) return err; - return nla_put(skb, TCA_FLOWER_KEY_FLAGS_MASK, 4, &_mask); + return nla_put(skb, fl_mask, 4, &_mask); } static int fl_dump_key_geneve_opt(struct sk_buff *skb, @@ -3117,6 +3276,32 @@ nla_put_failure: return -EMSGSIZE; } +static int fl_dump_key_pfcp_opt(struct sk_buff *skb, + struct flow_dissector_key_enc_opts *enc_opts) +{ + struct pfcp_metadata *md; + struct nlattr *nest; + + nest = nla_nest_start_noflag(skb, TCA_FLOWER_KEY_ENC_OPTS_PFCP); + if (!nest) + goto nla_put_failure; + + md = (struct pfcp_metadata *)&enc_opts->data[0]; + if (nla_put_u8(skb, TCA_FLOWER_KEY_ENC_OPT_PFCP_TYPE, md->type)) + goto nla_put_failure; + + if (nla_put_be64(skb, TCA_FLOWER_KEY_ENC_OPT_PFCP_SEID, + md->seid, 0)) + goto nla_put_failure; + + nla_nest_end(skb, nest); + return 0; + +nla_put_failure: + nla_nest_cancel(skb, nest); + return -EMSGSIZE; +} + static int fl_dump_key_ct(struct sk_buff *skb, struct flow_dissector_key_ct *key, struct flow_dissector_key_ct *mask) @@ -3202,26 +3387,31 @@ static int fl_dump_key_options(struct sk_buff *skb, int enc_opt_type, goto nla_put_failure; switch (enc_opts->dst_opt_type) { - case TUNNEL_GENEVE_OPT: + case IP_TUNNEL_GENEVE_OPT_BIT: err = fl_dump_key_geneve_opt(skb, enc_opts); if (err) goto nla_put_failure; break; - case TUNNEL_VXLAN_OPT: + case IP_TUNNEL_VXLAN_OPT_BIT: err = fl_dump_key_vxlan_opt(skb, enc_opts); if (err) goto nla_put_failure; break; - case TUNNEL_ERSPAN_OPT: + case IP_TUNNEL_ERSPAN_OPT_BIT: err = fl_dump_key_erspan_opt(skb, enc_opts); if (err) goto nla_put_failure; break; - case TUNNEL_GTP_OPT: + case IP_TUNNEL_GTP_OPT_BIT: err = fl_dump_key_gtp_opt(skb, enc_opts); if (err) goto nla_put_failure; break; + case IP_TUNNEL_PFCP_OPT_BIT: + err = fl_dump_key_pfcp_opt(skb, enc_opts); + if (err) + goto nla_put_failure; + break; default: goto nla_put_failure; } @@ -3473,7 +3663,8 @@ static int fl_dump_key(struct sk_buff *skb, struct net *net, if (fl_dump_key_ct(skb, &key->ct, &mask->ct)) goto nla_put_failure; - if (fl_dump_key_flags(skb, key->control.flags, mask->control.flags)) + if (fl_dump_key_flags(skb, false, key->control.flags, + mask->control.flags)) goto nla_put_failure; if (fl_dump_key_val(skb, &key->hash.hash, TCA_FLOWER_KEY_HASH, @@ -3484,6 +3675,10 @@ static int fl_dump_key(struct sk_buff *skb, struct net *net, if (fl_dump_key_cfm(skb, &key->cfm, &mask->cfm)) goto nla_put_failure; + if (fl_dump_key_flags(skb, true, key->enc_control.flags, + mask->enc_control.flags)) + goto nla_put_failure; + return 0; nla_put_failure: diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c index 9f1e62ca508d..f03bf5da39ee 100644 --- a/net/sched/cls_matchall.c +++ b/net/sched/cls_matchall.c @@ -228,6 +228,8 @@ static int mall_change(struct net *net, struct sk_buff *in_skb, if (!tc_in_hw(new->flags)) new->flags |= TCA_CLS_FLAGS_NOT_IN_HW; + tcf_proto_update_usesw(tp, new->flags); + *arg = head; rcu_assign_pointer(tp->root, new); return 0; diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c index 9412d88a99bc..2a1c00048fd6 100644 --- a/net/sched/cls_u32.c +++ b/net/sched/cls_u32.c @@ -92,6 +92,16 @@ struct tc_u_common { long knodes; }; +static u32 handle2id(u32 h) +{ + return ((h & 0x80000000) ? ((h >> 20) & 0x7FF) : h); +} + +static u32 id2handle(u32 id) +{ + return (id | 0x800U) << 20; +} + static inline unsigned int u32_hash_fold(__be32 key, const struct tc_u32_sel *sel, u8 fshift) @@ -310,7 +320,7 @@ static u32 gen_new_htid(struct tc_u_common *tp_c, struct tc_u_hnode *ptr) int id = idr_alloc_cyclic(&tp_c->handle_idr, ptr, 1, 0x7FF, GFP_KERNEL); if (id < 0) return 0; - return (id | 0x800U) << 20; + return id2handle(id); } static struct hlist_head *tc_u_common_hash; @@ -360,7 +370,7 @@ static int u32_init(struct tcf_proto *tp) return -ENOBUFS; refcount_set(&root_ht->refcnt, 1); - root_ht->handle = tp_c ? gen_new_htid(tp_c, root_ht) : 0x80000000; + root_ht->handle = tp_c ? gen_new_htid(tp_c, root_ht) : id2handle(0); root_ht->prio = tp->prio; root_ht->is_root = true; idr_init(&root_ht->handle_idr); @@ -612,7 +622,7 @@ static int u32_destroy_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht, if (phn == ht) { u32_clear_hw_hnode(tp, ht, extack); idr_destroy(&ht->handle_idr); - idr_remove(&tp_c->handle_idr, ht->handle); + idr_remove(&tp_c->handle_idr, handle2id(ht->handle)); RCU_INIT_POINTER(*hn, ht->next); kfree_rcu(ht, rcu); return 0; @@ -941,6 +951,8 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, if (!tc_in_hw(new->flags)) new->flags |= TCA_CLS_FLAGS_NOT_IN_HW; + tcf_proto_update_usesw(tp, new->flags); + u32_replace_knode(tp, tp_c, new); tcf_unbind_filter(tp, &n->res); tcf_exts_get_net(&n->exts); @@ -989,7 +1001,7 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, err = u32_replace_hw_hnode(tp, ht, userflags, extack); if (err) { - idr_remove(&tp_c->handle_idr, handle); + idr_remove(&tp_c->handle_idr, handle2id(handle)); kfree(ht); return err; } @@ -1154,6 +1166,8 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, if (!tc_in_hw(n->flags)) n->flags |= TCA_CLS_FLAGS_NOT_IN_HW; + tcf_proto_update_usesw(tp, n->flags); + ins = &ht->ht[TC_U32_HASH(handle)]; for (pins = rtnl_dereference(*ins); pins; ins = &pins->next, pins = rtnl_dereference(*ins)) diff --git a/net/sched/em_cmp.c b/net/sched/em_cmp.c index c90ad7ea26b4..64b637f18bc7 100644 --- a/net/sched/em_cmp.c +++ b/net/sched/em_cmp.c @@ -10,7 +10,7 @@ #include <linux/kernel.h> #include <linux/skbuff.h> #include <linux/tc_ematch/tc_em_cmp.h> -#include <asm/unaligned.h> +#include <linux/unaligned.h> #include <net/pkt_cls.h> static inline int cmp_needs_transformation(struct tcf_em_cmp *cmp) diff --git a/net/sched/em_meta.c b/net/sched/em_meta.c index 8996c73c9779..3f2e707a11d1 100644 --- a/net/sched/em_meta.c +++ b/net/sched/em_meta.c @@ -460,7 +460,7 @@ META_COLLECTOR(int_sk_fwd_alloc) *err = -1; return; } - dst->value = sk_forward_alloc_get(sk); + dst->value = READ_ONCE(sk->sk_forward_alloc); } META_COLLECTOR(int_sk_sndbuf) diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 65e05b0c98e4..d7c767b861a4 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -25,7 +25,9 @@ #include <linux/hrtimer.h> #include <linux/slab.h> #include <linux/hashtable.h> +#include <linux/bpf.h> +#include <net/netdev_lock.h> #include <net/net_namespace.h> #include <net/sock.h> #include <net/netlink.h> @@ -206,7 +208,7 @@ static struct Qdisc_ops *qdisc_lookup_default(const char *name) for (q = qdisc_base; q; q = q->next) { if (!strcmp(name, q->id)) { - if (!try_module_get(q->owner)) + if (!bpf_try_module_get(q, q->owner)) q = NULL; break; } @@ -236,7 +238,7 @@ int qdisc_set_default(const char *name) if (ops) { /* Set new default */ - module_put(default_qdisc_ops->owner); + bpf_module_put(default_qdisc_ops, default_qdisc_ops->owner); default_qdisc_ops = ops; } write_unlock(&qdisc_mod_lock); @@ -334,17 +336,22 @@ out: return q; } -static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid) +static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid, + struct netlink_ext_ack *extack) { unsigned long cl; const struct Qdisc_class_ops *cops = p->ops->cl_ops; - if (cops == NULL) - return NULL; + if (cops == NULL) { + NL_SET_ERR_MSG(extack, "Parent qdisc is not classful"); + return ERR_PTR(-EOPNOTSUPP); + } cl = cops->find(p, classid); - if (cl == 0) - return NULL; + if (cl == 0) { + NL_SET_ERR_MSG(extack, "Specified class not found"); + return ERR_PTR(-ENOENT); + } return cops->leaf(p, cl); } @@ -358,7 +365,7 @@ static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind) read_lock(&qdisc_mod_lock); for (q = qdisc_base; q; q = q->next) { if (nla_strcmp(kind, q->id) == 0) { - if (!try_module_get(q->owner)) + if (!bpf_try_module_get(q, q->owner)) q = NULL; break; } @@ -593,17 +600,6 @@ out: pkt_len = 1; qdisc_skb_cb(skb)->pkt_len = pkt_len; } -EXPORT_SYMBOL(__qdisc_calculate_pkt_len); - -void qdisc_warn_nonwc(const char *txt, struct Qdisc *qdisc) -{ - if (!(qdisc->flags & TCQ_F_WARN_NONWC)) { - pr_warn("%s: %s qdisc %X: is non-work-conserving?\n", - txt, qdisc->ops->id, qdisc->handle >> 16); - qdisc->flags |= TCQ_F_WARN_NONWC; - } -} -EXPORT_SYMBOL(qdisc_warn_nonwc); static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer) { @@ -620,8 +616,7 @@ static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer) void qdisc_watchdog_init_clockid(struct qdisc_watchdog *wd, struct Qdisc *qdisc, clockid_t clockid) { - hrtimer_init(&wd->timer, clockid, HRTIMER_MODE_ABS_PINNED); - wd->timer.function = qdisc_watchdog; + hrtimer_setup(&wd->timer, qdisc_watchdog, clockid, HRTIMER_MODE_ABS_PINNED); wd->qdisc = qdisc; } EXPORT_SYMBOL(qdisc_watchdog_init_clockid); @@ -780,42 +775,33 @@ static u32 qdisc_alloc_handle(struct net_device *dev) void qdisc_tree_reduce_backlog(struct Qdisc *sch, int n, int len) { - bool qdisc_is_offloaded = sch->flags & TCQ_F_OFFLOADED; const struct Qdisc_class_ops *cops; unsigned long cl; u32 parentid; bool notify; int drops; - if (n == 0 && len == 0) - return; drops = max_t(int, n, 0); rcu_read_lock(); while ((parentid = sch->parent)) { - if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS)) + if (parentid == TC_H_ROOT) break; if (sch->flags & TCQ_F_NOPARENT) break; - /* Notify parent qdisc only if child qdisc becomes empty. - * - * If child was empty even before update then backlog - * counter is screwed and we skip notification because - * parent class is already passive. - * - * If the original child was offloaded then it is allowed - * to be seem as empty, so the parent is notified anyway. - */ - notify = !sch->q.qlen && !WARN_ON_ONCE(!n && - !qdisc_is_offloaded); + /* Notify parent qdisc only if child qdisc becomes empty. */ + notify = !sch->q.qlen; /* TODO: perform the search on a per txq basis */ - sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid)); + sch = qdisc_lookup_rcu(qdisc_dev(sch), TC_H_MAJ(parentid)); if (sch == NULL) { WARN_ON_ONCE(parentid != TC_H_ROOT); break; } cops = sch->ops->cl_ops; if (notify && cops->qlen_notify) { + /* Note that qlen_notify must be idempotent as it may get called + * multiple times. + */ cl = cops->find(sch, parentid); cops->qlen_notify(sch, cl); } @@ -1201,6 +1187,12 @@ skip: return -EINVAL; } + if (new && + !(parent->flags & TCQ_F_MQROOT) && + rcu_access_pointer(new->stab)) { + NL_SET_ERR_MSG(extack, "STAB not supported on a non root"); + return -EINVAL; + } err = cops->graft(parent, cl, new, &old, extack); if (err) return err; @@ -1262,36 +1254,8 @@ static struct Qdisc *qdisc_create(struct net_device *dev, struct qdisc_size_table *stab; ops = qdisc_lookup_ops(kind); -#ifdef CONFIG_MODULES - if (ops == NULL && kind != NULL) { - char name[IFNAMSIZ]; - if (nla_strscpy(name, kind, IFNAMSIZ) >= 0) { - /* We dropped the RTNL semaphore in order to - * perform the module load. So, even if we - * succeeded in loading the module we have to - * tell the caller to replay the request. We - * indicate this using -EAGAIN. - * We replay the request because the device may - * go away in the mean time. - */ - rtnl_unlock(); - request_module(NET_SCH_ALIAS_PREFIX "%s", name); - rtnl_lock(); - ops = qdisc_lookup_ops(kind); - if (ops != NULL) { - /* We will try again qdisc_lookup_ops, - * so don't keep a reference. - */ - module_put(ops->owner); - err = -EAGAIN; - goto err_out; - } - } - } -#endif - - err = -ENOENT; if (!ops) { + err = -ENOENT; NL_SET_ERR_MSG(extack, "Specified qdisc kind is unknown"); goto err_out; } @@ -1334,7 +1298,7 @@ static struct Qdisc *qdisc_create(struct net_device *dev, * before again attaching a qdisc. */ if ((dev->priv_flags & IFF_NO_QUEUE) && (dev->tx_queue_len == 0)) { - dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN; + WRITE_ONCE(dev->tx_queue_len, DEFAULT_TX_QUEUE_LEN); netdev_info(dev, "Caught tx_queue_len zero misconfig\n"); } @@ -1389,10 +1353,11 @@ err_out4: ops->destroy(sch); qdisc_put_stab(rtnl_dereference(sch->stab)); err_out3: + lockdep_unregister_key(&sch->root_lock_key); netdev_put(dev, &sch->dev_tracker); qdisc_free(sch); err_out2: - module_put(ops->owner); + bpf_module_put(ops, ops->owner); err_out: *errp = err; return NULL; @@ -1499,27 +1464,18 @@ const struct nla_policy rtm_tca_policy[TCA_MAX + 1] = { * Delete/get qdisc. */ -static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, - struct netlink_ext_ack *extack) +static int __tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, + struct netlink_ext_ack *extack, + struct net_device *dev, + struct nlattr *tca[TCA_MAX + 1], + struct tcmsg *tcm) { struct net *net = sock_net(skb->sk); - struct tcmsg *tcm = nlmsg_data(n); - struct nlattr *tca[TCA_MAX + 1]; - struct net_device *dev; - u32 clid; struct Qdisc *q = NULL; struct Qdisc *p = NULL; + u32 clid; int err; - err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX, - rtm_tca_policy, extack); - if (err < 0) - return err; - - dev = __dev_get_by_index(net, tcm->tcm_ifindex); - if (!dev) - return -ENODEV; - clid = tcm->tcm_parent; if (clid) { if (clid != TC_H_ROOT) { @@ -1529,7 +1485,7 @@ static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified classid"); return -ENOENT; } - q = qdisc_leaf(p, clid); + q = qdisc_leaf(p, clid, extack); } else if (dev_ingress_queue(dev)) { q = rtnl_dereference(dev_ingress_queue(dev)->qdisc_sleeping); } @@ -1540,6 +1496,8 @@ static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, NL_SET_ERR_MSG(extack, "Cannot find specified qdisc on specified device"); return -ENOENT; } + if (IS_ERR(q)) + return PTR_ERR(q); if (tcm->tcm_handle && q->handle != tcm->tcm_handle) { NL_SET_ERR_MSG(extack, "Invalid handle"); @@ -1554,7 +1512,7 @@ static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, } if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) { - NL_SET_ERR_MSG(extack, "Invalid qdisc name"); + NL_SET_ERR_MSG(extack, "Invalid qdisc name: must match existing qdisc"); return -EINVAL; } @@ -1576,6 +1534,31 @@ static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, return 0; } +static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, + struct netlink_ext_ack *extack) +{ + struct net *net = sock_net(skb->sk); + struct tcmsg *tcm = nlmsg_data(n); + struct nlattr *tca[TCA_MAX + 1]; + struct net_device *dev; + int err; + + err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX, + rtm_tca_policy, extack); + if (err < 0) + return err; + + dev = __dev_get_by_index(net, tcm->tcm_ifindex); + if (!dev) + return -ENODEV; + + netdev_lock_ops(dev); + err = __tc_get_qdisc(skb, n, extack, dev, tca, tcm); + netdev_unlock_ops(dev); + + return err; +} + static bool req_create_or_replace(struct nlmsghdr *n) { return (n->nlmsg_flags & NLM_F_CREATE && @@ -1595,35 +1578,18 @@ static bool req_change(struct nlmsghdr *n) !(n->nlmsg_flags & NLM_F_EXCL)); } -/* - * Create/change qdisc. - */ -static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, - struct netlink_ext_ack *extack) +static int __tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, + struct netlink_ext_ack *extack, + struct net_device *dev, + struct nlattr *tca[TCA_MAX + 1], + struct tcmsg *tcm) { - struct net *net = sock_net(skb->sk); - struct tcmsg *tcm; - struct nlattr *tca[TCA_MAX + 1]; - struct net_device *dev; + struct Qdisc *q = NULL; + struct Qdisc *p = NULL; u32 clid; - struct Qdisc *q, *p; int err; -replay: - /* Reinit, just in case something touches this. */ - err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX, - rtm_tca_policy, extack); - if (err < 0) - return err; - - tcm = nlmsg_data(n); clid = tcm->tcm_parent; - q = p = NULL; - - dev = __dev_get_by_index(net, tcm->tcm_ifindex); - if (!dev) - return -ENODEV; - if (clid) { if (clid != TC_H_ROOT) { @@ -1633,7 +1599,9 @@ replay: NL_SET_ERR_MSG(extack, "Failed to find specified qdisc"); return -ENOENT; } - q = qdisc_leaf(p, clid); + q = qdisc_leaf(p, clid, extack); + if (IS_ERR(q)) + return PTR_ERR(q); } else if (dev_ingress_queue_create(dev)) { q = rtnl_dereference(dev_ingress_queue(dev)->qdisc_sleeping); } @@ -1658,13 +1626,17 @@ replay: q = qdisc_lookup(dev, tcm->tcm_handle); if (!q) goto create_n_graft; + if (q->parent != tcm->tcm_parent) { + NL_SET_ERR_MSG(extack, "Cannot move an existing qdisc to a different parent"); + return -EINVAL; + } if (n->nlmsg_flags & NLM_F_EXCL) { NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot override"); return -EEXIST; } if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) { - NL_SET_ERR_MSG(extack, "Invalid qdisc name"); + NL_SET_ERR_MSG(extack, "Invalid qdisc name: must match existing qdisc"); return -EINVAL; } if (q->flags & TCQ_F_INGRESS) { @@ -1740,12 +1712,12 @@ replay: return -EEXIST; } if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) { - NL_SET_ERR_MSG(extack, "Invalid qdisc name"); + NL_SET_ERR_MSG(extack, "Invalid qdisc name: must match existing qdisc"); return -EINVAL; } err = qdisc_change(q, tca, extack); if (err == 0) - qdisc_notify(net, skb, n, clid, NULL, q, extack); + qdisc_notify(sock_net(skb->sk), skb, n, clid, NULL, q, extack); return err; create_n_graft: @@ -1777,11 +1749,8 @@ create_n_graft2: tcm->tcm_parent, tcm->tcm_handle, tca, &err, extack); } - if (q == NULL) { - if (err == -EAGAIN) - goto replay; + if (!q) return err; - } graft: err = qdisc_graft(dev, p, skb, n, clid, q, NULL, extack); @@ -1794,6 +1763,58 @@ graft: return 0; } +static void request_qdisc_module(struct nlattr *kind) +{ + struct Qdisc_ops *ops; + char name[IFNAMSIZ]; + + if (!kind) + return; + + ops = qdisc_lookup_ops(kind); + if (ops) { + bpf_module_put(ops, ops->owner); + return; + } + + if (nla_strscpy(name, kind, IFNAMSIZ) >= 0) { + rtnl_unlock(); + request_module(NET_SCH_ALIAS_PREFIX "%s", name); + rtnl_lock(); + } +} + +/* + * Create/change qdisc. + */ +static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, + struct netlink_ext_ack *extack) +{ + struct net *net = sock_net(skb->sk); + struct nlattr *tca[TCA_MAX + 1]; + struct net_device *dev; + struct tcmsg *tcm; + int err; + + err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX, + rtm_tca_policy, extack); + if (err < 0) + return err; + + request_qdisc_module(tca[TCA_KIND]); + + tcm = nlmsg_data(n); + dev = __dev_get_by_index(net, tcm->tcm_ifindex); + if (!dev) + return -ENODEV; + + netdev_lock_ops(dev); + err = __tc_modify_qdisc(skb, n, extack, dev, tca, tcm); + netdev_unlock_ops(dev); + + return err; +} + static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb, struct netlink_callback *cb, int *q_idx_p, int s_q_idx, bool recur, @@ -1878,17 +1899,23 @@ static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb) s_q_idx = 0; q_idx = 0; + netdev_lock_ops(dev); if (tc_dump_qdisc_root(rtnl_dereference(dev->qdisc), skb, cb, &q_idx, s_q_idx, - true, tca[TCA_DUMP_INVISIBLE]) < 0) + true, tca[TCA_DUMP_INVISIBLE]) < 0) { + netdev_unlock_ops(dev); goto done; + } dev_queue = dev_ingress_queue(dev); if (dev_queue && tc_dump_qdisc_root(rtnl_dereference(dev_queue->qdisc_sleeping), skb, cb, &q_idx, s_q_idx, false, - tca[TCA_DUMP_INVISIBLE]) < 0) + tca[TCA_DUMP_INVISIBLE]) < 0) { + netdev_unlock_ops(dev); goto done; + } + netdev_unlock_ops(dev); cont: idx++; @@ -2125,15 +2152,15 @@ static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid, #endif -static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, - struct netlink_ext_ack *extack) +static int __tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, + struct netlink_ext_ack *extack, + struct net_device *dev, + struct nlattr *tca[TCA_MAX + 1], + struct tcmsg *tcm) { struct net *net = sock_net(skb->sk); - struct tcmsg *tcm = nlmsg_data(n); - struct nlattr *tca[TCA_MAX + 1]; - struct net_device *dev; - struct Qdisc *q = NULL; const struct Qdisc_class_ops *cops; + struct Qdisc *q = NULL; unsigned long cl = 0; unsigned long new_cl; u32 portid; @@ -2141,15 +2168,6 @@ static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, u32 qid; int err; - err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX, - rtm_tca_policy, extack); - if (err < 0) - return err; - - dev = __dev_get_by_index(net, tcm->tcm_ifindex); - if (!dev) - return -ENODEV; - /* parent == TC_H_UNSPEC - unspecified parent. parent == TC_H_ROOT - class is root, which has no parent. @@ -2244,6 +2262,12 @@ static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, return -EOPNOTSUPP; } + /* Prevent creation of traffic classes with classid TC_H_ROOT */ + if (clid == TC_H_ROOT) { + NL_SET_ERR_MSG(extack, "Cannot create traffic class with classid TC_H_ROOT"); + return -EINVAL; + } + new_cl = cl; err = -EOPNOTSUPP; if (cops->change) @@ -2258,6 +2282,31 @@ out: return err; } +static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, + struct netlink_ext_ack *extack) +{ + struct net *net = sock_net(skb->sk); + struct tcmsg *tcm = nlmsg_data(n); + struct nlattr *tca[TCA_MAX + 1]; + struct net_device *dev; + int err; + + err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX, + rtm_tca_policy, extack); + if (err < 0) + return err; + + dev = __dev_get_by_index(net, tcm->tcm_ifindex); + if (!dev) + return -ENODEV; + + netdev_lock_ops(dev); + err = __tc_ctl_tclass(skb, n, extack, dev, tca, tcm); + netdev_unlock_ops(dev); + + return err; +} + struct qdisc_dump_args { struct qdisc_walker w; struct sk_buff *skb; @@ -2334,20 +2383,12 @@ static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb, return 0; } -static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb) +static int __tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb, + struct tcmsg *tcm, struct net_device *dev) { - struct tcmsg *tcm = nlmsg_data(cb->nlh); - struct net *net = sock_net(skb->sk); struct netdev_queue *dev_queue; - struct net_device *dev; int t, s_t; - if (nlmsg_len(cb->nlh) < sizeof(*tcm)) - return 0; - dev = dev_get_by_index(net, tcm->tcm_ifindex); - if (!dev) - return 0; - s_t = cb->args[0]; t = 0; @@ -2364,10 +2405,32 @@ static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb) done: cb->args[0] = t; - dev_put(dev); return skb->len; } +static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct tcmsg *tcm = nlmsg_data(cb->nlh); + struct net *net = sock_net(skb->sk); + struct net_device *dev; + int err; + + if (nlmsg_len(cb->nlh) < sizeof(*tcm)) + return 0; + + dev = dev_get_by_index(net, tcm->tcm_ifindex); + if (!dev) + return 0; + + netdev_lock_ops(dev); + err = __tc_dump_tclass(skb, cb, tcm, dev); + netdev_unlock_ops(dev); + + dev_put(dev); + + return err; +} + #ifdef CONFIG_PROC_FS static int psched_show(struct seq_file *seq, void *v) { @@ -2414,6 +2477,17 @@ static struct pernet_operations psched_net_ops = { DEFINE_STATIC_KEY_FALSE(tc_skip_wrapper); #endif +static const struct rtnl_msg_handler psched_rtnl_msg_handlers[] __initconst = { + {.msgtype = RTM_NEWQDISC, .doit = tc_modify_qdisc}, + {.msgtype = RTM_DELQDISC, .doit = tc_get_qdisc}, + {.msgtype = RTM_GETQDISC, .doit = tc_get_qdisc, + .dumpit = tc_dump_qdisc}, + {.msgtype = RTM_NEWTCLASS, .doit = tc_ctl_tclass}, + {.msgtype = RTM_DELTCLASS, .doit = tc_ctl_tclass}, + {.msgtype = RTM_GETTCLASS, .doit = tc_ctl_tclass, + .dumpit = tc_dump_tclass}, +}; + static int __init pktsched_init(void) { int err; @@ -2432,14 +2506,7 @@ static int __init pktsched_init(void) register_qdisc(&mq_qdisc_ops); register_qdisc(&noqueue_qdisc_ops); - rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL, 0); - rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL, 0); - rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc, - 0); - rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL, 0); - rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL, 0); - rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass, - 0); + rtnl_register_many(psched_rtnl_msg_handlers); tc_wrapper_init(); diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c index edee926ccde8..48dd8c88903f 100644 --- a/net/sched/sch_cake.c +++ b/net/sched/sch_cake.c @@ -361,8 +361,24 @@ static const u8 besteffort[] = { static const u8 normal_order[] = {0, 1, 2, 3, 4, 5, 6, 7}; static const u8 bulk_order[] = {1, 0, 2, 3}; +/* There is a big difference in timing between the accurate values placed in the + * cache and the approximations given by a single Newton step for small count + * values, particularly when stepping from count 1 to 2 or vice versa. Hence, + * these values are calculated using eight Newton steps, using the + * implementation below. Above 16, a single Newton step gives sufficient + * accuracy in either direction, given the precision stored. + * + * The magnitude of the error when stepping up to count 2 is such as to give the + * value that *should* have been produced at count 4. + */ + #define REC_INV_SQRT_CACHE (16) -static u32 cobalt_rec_inv_sqrt_cache[REC_INV_SQRT_CACHE] = {0}; +static const u32 inv_sqrt_cache[REC_INV_SQRT_CACHE] = { + ~0, ~0, 3037000500, 2479700525, + 2147483647, 1920767767, 1753413056, 1623345051, + 1518500250, 1431655765, 1358187914, 1294981364, + 1239850263, 1191209601, 1147878294, 1108955788 +}; /* http://en.wikipedia.org/wiki/Methods_of_computing_square_roots * new_invsqrt = (invsqrt / 2) * (3 - count * invsqrt^2) @@ -388,47 +404,14 @@ static void cobalt_newton_step(struct cobalt_vars *vars) static void cobalt_invsqrt(struct cobalt_vars *vars) { if (vars->count < REC_INV_SQRT_CACHE) - vars->rec_inv_sqrt = cobalt_rec_inv_sqrt_cache[vars->count]; + vars->rec_inv_sqrt = inv_sqrt_cache[vars->count]; else cobalt_newton_step(vars); } -/* There is a big difference in timing between the accurate values placed in - * the cache and the approximations given by a single Newton step for small - * count values, particularly when stepping from count 1 to 2 or vice versa. - * Above 16, a single Newton step gives sufficient accuracy in either - * direction, given the precision stored. - * - * The magnitude of the error when stepping up to count 2 is such as to give - * the value that *should* have been produced at count 4. - */ - -static void cobalt_cache_init(void) -{ - struct cobalt_vars v; - - memset(&v, 0, sizeof(v)); - v.rec_inv_sqrt = ~0U; - cobalt_rec_inv_sqrt_cache[0] = v.rec_inv_sqrt; - - for (v.count = 1; v.count < REC_INV_SQRT_CACHE; v.count++) { - cobalt_newton_step(&v); - cobalt_newton_step(&v); - cobalt_newton_step(&v); - cobalt_newton_step(&v); - - cobalt_rec_inv_sqrt_cache[v.count] = v.rec_inv_sqrt; - } -} - static void cobalt_vars_init(struct cobalt_vars *vars) { memset(vars, 0, sizeof(*vars)); - - if (!cobalt_rec_inv_sqrt_cache[0]) { - cobalt_cache_init(); - cobalt_rec_inv_sqrt_cache[0] = ~0; - } } /* CoDel control_law is t + interval/sqrt(count) @@ -501,13 +484,14 @@ static bool cobalt_queue_empty(struct cobalt_vars *vars, /* Call this with a freshly dequeued packet for possible congestion marking. * Returns true as an instruction to drop the packet, false for delivery. */ -static bool cobalt_should_drop(struct cobalt_vars *vars, - struct cobalt_params *p, - ktime_t now, - struct sk_buff *skb, - u32 bulk_flows) -{ - bool next_due, over_target, drop = false; +static enum skb_drop_reason cobalt_should_drop(struct cobalt_vars *vars, + struct cobalt_params *p, + ktime_t now, + struct sk_buff *skb, + u32 bulk_flows) +{ + enum skb_drop_reason reason = SKB_NOT_DROPPED_YET; + bool next_due, over_target; ktime_t schedule; u64 sojourn; @@ -550,7 +534,8 @@ static bool cobalt_should_drop(struct cobalt_vars *vars, if (next_due && vars->dropping) { /* Use ECN mark if possible, otherwise drop */ - drop = !(vars->ecn_marked = INET_ECN_set_ce(skb)); + if (!(vars->ecn_marked = INET_ECN_set_ce(skb))) + reason = SKB_DROP_REASON_QDISC_CONGESTED; vars->count++; if (!vars->count) @@ -573,16 +558,17 @@ static bool cobalt_should_drop(struct cobalt_vars *vars, } /* Simple BLUE implementation. Lack of ECN is deliberate. */ - if (vars->p_drop) - drop |= (get_random_u32() < vars->p_drop); + if (vars->p_drop && reason == SKB_NOT_DROPPED_YET && + get_random_u32() < vars->p_drop) + reason = SKB_DROP_REASON_CAKE_FLOOD; /* Overload the drop_next field as an activity timeout */ if (!vars->count) vars->drop_next = ktime_add_ns(now, p->interval); - else if (ktime_to_ns(schedule) > 0 && !drop) + else if (ktime_to_ns(schedule) > 0 && reason == SKB_NOT_DROPPED_YET) vars->drop_next = now; - return drop; + return reason; } static bool cake_update_flowkeys(struct flow_keys *keys, @@ -644,6 +630,63 @@ static bool cake_ddst(int flow_mode) return (flow_mode & CAKE_FLOW_DUAL_DST) == CAKE_FLOW_DUAL_DST; } +static void cake_dec_srchost_bulk_flow_count(struct cake_tin_data *q, + struct cake_flow *flow, + int flow_mode) +{ + if (likely(cake_dsrc(flow_mode) && + q->hosts[flow->srchost].srchost_bulk_flow_count)) + q->hosts[flow->srchost].srchost_bulk_flow_count--; +} + +static void cake_inc_srchost_bulk_flow_count(struct cake_tin_data *q, + struct cake_flow *flow, + int flow_mode) +{ + if (likely(cake_dsrc(flow_mode) && + q->hosts[flow->srchost].srchost_bulk_flow_count < CAKE_QUEUES)) + q->hosts[flow->srchost].srchost_bulk_flow_count++; +} + +static void cake_dec_dsthost_bulk_flow_count(struct cake_tin_data *q, + struct cake_flow *flow, + int flow_mode) +{ + if (likely(cake_ddst(flow_mode) && + q->hosts[flow->dsthost].dsthost_bulk_flow_count)) + q->hosts[flow->dsthost].dsthost_bulk_flow_count--; +} + +static void cake_inc_dsthost_bulk_flow_count(struct cake_tin_data *q, + struct cake_flow *flow, + int flow_mode) +{ + if (likely(cake_ddst(flow_mode) && + q->hosts[flow->dsthost].dsthost_bulk_flow_count < CAKE_QUEUES)) + q->hosts[flow->dsthost].dsthost_bulk_flow_count++; +} + +static u16 cake_get_flow_quantum(struct cake_tin_data *q, + struct cake_flow *flow, + int flow_mode) +{ + u16 host_load = 1; + + if (cake_dsrc(flow_mode)) + host_load = max(host_load, + q->hosts[flow->srchost].srchost_bulk_flow_count); + + if (cake_ddst(flow_mode)) + host_load = max(host_load, + q->hosts[flow->dsthost].dsthost_bulk_flow_count); + + /* The get_random_u16() is a way to apply dithering to avoid + * accumulating roundoff errors + */ + return (q->flow_quantum * quantum_div[host_load] + + get_random_u16()) >> 16; +} + static u32 cake_hash(struct cake_tin_data *q, const struct sk_buff *skb, int flow_mode, u16 flow_override, u16 host_override) { @@ -786,12 +829,13 @@ skip_hash: * queue, accept the collision, update the host tags. */ q->way_collisions++; - if (q->flows[outer_hash + k].set == CAKE_SET_BULK) { - q->hosts[q->flows[reduced_hash].srchost].srchost_bulk_flow_count--; - q->hosts[q->flows[reduced_hash].dsthost].dsthost_bulk_flow_count--; - } allocate_src = cake_dsrc(flow_mode); allocate_dst = cake_ddst(flow_mode); + + if (q->flows[outer_hash + k].set == CAKE_SET_BULK) { + cake_dec_srchost_bulk_flow_count(q, &q->flows[outer_hash + k], flow_mode); + cake_dec_dsthost_bulk_flow_count(q, &q->flows[outer_hash + k], flow_mode); + } found: /* reserve queue for future packets in same flow */ reduced_hash = outer_hash + k; @@ -815,9 +859,10 @@ found: q->hosts[outer_hash + k].srchost_tag = srchost_hash; found_src: srchost_idx = outer_hash + k; - if (q->flows[reduced_hash].set == CAKE_SET_BULK) - q->hosts[srchost_idx].srchost_bulk_flow_count++; q->flows[reduced_hash].srchost = srchost_idx; + + if (q->flows[reduced_hash].set == CAKE_SET_BULK) + cake_inc_srchost_bulk_flow_count(q, &q->flows[reduced_hash], flow_mode); } if (allocate_dst) { @@ -838,9 +883,10 @@ found_src: q->hosts[outer_hash + k].dsthost_tag = dsthost_hash; found_dst: dsthost_idx = outer_hash + k; - if (q->flows[reduced_hash].set == CAKE_SET_BULK) - q->hosts[dsthost_idx].dsthost_bulk_flow_count++; q->flows[reduced_hash].dsthost = dsthost_idx; + + if (q->flows[reduced_hash].set == CAKE_SET_BULK) + cake_inc_dsthost_bulk_flow_count(q, &q->flows[reduced_hash], flow_mode); } } @@ -1512,7 +1558,7 @@ static unsigned int cake_drop(struct Qdisc *sch, struct sk_buff **to_free) if (!q->overflow_timeout) { int i; /* Build fresh max-heap */ - for (i = CAKE_MAX_TINS * CAKE_QUEUES / 2; i >= 0; i--) + for (i = CAKE_MAX_TINS * CAKE_QUEUES / 2 - 1; i >= 0; i--) cake_heapify(q, i); } q->overflow_timeout = 65535; @@ -1539,17 +1585,16 @@ static unsigned int cake_drop(struct Qdisc *sch, struct sk_buff **to_free) b->backlogs[idx] -= len; b->tin_backlog -= len; sch->qstats.backlog -= len; - qdisc_tree_reduce_backlog(sch, 1, len); flow->dropped++; b->tin_dropped++; - sch->qstats.drops++; if (q->rate_flags & CAKE_FLAG_INGRESS) cake_advance_shaper(q, b, skb, now, true); - __qdisc_drop(skb, to_free); + qdisc_drop_reason(skb, sch, to_free, SKB_DROP_REASON_QDISC_OVERLIMIT); sch->q.qlen--; + qdisc_tree_reduce_backlog(sch, 1, len); cake_heapify(q, 0); @@ -1853,10 +1898,6 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, /* flowchain */ if (!flow->set || flow->set == CAKE_SET_DECAYING) { - struct cake_host *srchost = &b->hosts[flow->srchost]; - struct cake_host *dsthost = &b->hosts[flow->dsthost]; - u16 host_load = 1; - if (!flow->set) { list_add_tail(&flow->flowchain, &b->new_flows); } else { @@ -1866,18 +1907,8 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, flow->set = CAKE_SET_SPARSE; b->sparse_flow_count++; - if (cake_dsrc(q->flow_mode)) - host_load = max(host_load, srchost->srchost_bulk_flow_count); - - if (cake_ddst(q->flow_mode)) - host_load = max(host_load, dsthost->dsthost_bulk_flow_count); - - flow->deficit = (b->flow_quantum * - quantum_div[host_load]) >> 16; + flow->deficit = cake_get_flow_quantum(b, flow, q->flow_mode); } else if (flow->set == CAKE_SET_SPARSE_WAIT) { - struct cake_host *srchost = &b->hosts[flow->srchost]; - struct cake_host *dsthost = &b->hosts[flow->dsthost]; - /* this flow was empty, accounted as a sparse flow, but actually * in the bulk rotation. */ @@ -1885,12 +1916,8 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, b->sparse_flow_count--; b->bulk_flow_count++; - if (cake_dsrc(q->flow_mode)) - srchost->srchost_bulk_flow_count++; - - if (cake_ddst(q->flow_mode)) - dsthost->dsthost_bulk_flow_count++; - + cake_inc_srchost_bulk_flow_count(b, flow, q->flow_mode); + cake_inc_dsthost_bulk_flow_count(b, flow, q->flow_mode); } if (q->buffer_used > q->buffer_max_used) @@ -1940,20 +1967,19 @@ static void cake_clear_tin(struct Qdisc *sch, u16 tin) q->cur_tin = tin; for (q->cur_flow = 0; q->cur_flow < CAKE_QUEUES; q->cur_flow++) while (!!(skb = cake_dequeue_one(sch))) - kfree_skb(skb); + kfree_skb_reason(skb, SKB_DROP_REASON_QUEUE_PURGE); } static struct sk_buff *cake_dequeue(struct Qdisc *sch) { struct cake_sched_data *q = qdisc_priv(sch); struct cake_tin_data *b = &q->tins[q->cur_tin]; - struct cake_host *srchost, *dsthost; + enum skb_drop_reason reason; ktime_t now = ktime_get(); struct cake_flow *flow; struct list_head *head; bool first_flow = true; struct sk_buff *skb; - u16 host_load; u64 delay; u32 len; @@ -2053,11 +2079,6 @@ retry: q->cur_flow = flow - b->flows; first_flow = false; - /* triple isolation (modified DRR++) */ - srchost = &b->hosts[flow->srchost]; - dsthost = &b->hosts[flow->dsthost]; - host_load = 1; - /* flow isolation (DRR++) */ if (flow->deficit <= 0) { /* Keep all flows with deficits out of the sparse and decaying @@ -2069,11 +2090,8 @@ retry: b->sparse_flow_count--; b->bulk_flow_count++; - if (cake_dsrc(q->flow_mode)) - srchost->srchost_bulk_flow_count++; - - if (cake_ddst(q->flow_mode)) - dsthost->dsthost_bulk_flow_count++; + cake_inc_srchost_bulk_flow_count(b, flow, q->flow_mode); + cake_inc_dsthost_bulk_flow_count(b, flow, q->flow_mode); flow->set = CAKE_SET_BULK; } else { @@ -2085,19 +2103,7 @@ retry: } } - if (cake_dsrc(q->flow_mode)) - host_load = max(host_load, srchost->srchost_bulk_flow_count); - - if (cake_ddst(q->flow_mode)) - host_load = max(host_load, dsthost->dsthost_bulk_flow_count); - - WARN_ON(host_load > CAKE_QUEUES); - - /* The get_random_u16() is a way to apply dithering to avoid - * accumulating roundoff errors - */ - flow->deficit += (b->flow_quantum * quantum_div[host_load] + - get_random_u16()) >> 16; + flow->deficit += cake_get_flow_quantum(b, flow, q->flow_mode); list_move_tail(&flow->flowchain, &b->old_flows); goto retry; @@ -2121,11 +2127,8 @@ retry: if (flow->set == CAKE_SET_BULK) { b->bulk_flow_count--; - if (cake_dsrc(q->flow_mode)) - srchost->srchost_bulk_flow_count--; - - if (cake_ddst(q->flow_mode)) - dsthost->dsthost_bulk_flow_count--; + cake_dec_srchost_bulk_flow_count(b, flow, q->flow_mode); + cake_dec_dsthost_bulk_flow_count(b, flow, q->flow_mode); b->decaying_flow_count++; } else if (flow->set == CAKE_SET_SPARSE || @@ -2143,12 +2146,8 @@ retry: else if (flow->set == CAKE_SET_BULK) { b->bulk_flow_count--; - if (cake_dsrc(q->flow_mode)) - srchost->srchost_bulk_flow_count--; - - if (cake_ddst(q->flow_mode)) - dsthost->dsthost_bulk_flow_count--; - + cake_dec_srchost_bulk_flow_count(b, flow, q->flow_mode); + cake_dec_dsthost_bulk_flow_count(b, flow, q->flow_mode); } else b->decaying_flow_count--; @@ -2157,12 +2156,12 @@ retry: goto begin; } + reason = cobalt_should_drop(&flow->cvars, &b->cparams, now, skb, + (b->bulk_flow_count * + !!(q->rate_flags & + CAKE_FLAG_INGRESS))); /* Last packet in queue may be marked, shouldn't be dropped */ - if (!cobalt_should_drop(&flow->cvars, &b->cparams, now, skb, - (b->bulk_flow_count * - !!(q->rate_flags & - CAKE_FLAG_INGRESS))) || - !flow->head) + if (reason == SKB_NOT_DROPPED_YET || !flow->head) break; /* drop this packet, get another one */ @@ -2176,7 +2175,7 @@ retry: b->tin_dropped++; qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(skb)); qdisc_qstats_drop(sch); - kfree_skb(skb); + kfree_skb_reason(skb, reason); if (q->rate_flags & CAKE_FLAG_INGRESS) goto retry; } @@ -2572,6 +2571,8 @@ static int cake_change(struct Qdisc *sch, struct nlattr *opt, { struct cake_sched_data *q = qdisc_priv(sch); struct nlattr *tb[TCA_CAKE_MAX + 1]; + u16 rate_flags; + u8 flow_mode; int err; err = nla_parse_nested_deprecated(tb, TCA_CAKE_MAX, opt, cake_policy, @@ -2579,10 +2580,11 @@ static int cake_change(struct Qdisc *sch, struct nlattr *opt, if (err < 0) return err; + flow_mode = q->flow_mode; if (tb[TCA_CAKE_NAT]) { #if IS_ENABLED(CONFIG_NF_CONNTRACK) - q->flow_mode &= ~CAKE_FLOW_NAT_FLAG; - q->flow_mode |= CAKE_FLOW_NAT_FLAG * + flow_mode &= ~CAKE_FLOW_NAT_FLAG; + flow_mode |= CAKE_FLOW_NAT_FLAG * !!nla_get_u32(tb[TCA_CAKE_NAT]); #else NL_SET_ERR_MSG_ATTR(extack, tb[TCA_CAKE_NAT], @@ -2592,29 +2594,34 @@ static int cake_change(struct Qdisc *sch, struct nlattr *opt, } if (tb[TCA_CAKE_BASE_RATE64]) - q->rate_bps = nla_get_u64(tb[TCA_CAKE_BASE_RATE64]); + WRITE_ONCE(q->rate_bps, + nla_get_u64(tb[TCA_CAKE_BASE_RATE64])); if (tb[TCA_CAKE_DIFFSERV_MODE]) - q->tin_mode = nla_get_u32(tb[TCA_CAKE_DIFFSERV_MODE]); + WRITE_ONCE(q->tin_mode, + nla_get_u32(tb[TCA_CAKE_DIFFSERV_MODE])); + rate_flags = q->rate_flags; if (tb[TCA_CAKE_WASH]) { if (!!nla_get_u32(tb[TCA_CAKE_WASH])) - q->rate_flags |= CAKE_FLAG_WASH; + rate_flags |= CAKE_FLAG_WASH; else - q->rate_flags &= ~CAKE_FLAG_WASH; + rate_flags &= ~CAKE_FLAG_WASH; } if (tb[TCA_CAKE_FLOW_MODE]) - q->flow_mode = ((q->flow_mode & CAKE_FLOW_NAT_FLAG) | + flow_mode = ((flow_mode & CAKE_FLOW_NAT_FLAG) | (nla_get_u32(tb[TCA_CAKE_FLOW_MODE]) & CAKE_FLOW_MASK)); if (tb[TCA_CAKE_ATM]) - q->atm_mode = nla_get_u32(tb[TCA_CAKE_ATM]); + WRITE_ONCE(q->atm_mode, + nla_get_u32(tb[TCA_CAKE_ATM])); if (tb[TCA_CAKE_OVERHEAD]) { - q->rate_overhead = nla_get_s32(tb[TCA_CAKE_OVERHEAD]); - q->rate_flags |= CAKE_FLAG_OVERHEAD; + WRITE_ONCE(q->rate_overhead, + nla_get_s32(tb[TCA_CAKE_OVERHEAD])); + rate_flags |= CAKE_FLAG_OVERHEAD; q->max_netlen = 0; q->max_adjlen = 0; @@ -2623,7 +2630,7 @@ static int cake_change(struct Qdisc *sch, struct nlattr *opt, } if (tb[TCA_CAKE_RAW]) { - q->rate_flags &= ~CAKE_FLAG_OVERHEAD; + rate_flags &= ~CAKE_FLAG_OVERHEAD; q->max_netlen = 0; q->max_adjlen = 0; @@ -2632,54 +2639,58 @@ static int cake_change(struct Qdisc *sch, struct nlattr *opt, } if (tb[TCA_CAKE_MPU]) - q->rate_mpu = nla_get_u32(tb[TCA_CAKE_MPU]); + WRITE_ONCE(q->rate_mpu, + nla_get_u32(tb[TCA_CAKE_MPU])); if (tb[TCA_CAKE_RTT]) { - q->interval = nla_get_u32(tb[TCA_CAKE_RTT]); + u32 interval = nla_get_u32(tb[TCA_CAKE_RTT]); - if (!q->interval) - q->interval = 1; + WRITE_ONCE(q->interval, max(interval, 1U)); } if (tb[TCA_CAKE_TARGET]) { - q->target = nla_get_u32(tb[TCA_CAKE_TARGET]); + u32 target = nla_get_u32(tb[TCA_CAKE_TARGET]); - if (!q->target) - q->target = 1; + WRITE_ONCE(q->target, max(target, 1U)); } if (tb[TCA_CAKE_AUTORATE]) { if (!!nla_get_u32(tb[TCA_CAKE_AUTORATE])) - q->rate_flags |= CAKE_FLAG_AUTORATE_INGRESS; + rate_flags |= CAKE_FLAG_AUTORATE_INGRESS; else - q->rate_flags &= ~CAKE_FLAG_AUTORATE_INGRESS; + rate_flags &= ~CAKE_FLAG_AUTORATE_INGRESS; } if (tb[TCA_CAKE_INGRESS]) { if (!!nla_get_u32(tb[TCA_CAKE_INGRESS])) - q->rate_flags |= CAKE_FLAG_INGRESS; + rate_flags |= CAKE_FLAG_INGRESS; else - q->rate_flags &= ~CAKE_FLAG_INGRESS; + rate_flags &= ~CAKE_FLAG_INGRESS; } if (tb[TCA_CAKE_ACK_FILTER]) - q->ack_filter = nla_get_u32(tb[TCA_CAKE_ACK_FILTER]); + WRITE_ONCE(q->ack_filter, + nla_get_u32(tb[TCA_CAKE_ACK_FILTER])); if (tb[TCA_CAKE_MEMORY]) - q->buffer_config_limit = nla_get_u32(tb[TCA_CAKE_MEMORY]); + WRITE_ONCE(q->buffer_config_limit, + nla_get_u32(tb[TCA_CAKE_MEMORY])); if (tb[TCA_CAKE_SPLIT_GSO]) { if (!!nla_get_u32(tb[TCA_CAKE_SPLIT_GSO])) - q->rate_flags |= CAKE_FLAG_SPLIT_GSO; + rate_flags |= CAKE_FLAG_SPLIT_GSO; else - q->rate_flags &= ~CAKE_FLAG_SPLIT_GSO; + rate_flags &= ~CAKE_FLAG_SPLIT_GSO; } if (tb[TCA_CAKE_FWMARK]) { - q->fwmark_mask = nla_get_u32(tb[TCA_CAKE_FWMARK]); - q->fwmark_shft = q->fwmark_mask ? __ffs(q->fwmark_mask) : 0; + WRITE_ONCE(q->fwmark_mask, nla_get_u32(tb[TCA_CAKE_FWMARK])); + WRITE_ONCE(q->fwmark_shft, + q->fwmark_mask ? __ffs(q->fwmark_mask) : 0); } + WRITE_ONCE(q->rate_flags, rate_flags); + WRITE_ONCE(q->flow_mode, flow_mode); if (q->tins) { sch_tree_lock(sch); cake_reconfigure(sch); @@ -2774,68 +2785,72 @@ static int cake_dump(struct Qdisc *sch, struct sk_buff *skb) { struct cake_sched_data *q = qdisc_priv(sch); struct nlattr *opts; + u16 rate_flags; + u8 flow_mode; opts = nla_nest_start_noflag(skb, TCA_OPTIONS); if (!opts) goto nla_put_failure; - if (nla_put_u64_64bit(skb, TCA_CAKE_BASE_RATE64, q->rate_bps, - TCA_CAKE_PAD)) + if (nla_put_u64_64bit(skb, TCA_CAKE_BASE_RATE64, + READ_ONCE(q->rate_bps), TCA_CAKE_PAD)) goto nla_put_failure; - if (nla_put_u32(skb, TCA_CAKE_FLOW_MODE, - q->flow_mode & CAKE_FLOW_MASK)) + flow_mode = READ_ONCE(q->flow_mode); + if (nla_put_u32(skb, TCA_CAKE_FLOW_MODE, flow_mode & CAKE_FLOW_MASK)) goto nla_put_failure; - if (nla_put_u32(skb, TCA_CAKE_RTT, q->interval)) + if (nla_put_u32(skb, TCA_CAKE_RTT, READ_ONCE(q->interval))) goto nla_put_failure; - if (nla_put_u32(skb, TCA_CAKE_TARGET, q->target)) + if (nla_put_u32(skb, TCA_CAKE_TARGET, READ_ONCE(q->target))) goto nla_put_failure; - if (nla_put_u32(skb, TCA_CAKE_MEMORY, q->buffer_config_limit)) + if (nla_put_u32(skb, TCA_CAKE_MEMORY, + READ_ONCE(q->buffer_config_limit))) goto nla_put_failure; + rate_flags = READ_ONCE(q->rate_flags); if (nla_put_u32(skb, TCA_CAKE_AUTORATE, - !!(q->rate_flags & CAKE_FLAG_AUTORATE_INGRESS))) + !!(rate_flags & CAKE_FLAG_AUTORATE_INGRESS))) goto nla_put_failure; if (nla_put_u32(skb, TCA_CAKE_INGRESS, - !!(q->rate_flags & CAKE_FLAG_INGRESS))) + !!(rate_flags & CAKE_FLAG_INGRESS))) goto nla_put_failure; - if (nla_put_u32(skb, TCA_CAKE_ACK_FILTER, q->ack_filter)) + if (nla_put_u32(skb, TCA_CAKE_ACK_FILTER, READ_ONCE(q->ack_filter))) goto nla_put_failure; if (nla_put_u32(skb, TCA_CAKE_NAT, - !!(q->flow_mode & CAKE_FLOW_NAT_FLAG))) + !!(flow_mode & CAKE_FLOW_NAT_FLAG))) goto nla_put_failure; - if (nla_put_u32(skb, TCA_CAKE_DIFFSERV_MODE, q->tin_mode)) + if (nla_put_u32(skb, TCA_CAKE_DIFFSERV_MODE, READ_ONCE(q->tin_mode))) goto nla_put_failure; if (nla_put_u32(skb, TCA_CAKE_WASH, - !!(q->rate_flags & CAKE_FLAG_WASH))) + !!(rate_flags & CAKE_FLAG_WASH))) goto nla_put_failure; - if (nla_put_u32(skb, TCA_CAKE_OVERHEAD, q->rate_overhead)) + if (nla_put_u32(skb, TCA_CAKE_OVERHEAD, READ_ONCE(q->rate_overhead))) goto nla_put_failure; - if (!(q->rate_flags & CAKE_FLAG_OVERHEAD)) + if (!(rate_flags & CAKE_FLAG_OVERHEAD)) if (nla_put_u32(skb, TCA_CAKE_RAW, 0)) goto nla_put_failure; - if (nla_put_u32(skb, TCA_CAKE_ATM, q->atm_mode)) + if (nla_put_u32(skb, TCA_CAKE_ATM, READ_ONCE(q->atm_mode))) goto nla_put_failure; - if (nla_put_u32(skb, TCA_CAKE_MPU, q->rate_mpu)) + if (nla_put_u32(skb, TCA_CAKE_MPU, READ_ONCE(q->rate_mpu))) goto nla_put_failure; if (nla_put_u32(skb, TCA_CAKE_SPLIT_GSO, - !!(q->rate_flags & CAKE_FLAG_SPLIT_GSO))) + !!(rate_flags & CAKE_FLAG_SPLIT_GSO))) goto nla_put_failure; - if (nla_put_u32(skb, TCA_CAKE_FWMARK, q->fwmark_mask)) + if (nla_put_u32(skb, TCA_CAKE_FWMARK, READ_ONCE(q->fwmark_mask))) goto nla_put_failure; return nla_nest_end(skb, opts); diff --git a/net/sched/sch_cbs.c b/net/sched/sch_cbs.c index 69001eff0315..8c9a0400c862 100644 --- a/net/sched/sch_cbs.c +++ b/net/sched/sch_cbs.c @@ -310,7 +310,7 @@ static void cbs_set_port_rate(struct net_device *dev, struct cbs_sched_data *q) { struct ethtool_link_ksettings ecmd; int speed = SPEED_10; - int port_rate; + s64 port_rate; int err; err = __ethtool_get_link_ksettings(dev, &ecmd); @@ -389,11 +389,11 @@ static int cbs_change(struct Qdisc *sch, struct nlattr *opt, } /* Everything went OK, save the parameters used. */ - q->hicredit = qopt->hicredit; - q->locredit = qopt->locredit; - q->idleslope = qopt->idleslope * BYTES_PER_KBIT; - q->sendslope = qopt->sendslope * BYTES_PER_KBIT; - q->offload = qopt->offload; + WRITE_ONCE(q->hicredit, qopt->hicredit); + WRITE_ONCE(q->locredit, qopt->locredit); + WRITE_ONCE(q->idleslope, qopt->idleslope * BYTES_PER_KBIT); + WRITE_ONCE(q->sendslope, qopt->sendslope * BYTES_PER_KBIT); + WRITE_ONCE(q->offload, qopt->offload); return 0; } @@ -459,11 +459,11 @@ static int cbs_dump(struct Qdisc *sch, struct sk_buff *skb) if (!nest) goto nla_put_failure; - opt.hicredit = q->hicredit; - opt.locredit = q->locredit; - opt.sendslope = div64_s64(q->sendslope, BYTES_PER_KBIT); - opt.idleslope = div64_s64(q->idleslope, BYTES_PER_KBIT); - opt.offload = q->offload; + opt.hicredit = READ_ONCE(q->hicredit); + opt.locredit = READ_ONCE(q->locredit); + opt.sendslope = div64_s64(READ_ONCE(q->sendslope), BYTES_PER_KBIT); + opt.idleslope = div64_s64(READ_ONCE(q->idleslope), BYTES_PER_KBIT); + opt.offload = READ_ONCE(q->offload); if (nla_put(skb, TCA_CBS_PARMS, sizeof(opt), &opt)) goto nla_put_failure; diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c index ea108030c6b4..59e7bdf5063e 100644 --- a/net/sched/sch_choke.c +++ b/net/sched/sch_choke.c @@ -123,10 +123,10 @@ static void choke_drop_by_idx(struct Qdisc *sch, unsigned int idx, if (idx == q->tail) choke_zap_tail_holes(q); + --sch->q.qlen; qdisc_qstats_backlog_dec(sch, skb); qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(skb)); qdisc_drop(skb, sch, to_free); - --sch->q.qlen; } struct choke_skb_cb { @@ -356,7 +356,7 @@ static int choke_change(struct Qdisc *sch, struct nlattr *opt, tb[TCA_CHOKE_STAB] == NULL) return -EINVAL; - max_P = tb[TCA_CHOKE_MAX_P] ? nla_get_u32(tb[TCA_CHOKE_MAX_P]) : 0; + max_P = nla_get_u32_default(tb[TCA_CHOKE_MAX_P], 0); ctl = nla_data(tb[TCA_CHOKE_PARMS]); stab = nla_data(tb[TCA_CHOKE_STAB]); @@ -405,8 +405,8 @@ static int choke_change(struct Qdisc *sch, struct nlattr *opt, } else sch_tree_lock(sch); - q->flags = ctl->flags; - q->limit = ctl->limit; + WRITE_ONCE(q->flags, ctl->flags); + WRITE_ONCE(q->limit, ctl->limit); red_set_parms(&q->parms, ctl->qth_min, ctl->qth_max, ctl->Wlog, ctl->Plog, ctl->Scell_log, @@ -431,15 +431,16 @@ static int choke_init(struct Qdisc *sch, struct nlattr *opt, static int choke_dump(struct Qdisc *sch, struct sk_buff *skb) { struct choke_sched_data *q = qdisc_priv(sch); + u8 Wlog = READ_ONCE(q->parms.Wlog); struct nlattr *opts = NULL; struct tc_red_qopt opt = { - .limit = q->limit, - .flags = q->flags, - .qth_min = q->parms.qth_min >> q->parms.Wlog, - .qth_max = q->parms.qth_max >> q->parms.Wlog, - .Wlog = q->parms.Wlog, - .Plog = q->parms.Plog, - .Scell_log = q->parms.Scell_log, + .limit = READ_ONCE(q->limit), + .flags = READ_ONCE(q->flags), + .qth_min = READ_ONCE(q->parms.qth_min) >> Wlog, + .qth_max = READ_ONCE(q->parms.qth_max) >> Wlog, + .Wlog = Wlog, + .Plog = READ_ONCE(q->parms.Plog), + .Scell_log = READ_ONCE(q->parms.Scell_log), }; opts = nla_nest_start_noflag(skb, TCA_OPTIONS); @@ -447,7 +448,7 @@ static int choke_dump(struct Qdisc *sch, struct sk_buff *skb) goto nla_put_failure; if (nla_put(skb, TCA_CHOKE_PARMS, sizeof(opt), &opt) || - nla_put_u32(skb, TCA_CHOKE_MAX_P, q->parms.max_P)) + nla_put_u32(skb, TCA_CHOKE_MAX_P, READ_ONCE(q->parms.max_P))) goto nla_put_failure; return nla_nest_end(skb, opts); diff --git a/net/sched/sch_codel.c b/net/sched/sch_codel.c index ecb3f164bb25..c93761040c6e 100644 --- a/net/sched/sch_codel.c +++ b/net/sched/sch_codel.c @@ -52,7 +52,7 @@ static void drop_func(struct sk_buff *skb, void *ctx) { struct Qdisc *sch = ctx; - kfree_skb(skb); + kfree_skb_reason(skb, SKB_DROP_REASON_QDISC_CONGESTED); qdisc_qstats_drop(sch); } @@ -65,10 +65,7 @@ static struct sk_buff *codel_qdisc_dequeue(struct Qdisc *sch) &q->stats, qdisc_pkt_len, codel_get_enqueue_time, drop_func, dequeue_func); - /* We cant call qdisc_tree_reduce_backlog() if our qlen is 0, - * or HTB crashes. Defer it for next round. - */ - if (q->stats.drop_count && sch->q.qlen) { + if (q->stats.drop_count) { qdisc_tree_reduce_backlog(sch, q->stats.drop_count, q->stats.drop_len); q->stats.drop_count = 0; q->stats.drop_len = 0; @@ -89,7 +86,8 @@ static int codel_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, } q = qdisc_priv(sch); q->drop_overlimit++; - return qdisc_drop(skb, sch, to_free); + return qdisc_drop_reason(skb, sch, to_free, + SKB_DROP_REASON_QDISC_OVERLIMIT); } static const struct nla_policy codel_policy[TCA_CODEL_MAX + 1] = { @@ -118,30 +116,35 @@ static int codel_change(struct Qdisc *sch, struct nlattr *opt, if (tb[TCA_CODEL_TARGET]) { u32 target = nla_get_u32(tb[TCA_CODEL_TARGET]); - q->params.target = ((u64)target * NSEC_PER_USEC) >> CODEL_SHIFT; + WRITE_ONCE(q->params.target, + ((u64)target * NSEC_PER_USEC) >> CODEL_SHIFT); } if (tb[TCA_CODEL_CE_THRESHOLD]) { u64 val = nla_get_u32(tb[TCA_CODEL_CE_THRESHOLD]); - q->params.ce_threshold = (val * NSEC_PER_USEC) >> CODEL_SHIFT; + WRITE_ONCE(q->params.ce_threshold, + (val * NSEC_PER_USEC) >> CODEL_SHIFT); } if (tb[TCA_CODEL_INTERVAL]) { u32 interval = nla_get_u32(tb[TCA_CODEL_INTERVAL]); - q->params.interval = ((u64)interval * NSEC_PER_USEC) >> CODEL_SHIFT; + WRITE_ONCE(q->params.interval, + ((u64)interval * NSEC_PER_USEC) >> CODEL_SHIFT); } if (tb[TCA_CODEL_LIMIT]) - sch->limit = nla_get_u32(tb[TCA_CODEL_LIMIT]); + WRITE_ONCE(sch->limit, + nla_get_u32(tb[TCA_CODEL_LIMIT])); if (tb[TCA_CODEL_ECN]) - q->params.ecn = !!nla_get_u32(tb[TCA_CODEL_ECN]); + WRITE_ONCE(q->params.ecn, + !!nla_get_u32(tb[TCA_CODEL_ECN])); qlen = sch->q.qlen; while (sch->q.qlen > sch->limit) { - struct sk_buff *skb = __qdisc_dequeue_head(&sch->q); + struct sk_buff *skb = qdisc_dequeue_internal(sch, true); dropped += qdisc_pkt_len(skb); qdisc_qstats_backlog_dec(sch, skb); @@ -183,6 +186,7 @@ static int codel_init(struct Qdisc *sch, struct nlattr *opt, static int codel_dump(struct Qdisc *sch, struct sk_buff *skb) { struct codel_sched_data *q = qdisc_priv(sch); + codel_time_t ce_threshold; struct nlattr *opts; opts = nla_nest_start_noflag(skb, TCA_OPTIONS); @@ -190,17 +194,18 @@ static int codel_dump(struct Qdisc *sch, struct sk_buff *skb) goto nla_put_failure; if (nla_put_u32(skb, TCA_CODEL_TARGET, - codel_time_to_us(q->params.target)) || + codel_time_to_us(READ_ONCE(q->params.target))) || nla_put_u32(skb, TCA_CODEL_LIMIT, - sch->limit) || + READ_ONCE(sch->limit)) || nla_put_u32(skb, TCA_CODEL_INTERVAL, - codel_time_to_us(q->params.interval)) || + codel_time_to_us(READ_ONCE(q->params.interval))) || nla_put_u32(skb, TCA_CODEL_ECN, - q->params.ecn)) + READ_ONCE(q->params.ecn))) goto nla_put_failure; - if (q->params.ce_threshold != CODEL_DISABLED_THRESHOLD && + ce_threshold = READ_ONCE(q->params.ce_threshold); + if (ce_threshold != CODEL_DISABLED_THRESHOLD && nla_put_u32(skb, TCA_CODEL_CE_THRESHOLD, - codel_time_to_us(q->params.ce_threshold))) + codel_time_to_us(ce_threshold))) goto nla_put_failure; return nla_nest_end(skb, opts); diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c index c69b999fae17..9b6d79bd8737 100644 --- a/net/sched/sch_drr.c +++ b/net/sched/sch_drr.c @@ -35,6 +35,11 @@ struct drr_sched { struct Qdisc_class_hash clhash; }; +static bool cl_is_active(struct drr_class *cl) +{ + return !list_empty(&cl->alist); +} + static struct drr_class *drr_find_class(struct Qdisc *sch, u32 classid) { struct drr_sched *q = qdisc_priv(sch); @@ -105,6 +110,7 @@ static int drr_change_class(struct Qdisc *sch, u32 classid, u32 parentid, return -ENOBUFS; gnet_stats_basic_sync_init(&cl->bstats); + INIT_LIST_HEAD(&cl->alist); cl->common.classid = classid; cl->quantum = quantum; cl->qdisc = qdisc_create_dflt(sch->dev_queue, @@ -229,7 +235,7 @@ static void drr_qlen_notify(struct Qdisc *csh, unsigned long arg) { struct drr_class *cl = (struct drr_class *)arg; - list_del(&cl->alist); + list_del_init(&cl->alist); } static int drr_dump_class(struct Qdisc *sch, unsigned long arg, @@ -336,7 +342,6 @@ static int drr_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct drr_sched *q = qdisc_priv(sch); struct drr_class *cl; int err = 0; - bool first; cl = drr_classify(skb, sch, &err); if (cl == NULL) { @@ -346,7 +351,6 @@ static int drr_enqueue(struct sk_buff *skb, struct Qdisc *sch, return err; } - first = !cl->qdisc->q.qlen; err = qdisc_enqueue(skb, cl->qdisc, to_free); if (unlikely(err != NET_XMIT_SUCCESS)) { if (net_xmit_drop_count(err)) { @@ -356,7 +360,7 @@ static int drr_enqueue(struct sk_buff *skb, struct Qdisc *sch, return err; } - if (first) { + if (!cl_is_active(cl)) { list_add_tail(&cl->alist, &q->active); cl->deficit = cl->quantum; } @@ -390,7 +394,7 @@ static struct sk_buff *drr_dequeue(struct Qdisc *sch) if (unlikely(skb == NULL)) goto out; if (cl->qdisc->q.qlen == 0) - list_del(&cl->alist); + list_del_init(&cl->alist); bstats_update(&cl->bstats, skb); qdisc_bstats_update(sch, skb); @@ -431,7 +435,7 @@ static void drr_reset_qdisc(struct Qdisc *sch) for (i = 0; i < q->clhash.hashsize; i++) { hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) { if (cl->qdisc->q.qlen) - list_del(&cl->alist); + list_del_init(&cl->alist); qdisc_reset(cl->qdisc); } } diff --git a/net/sched/sch_etf.c b/net/sched/sch_etf.c index 2e4bef713b6a..c74d778c32a1 100644 --- a/net/sched/sch_etf.c +++ b/net/sched/sch_etf.c @@ -467,15 +467,15 @@ static int etf_dump(struct Qdisc *sch, struct sk_buff *skb) if (!nest) goto nla_put_failure; - opt.delta = q->delta; - opt.clockid = q->clockid; - if (q->offload) + opt.delta = READ_ONCE(q->delta); + opt.clockid = READ_ONCE(q->clockid); + if (READ_ONCE(q->offload)) opt.flags |= TC_ETF_OFFLOAD_ON; - if (q->deadline_mode) + if (READ_ONCE(q->deadline_mode)) opt.flags |= TC_ETF_DEADLINE_MODE_ON; - if (q->skip_sock_check) + if (READ_ONCE(q->skip_sock_check)) opt.flags |= TC_ETF_SKIP_SOCK_CHECK; if (nla_put(skb, TCA_ETF_PARMS, sizeof(opt), &opt)) diff --git a/net/sched/sch_ets.c b/net/sched/sch_ets.c index 835b4460b448..037f764822b9 100644 --- a/net/sched/sch_ets.c +++ b/net/sched/sch_ets.c @@ -74,6 +74,11 @@ static const struct nla_policy ets_class_policy[TCA_ETS_MAX + 1] = { [TCA_ETS_QUANTA_BAND] = { .type = NLA_U32 }, }; +static bool cl_is_active(struct ets_class *cl) +{ + return !list_empty(&cl->alist); +} + static int ets_quantum_parse(struct Qdisc *sch, const struct nlattr *attr, unsigned int *quantum, struct netlink_ext_ack *extack) @@ -91,6 +96,8 @@ ets_class_from_arg(struct Qdisc *sch, unsigned long arg) { struct ets_sched *q = qdisc_priv(sch); + if (arg == 0 || arg > q->nbands) + return NULL; return &q->classes[arg - 1]; } @@ -291,7 +298,7 @@ static void ets_class_qlen_notify(struct Qdisc *sch, unsigned long arg) * to remove them. */ if (!ets_class_is_strict(q, cl) && sch->q.qlen) - list_del(&cl->alist); + list_del_init(&cl->alist); } static int ets_class_dump(struct Qdisc *sch, unsigned long arg, @@ -414,7 +421,6 @@ static int ets_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct ets_sched *q = qdisc_priv(sch); struct ets_class *cl; int err = 0; - bool first; cl = ets_classify(skb, sch, &err); if (!cl) { @@ -424,7 +430,6 @@ static int ets_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, return err; } - first = !cl->qdisc->q.qlen; err = qdisc_enqueue(skb, cl->qdisc, to_free); if (unlikely(err != NET_XMIT_SUCCESS)) { if (net_xmit_drop_count(err)) { @@ -434,7 +439,7 @@ static int ets_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, return err; } - if (first && !ets_class_is_strict(q, cl)) { + if (!cl_is_active(cl) && !ets_class_is_strict(q, cl)) { list_add_tail(&cl->alist, &q->active); cl->deficit = cl->quantum; } @@ -486,7 +491,7 @@ static struct sk_buff *ets_qdisc_dequeue(struct Qdisc *sch) if (unlikely(!skb)) goto out; if (cl->qdisc->q.qlen == 0) - list_del(&cl->alist); + list_del_init(&cl->alist); return ets_qdisc_dequeue_skb(sch, skb); } @@ -646,7 +651,7 @@ static int ets_qdisc_change(struct Qdisc *sch, struct nlattr *opt, sch_tree_lock(sch); - q->nbands = nbands; + WRITE_ONCE(q->nbands, nbands); for (i = nstrict; i < q->nstrict; i++) { if (q->classes[i].qdisc->q.qlen) { list_add_tail(&q->classes[i].alist, &q->active); @@ -655,14 +660,14 @@ static int ets_qdisc_change(struct Qdisc *sch, struct nlattr *opt, } for (i = q->nbands; i < oldbands; i++) { if (i >= q->nstrict && q->classes[i].qdisc->q.qlen) - list_del(&q->classes[i].alist); - qdisc_tree_flush_backlog(q->classes[i].qdisc); + list_del_init(&q->classes[i].alist); + qdisc_purge_queue(q->classes[i].qdisc); } - q->nstrict = nstrict; + WRITE_ONCE(q->nstrict, nstrict); memcpy(q->prio2band, priomap, sizeof(priomap)); for (i = 0; i < q->nbands; i++) - q->classes[i].quantum = quanta[i]; + WRITE_ONCE(q->classes[i].quantum, quanta[i]); for (i = oldbands; i < q->nbands; i++) { q->classes[i].qdisc = queues[i]; @@ -676,7 +681,7 @@ static int ets_qdisc_change(struct Qdisc *sch, struct nlattr *opt, for (i = q->nbands; i < oldbands; i++) { qdisc_put(q->classes[i].qdisc); q->classes[i].qdisc = NULL; - q->classes[i].quantum = 0; + WRITE_ONCE(q->classes[i].quantum, 0); q->classes[i].deficit = 0; gnet_stats_basic_sync_init(&q->classes[i].bstats); memset(&q->classes[i].qstats, 0, sizeof(q->classes[i].qstats)); @@ -711,7 +716,7 @@ static void ets_qdisc_reset(struct Qdisc *sch) for (band = q->nstrict; band < q->nbands; band++) { if (q->classes[band].qdisc->q.qlen) - list_del(&q->classes[band].alist); + list_del_init(&q->classes[band].alist); } for (band = 0; band < q->nbands; band++) qdisc_reset(q->classes[band].qdisc); @@ -733,6 +738,7 @@ static int ets_qdisc_dump(struct Qdisc *sch, struct sk_buff *skb) struct ets_sched *q = qdisc_priv(sch); struct nlattr *opts; struct nlattr *nest; + u8 nbands, nstrict; int band; int prio; int err; @@ -745,21 +751,22 @@ static int ets_qdisc_dump(struct Qdisc *sch, struct sk_buff *skb) if (!opts) goto nla_err; - if (nla_put_u8(skb, TCA_ETS_NBANDS, q->nbands)) + nbands = READ_ONCE(q->nbands); + if (nla_put_u8(skb, TCA_ETS_NBANDS, nbands)) goto nla_err; - if (q->nstrict && - nla_put_u8(skb, TCA_ETS_NSTRICT, q->nstrict)) + nstrict = READ_ONCE(q->nstrict); + if (nstrict && nla_put_u8(skb, TCA_ETS_NSTRICT, nstrict)) goto nla_err; - if (q->nbands > q->nstrict) { + if (nbands > nstrict) { nest = nla_nest_start(skb, TCA_ETS_QUANTA); if (!nest) goto nla_err; - for (band = q->nstrict; band < q->nbands; band++) { + for (band = nstrict; band < nbands; band++) { if (nla_put_u32(skb, TCA_ETS_QUANTA_BAND, - q->classes[band].quantum)) + READ_ONCE(q->classes[band].quantum))) goto nla_err; } @@ -771,7 +778,8 @@ static int ets_qdisc_dump(struct Qdisc *sch, struct sk_buff *skb) goto nla_err; for (prio = 0; prio <= TC_PRIO_MAX; prio++) { - if (nla_put_u8(skb, TCA_ETS_PRIOMAP_BAND, q->prio2band[prio])) + if (nla_put_u8(skb, TCA_ETS_PRIOMAP_BAND, + READ_ONCE(q->prio2band[prio]))) goto nla_err; } diff --git a/net/sched/sch_fifo.c b/net/sched/sch_fifo.c index 450f5c67ac49..e6bfd39ff339 100644 --- a/net/sched/sch_fifo.c +++ b/net/sched/sch_fifo.c @@ -19,7 +19,8 @@ static int bfifo_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { - if (likely(sch->qstats.backlog + qdisc_pkt_len(skb) <= sch->limit)) + if (likely(sch->qstats.backlog + qdisc_pkt_len(skb) <= + READ_ONCE(sch->limit))) return qdisc_enqueue_tail(skb, sch); return qdisc_drop(skb, sch, to_free); @@ -28,7 +29,7 @@ static int bfifo_enqueue(struct sk_buff *skb, struct Qdisc *sch, static int pfifo_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { - if (likely(sch->q.qlen < sch->limit)) + if (likely(sch->q.qlen < READ_ONCE(sch->limit))) return qdisc_enqueue_tail(skb, sch); return qdisc_drop(skb, sch, to_free); @@ -39,7 +40,10 @@ static int pfifo_tail_enqueue(struct sk_buff *skb, struct Qdisc *sch, { unsigned int prev_backlog; - if (likely(sch->q.qlen < sch->limit)) + if (unlikely(READ_ONCE(sch->limit) == 0)) + return qdisc_drop(skb, sch, to_free); + + if (likely(sch->q.qlen < READ_ONCE(sch->limit))) return qdisc_enqueue_tail(skb, sch); prev_backlog = sch->qstats.backlog; @@ -105,14 +109,14 @@ static int __fifo_init(struct Qdisc *sch, struct nlattr *opt, if (is_bfifo) limit *= psched_mtu(qdisc_dev(sch)); - sch->limit = limit; + WRITE_ONCE(sch->limit, limit); } else { struct tc_fifo_qopt *ctl = nla_data(opt); if (nla_len(opt) < sizeof(*ctl)) return -EINVAL; - sch->limit = ctl->limit; + WRITE_ONCE(sch->limit, ctl->limit); } if (is_bfifo) @@ -154,7 +158,7 @@ static void fifo_destroy(struct Qdisc *sch) static int __fifo_dump(struct Qdisc *sch, struct sk_buff *skb) { - struct tc_fifo_qopt opt = { .limit = sch->limit }; + struct tc_fifo_qopt opt = { .limit = READ_ONCE(sch->limit) }; if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt)) goto nla_put_failure; diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c index cdf23ff16f40..902ff5470607 100644 --- a/net/sched/sch_fq.c +++ b/net/sched/sch_fq.c @@ -106,9 +106,12 @@ struct fq_perband_flows { int quantum; /* based on band nr : 576KB, 192KB, 64KB */ }; +#define FQ_PRIO2BAND_CRUMB_SIZE ((TC_PRIO_MAX + 1) >> 2) + struct fq_sched_data { /* Read mostly cache line */ + u64 offload_horizon; u32 quantum; u32 initial_quantum; u32 flow_refill_delay; @@ -122,7 +125,7 @@ struct fq_sched_data { u8 rate_enable; u8 fq_trees_log; u8 horizon_drop; - u8 prio2band[(TC_PRIO_MAX + 1) >> 2]; + u8 prio2band[FQ_PRIO2BAND_CRUMB_SIZE]; u32 timer_slack; /* hrtimer slack in ns */ /* Read/Write fields. */ @@ -159,7 +162,7 @@ struct fq_sched_data { /* return the i-th 2-bit value ("crumb") */ static u8 fq_prio2band(const u8 *prio2band, unsigned int prio) { - return (prio2band[prio / 4] >> (2 * (prio & 0x3))) & 0x3; + return (READ_ONCE(prio2band[prio / 4]) >> (2 * (prio & 0x3))) & 0x3; } /* @@ -297,7 +300,7 @@ static void fq_gc(struct fq_sched_data *q, } /* Fast path can be used if : - * 1) Packet tstamp is in the past. + * 1) Packet tstamp is in the past, or within the pacing offload horizon. * 2) FQ qlen == 0 OR * (no flow is currently eligible for transmit, * AND fast path queue has less than 8 packets) @@ -312,7 +315,7 @@ static bool fq_fastpath_check(const struct Qdisc *sch, struct sk_buff *skb, const struct fq_sched_data *q = qdisc_priv(sch); const struct sock *sk; - if (fq_skb_cb(skb)->time_to_send > now) + if (fq_skb_cb(skb)->time_to_send > now + q->offload_horizon) return false; if (sch->q.qlen != 0) { @@ -329,6 +332,12 @@ static bool fq_fastpath_check(const struct Qdisc *sch, struct sk_buff *skb, */ if (q->internal.qlen >= 8) return false; + + /* Ordering invariants fall apart if some delayed flows + * are ready but we haven't serviced them, yet. + */ + if (q->time_next_delayed_flow <= now + q->offload_horizon) + return false; } sk = skb->sk; @@ -359,8 +368,9 @@ static struct fq_flow *fq_classify(struct Qdisc *sch, struct sk_buff *skb, * 3) We do not want to rate limit them (eg SYNFLOOD attack), * especially if the listener set SO_MAX_PACING_RATE * 4) We pretend they are orphaned + * TCP can also associate TIME_WAIT sockets with RST or ACK packets. */ - if (!sk || sk_listener(sk)) { + if (!sk || sk_listener_or_tw(sk)) { unsigned long hash = skb_get_hash(skb) & q->orphan_mask; /* By forcing low order bit to 1, we make sure to not @@ -527,6 +537,8 @@ static bool fq_packet_beyond_horizon(const struct sk_buff *skb, return unlikely((s64)skb->tstamp > (s64)(now + q->horizon)); } +#define FQDR(reason) SKB_DROP_REASON_FQ_##reason + static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { @@ -538,7 +550,8 @@ static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch, band = fq_prio2band(q->prio2band, skb->priority & TC_PRIO_MAX); if (unlikely(q->band_pkt_count[band] >= sch->limit)) { q->stat_band_drops[band]++; - return qdisc_drop(skb, sch, to_free); + return qdisc_drop_reason(skb, sch, to_free, + FQDR(BAND_LIMIT)); } now = ktime_get_ns(); @@ -548,8 +561,9 @@ static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch, /* Check if packet timestamp is too far in the future. */ if (fq_packet_beyond_horizon(skb, q, now)) { if (q->horizon_drop) { - q->stat_horizon_drops++; - return qdisc_drop(skb, sch, to_free); + q->stat_horizon_drops++; + return qdisc_drop_reason(skb, sch, to_free, + FQDR(HORIZON_LIMIT)); } q->stat_horizon_caps++; skb->tstamp = now + q->horizon; @@ -562,7 +576,8 @@ static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch, if (f != &q->internal) { if (unlikely(f->qlen >= q->flow_plimit)) { q->stat_flows_plimit++; - return qdisc_drop(skb, sch, to_free); + return qdisc_drop_reason(skb, sch, to_free, + FQDR(FLOW_LIMIT)); } if (fq_flow_is_detached(f)) { @@ -587,21 +602,25 @@ static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch, return NET_XMIT_SUCCESS; } +#undef FQDR static void fq_check_throttled(struct fq_sched_data *q, u64 now) { unsigned long sample; struct rb_node *p; - if (q->time_next_delayed_flow > now) + if (q->time_next_delayed_flow > now + q->offload_horizon) return; /* Update unthrottle latency EWMA. * This is cheap and can help diagnosing timer/latency problems. */ sample = (unsigned long)(now - q->time_next_delayed_flow); - q->unthrottle_latency_ns -= q->unthrottle_latency_ns >> 3; - q->unthrottle_latency_ns += sample >> 3; + if ((long)sample > 0) { + q->unthrottle_latency_ns -= q->unthrottle_latency_ns >> 3; + q->unthrottle_latency_ns += sample >> 3; + } + now += q->offload_horizon; q->time_next_delayed_flow = ~0ULL; while ((p = rb_first(&q->delayed)) != NULL) { @@ -661,7 +680,9 @@ begin: pband = &q->band_flows[q->band_nr]; pband->credit = min(pband->credit + pband->quantum, pband->quantum); - goto begin; + if (pband->credit > 0) + goto begin; + retry = 0; } if (q->time_next_delayed_flow != ~0ULL) qdisc_watchdog_schedule_range_ns(&q->watchdog, @@ -683,7 +704,7 @@ begin: u64 time_next_packet = max_t(u64, fq_skb_cb(skb)->time_to_send, f->time_next_packet); - if (now < time_next_packet) { + if (now + q->offload_horizon < time_next_packet) { head->first = f->next; f->time_next_packet = time_next_packet; fq_flow_set_throttled(q, f); @@ -888,7 +909,7 @@ static int fq_resize(struct Qdisc *sch, u32 log) fq_rehash(q, old_fq_root, q->fq_trees_log, array, log); q->fq_root = array; - q->fq_trees_log = log; + WRITE_ONCE(q->fq_trees_log, log); sch_tree_unlock(sch); @@ -921,17 +942,22 @@ static const struct nla_policy fq_policy[TCA_FQ_MAX + 1] = { [TCA_FQ_HORIZON_DROP] = { .type = NLA_U8 }, [TCA_FQ_PRIOMAP] = NLA_POLICY_EXACT_LEN(sizeof(struct tc_prio_qopt)), [TCA_FQ_WEIGHTS] = NLA_POLICY_EXACT_LEN(FQ_BANDS * sizeof(s32)), + [TCA_FQ_OFFLOAD_HORIZON] = { .type = NLA_U32 }, }; /* compress a u8 array with all elems <= 3 to an array of 2-bit fields */ static void fq_prio2band_compress_crumb(const u8 *in, u8 *out) { const int num_elems = TC_PRIO_MAX + 1; + u8 tmp[FQ_PRIO2BAND_CRUMB_SIZE]; int i; - memset(out, 0, num_elems / 4); + memset(tmp, 0, sizeof(tmp)); for (i = 0; i < num_elems; i++) - out[i / 4] |= in[i] << (2 * (i & 0x3)); + tmp[i / 4] |= in[i] << (2 * (i & 0x3)); + + for (i = 0; i < FQ_PRIO2BAND_CRUMB_SIZE; i++) + WRITE_ONCE(out[i], tmp[i]); } static void fq_prio2band_decompress_crumb(const u8 *in, u8 *out) @@ -958,7 +984,7 @@ static int fq_load_weights(struct fq_sched_data *q, } } for (i = 0; i < FQ_BANDS; i++) - q->band_flows[i].quantum = weights[i]; + WRITE_ONCE(q->band_flows[i].quantum, weights[i]); return 0; } @@ -1011,16 +1037,18 @@ static int fq_change(struct Qdisc *sch, struct nlattr *opt, err = -EINVAL; } if (tb[TCA_FQ_PLIMIT]) - sch->limit = nla_get_u32(tb[TCA_FQ_PLIMIT]); + WRITE_ONCE(sch->limit, + nla_get_u32(tb[TCA_FQ_PLIMIT])); if (tb[TCA_FQ_FLOW_PLIMIT]) - q->flow_plimit = nla_get_u32(tb[TCA_FQ_FLOW_PLIMIT]); + WRITE_ONCE(q->flow_plimit, + nla_get_u32(tb[TCA_FQ_FLOW_PLIMIT])); if (tb[TCA_FQ_QUANTUM]) { u32 quantum = nla_get_u32(tb[TCA_FQ_QUANTUM]); if (quantum > 0 && quantum <= (1 << 20)) { - q->quantum = quantum; + WRITE_ONCE(q->quantum, quantum); } else { NL_SET_ERR_MSG_MOD(extack, "invalid quantum"); err = -EINVAL; @@ -1028,7 +1056,8 @@ static int fq_change(struct Qdisc *sch, struct nlattr *opt, } if (tb[TCA_FQ_INITIAL_QUANTUM]) - q->initial_quantum = nla_get_u32(tb[TCA_FQ_INITIAL_QUANTUM]); + WRITE_ONCE(q->initial_quantum, + nla_get_u32(tb[TCA_FQ_INITIAL_QUANTUM])); if (tb[TCA_FQ_FLOW_DEFAULT_RATE]) pr_warn_ratelimited("sch_fq: defrate %u ignored.\n", @@ -1037,17 +1066,19 @@ static int fq_change(struct Qdisc *sch, struct nlattr *opt, if (tb[TCA_FQ_FLOW_MAX_RATE]) { u32 rate = nla_get_u32(tb[TCA_FQ_FLOW_MAX_RATE]); - q->flow_max_rate = (rate == ~0U) ? ~0UL : rate; + WRITE_ONCE(q->flow_max_rate, + (rate == ~0U) ? ~0UL : rate); } if (tb[TCA_FQ_LOW_RATE_THRESHOLD]) - q->low_rate_threshold = - nla_get_u32(tb[TCA_FQ_LOW_RATE_THRESHOLD]); + WRITE_ONCE(q->low_rate_threshold, + nla_get_u32(tb[TCA_FQ_LOW_RATE_THRESHOLD])); if (tb[TCA_FQ_RATE_ENABLE]) { u32 enable = nla_get_u32(tb[TCA_FQ_RATE_ENABLE]); if (enable <= 1) - q->rate_enable = enable; + WRITE_ONCE(q->rate_enable, + enable); else err = -EINVAL; } @@ -1055,7 +1086,8 @@ static int fq_change(struct Qdisc *sch, struct nlattr *opt, if (tb[TCA_FQ_FLOW_REFILL_DELAY]) { u32 usecs_delay = nla_get_u32(tb[TCA_FQ_FLOW_REFILL_DELAY]) ; - q->flow_refill_delay = usecs_to_jiffies(usecs_delay); + WRITE_ONCE(q->flow_refill_delay, + usecs_to_jiffies(usecs_delay)); } if (!err && tb[TCA_FQ_PRIOMAP]) @@ -1065,22 +1097,38 @@ static int fq_change(struct Qdisc *sch, struct nlattr *opt, err = fq_load_weights(q, tb[TCA_FQ_WEIGHTS], extack); if (tb[TCA_FQ_ORPHAN_MASK]) - q->orphan_mask = nla_get_u32(tb[TCA_FQ_ORPHAN_MASK]); + WRITE_ONCE(q->orphan_mask, + nla_get_u32(tb[TCA_FQ_ORPHAN_MASK])); if (tb[TCA_FQ_CE_THRESHOLD]) - q->ce_threshold = (u64)NSEC_PER_USEC * - nla_get_u32(tb[TCA_FQ_CE_THRESHOLD]); + WRITE_ONCE(q->ce_threshold, + (u64)NSEC_PER_USEC * + nla_get_u32(tb[TCA_FQ_CE_THRESHOLD])); if (tb[TCA_FQ_TIMER_SLACK]) - q->timer_slack = nla_get_u32(tb[TCA_FQ_TIMER_SLACK]); + WRITE_ONCE(q->timer_slack, + nla_get_u32(tb[TCA_FQ_TIMER_SLACK])); if (tb[TCA_FQ_HORIZON]) - q->horizon = (u64)NSEC_PER_USEC * - nla_get_u32(tb[TCA_FQ_HORIZON]); + WRITE_ONCE(q->horizon, + (u64)NSEC_PER_USEC * + nla_get_u32(tb[TCA_FQ_HORIZON])); if (tb[TCA_FQ_HORIZON_DROP]) - q->horizon_drop = nla_get_u8(tb[TCA_FQ_HORIZON_DROP]); + WRITE_ONCE(q->horizon_drop, + nla_get_u8(tb[TCA_FQ_HORIZON_DROP])); + + if (tb[TCA_FQ_OFFLOAD_HORIZON]) { + u64 offload_horizon = (u64)NSEC_PER_USEC * + nla_get_u32(tb[TCA_FQ_OFFLOAD_HORIZON]); + if (offload_horizon <= qdisc_dev(sch)->max_pacing_offload_horizon) { + WRITE_ONCE(q->offload_horizon, offload_horizon); + } else { + NL_SET_ERR_MSG_MOD(extack, "invalid offload_horizon"); + err = -EINVAL; + } + } if (!err) { sch_tree_unlock(sch); @@ -1088,7 +1136,7 @@ static int fq_change(struct Qdisc *sch, struct nlattr *opt, sch_tree_lock(sch); } while (sch->q.qlen > sch->limit) { - struct sk_buff *skb = fq_dequeue(sch); + struct sk_buff *skb = qdisc_dequeue_internal(sch, false); if (!skb) break; @@ -1160,13 +1208,14 @@ static int fq_init(struct Qdisc *sch, struct nlattr *opt, static int fq_dump(struct Qdisc *sch, struct sk_buff *skb) { struct fq_sched_data *q = qdisc_priv(sch); - u64 ce_threshold = q->ce_threshold; struct tc_prio_qopt prio = { .bands = FQ_BANDS, }; - u64 horizon = q->horizon; struct nlattr *opts; + u64 offload_horizon; + u64 ce_threshold; s32 weights[3]; + u64 horizon; opts = nla_nest_start_noflag(skb, TCA_OPTIONS); if (opts == NULL) @@ -1174,35 +1223,52 @@ static int fq_dump(struct Qdisc *sch, struct sk_buff *skb) /* TCA_FQ_FLOW_DEFAULT_RATE is not used anymore */ + ce_threshold = READ_ONCE(q->ce_threshold); do_div(ce_threshold, NSEC_PER_USEC); + + horizon = READ_ONCE(q->horizon); do_div(horizon, NSEC_PER_USEC); - if (nla_put_u32(skb, TCA_FQ_PLIMIT, sch->limit) || - nla_put_u32(skb, TCA_FQ_FLOW_PLIMIT, q->flow_plimit) || - nla_put_u32(skb, TCA_FQ_QUANTUM, q->quantum) || - nla_put_u32(skb, TCA_FQ_INITIAL_QUANTUM, q->initial_quantum) || - nla_put_u32(skb, TCA_FQ_RATE_ENABLE, q->rate_enable) || + offload_horizon = READ_ONCE(q->offload_horizon); + do_div(offload_horizon, NSEC_PER_USEC); + + if (nla_put_u32(skb, TCA_FQ_PLIMIT, + READ_ONCE(sch->limit)) || + nla_put_u32(skb, TCA_FQ_FLOW_PLIMIT, + READ_ONCE(q->flow_plimit)) || + nla_put_u32(skb, TCA_FQ_QUANTUM, + READ_ONCE(q->quantum)) || + nla_put_u32(skb, TCA_FQ_INITIAL_QUANTUM, + READ_ONCE(q->initial_quantum)) || + nla_put_u32(skb, TCA_FQ_RATE_ENABLE, + READ_ONCE(q->rate_enable)) || nla_put_u32(skb, TCA_FQ_FLOW_MAX_RATE, - min_t(unsigned long, q->flow_max_rate, ~0U)) || + min_t(unsigned long, + READ_ONCE(q->flow_max_rate), ~0U)) || nla_put_u32(skb, TCA_FQ_FLOW_REFILL_DELAY, - jiffies_to_usecs(q->flow_refill_delay)) || - nla_put_u32(skb, TCA_FQ_ORPHAN_MASK, q->orphan_mask) || + jiffies_to_usecs(READ_ONCE(q->flow_refill_delay))) || + nla_put_u32(skb, TCA_FQ_ORPHAN_MASK, + READ_ONCE(q->orphan_mask)) || nla_put_u32(skb, TCA_FQ_LOW_RATE_THRESHOLD, - q->low_rate_threshold) || + READ_ONCE(q->low_rate_threshold)) || nla_put_u32(skb, TCA_FQ_CE_THRESHOLD, (u32)ce_threshold) || - nla_put_u32(skb, TCA_FQ_BUCKETS_LOG, q->fq_trees_log) || - nla_put_u32(skb, TCA_FQ_TIMER_SLACK, q->timer_slack) || + nla_put_u32(skb, TCA_FQ_BUCKETS_LOG, + READ_ONCE(q->fq_trees_log)) || + nla_put_u32(skb, TCA_FQ_TIMER_SLACK, + READ_ONCE(q->timer_slack)) || nla_put_u32(skb, TCA_FQ_HORIZON, (u32)horizon) || - nla_put_u8(skb, TCA_FQ_HORIZON_DROP, q->horizon_drop)) + nla_put_u32(skb, TCA_FQ_OFFLOAD_HORIZON, (u32)offload_horizon) || + nla_put_u8(skb, TCA_FQ_HORIZON_DROP, + READ_ONCE(q->horizon_drop))) goto nla_put_failure; fq_prio2band_decompress_crumb(q->prio2band, prio.priomap); if (nla_put(skb, TCA_FQ_PRIOMAP, sizeof(prio), &prio)) goto nla_put_failure; - weights[0] = q->band_flows[0].quantum; - weights[1] = q->band_flows[1].quantum; - weights[2] = q->band_flows[2].quantum; + weights[0] = READ_ONCE(q->band_flows[0].quantum); + weights[1] = READ_ONCE(q->band_flows[1].quantum); + weights[2] = READ_ONCE(q->band_flows[2].quantum); if (nla_put(skb, TCA_FQ_WEIGHTS, sizeof(weights), &weights)) goto nla_put_failure; diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c index 79f9d6de6c85..2a0f3a513bfa 100644 --- a/net/sched/sch_fq_codel.c +++ b/net/sched/sch_fq_codel.c @@ -168,6 +168,7 @@ static unsigned int fq_codel_drop(struct Qdisc *sch, unsigned int max_packets, skb = dequeue_head(flow); len += qdisc_pkt_len(skb); mem += get_codel_cb(skb)->mem_usage; + tcf_set_drop_reason(skb, SKB_DROP_REASON_QDISC_OVERLIMIT); __qdisc_drop(skb, to_free); } while (++i < max_packets && len < threshold); @@ -274,7 +275,7 @@ static void drop_func(struct sk_buff *skb, void *ctx) { struct Qdisc *sch = ctx; - kfree_skb(skb); + kfree_skb_reason(skb, SKB_DROP_REASON_QDISC_CONGESTED); qdisc_qstats_drop(sch); } @@ -314,10 +315,8 @@ begin: } qdisc_bstats_update(sch, skb); flow->deficit -= qdisc_pkt_len(skb); - /* We cant call qdisc_tree_reduce_backlog() if our qlen is 0, - * or HTB crashes. Defer it for next round. - */ - if (q->cstats.drop_count && sch->q.qlen) { + + if (q->cstats.drop_count) { qdisc_tree_reduce_backlog(sch, q->cstats.drop_count, q->cstats.drop_len); q->cstats.drop_count = 0; @@ -396,44 +395,53 @@ static int fq_codel_change(struct Qdisc *sch, struct nlattr *opt, if (tb[TCA_FQ_CODEL_TARGET]) { u64 target = nla_get_u32(tb[TCA_FQ_CODEL_TARGET]); - q->cparams.target = (target * NSEC_PER_USEC) >> CODEL_SHIFT; + WRITE_ONCE(q->cparams.target, + (target * NSEC_PER_USEC) >> CODEL_SHIFT); } if (tb[TCA_FQ_CODEL_CE_THRESHOLD]) { u64 val = nla_get_u32(tb[TCA_FQ_CODEL_CE_THRESHOLD]); - q->cparams.ce_threshold = (val * NSEC_PER_USEC) >> CODEL_SHIFT; + WRITE_ONCE(q->cparams.ce_threshold, + (val * NSEC_PER_USEC) >> CODEL_SHIFT); } if (tb[TCA_FQ_CODEL_CE_THRESHOLD_SELECTOR]) - q->cparams.ce_threshold_selector = nla_get_u8(tb[TCA_FQ_CODEL_CE_THRESHOLD_SELECTOR]); + WRITE_ONCE(q->cparams.ce_threshold_selector, + nla_get_u8(tb[TCA_FQ_CODEL_CE_THRESHOLD_SELECTOR])); if (tb[TCA_FQ_CODEL_CE_THRESHOLD_MASK]) - q->cparams.ce_threshold_mask = nla_get_u8(tb[TCA_FQ_CODEL_CE_THRESHOLD_MASK]); + WRITE_ONCE(q->cparams.ce_threshold_mask, + nla_get_u8(tb[TCA_FQ_CODEL_CE_THRESHOLD_MASK])); if (tb[TCA_FQ_CODEL_INTERVAL]) { u64 interval = nla_get_u32(tb[TCA_FQ_CODEL_INTERVAL]); - q->cparams.interval = (interval * NSEC_PER_USEC) >> CODEL_SHIFT; + WRITE_ONCE(q->cparams.interval, + (interval * NSEC_PER_USEC) >> CODEL_SHIFT); } if (tb[TCA_FQ_CODEL_LIMIT]) - sch->limit = nla_get_u32(tb[TCA_FQ_CODEL_LIMIT]); + WRITE_ONCE(sch->limit, + nla_get_u32(tb[TCA_FQ_CODEL_LIMIT])); if (tb[TCA_FQ_CODEL_ECN]) - q->cparams.ecn = !!nla_get_u32(tb[TCA_FQ_CODEL_ECN]); + WRITE_ONCE(q->cparams.ecn, + !!nla_get_u32(tb[TCA_FQ_CODEL_ECN])); if (quantum) - q->quantum = quantum; + WRITE_ONCE(q->quantum, quantum); if (tb[TCA_FQ_CODEL_DROP_BATCH_SIZE]) - q->drop_batch_size = max(1U, nla_get_u32(tb[TCA_FQ_CODEL_DROP_BATCH_SIZE])); + WRITE_ONCE(q->drop_batch_size, + max(1U, nla_get_u32(tb[TCA_FQ_CODEL_DROP_BATCH_SIZE]))); if (tb[TCA_FQ_CODEL_MEMORY_LIMIT]) - q->memory_limit = min(1U << 31, nla_get_u32(tb[TCA_FQ_CODEL_MEMORY_LIMIT])); + WRITE_ONCE(q->memory_limit, + min(1U << 31, nla_get_u32(tb[TCA_FQ_CODEL_MEMORY_LIMIT]))); while (sch->q.qlen > sch->limit || q->memory_usage > q->memory_limit) { - struct sk_buff *skb = fq_codel_dequeue(sch); + struct sk_buff *skb = qdisc_dequeue_internal(sch, false); q->cstats.drop_len += qdisc_pkt_len(skb); rtnl_kfree_skbs(skb, skb); @@ -522,6 +530,7 @@ init_failure: static int fq_codel_dump(struct Qdisc *sch, struct sk_buff *skb) { struct fq_codel_sched_data *q = qdisc_priv(sch); + codel_time_t ce_threshold; struct nlattr *opts; opts = nla_nest_start_noflag(skb, TCA_OPTIONS); @@ -529,30 +538,33 @@ static int fq_codel_dump(struct Qdisc *sch, struct sk_buff *skb) goto nla_put_failure; if (nla_put_u32(skb, TCA_FQ_CODEL_TARGET, - codel_time_to_us(q->cparams.target)) || + codel_time_to_us(READ_ONCE(q->cparams.target))) || nla_put_u32(skb, TCA_FQ_CODEL_LIMIT, - sch->limit) || + READ_ONCE(sch->limit)) || nla_put_u32(skb, TCA_FQ_CODEL_INTERVAL, - codel_time_to_us(q->cparams.interval)) || + codel_time_to_us(READ_ONCE(q->cparams.interval))) || nla_put_u32(skb, TCA_FQ_CODEL_ECN, - q->cparams.ecn) || + READ_ONCE(q->cparams.ecn)) || nla_put_u32(skb, TCA_FQ_CODEL_QUANTUM, - q->quantum) || + READ_ONCE(q->quantum)) || nla_put_u32(skb, TCA_FQ_CODEL_DROP_BATCH_SIZE, - q->drop_batch_size) || + READ_ONCE(q->drop_batch_size)) || nla_put_u32(skb, TCA_FQ_CODEL_MEMORY_LIMIT, - q->memory_limit) || + READ_ONCE(q->memory_limit)) || nla_put_u32(skb, TCA_FQ_CODEL_FLOWS, - q->flows_cnt)) + READ_ONCE(q->flows_cnt))) goto nla_put_failure; - if (q->cparams.ce_threshold != CODEL_DISABLED_THRESHOLD) { + ce_threshold = READ_ONCE(q->cparams.ce_threshold); + if (ce_threshold != CODEL_DISABLED_THRESHOLD) { if (nla_put_u32(skb, TCA_FQ_CODEL_CE_THRESHOLD, - codel_time_to_us(q->cparams.ce_threshold))) + codel_time_to_us(ce_threshold))) goto nla_put_failure; - if (nla_put_u8(skb, TCA_FQ_CODEL_CE_THRESHOLD_SELECTOR, q->cparams.ce_threshold_selector)) + if (nla_put_u8(skb, TCA_FQ_CODEL_CE_THRESHOLD_SELECTOR, + READ_ONCE(q->cparams.ce_threshold_selector))) goto nla_put_failure; - if (nla_put_u8(skb, TCA_FQ_CODEL_CE_THRESHOLD_MASK, q->cparams.ce_threshold_mask)) + if (nla_put_u8(skb, TCA_FQ_CODEL_CE_THRESHOLD_MASK, + READ_ONCE(q->cparams.ce_threshold_mask))) goto nla_put_failure; } diff --git a/net/sched/sch_fq_pie.c b/net/sched/sch_fq_pie.c index 358cf304f4c9..b0e34daf1f75 100644 --- a/net/sched/sch_fq_pie.c +++ b/net/sched/sch_fq_pie.c @@ -130,6 +130,7 @@ static inline void flow_queue_add(struct fq_pie_flow *flow, static int fq_pie_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { + enum skb_drop_reason reason = SKB_DROP_REASON_QDISC_OVERLIMIT; struct fq_pie_sched_data *q = qdisc_priv(sch); struct fq_pie_flow *sel_flow; int ret; @@ -161,6 +162,8 @@ static int fq_pie_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, q->overmemory++; } + reason = SKB_DROP_REASON_QDISC_CONGESTED; + if (!pie_drop_early(sch, &q->p_params, &sel_flow->vars, sel_flow->backlog, skb->len)) { enqueue = true; @@ -198,8 +201,7 @@ static int fq_pie_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, out: q->stats.dropped++; sel_flow->vars.accu_prob = 0; - __qdisc_drop(skb, to_free); - qdisc_qstats_drop(sch); + qdisc_drop_reason(skb, sch, to_free, reason); return NET_XMIT_CN; } @@ -299,8 +301,8 @@ static int fq_pie_change(struct Qdisc *sch, struct nlattr *opt, if (tb[TCA_FQ_PIE_LIMIT]) { u32 limit = nla_get_u32(tb[TCA_FQ_PIE_LIMIT]); - q->p_params.limit = limit; - sch->limit = limit; + WRITE_ONCE(q->p_params.limit, limit); + WRITE_ONCE(sch->limit, limit); } if (tb[TCA_FQ_PIE_FLOWS]) { if (q->flows) { @@ -322,43 +324,49 @@ static int fq_pie_change(struct Qdisc *sch, struct nlattr *opt, u32 target = nla_get_u32(tb[TCA_FQ_PIE_TARGET]); /* convert to pschedtime */ - q->p_params.target = - PSCHED_NS2TICKS((u64)target * NSEC_PER_USEC); + WRITE_ONCE(q->p_params.target, + PSCHED_NS2TICKS((u64)target * NSEC_PER_USEC)); } /* tupdate is in jiffies */ if (tb[TCA_FQ_PIE_TUPDATE]) - q->p_params.tupdate = - usecs_to_jiffies(nla_get_u32(tb[TCA_FQ_PIE_TUPDATE])); + WRITE_ONCE(q->p_params.tupdate, + usecs_to_jiffies(nla_get_u32(tb[TCA_FQ_PIE_TUPDATE]))); if (tb[TCA_FQ_PIE_ALPHA]) - q->p_params.alpha = nla_get_u32(tb[TCA_FQ_PIE_ALPHA]); + WRITE_ONCE(q->p_params.alpha, + nla_get_u32(tb[TCA_FQ_PIE_ALPHA])); if (tb[TCA_FQ_PIE_BETA]) - q->p_params.beta = nla_get_u32(tb[TCA_FQ_PIE_BETA]); + WRITE_ONCE(q->p_params.beta, + nla_get_u32(tb[TCA_FQ_PIE_BETA])); if (tb[TCA_FQ_PIE_QUANTUM]) - q->quantum = nla_get_u32(tb[TCA_FQ_PIE_QUANTUM]); + WRITE_ONCE(q->quantum, nla_get_u32(tb[TCA_FQ_PIE_QUANTUM])); if (tb[TCA_FQ_PIE_MEMORY_LIMIT]) - q->memory_limit = nla_get_u32(tb[TCA_FQ_PIE_MEMORY_LIMIT]); + WRITE_ONCE(q->memory_limit, + nla_get_u32(tb[TCA_FQ_PIE_MEMORY_LIMIT])); if (tb[TCA_FQ_PIE_ECN_PROB]) - q->ecn_prob = nla_get_u32(tb[TCA_FQ_PIE_ECN_PROB]); + WRITE_ONCE(q->ecn_prob, + nla_get_u32(tb[TCA_FQ_PIE_ECN_PROB])); if (tb[TCA_FQ_PIE_ECN]) - q->p_params.ecn = nla_get_u32(tb[TCA_FQ_PIE_ECN]); + WRITE_ONCE(q->p_params.ecn, + nla_get_u32(tb[TCA_FQ_PIE_ECN])); if (tb[TCA_FQ_PIE_BYTEMODE]) - q->p_params.bytemode = nla_get_u32(tb[TCA_FQ_PIE_BYTEMODE]); + WRITE_ONCE(q->p_params.bytemode, + nla_get_u32(tb[TCA_FQ_PIE_BYTEMODE])); if (tb[TCA_FQ_PIE_DQ_RATE_ESTIMATOR]) - q->p_params.dq_rate_estimator = - nla_get_u32(tb[TCA_FQ_PIE_DQ_RATE_ESTIMATOR]); + WRITE_ONCE(q->p_params.dq_rate_estimator, + nla_get_u32(tb[TCA_FQ_PIE_DQ_RATE_ESTIMATOR])); /* Drop excess packets if new limit is lower */ while (sch->q.qlen > sch->limit) { - struct sk_buff *skb = fq_pie_qdisc_dequeue(sch); + struct sk_buff *skb = qdisc_dequeue_internal(sch, false); len_dropped += qdisc_pkt_len(skb); num_dropped += 1; @@ -376,7 +384,7 @@ flow_error: static void fq_pie_timer(struct timer_list *t) { - struct fq_pie_sched_data *q = from_timer(q, t, adapt_timer); + struct fq_pie_sched_data *q = timer_container_of(q, t, adapt_timer); unsigned long next, tupdate; struct Qdisc *sch = q->sch; spinlock_t *root_lock; /* to lock qdisc for probability calculations */ @@ -471,22 +479,23 @@ static int fq_pie_dump(struct Qdisc *sch, struct sk_buff *skb) return -EMSGSIZE; /* convert target from pschedtime to us */ - if (nla_put_u32(skb, TCA_FQ_PIE_LIMIT, sch->limit) || - nla_put_u32(skb, TCA_FQ_PIE_FLOWS, q->flows_cnt) || + if (nla_put_u32(skb, TCA_FQ_PIE_LIMIT, READ_ONCE(sch->limit)) || + nla_put_u32(skb, TCA_FQ_PIE_FLOWS, READ_ONCE(q->flows_cnt)) || nla_put_u32(skb, TCA_FQ_PIE_TARGET, - ((u32)PSCHED_TICKS2NS(q->p_params.target)) / + ((u32)PSCHED_TICKS2NS(READ_ONCE(q->p_params.target))) / NSEC_PER_USEC) || nla_put_u32(skb, TCA_FQ_PIE_TUPDATE, - jiffies_to_usecs(q->p_params.tupdate)) || - nla_put_u32(skb, TCA_FQ_PIE_ALPHA, q->p_params.alpha) || - nla_put_u32(skb, TCA_FQ_PIE_BETA, q->p_params.beta) || - nla_put_u32(skb, TCA_FQ_PIE_QUANTUM, q->quantum) || - nla_put_u32(skb, TCA_FQ_PIE_MEMORY_LIMIT, q->memory_limit) || - nla_put_u32(skb, TCA_FQ_PIE_ECN_PROB, q->ecn_prob) || - nla_put_u32(skb, TCA_FQ_PIE_ECN, q->p_params.ecn) || - nla_put_u32(skb, TCA_FQ_PIE_BYTEMODE, q->p_params.bytemode) || + jiffies_to_usecs(READ_ONCE(q->p_params.tupdate))) || + nla_put_u32(skb, TCA_FQ_PIE_ALPHA, READ_ONCE(q->p_params.alpha)) || + nla_put_u32(skb, TCA_FQ_PIE_BETA, READ_ONCE(q->p_params.beta)) || + nla_put_u32(skb, TCA_FQ_PIE_QUANTUM, READ_ONCE(q->quantum)) || + nla_put_u32(skb, TCA_FQ_PIE_MEMORY_LIMIT, + READ_ONCE(q->memory_limit)) || + nla_put_u32(skb, TCA_FQ_PIE_ECN_PROB, READ_ONCE(q->ecn_prob)) || + nla_put_u32(skb, TCA_FQ_PIE_ECN, READ_ONCE(q->p_params.ecn)) || + nla_put_u32(skb, TCA_FQ_PIE_BYTEMODE, READ_ONCE(q->p_params.bytemode)) || nla_put_u32(skb, TCA_FQ_PIE_DQ_RATE_ESTIMATOR, - q->p_params.dq_rate_estimator)) + READ_ONCE(q->p_params.dq_rate_estimator))) goto nla_put_failure; return nla_nest_end(skb, opts); @@ -546,7 +555,7 @@ static void fq_pie_destroy(struct Qdisc *sch) tcf_block_put(q->block); q->p_params.tupdate = 0; - del_timer_sync(&q->adapt_timer); + timer_delete_sync(&q->adapt_timer); kvfree(q->flows); } diff --git a/net/sched/sch_frag.c b/net/sched/sch_frag.c index ce63414185fd..d1d87dce7f3f 100644 --- a/net/sched/sch_frag.c +++ b/net/sched/sch_frag.c @@ -16,14 +16,18 @@ struct sch_frag_data { unsigned int l2_len; u8 l2_data[VLAN_ETH_HLEN]; int (*xmit)(struct sk_buff *skb); + local_lock_t bh_lock; }; -static DEFINE_PER_CPU(struct sch_frag_data, sch_frag_data_storage); +static DEFINE_PER_CPU(struct sch_frag_data, sch_frag_data_storage) = { + .bh_lock = INIT_LOCAL_LOCK(bh_lock), +}; static int sch_frag_xmit(struct net *net, struct sock *sk, struct sk_buff *skb) { struct sch_frag_data *data = this_cpu_ptr(&sch_frag_data_storage); + lockdep_assert_held(&data->bh_lock); if (skb_cow_head(skb, data->l2_len) < 0) { kfree_skb(skb); return -ENOMEM; @@ -95,6 +99,7 @@ static int sch_fragment(struct net *net, struct sk_buff *skb, struct rtable sch_frag_rt = { 0 }; unsigned long orig_dst; + local_lock_nested_bh(&sch_frag_data_storage.bh_lock); sch_frag_prepare_frag(skb, xmit); dst_init(&sch_frag_rt.dst, &sch_frag_dst_ops, NULL, DST_OBSOLETE_NONE, DST_NOCOUNT); @@ -105,11 +110,13 @@ static int sch_fragment(struct net *net, struct sk_buff *skb, IPCB(skb)->frag_max_size = mru; ret = ip_do_fragment(net, skb->sk, skb, sch_frag_xmit); + local_unlock_nested_bh(&sch_frag_data_storage.bh_lock); refdst_drop(orig_dst); } else if (skb_protocol(skb, true) == htons(ETH_P_IPV6)) { unsigned long orig_dst; struct rt6_info sch_frag_rt; + local_lock_nested_bh(&sch_frag_data_storage.bh_lock); sch_frag_prepare_frag(skb, xmit); memset(&sch_frag_rt, 0, sizeof(sch_frag_rt)); dst_init(&sch_frag_rt.dst, &sch_frag_dst_ops, NULL, @@ -122,6 +129,7 @@ static int sch_fragment(struct net *net, struct sk_buff *skb, ret = ipv6_stub->ipv6_fragment(net, skb->sk, skb, sch_frag_xmit); + local_unlock_nested_bh(&sch_frag_data_storage.bh_lock); refdst_drop(orig_dst); } else { net_warn_ratelimited("Fail frag %s: eth=%x, MRU=%d, MTU=%d\n", diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index ff5336493777..16afb834fe4a 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -24,6 +24,7 @@ #include <linux/if_vlan.h> #include <linux/skb_array.h> #include <linux/if_macvlan.h> +#include <linux/bpf.h> #include <net/sch_generic.h> #include <net/pkt_sched.h> #include <net/dst.h> @@ -495,7 +496,7 @@ EXPORT_SYMBOL(netif_tx_unlock); static void dev_watchdog(struct timer_list *t) { - struct net_device *dev = from_timer(dev, t, watchdog_timer); + struct net_device *dev = timer_container_of(dev, t, watchdog_timer); bool release = true; spin_lock(&dev->tx_global_lock); @@ -506,19 +507,28 @@ static void dev_watchdog(struct timer_list *t) unsigned int timedout_ms = 0; unsigned int i; unsigned long trans_start; + unsigned long oldest_start = jiffies; for (i = 0; i < dev->num_tx_queues; i++) { struct netdev_queue *txq; txq = netdev_get_tx_queue(dev, i); + if (!netif_xmit_stopped(txq)) + continue; + + /* Paired with WRITE_ONCE() + smp_mb...() in + * netdev_tx_sent_queue() and netif_tx_stop_queue(). + */ + smp_mb(); trans_start = READ_ONCE(txq->trans_start); - if (netif_xmit_stopped(txq) && - time_after(jiffies, (trans_start + - dev->watchdog_timeo))) { + + if (time_after(jiffies, trans_start + dev->watchdog_timeo)) { timedout_ms = jiffies_to_msecs(jiffies - trans_start); atomic_long_inc(&txq->trans_timeout); break; } + if (time_after(oldest_start, trans_start)) + oldest_start = trans_start; } if (unlikely(timedout_ms)) { @@ -531,7 +541,7 @@ static void dev_watchdog(struct timer_list *t) netif_unfreeze_queues(dev); } if (!mod_timer(&dev->watchdog_timer, - round_jiffies(jiffies + + round_jiffies(oldest_start + dev->watchdog_timeo))) release = false; } @@ -542,28 +552,23 @@ static void dev_watchdog(struct timer_list *t) netdev_put(dev, &dev->watchdog_dev_tracker); } -void __netdev_watchdog_up(struct net_device *dev) -{ - if (dev->netdev_ops->ndo_tx_timeout) { - if (dev->watchdog_timeo <= 0) - dev->watchdog_timeo = 5*HZ; - if (!mod_timer(&dev->watchdog_timer, - round_jiffies(jiffies + dev->watchdog_timeo))) - netdev_hold(dev, &dev->watchdog_dev_tracker, - GFP_ATOMIC); - } -} -EXPORT_SYMBOL_GPL(__netdev_watchdog_up); - -static void dev_watchdog_up(struct net_device *dev) +void netdev_watchdog_up(struct net_device *dev) { - __netdev_watchdog_up(dev); + if (!dev->netdev_ops->ndo_tx_timeout) + return; + if (dev->watchdog_timeo <= 0) + dev->watchdog_timeo = 5*HZ; + if (!mod_timer(&dev->watchdog_timer, + round_jiffies(jiffies + dev->watchdog_timeo))) + netdev_hold(dev, &dev->watchdog_dev_tracker, + GFP_ATOMIC); } +EXPORT_SYMBOL_GPL(netdev_watchdog_up); -static void dev_watchdog_down(struct net_device *dev) +static void netdev_watchdog_down(struct net_device *dev) { netif_tx_lock_bh(dev); - if (del_timer(&dev->watchdog_timer)) + if (timer_delete(&dev->watchdog_timer)) netdev_put(dev, &dev->watchdog_dev_tracker); netif_tx_unlock_bh(dev); } @@ -582,7 +587,7 @@ void netif_carrier_on(struct net_device *dev) atomic_inc(&dev->carrier_up_count); linkwatch_fire_event(dev); if (netif_running(dev)) - __netdev_watchdog_up(dev); + netdev_watchdog_up(dev); } } EXPORT_SYMBOL(netif_carrier_on); @@ -630,6 +635,7 @@ EXPORT_SYMBOL_GPL(netif_carrier_event); static int noop_enqueue(struct sk_buff *skb, struct Qdisc *qdisc, struct sk_buff **to_free) { + dev_core_stats_tx_dropped_inc(skb->dev); __qdisc_drop(skb, to_free); return NET_XMIT_CN; } @@ -673,6 +679,7 @@ struct Qdisc noop_qdisc = { .qlen = 0, .lock = __SPIN_LOCK_UNLOCKED(noop_qdisc.skb_bad_txq.lock), }, + .owner = -1, }; EXPORT_SYMBOL(noop_qdisc); @@ -900,8 +907,8 @@ static int pfifo_fast_change_tx_queue_len(struct Qdisc *sch, bands[prio] = q; } - return skb_array_resize_multiple(bands, PFIFO_FAST_BANDS, new_len, - GFP_KERNEL); + return skb_array_resize_multiple_bh(bands, PFIFO_FAST_BANDS, new_len, + GFP_KERNEL); } struct Qdisc_ops pfifo_fast_ops __read_mostly = { @@ -945,7 +952,9 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, __skb_queue_head_init(&sch->gso_skb); __skb_queue_head_init(&sch->skb_bad_txq); gnet_stats_basic_sync_init(&sch->bstats); + lockdep_register_key(&sch->root_lock_key); spin_lock_init(&sch->q.lock); + lockdep_set_class(&sch->q.lock, &sch->root_lock_key); if (ops->static_flags & TCQ_F_CPUSTATS) { sch->cpu_bstats = @@ -974,11 +983,13 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, sch->enqueue = ops->enqueue; sch->dequeue = ops->dequeue; sch->dev_queue = dev_queue; + sch->owner = -1; netdev_hold(dev, &sch->dev_tracker, GFP_KERNEL); refcount_set(&sch->refcnt, 1); return sch; errout1: + lockdep_unregister_key(&sch->root_lock_key); kfree(sch); errout: return ERR_PTR(err); @@ -991,14 +1002,14 @@ struct Qdisc *qdisc_create_dflt(struct netdev_queue *dev_queue, { struct Qdisc *sch; - if (!try_module_get(ops->owner)) { + if (!bpf_try_module_get(ops, ops->owner)) { NL_SET_ERR_MSG(extack, "Failed to increase module reference counter"); return NULL; } sch = qdisc_alloc(dev_queue, ops, extack); if (IS_ERR(sch)) { - module_put(ops->owner); + bpf_module_put(ops, ops->owner); return NULL; } sch->parent = parentid; @@ -1067,7 +1078,8 @@ static void __qdisc_destroy(struct Qdisc *qdisc) if (ops->destroy) ops->destroy(qdisc); - module_put(ops->owner); + lockdep_unregister_key(&qdisc->root_lock_key); + bpf_module_put(ops, ops->owner); netdev_put(dev, &qdisc->dev_tracker); trace_qdisc_destroy(qdisc); @@ -1251,7 +1263,7 @@ void dev_activate(struct net_device *dev) if (need_watchdog) { netif_trans_update(dev); - dev_watchdog_up(dev); + netdev_watchdog_up(dev); } } EXPORT_SYMBOL(dev_activate); @@ -1266,15 +1278,17 @@ static void qdisc_deactivate(struct Qdisc *qdisc) static void dev_deactivate_queue(struct net_device *dev, struct netdev_queue *dev_queue, - void *_qdisc_default) + void *_sync_needed) { - struct Qdisc *qdisc_default = _qdisc_default; + bool *sync_needed = _sync_needed; struct Qdisc *qdisc; qdisc = rtnl_dereference(dev_queue->qdisc); if (qdisc) { + if (qdisc->enqueue) + *sync_needed = true; qdisc_deactivate(qdisc); - rcu_assign_pointer(dev_queue->qdisc, qdisc_default); + rcu_assign_pointer(dev_queue->qdisc, &noop_qdisc); } } @@ -1341,24 +1355,22 @@ static bool some_qdisc_is_busy(struct net_device *dev) */ void dev_deactivate_many(struct list_head *head) { + bool sync_needed = false; struct net_device *dev; list_for_each_entry(dev, head, close_list) { netdev_for_each_tx_queue(dev, dev_deactivate_queue, - &noop_qdisc); + &sync_needed); if (dev_ingress_queue(dev)) dev_deactivate_queue(dev, dev_ingress_queue(dev), - &noop_qdisc); + &sync_needed); - dev_watchdog_down(dev); + netdev_watchdog_down(dev); } - /* Wait for outstanding qdisc-less dev_queue_xmit calls or - * outstanding qdisc enqueuing calls. - * This is avoided if all devices are in dismantle phase : - * Caller will call synchronize_net() for us - */ - synchronize_net(); + /* Wait for outstanding qdisc enqueuing calls. */ + if (sync_needed) + synchronize_net(); list_for_each_entry(dev, head, close_list) { netdev_for_each_tx_queue(dev, dev_reset_queue, NULL); diff --git a/net/sched/sch_gred.c b/net/sched/sch_gred.c index 79ba9dc70254..532fde548b88 100644 --- a/net/sched/sch_gred.c +++ b/net/sched/sch_gred.c @@ -251,10 +251,10 @@ static int gred_enqueue(struct sk_buff *skb, struct Qdisc *sch, q->stats.pdrop++; drop: - return qdisc_drop(skb, sch, to_free); + return qdisc_drop_reason(skb, sch, to_free, SKB_DROP_REASON_QDISC_OVERLIMIT); congestion_drop: - qdisc_drop(skb, sch, to_free); + qdisc_drop_reason(skb, sch, to_free, SKB_DROP_REASON_QDISC_CONGESTED); return NET_XMIT_CN; } @@ -668,7 +668,7 @@ static int gred_change(struct Qdisc *sch, struct nlattr *opt, return -EINVAL; } - max_P = tb[TCA_GRED_MAX_P] ? nla_get_u32(tb[TCA_GRED_MAX_P]) : 0; + max_P = nla_get_u32_default(tb[TCA_GRED_MAX_P], 0); ctl = nla_data(tb[TCA_GRED_PARMS]); stab = nla_data(tb[TCA_GRED_STAB]); @@ -913,7 +913,8 @@ static void gred_destroy(struct Qdisc *sch) for (i = 0; i < table->DPs; i++) gred_destroy_vq(table->tab[i]); - gred_offload(sch, TC_GRED_DESTROY); + if (table->opt) + gred_offload(sch, TC_GRED_DESTROY); kfree(table->opt); } diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c index 4e626df742d7..d8fd35da32a7 100644 --- a/net/sched/sch_hfsc.c +++ b/net/sched/sch_hfsc.c @@ -175,6 +175,11 @@ struct hfsc_sched { #define HT_INFINITY 0xffffffffffffffffULL /* infinite time value */ +static bool cl_in_el_or_vttree(struct hfsc_class *cl) +{ + return ((cl->cl_flags & HFSC_FSC) && cl->cl_nactive) || + ((cl->cl_flags & HFSC_RSC) && !RB_EMPTY_NODE(&cl->el_node)); +} /* * eligible tree holds backlogged classes being sorted by their eligible times. @@ -203,7 +208,10 @@ eltree_insert(struct hfsc_class *cl) static inline void eltree_remove(struct hfsc_class *cl) { - rb_erase(&cl->el_node, &cl->sched->eligible); + if (!RB_EMPTY_NODE(&cl->el_node)) { + rb_erase(&cl->el_node, &cl->sched->eligible); + RB_CLEAR_NODE(&cl->el_node); + } } static inline void @@ -827,22 +835,6 @@ update_vf(struct hfsc_class *cl, unsigned int len, u64 cur_time) } } -static unsigned int -qdisc_peek_len(struct Qdisc *sch) -{ - struct sk_buff *skb; - unsigned int len; - - skb = sch->ops->peek(sch); - if (unlikely(skb == NULL)) { - qdisc_warn_nonwc("qdisc_peek_len", sch); - return 0; - } - len = qdisc_pkt_len(skb); - - return len; -} - static void hfsc_adjust_levels(struct hfsc_class *cl) { @@ -958,6 +950,7 @@ hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid, if (cl != NULL) { int old_flags; + int len = 0; if (parentid) { if (cl->cl_parent && @@ -988,9 +981,13 @@ hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid, if (usc != NULL) hfsc_change_usc(cl, usc, cur_time); + if (cl->qdisc->q.qlen != 0) + len = qdisc_peek_len(cl->qdisc); + /* Check queue length again since some qdisc implementations + * (e.g., netem/codel) might empty the queue during the peek + * operation. + */ if (cl->qdisc->q.qlen != 0) { - int len = qdisc_peek_len(cl->qdisc); - if (cl->cl_flags & HFSC_RSC) { if (old_flags & HFSC_RSC) update_ed(cl, len); @@ -1032,6 +1029,8 @@ hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid, if (cl == NULL) return -ENOBUFS; + RB_CLEAR_NODE(&cl->el_node); + err = tcf_block_get(&cl->block, &cl->filter_list, sch, extack); if (err) { kfree(cl); @@ -1174,7 +1173,8 @@ hfsc_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) } /* classification failed, try default class */ - cl = hfsc_find_class(TC_H_MAKE(TC_H_MAJ(sch->handle), q->defcls), sch); + cl = hfsc_find_class(TC_H_MAKE(TC_H_MAJ(sch->handle), + READ_ONCE(q->defcls)), sch); if (cl == NULL || cl->level > 0) return NULL; @@ -1219,7 +1219,8 @@ hfsc_qlen_notify(struct Qdisc *sch, unsigned long arg) /* vttree is now handled in update_vf() so that update_vf(cl, 0, 0) * needs to be called explicitly to remove a class from vttree. */ - update_vf(cl, 0, 0); + if (cl->cl_nactive) + update_vf(cl, 0, 0); if (cl->cl_flags & HFSC_RSC) eltree_remove(cl); } @@ -1443,9 +1444,7 @@ hfsc_change_qdisc(struct Qdisc *sch, struct nlattr *opt, return -EINVAL; qopt = nla_data(opt); - sch_tree_lock(sch); - q->defcls = qopt->defcls; - sch_tree_unlock(sch); + WRITE_ONCE(q->defcls, qopt->defcls); return 0; } @@ -1525,7 +1524,7 @@ hfsc_dump_qdisc(struct Qdisc *sch, struct sk_buff *skb) unsigned char *b = skb_tail_pointer(skb); struct tc_hfsc_qopt qopt; - qopt.defcls = q->defcls; + qopt.defcls = READ_ONCE(q->defcls); if (nla_put(skb, TCA_OPTIONS, sizeof(qopt), &qopt)) goto nla_put_failure; return skb->len; @@ -1561,7 +1560,10 @@ hfsc_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) return err; } - if (first) { + sch->qstats.backlog += len; + sch->q.qlen++; + + if (first && !cl_in_el_or_vttree(cl)) { if (cl->cl_flags & HFSC_RSC) init_ed(cl, len); if (cl->cl_flags & HFSC_FSC) @@ -1576,9 +1578,6 @@ hfsc_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) } - sch->qstats.backlog += len; - sch->q.qlen++; - return NET_XMIT_SUCCESS; } @@ -1633,10 +1632,16 @@ hfsc_dequeue(struct Qdisc *sch) if (cl->qdisc->q.qlen != 0) { /* update ed */ next_len = qdisc_peek_len(cl->qdisc); - if (realtime) - update_ed(cl, next_len); - else - update_d(cl, next_len); + /* Check queue length again since some qdisc implementations + * (e.g., netem/codel) might empty the queue during the peek + * operation. + */ + if (cl->qdisc->q.qlen != 0) { + if (realtime) + update_ed(cl, next_len); + else + update_d(cl, next_len); + } } else { /* the class becomes passive */ eltree_remove(cl); diff --git a/net/sched/sch_hhf.c b/net/sched/sch_hhf.c index 3f906df1435b..5aa434b46707 100644 --- a/net/sched/sch_hhf.c +++ b/net/sched/sch_hhf.c @@ -534,33 +534,37 @@ static int hhf_change(struct Qdisc *sch, struct nlattr *opt, sch_tree_lock(sch); if (tb[TCA_HHF_BACKLOG_LIMIT]) - sch->limit = nla_get_u32(tb[TCA_HHF_BACKLOG_LIMIT]); + WRITE_ONCE(sch->limit, nla_get_u32(tb[TCA_HHF_BACKLOG_LIMIT])); - q->quantum = new_quantum; - q->hhf_non_hh_weight = new_hhf_non_hh_weight; + WRITE_ONCE(q->quantum, new_quantum); + WRITE_ONCE(q->hhf_non_hh_weight, new_hhf_non_hh_weight); if (tb[TCA_HHF_HH_FLOWS_LIMIT]) - q->hh_flows_limit = nla_get_u32(tb[TCA_HHF_HH_FLOWS_LIMIT]); + WRITE_ONCE(q->hh_flows_limit, + nla_get_u32(tb[TCA_HHF_HH_FLOWS_LIMIT])); if (tb[TCA_HHF_RESET_TIMEOUT]) { u32 us = nla_get_u32(tb[TCA_HHF_RESET_TIMEOUT]); - q->hhf_reset_timeout = usecs_to_jiffies(us); + WRITE_ONCE(q->hhf_reset_timeout, + usecs_to_jiffies(us)); } if (tb[TCA_HHF_ADMIT_BYTES]) - q->hhf_admit_bytes = nla_get_u32(tb[TCA_HHF_ADMIT_BYTES]); + WRITE_ONCE(q->hhf_admit_bytes, + nla_get_u32(tb[TCA_HHF_ADMIT_BYTES])); if (tb[TCA_HHF_EVICT_TIMEOUT]) { u32 us = nla_get_u32(tb[TCA_HHF_EVICT_TIMEOUT]); - q->hhf_evict_timeout = usecs_to_jiffies(us); + WRITE_ONCE(q->hhf_evict_timeout, + usecs_to_jiffies(us)); } qlen = sch->q.qlen; prev_backlog = sch->qstats.backlog; while (sch->q.qlen > sch->limit) { - struct sk_buff *skb = hhf_dequeue(sch); + struct sk_buff *skb = qdisc_dequeue_internal(sch, false); rtnl_kfree_skbs(skb, skb); } @@ -657,15 +661,18 @@ static int hhf_dump(struct Qdisc *sch, struct sk_buff *skb) if (opts == NULL) goto nla_put_failure; - if (nla_put_u32(skb, TCA_HHF_BACKLOG_LIMIT, sch->limit) || - nla_put_u32(skb, TCA_HHF_QUANTUM, q->quantum) || - nla_put_u32(skb, TCA_HHF_HH_FLOWS_LIMIT, q->hh_flows_limit) || + if (nla_put_u32(skb, TCA_HHF_BACKLOG_LIMIT, READ_ONCE(sch->limit)) || + nla_put_u32(skb, TCA_HHF_QUANTUM, READ_ONCE(q->quantum)) || + nla_put_u32(skb, TCA_HHF_HH_FLOWS_LIMIT, + READ_ONCE(q->hh_flows_limit)) || nla_put_u32(skb, TCA_HHF_RESET_TIMEOUT, - jiffies_to_usecs(q->hhf_reset_timeout)) || - nla_put_u32(skb, TCA_HHF_ADMIT_BYTES, q->hhf_admit_bytes) || + jiffies_to_usecs(READ_ONCE(q->hhf_reset_timeout))) || + nla_put_u32(skb, TCA_HHF_ADMIT_BYTES, + READ_ONCE(q->hhf_admit_bytes)) || nla_put_u32(skb, TCA_HHF_EVICT_TIMEOUT, - jiffies_to_usecs(q->hhf_evict_timeout)) || - nla_put_u32(skb, TCA_HHF_NON_HH_WEIGHT, q->hhf_non_hh_weight)) + jiffies_to_usecs(READ_ONCE(q->hhf_evict_timeout))) || + nla_put_u32(skb, TCA_HHF_NON_HH_WEIGHT, + READ_ONCE(q->hhf_non_hh_weight))) goto nla_put_failure; return nla_nest_end(skb, opts); diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index 93e6fb56f3b5..14bf71f57057 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -348,7 +348,8 @@ static void htb_add_to_wait_tree(struct htb_sched *q, */ static inline void htb_next_rb_node(struct rb_node **n) { - *n = rb_next(*n); + if (*n) + *n = rb_next(*n); } /** @@ -609,8 +610,8 @@ static inline void htb_activate(struct htb_sched *q, struct htb_class *cl) */ static inline void htb_deactivate(struct htb_sched *q, struct htb_class *cl) { - WARN_ON(!cl->prio_activity); - + if (!cl->prio_activity) + return; htb_deactivate_prios(q, cl); cl->prio_activity = 0; } @@ -1039,13 +1040,6 @@ static void htb_work_func(struct work_struct *work) rcu_read_unlock(); } -static void htb_set_lockdep_class_child(struct Qdisc *q) -{ - static struct lock_class_key child_key; - - lockdep_set_class(qdisc_lock(q), &child_key); -} - static int htb_offload(struct net_device *dev, struct tc_htb_qopt_offload *opt) { return dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_HTB, opt); @@ -1132,7 +1126,6 @@ static int htb_init(struct Qdisc *sch, struct nlattr *opt, return -ENOMEM; } - htb_set_lockdep_class_child(qdisc); q->direct_qdiscs[ntx] = qdisc; qdisc->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT; } @@ -1468,7 +1461,6 @@ static int htb_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, } if (q->offload) { - htb_set_lockdep_class_child(new); /* One ref for cl->leaf.q, the other for dev_queue->qdisc. */ qdisc_refcount_inc(new); old_q = htb_graft_helper(dev_queue, new); @@ -1733,11 +1725,8 @@ static int htb_delete(struct Qdisc *sch, unsigned long arg, new_q = qdisc_create_dflt(dev_queue, &pfifo_qdisc_ops, cl->parent->common.classid, NULL); - if (q->offload) { - if (new_q) - htb_set_lockdep_class_child(new_q); + if (q->offload) htb_parent_to_leaf_offload(sch, dev_queue, new_q); - } } sch_tree_lock(sch); @@ -1750,8 +1739,7 @@ static int htb_delete(struct Qdisc *sch, unsigned long arg, if (cl->parent) cl->parent->children--; - if (cl->prio_activity) - htb_deactivate(q, cl); + htb_deactivate(q, cl); if (cl->cmode != HTB_CAN_SEND) htb_safe_rb_erase(&cl->pq_node, @@ -1822,8 +1810,8 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, qdisc_put_rtab(qdisc_get_rtab(&hopt->ceil, tb[TCA_HTB_CTAB], NULL)); - rate64 = tb[TCA_HTB_RATE64] ? nla_get_u64(tb[TCA_HTB_RATE64]) : 0; - ceil64 = tb[TCA_HTB_CEIL64] ? nla_get_u64(tb[TCA_HTB_CEIL64]) : 0; + rate64 = nla_get_u64_default(tb[TCA_HTB_RATE64], 0); + ceil64 = nla_get_u64_default(tb[TCA_HTB_CEIL64], 0); if (!cl) { /* new class */ struct net_device *dev = qdisc_dev(sch); @@ -1947,13 +1935,9 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, new_q = qdisc_create_dflt(dev_queue, &pfifo_qdisc_ops, classid, NULL); if (q->offload) { - if (new_q) { - htb_set_lockdep_class_child(new_q); - /* One ref for cl->leaf.q, the other for - * dev_queue->qdisc. - */ + /* One ref for cl->leaf.q, the other for dev_queue->qdisc. */ + if (new_q) qdisc_refcount_inc(new_q); - } old_q = htb_graft_helper(dev_queue, new_q); /* No qdisc_put needed. */ WARN_ON(!(old_q->flags & TCQ_F_BUILTIN)); @@ -1963,8 +1947,7 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, /* turn parent into inner node */ qdisc_purge_queue(parent->leaf.q); parent_qdisc = parent->leaf.q; - if (parent->prio_activity) - htb_deactivate(q, parent); + htb_deactivate(q, parent); /* remove from evt list because of level change */ if (parent->cmode != HTB_CAN_SEND) { diff --git a/net/sched/sch_ingress.c b/net/sched/sch_ingress.c index c2ef9dcf91d2..cc6051d4f2ef 100644 --- a/net/sched/sch_ingress.c +++ b/net/sched/sch_ingress.c @@ -91,7 +91,7 @@ static int ingress_init(struct Qdisc *sch, struct nlattr *opt, entry = tcx_entry_fetch_or_create(dev, true, &created); if (!entry) return -ENOMEM; - tcx_miniq_set_active(entry, true); + tcx_miniq_inc(entry); mini_qdisc_pair_init(&q->miniqp, sch, &tcx_entry(entry)->miniq); if (created) tcx_entry_update(dev, entry, true); @@ -121,7 +121,7 @@ static void ingress_destroy(struct Qdisc *sch) tcf_block_put_ext(q->block, sch, &q->block_info); if (entry) { - tcx_miniq_set_active(entry, false); + tcx_miniq_dec(entry); if (!tcx_entry_is_active(entry)) { tcx_entry_update(dev, NULL, true); tcx_entry_free(entry); @@ -257,7 +257,7 @@ static int clsact_init(struct Qdisc *sch, struct nlattr *opt, entry = tcx_entry_fetch_or_create(dev, true, &created); if (!entry) return -ENOMEM; - tcx_miniq_set_active(entry, true); + tcx_miniq_inc(entry); mini_qdisc_pair_init(&q->miniqp_ingress, sch, &tcx_entry(entry)->miniq); if (created) tcx_entry_update(dev, entry, true); @@ -276,7 +276,7 @@ static int clsact_init(struct Qdisc *sch, struct nlattr *opt, entry = tcx_entry_fetch_or_create(dev, false, &created); if (!entry) return -ENOMEM; - tcx_miniq_set_active(entry, true); + tcx_miniq_inc(entry); mini_qdisc_pair_init(&q->miniqp_egress, sch, &tcx_entry(entry)->miniq); if (created) tcx_entry_update(dev, entry, false); @@ -302,7 +302,7 @@ static void clsact_destroy(struct Qdisc *sch) tcf_block_put_ext(q->egress_block, sch, &q->egress_block_info); if (ingress_entry) { - tcx_miniq_set_active(ingress_entry, false); + tcx_miniq_dec(ingress_entry); if (!tcx_entry_is_active(ingress_entry)) { tcx_entry_update(dev, NULL, true); tcx_entry_free(ingress_entry); @@ -310,7 +310,7 @@ static void clsact_destroy(struct Qdisc *sch) } if (egress_entry) { - tcx_miniq_set_active(egress_entry, false); + tcx_miniq_dec(egress_entry); if (!tcx_entry_is_active(egress_entry)) { tcx_entry_update(dev, NULL, false); tcx_entry_free(egress_entry); diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c index 225353fbb3f1..51d4013b6121 100644 --- a/net/sched/sch_mqprio.c +++ b/net/sched/sch_mqprio.c @@ -215,10 +215,8 @@ static int mqprio_parse_tc_entries(struct Qdisc *sch, struct nlattr *nlattr_opt, for (tc = 0; tc < TC_QOPT_MAX_QUEUE; tc++) fp[tc] = priv->fp[tc]; - nla_for_each_attr(n, nlattr_opt, nlattr_opt_len, rem) { - if (nla_type(n) != TCA_MQPRIO_TC_ENTRY) - continue; - + nla_for_each_attr_type(n, TCA_MQPRIO_TC_ENTRY, nlattr_opt, + nlattr_opt_len, rem) { err = mqprio_parse_tc_entry(fp, n, &seen_tcs, extack); if (err) goto out; diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c index 79e93a19d5fa..06e03f5cd7ce 100644 --- a/net/sched/sch_multiq.c +++ b/net/sched/sch_multiq.c @@ -185,7 +185,7 @@ static int multiq_tune(struct Qdisc *sch, struct nlattr *opt, qopt->bands = qdisc_dev(sch)->real_num_tx_queues; - removed = kmalloc(sizeof(*removed) * (q->max_bands - q->bands), + removed = kmalloc(sizeof(*removed) * (q->max_bands - qopt->bands), GFP_KERNEL); if (!removed) return -ENOMEM; diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index edc72962ae63..fdd79d3ccd8c 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -17,6 +17,7 @@ #include <linux/errno.h> #include <linux/skbuff.h> #include <linux/vmalloc.h> +#include <linux/prandom.h> #include <linux/rtnetlink.h> #include <linux/reciprocal_div.h> #include <linux/rbtree.h> @@ -78,6 +79,8 @@ struct netem_sched_data { struct sk_buff *t_head; struct sk_buff *t_tail; + u32 t_len; + /* optional qdisc for classful handling (NULL at netem init) */ struct Qdisc *qdisc; @@ -382,6 +385,7 @@ static void tfifo_reset(struct Qdisc *sch) rtnl_kfree_skbs(q->t_head, q->t_tail); q->t_head = NULL; q->t_tail = NULL; + q->t_len = 0; } static void tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch) @@ -411,6 +415,7 @@ static void tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch) rb_link_node(&nskb->rbnode, parent, p); rb_insert_color(&nskb->rbnode, &q->t_root); } + q->t_len++; sch->q.qlen++; } @@ -446,12 +451,10 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct netem_sched_data *q = qdisc_priv(sch); /* We don't fill cb now as skb_unshare() may invalidate it */ struct netem_skb_cb *cb; - struct sk_buff *skb2; + struct sk_buff *skb2 = NULL; struct sk_buff *segs = NULL; unsigned int prev_len = qdisc_pkt_len(skb); int count = 1; - int rc = NET_XMIT_SUCCESS; - int rc_drop = NET_XMIT_DROP; /* Do not fool qdisc_drop_all() */ skb->prev = NULL; @@ -480,19 +483,11 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch, skb_orphan_partial(skb); /* - * If we need to duplicate packet, then re-insert at top of the - * qdisc tree, since parent queuer expects that only one - * skb will be queued. + * If we need to duplicate packet, then clone it before + * original is modified. */ - if (count > 1 && (skb2 = skb_clone(skb, GFP_ATOMIC)) != NULL) { - struct Qdisc *rootq = qdisc_root_bh(sch); - u32 dupsave = q->duplicate; /* prevent duplicating a dup... */ - - q->duplicate = 0; - rootq->enqueue(skb2, rootq, to_free); - q->duplicate = dupsave; - rc_drop = NET_XMIT_SUCCESS; - } + if (count > 1) + skb2 = skb_clone(skb, GFP_ATOMIC); /* * Randomized packet corruption. @@ -504,7 +499,8 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch, if (skb_is_gso(skb)) { skb = netem_segment(skb, sch, to_free); if (!skb) - return rc_drop; + goto finish_segs; + segs = skb->next; skb_mark_not_on_list(skb); qdisc_skb_cb(skb)->pkt_len = skb->len; @@ -526,11 +522,28 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch, 1<<get_random_u32_below(8); } - if (unlikely(sch->q.qlen >= sch->limit)) { + if (unlikely(q->t_len >= sch->limit)) { /* re-link segs, so that qdisc_drop_all() frees them all */ skb->next = segs; qdisc_drop_all(skb, sch, to_free); - return rc_drop; + if (skb2) + __qdisc_drop(skb2, to_free); + return NET_XMIT_DROP; + } + + /* + * If doing duplication then re-insert at top of the + * qdisc tree, since parent queuer expects that only one + * skb will be queued. + */ + if (skb2) { + struct Qdisc *rootq = qdisc_root_bh(sch); + u32 dupsave = q->duplicate; /* prevent duplicating a dup... */ + + q->duplicate = 0; + rootq->enqueue(skb2, rootq, to_free); + q->duplicate = dupsave; + skb2 = NULL; } qdisc_qstats_backlog_inc(sch, skb); @@ -601,9 +614,12 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch, } finish_segs: + if (skb2) + __qdisc_drop(skb2, to_free); + if (segs) { unsigned int len, last_len; - int nb; + int rc, nb; len = skb ? skb->len : 0; nb = skb ? 1 : 0; @@ -690,8 +706,8 @@ static struct sk_buff *netem_dequeue(struct Qdisc *sch) tfifo_dequeue: skb = __qdisc_dequeue_head(&sch->q); if (skb) { - qdisc_qstats_backlog_dec(sch, skb); deliver: + qdisc_qstats_backlog_dec(sch, skb); qdisc_bstats_update(sch, skb); return skb; } @@ -707,8 +723,7 @@ deliver: if (time_to_send <= now && q->slot.slot_next <= now) { netem_erase_head(q, skb); - sch->q.qlen--; - qdisc_qstats_backlog_dec(sch, skb); + q->t_len--; skb->next = NULL; skb->prev = NULL; /* skb->dev shares skb->rbnode area, @@ -731,21 +746,25 @@ deliver: err = qdisc_enqueue(skb, q->qdisc, &to_free); kfree_skb_list(to_free); - if (err != NET_XMIT_SUCCESS && - net_xmit_drop_count(err)) { - qdisc_qstats_drop(sch); - qdisc_tree_reduce_backlog(sch, 1, - pkt_len); + if (err != NET_XMIT_SUCCESS) { + if (net_xmit_drop_count(err)) + qdisc_qstats_drop(sch); + sch->qstats.backlog -= pkt_len; + sch->q.qlen--; + qdisc_tree_reduce_backlog(sch, 1, pkt_len); } goto tfifo_dequeue; } + sch->q.qlen--; goto deliver; } if (q->qdisc) { skb = q->qdisc->ops->dequeue(q->qdisc); - if (skb) + if (skb) { + sch->q.qlen--; goto deliver; + } } qdisc_watchdog_schedule_ns(&q->watchdog, @@ -755,8 +774,10 @@ deliver: if (q->qdisc) { skb = q->qdisc->ops->dequeue(q->qdisc); - if (skb) + if (skb) { + sch->q.qlen--; goto deliver; + } } return NULL; } diff --git a/net/sched/sch_pie.c b/net/sched/sch_pie.c index 1764059b0635..ad46ee3ed5a9 100644 --- a/net/sched/sch_pie.c +++ b/net/sched/sch_pie.c @@ -85,6 +85,7 @@ EXPORT_SYMBOL_GPL(pie_drop_early); static int pie_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { + enum skb_drop_reason reason = SKB_DROP_REASON_QDISC_OVERLIMIT; struct pie_sched_data *q = qdisc_priv(sch); bool enqueue = false; @@ -93,6 +94,8 @@ static int pie_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, goto out; } + reason = SKB_DROP_REASON_QDISC_CONGESTED; + if (!pie_drop_early(sch, &q->params, &q->vars, sch->qstats.backlog, skb->len)) { enqueue = true; @@ -121,7 +124,7 @@ static int pie_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, out: q->stats.dropped++; q->vars.accu_prob = 0; - return qdisc_drop(skb, sch, to_free); + return qdisc_drop_reason(skb, sch, to_free, reason); } static const struct nla_policy pie_policy[TCA_PIE_MAX + 1] = { @@ -156,41 +159,43 @@ static int pie_change(struct Qdisc *sch, struct nlattr *opt, u32 target = nla_get_u32(tb[TCA_PIE_TARGET]); /* convert to pschedtime */ - q->params.target = PSCHED_NS2TICKS((u64)target * NSEC_PER_USEC); + WRITE_ONCE(q->params.target, + PSCHED_NS2TICKS((u64)target * NSEC_PER_USEC)); } /* tupdate is in jiffies */ if (tb[TCA_PIE_TUPDATE]) - q->params.tupdate = - usecs_to_jiffies(nla_get_u32(tb[TCA_PIE_TUPDATE])); + WRITE_ONCE(q->params.tupdate, + usecs_to_jiffies(nla_get_u32(tb[TCA_PIE_TUPDATE]))); if (tb[TCA_PIE_LIMIT]) { u32 limit = nla_get_u32(tb[TCA_PIE_LIMIT]); - q->params.limit = limit; - sch->limit = limit; + WRITE_ONCE(q->params.limit, limit); + WRITE_ONCE(sch->limit, limit); } if (tb[TCA_PIE_ALPHA]) - q->params.alpha = nla_get_u32(tb[TCA_PIE_ALPHA]); + WRITE_ONCE(q->params.alpha, nla_get_u32(tb[TCA_PIE_ALPHA])); if (tb[TCA_PIE_BETA]) - q->params.beta = nla_get_u32(tb[TCA_PIE_BETA]); + WRITE_ONCE(q->params.beta, nla_get_u32(tb[TCA_PIE_BETA])); if (tb[TCA_PIE_ECN]) - q->params.ecn = nla_get_u32(tb[TCA_PIE_ECN]); + WRITE_ONCE(q->params.ecn, nla_get_u32(tb[TCA_PIE_ECN])); if (tb[TCA_PIE_BYTEMODE]) - q->params.bytemode = nla_get_u32(tb[TCA_PIE_BYTEMODE]); + WRITE_ONCE(q->params.bytemode, + nla_get_u32(tb[TCA_PIE_BYTEMODE])); if (tb[TCA_PIE_DQ_RATE_ESTIMATOR]) - q->params.dq_rate_estimator = - nla_get_u32(tb[TCA_PIE_DQ_RATE_ESTIMATOR]); + WRITE_ONCE(q->params.dq_rate_estimator, + nla_get_u32(tb[TCA_PIE_DQ_RATE_ESTIMATOR])); /* Drop excess packets if new limit is lower */ qlen = sch->q.qlen; while (sch->q.qlen > sch->limit) { - struct sk_buff *skb = __qdisc_dequeue_head(&sch->q); + struct sk_buff *skb = qdisc_dequeue_internal(sch, true); dropped += qdisc_pkt_len(skb); qdisc_qstats_backlog_dec(sch, skb); @@ -419,7 +424,7 @@ EXPORT_SYMBOL_GPL(pie_calculate_probability); static void pie_timer(struct timer_list *t) { - struct pie_sched_data *q = from_timer(q, t, adapt_timer); + struct pie_sched_data *q = timer_container_of(q, t, adapt_timer); struct Qdisc *sch = q->sch; spinlock_t *root_lock; @@ -469,17 +474,18 @@ static int pie_dump(struct Qdisc *sch, struct sk_buff *skb) /* convert target from pschedtime to us */ if (nla_put_u32(skb, TCA_PIE_TARGET, - ((u32)PSCHED_TICKS2NS(q->params.target)) / + ((u32)PSCHED_TICKS2NS(READ_ONCE(q->params.target))) / NSEC_PER_USEC) || - nla_put_u32(skb, TCA_PIE_LIMIT, sch->limit) || + nla_put_u32(skb, TCA_PIE_LIMIT, READ_ONCE(sch->limit)) || nla_put_u32(skb, TCA_PIE_TUPDATE, - jiffies_to_usecs(q->params.tupdate)) || - nla_put_u32(skb, TCA_PIE_ALPHA, q->params.alpha) || - nla_put_u32(skb, TCA_PIE_BETA, q->params.beta) || + jiffies_to_usecs(READ_ONCE(q->params.tupdate))) || + nla_put_u32(skb, TCA_PIE_ALPHA, READ_ONCE(q->params.alpha)) || + nla_put_u32(skb, TCA_PIE_BETA, READ_ONCE(q->params.beta)) || nla_put_u32(skb, TCA_PIE_ECN, q->params.ecn) || - nla_put_u32(skb, TCA_PIE_BYTEMODE, q->params.bytemode) || + nla_put_u32(skb, TCA_PIE_BYTEMODE, + READ_ONCE(q->params.bytemode)) || nla_put_u32(skb, TCA_PIE_DQ_RATE_ESTIMATOR, - q->params.dq_rate_estimator)) + READ_ONCE(q->params.dq_rate_estimator))) goto nla_put_failure; return nla_nest_end(skb, opts); @@ -539,7 +545,7 @@ static void pie_destroy(struct Qdisc *sch) struct pie_sched_data *q = qdisc_priv(sch); q->params.tupdate = 0; - del_timer_sync(&q->adapt_timer); + timer_delete_sync(&q->adapt_timer); } static struct Qdisc_ops pie_qdisc_ops __read_mostly = { diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c index cc30f7a32f1a..9e2b9a490db2 100644 --- a/net/sched/sch_prio.c +++ b/net/sched/sch_prio.c @@ -211,7 +211,7 @@ static int prio_tune(struct Qdisc *sch, struct nlattr *opt, memcpy(q->prio2band, qopt->priomap, TC_PRIO_MAX+1); for (i = q->bands; i < oldbands; i++) - qdisc_tree_flush_backlog(q->queues[i]); + qdisc_purge_queue(q->queues[i]); for (i = oldbands; i < q->bands; i++) { q->queues[i] = queues[i]; diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c index d584c0c25899..bcce36608871 100644 --- a/net/sched/sch_qfq.c +++ b/net/sched/sch_qfq.c @@ -202,6 +202,11 @@ struct qfq_sched { */ enum update_reason {enqueue, requeue}; +static bool cl_is_active(struct qfq_class *cl) +{ + return !list_empty(&cl->alist); +} + static struct qfq_class *qfq_find_class(struct Qdisc *sch, u32 classid) { struct qfq_sched *q = qdisc_priv(sch); @@ -347,7 +352,7 @@ static void qfq_deactivate_class(struct qfq_sched *q, struct qfq_class *cl) struct qfq_aggregate *agg = cl->agg; - list_del(&cl->alist); /* remove from RR queue of the aggregate */ + list_del_init(&cl->alist); /* remove from RR queue of the aggregate */ if (list_empty(&agg->active)) /* agg is now inactive */ qfq_deactivate_agg(q, agg); } @@ -421,10 +426,7 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, if (err < 0) return err; - if (tb[TCA_QFQ_WEIGHT]) - weight = nla_get_u32(tb[TCA_QFQ_WEIGHT]); - else - weight = 1; + weight = nla_get_u32_default(tb[TCA_QFQ_WEIGHT], 1); if (tb[TCA_QFQ_LMAX]) { lmax = nla_get_u32(tb[TCA_QFQ_LMAX]); @@ -450,7 +452,7 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, if (q->wsum + delta_w > QFQ_MAX_WSUM) { NL_SET_ERR_MSG_FMT_MOD(extack, - "total weight out of range (%d + %u)\n", + "total weight out of range (%d + %u)", delta_w, q->wsum); return -EINVAL; } @@ -477,6 +479,7 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, gnet_stats_basic_sync_init(&cl->bstats); cl->common.classid = classid; cl->deficit = lmax; + INIT_LIST_HEAD(&cl->alist); cl->qdisc = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, classid, NULL); @@ -985,8 +988,8 @@ static struct sk_buff *agg_dequeue(struct qfq_aggregate *agg, cl->deficit -= (int) len; if (cl->qdisc->q.qlen == 0) /* no more packets, remove from list */ - list_del(&cl->alist); - else if (cl->deficit < qdisc_pkt_len(cl->qdisc->ops->peek(cl->qdisc))) { + list_del_init(&cl->alist); + else if (cl->deficit < qdisc_peek_len(cl->qdisc)) { cl->deficit += agg->lmax; list_move_tail(&cl->alist, &agg->active); } @@ -1217,7 +1220,6 @@ static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct qfq_class *cl; struct qfq_aggregate *agg; int err = 0; - bool first; cl = qfq_classify(skb, sch, &err); if (cl == NULL) { @@ -1239,7 +1241,6 @@ static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch, } gso_segs = skb_is_gso(skb) ? skb_shinfo(skb)->gso_segs : 1; - first = !cl->qdisc->q.qlen; err = qdisc_enqueue(skb, cl->qdisc, to_free); if (unlikely(err != NET_XMIT_SUCCESS)) { pr_debug("qfq_enqueue: enqueue failed %d\n", err); @@ -1255,8 +1256,8 @@ static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch, ++sch->q.qlen; agg = cl->agg; - /* if the queue was not empty, then done here */ - if (!first) { + /* if the class is active, then done here */ + if (cl_is_active(cl)) { if (unlikely(skb == cl->qdisc->ops->peek(cl->qdisc)) && list_first_entry(&agg->active, struct qfq_class, alist) == cl && cl->deficit < len) @@ -1418,6 +1419,8 @@ static void qfq_qlen_notify(struct Qdisc *sch, unsigned long arg) struct qfq_sched *q = qdisc_priv(sch); struct qfq_class *cl = (struct qfq_class *)arg; + if (list_empty(&cl->alist)) + return; qfq_deactivate_class(q, cl); } diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c index b5f096588fae..479c42d11083 100644 --- a/net/sched/sch_red.c +++ b/net/sched/sch_red.c @@ -70,6 +70,7 @@ static int red_use_nodrop(struct red_sched_data *q) static int red_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { + enum skb_drop_reason reason = SKB_DROP_REASON_QDISC_CONGESTED; struct red_sched_data *q = qdisc_priv(sch); struct Qdisc *child = q->qdisc; unsigned int len; @@ -107,6 +108,7 @@ static int red_enqueue(struct sk_buff *skb, struct Qdisc *sch, break; case RED_HARD_MARK: + reason = SKB_DROP_REASON_QDISC_OVERLIMIT; qdisc_qstats_overlimit(sch); if (red_use_harddrop(q) || !red_use_ecn(q)) { q->stats.forced_drop++; @@ -143,7 +145,7 @@ congestion_drop: if (!skb) return NET_XMIT_CN | ret; - qdisc_drop(skb, sch, to_free); + qdisc_drop_reason(skb, sch, to_free, reason); return NET_XMIT_CN; } @@ -216,7 +218,7 @@ static void red_destroy(struct Qdisc *sch) tcf_qevent_destroy(&q->qe_mark, sch); tcf_qevent_destroy(&q->qe_early_drop, sch); - del_timer_sync(&q->adapt_timer); + timer_delete_sync(&q->adapt_timer); red_offload(sch, false); qdisc_put(q->qdisc); } @@ -248,7 +250,7 @@ static int __red_change(struct Qdisc *sch, struct nlattr **tb, tb[TCA_RED_STAB] == NULL) return -EINVAL; - max_P = tb[TCA_RED_MAX_P] ? nla_get_u32(tb[TCA_RED_MAX_P]) : 0; + max_P = nla_get_u32_default(tb[TCA_RED_MAX_P], 0); ctl = nla_data(tb[TCA_RED_PARMS]); stab = nla_data(tb[TCA_RED_STAB]); @@ -283,7 +285,7 @@ static int __red_change(struct Qdisc *sch, struct nlattr **tb, q->userbits = userbits; q->limit = ctl->limit; if (child) { - qdisc_tree_flush_backlog(q->qdisc); + qdisc_purge_queue(q->qdisc); old_child = q->qdisc; q->qdisc = child; } @@ -295,7 +297,7 @@ static int __red_change(struct Qdisc *sch, struct nlattr **tb, max_P); red_set_vars(&q->vars); - del_timer(&q->adapt_timer); + timer_delete(&q->adapt_timer); if (ctl->flags & TC_RED_ADAPTATIVE) mod_timer(&q->adapt_timer, jiffies + HZ/2); @@ -319,7 +321,7 @@ unlock_out: static inline void red_adaptative_timer(struct timer_list *t) { - struct red_sched_data *q = from_timer(q, t, adapt_timer); + struct red_sched_data *q = timer_container_of(q, t, adapt_timer); struct Qdisc *sch = q->sch; spinlock_t *root_lock; diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c index b717e15a3a17..d2835f1168e1 100644 --- a/net/sched/sch_sfb.c +++ b/net/sched/sch_sfb.c @@ -280,6 +280,7 @@ static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { + enum skb_drop_reason reason = SKB_DROP_REASON_QDISC_OVERLIMIT; struct sfb_sched_data *q = qdisc_priv(sch); unsigned int len = qdisc_pkt_len(skb); struct Qdisc *child = q->qdisc; @@ -380,6 +381,7 @@ static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch, } r = get_random_u16() & SFB_MAX_PROB; + reason = SKB_DROP_REASON_QDISC_CONGESTED; if (unlikely(r < p_min)) { if (unlikely(p_min > SFB_MAX_PROB / 2)) { @@ -414,7 +416,7 @@ enqueue: return ret; drop: - qdisc_drop(skb, sch, to_free); + qdisc_drop_reason(skb, sch, to_free, reason); return NET_XMIT_CN; other_drop: if (ret & __NET_XMIT_BYPASS) diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c index e66f4afb920d..96eb2f122973 100644 --- a/net/sched/sch_sfq.c +++ b/net/sched/sch_sfq.c @@ -77,12 +77,6 @@ #define SFQ_EMPTY_SLOT 0xffff #define SFQ_DEFAULT_HASH_DIVISOR 1024 -/* We use 16 bits to store allot, and want to handle packets up to 64K - * Scale allot by 8 (1<<3) so that no overflow occurs. - */ -#define SFQ_ALLOT_SHIFT 3 -#define SFQ_ALLOT_SIZE(X) DIV_ROUND_UP(X, 1 << SFQ_ALLOT_SHIFT) - /* This type should contain at least SFQ_MAX_DEPTH + 1 + SFQ_MAX_FLOWS values */ typedef u16 sfq_index; @@ -104,7 +98,7 @@ struct sfq_slot { sfq_index next; /* next slot in sfq RR chain */ struct sfq_head dep; /* anchor in dep[] chains */ unsigned short hash; /* hash value (index in ht[]) */ - short allot; /* credit for this slot */ + int allot; /* credit for this slot */ unsigned int backlog; struct red_vars vars; @@ -120,7 +114,6 @@ struct sfq_sched_data { siphash_key_t perturbation; u8 cur_depth; /* depth of longest slot */ u8 flags; - unsigned short scaled_quantum; /* SFQ_ALLOT_SIZE(quantum) */ struct tcf_proto __rcu *filter_list; struct tcf_block *block; sfq_index *ht; /* Hash table ('divisor' slots) */ @@ -317,7 +310,10 @@ drop: /* It is difficult to believe, but ALL THE SLOTS HAVE LENGTH 1. */ x = q->tail->next; slot = &q->slots[x]; - q->tail->next = slot->next; + if (slot->next == x) + q->tail = NULL; /* no more active slots */ + else + q->tail->next = slot->next; q->ht[slot->hash] = SFQ_EMPTY_SLOT; goto drop; } @@ -456,7 +452,7 @@ enqueue: */ q->tail = slot; /* We could use a bigger initial quantum for new flows */ - slot->allot = q->scaled_quantum; + slot->allot = q->quantum; } if (++sch->q.qlen <= q->limit) return NET_XMIT_SUCCESS; @@ -493,7 +489,7 @@ next_slot: slot = &q->slots[a]; if (slot->allot <= 0) { q->tail = slot; - slot->allot += q->scaled_quantum; + slot->allot += q->quantum; goto next_slot; } skb = slot_dequeue_head(slot); @@ -512,7 +508,7 @@ next_slot: } q->tail->next = next_a; } else { - slot->allot -= SFQ_ALLOT_SIZE(qdisc_pkt_len(skb)); + slot->allot -= qdisc_pkt_len(skb); } return skb; } @@ -595,7 +591,7 @@ drop: q->tail->next = x; } q->tail = slot; - slot->allot = q->scaled_quantum; + slot->allot = q->quantum; } } sch->q.qlen -= dropped; @@ -604,10 +600,11 @@ drop: static void sfq_perturbation(struct timer_list *t) { - struct sfq_sched_data *q = from_timer(q, t, perturb_timer); + struct sfq_sched_data *q = timer_container_of(q, t, perturb_timer); struct Qdisc *sch = q->sch; spinlock_t *root_lock; siphash_key_t nkey; + int period; get_random_bytes(&nkey, sizeof(nkey)); rcu_read_lock(); @@ -618,12 +615,17 @@ static void sfq_perturbation(struct timer_list *t) sfq_rehash(sch); spin_unlock(root_lock); - if (q->perturb_period) - mod_timer(&q->perturb_timer, jiffies + q->perturb_period); + /* q->perturb_period can change under us from + * sfq_change() and sfq_destroy(). + */ + period = READ_ONCE(q->perturb_period); + if (period) + mod_timer(&q->perturb_timer, jiffies + period); rcu_read_unlock(); } -static int sfq_change(struct Qdisc *sch, struct nlattr *opt) +static int sfq_change(struct Qdisc *sch, struct nlattr *opt, + struct netlink_ext_ack *extack) { struct sfq_sched_data *q = qdisc_priv(sch); struct tc_sfq_qopt *ctl = nla_data(opt); @@ -632,6 +634,15 @@ static int sfq_change(struct Qdisc *sch, struct nlattr *opt) struct red_parms *p = NULL; struct sk_buff *to_free = NULL; struct sk_buff *tail = NULL; + unsigned int maxflows; + unsigned int quantum; + unsigned int divisor; + int perturb_period; + u8 headdrop; + u8 maxdepth; + int limit; + u8 flags; + if (opt->nla_len < nla_attr_size(sizeof(*ctl))) return -EINVAL; @@ -641,13 +652,17 @@ static int sfq_change(struct Qdisc *sch, struct nlattr *opt) (!is_power_of_2(ctl->divisor) || ctl->divisor > 65536)) return -EINVAL; - /* slot->allot is a short, make sure quantum is not too big. */ - if (ctl->quantum) { - unsigned int scaled = SFQ_ALLOT_SIZE(ctl->quantum); + if ((int)ctl->quantum < 0) { + NL_SET_ERR_MSG_MOD(extack, "invalid quantum"); + return -EINVAL; + } - if (scaled <= 0 || scaled > SHRT_MAX) - return -EINVAL; + if (ctl->perturb_period < 0 || + ctl->perturb_period > INT_MAX / HZ) { + NL_SET_ERR_MSG_MOD(extack, "invalid perturb period"); + return -EINVAL; } + perturb_period = ctl->perturb_period * HZ; if (ctl_v1 && !red_check_params(ctl_v1->qth_min, ctl_v1->qth_max, ctl_v1->Wlog, ctl_v1->Scell_log, NULL)) @@ -657,38 +672,63 @@ static int sfq_change(struct Qdisc *sch, struct nlattr *opt) if (!p) return -ENOMEM; } + sch_tree_lock(sch); - if (ctl->quantum) { - q->quantum = ctl->quantum; - q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum); - } - q->perturb_period = ctl->perturb_period * HZ; + + limit = q->limit; + divisor = q->divisor; + headdrop = q->headdrop; + maxdepth = q->maxdepth; + maxflows = q->maxflows; + quantum = q->quantum; + flags = q->flags; + + /* update and validate configuration */ + if (ctl->quantum) + quantum = ctl->quantum; if (ctl->flows) - q->maxflows = min_t(u32, ctl->flows, SFQ_MAX_FLOWS); + maxflows = min_t(u32, ctl->flows, SFQ_MAX_FLOWS); if (ctl->divisor) { - q->divisor = ctl->divisor; - q->maxflows = min_t(u32, q->maxflows, q->divisor); + divisor = ctl->divisor; + maxflows = min_t(u32, maxflows, divisor); } if (ctl_v1) { if (ctl_v1->depth) - q->maxdepth = min_t(u32, ctl_v1->depth, SFQ_MAX_DEPTH); + maxdepth = min_t(u32, ctl_v1->depth, SFQ_MAX_DEPTH); if (p) { - swap(q->red_parms, p); - red_set_parms(q->red_parms, + red_set_parms(p, ctl_v1->qth_min, ctl_v1->qth_max, ctl_v1->Wlog, ctl_v1->Plog, ctl_v1->Scell_log, NULL, ctl_v1->max_P); } - q->flags = ctl_v1->flags; - q->headdrop = ctl_v1->headdrop; + flags = ctl_v1->flags; + headdrop = ctl_v1->headdrop; } if (ctl->limit) { - q->limit = min_t(u32, ctl->limit, q->maxdepth * q->maxflows); - q->maxflows = min_t(u32, q->maxflows, q->limit); + limit = min_t(u32, ctl->limit, maxdepth * maxflows); + maxflows = min_t(u32, maxflows, limit); + } + if (limit == 1) { + sch_tree_unlock(sch); + kfree(p); + NL_SET_ERR_MSG_MOD(extack, "invalid limit"); + return -EINVAL; } + /* commit configuration */ + q->limit = limit; + q->divisor = divisor; + q->headdrop = headdrop; + q->maxdepth = maxdepth; + q->maxflows = maxflows; + WRITE_ONCE(q->perturb_period, perturb_period); + q->quantum = quantum; + q->flags = flags; + if (p) + swap(q->red_parms, p); + qlen = sch->q.qlen; while (sch->q.qlen > q->limit) { dropped += sfq_drop(sch, &to_free); @@ -699,7 +739,7 @@ static int sfq_change(struct Qdisc *sch, struct nlattr *opt) rtnl_kfree_skbs(to_free, tail); qdisc_tree_reduce_backlog(sch, qlen - sch->q.qlen, dropped); - del_timer(&q->perturb_timer); + timer_delete(&q->perturb_timer); if (q->perturb_period) { mod_timer(&q->perturb_timer, jiffies + q->perturb_period); get_random_bytes(&q->perturbation, sizeof(q->perturbation)); @@ -724,8 +764,8 @@ static void sfq_destroy(struct Qdisc *sch) struct sfq_sched_data *q = qdisc_priv(sch); tcf_block_put(q->block); - q->perturb_period = 0; - del_timer_sync(&q->perturb_timer); + WRITE_ONCE(q->perturb_period, 0); + timer_delete_sync(&q->perturb_timer); sfq_free(q->ht); sfq_free(q->slots); kfree(q->red_parms); @@ -757,12 +797,11 @@ static int sfq_init(struct Qdisc *sch, struct nlattr *opt, q->divisor = SFQ_DEFAULT_HASH_DIVISOR; q->maxflows = SFQ_DEFAULT_FLOWS; q->quantum = psched_mtu(qdisc_dev(sch)); - q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum); q->perturb_period = 0; get_random_bytes(&q->perturbation, sizeof(q->perturbation)); if (opt) { - int err = sfq_change(sch, opt); + int err = sfq_change(sch, opt, extack); if (err) return err; } @@ -873,7 +912,7 @@ static int sfq_dump_class_stats(struct Qdisc *sch, unsigned long cl, if (idx != SFQ_EMPTY_SLOT) { const struct sfq_slot *slot = &q->slots[idx]; - xstats.allot = slot->allot << SFQ_ALLOT_SHIFT; + xstats.allot = slot->allot; qs.qlen = slot->qlen; qs.backlog = slot->backlog; } diff --git a/net/sched/sch_skbprio.c b/net/sched/sch_skbprio.c index b4dd626c309c..f485f62ab721 100644 --- a/net/sched/sch_skbprio.c +++ b/net/sched/sch_skbprio.c @@ -79,7 +79,9 @@ static int skbprio_enqueue(struct sk_buff *skb, struct Qdisc *sch, prio = min(skb->priority, max_priority); qdisc = &q->qdiscs[prio]; - if (sch->q.qlen < sch->limit) { + + /* sch->limit can change under us from skbprio_change() */ + if (sch->q.qlen < READ_ONCE(sch->limit)) { __skb_queue_tail(qdisc, skb); qdisc_qstats_backlog_inc(sch, skb); q->qstats[prio].backlog += qdisc_pkt_len(skb); @@ -121,8 +123,6 @@ static int skbprio_enqueue(struct sk_buff *skb, struct Qdisc *sch, /* Check to update highest and lowest priorities. */ if (skb_queue_empty(lp_qdisc)) { if (q->lowest_prio == q->highest_prio) { - /* The incoming packet is the only packet in queue. */ - BUG_ON(sch->q.qlen != 1); q->lowest_prio = prio; q->highest_prio = prio; } else { @@ -154,7 +154,6 @@ static struct sk_buff *skbprio_dequeue(struct Qdisc *sch) /* Update highest priority field. */ if (skb_queue_empty(hpq)) { if (q->lowest_prio == q->highest_prio) { - BUG_ON(sch->q.qlen); q->highest_prio = 0; q->lowest_prio = SKBPRIO_MAX_PRIORITY - 1; } else { @@ -172,7 +171,7 @@ static int skbprio_change(struct Qdisc *sch, struct nlattr *opt, if (opt->nla_len != nla_attr_size(sizeof(*ctl))) return -EINVAL; - sch->limit = ctl->limit; + WRITE_ONCE(sch->limit, ctl->limit); return 0; } @@ -200,7 +199,7 @@ static int skbprio_dump(struct Qdisc *sch, struct sk_buff *skb) { struct tc_skbprio_qopt opt; - opt.limit = sch->limit; + opt.limit = READ_ONCE(sch->limit); if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt)) return -1; diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c index a0d54b422186..2b14c81a87e5 100644 --- a/net/sched/sch_taprio.c +++ b/net/sched/sch_taprio.c @@ -1151,11 +1151,6 @@ static int parse_taprio_schedule(struct taprio_sched *q, struct nlattr **tb, list_for_each_entry(entry, &new->entries, list) cycle = ktime_add_ns(cycle, entry->interval); - if (!cycle) { - NL_SET_ERR_MSG(extack, "'cycle_time' can never be 0"); - return -EINVAL; - } - if (cycle < 0 || cycle > INT_MAX) { NL_SET_ERR_MSG(extack, "'cycle_time' is too big"); return -EINVAL; @@ -1164,6 +1159,11 @@ static int parse_taprio_schedule(struct taprio_sched *q, struct nlattr **tb, new->cycle_time = cycle; } + if (new->cycle_time < new->num_entries * length_to_duration(q, ETH_ZLEN)) { + NL_SET_ERR_MSG(extack, "'cycle_time' is too small"); + return -EINVAL; + } + taprio_calculate_gate_durations(q, new); return 0; @@ -1176,16 +1176,13 @@ static int taprio_parse_mqprio_opt(struct net_device *dev, { bool allow_overlapping_txqs = TXTIME_ASSIST_IS_ENABLED(taprio_flags); - if (!qopt && !dev->num_tc) { - NL_SET_ERR_MSG(extack, "'mqprio' configuration is necessary"); - return -EINVAL; - } - - /* If num_tc is already set, it means that the user already - * configured the mqprio part - */ - if (dev->num_tc) + if (!qopt) { + if (!dev->num_tc) { + NL_SET_ERR_MSG(extack, "'mqprio' configuration is necessary"); + return -EINVAL; + } return 0; + } /* taprio imposes that traffic classes map 1:n to tx queues */ if (qopt->num_tc > dev->num_tx_queues) { @@ -1331,13 +1328,15 @@ static int taprio_dev_notifier(struct notifier_block *nb, unsigned long event, stab = rtnl_dereference(q->root->stab); - oper = rtnl_dereference(q->oper_sched); + rcu_read_lock(); + oper = rcu_dereference(q->oper_sched); if (oper) taprio_update_queue_max_sdu(q, oper, stab); - admin = rtnl_dereference(q->admin_sched); + admin = rcu_dereference(q->admin_sched); if (admin) taprio_update_queue_max_sdu(q, admin, stab); + rcu_read_unlock(); break; } @@ -1613,7 +1612,7 @@ static int taprio_parse_clockid(struct Qdisc *sch, struct nlattr **tb, if (FULL_OFFLOAD_IS_ENABLED(q->flags)) { const struct ethtool_ops *ops = dev->ethtool_ops; - struct ethtool_ts_info info = { + struct kernel_ethtool_ts_info info = { .cmd = ETHTOOL_GET_TS_INFO, .phc_index = -1, }; @@ -1752,10 +1751,7 @@ static int taprio_parse_tc_entries(struct Qdisc *sch, fp[tc] = q->fp[tc]; } - nla_for_each_nested(n, opt, rem) { - if (nla_type(n) != TCA_TAPRIO_ATTR_TC_ENTRY) - continue; - + nla_for_each_nested_type(n, TCA_TAPRIO_ATTR_TC_ENTRY, opt, rem) { err = taprio_parse_tc_entry(sch, n, max_sdu, fp, &seen_tcs, extack); if (err) @@ -1834,7 +1830,7 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt, * zero; (2) the 'flags' of a "running" taprio instance cannot be * changed. */ - taprio_flags = tb[TCA_TAPRIO_ATTR_FLAGS] ? nla_get_u32(tb[TCA_TAPRIO_ATTR_FLAGS]) : 0; + taprio_flags = nla_get_u32_default(tb[TCA_TAPRIO_ATTR_FLAGS], 0); /* txtime-assist and full offload are mutually exclusive */ if ((taprio_flags & TCA_TAPRIO_ATTR_FLAG_TXTIME_ASSIST) && @@ -1851,6 +1847,9 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt, } q->flags = taprio_flags; + /* Needed for length_to_duration() during netlink attribute parsing */ + taprio_set_picos_per_byte(dev, q); + err = taprio_parse_mqprio_opt(dev, mqprio, extack, q->flags); if (err < 0) return err; @@ -1910,7 +1909,6 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt, if (err < 0) goto free_sched; - taprio_set_picos_per_byte(dev, q); taprio_update_queue_max_sdu(q, new_admin, stab); if (FULL_OFFLOAD_IS_ENABLED(q->flags)) @@ -1936,8 +1934,7 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt, if (!TXTIME_ASSIST_IS_ENABLED(q->flags) && !FULL_OFFLOAD_IS_ENABLED(q->flags) && !hrtimer_active(&q->advance_timer)) { - hrtimer_init(&q->advance_timer, q->clockid, HRTIMER_MODE_ABS); - q->advance_timer.function = advance_sched; + hrtimer_setup(&q->advance_timer, advance_sched, q->clockid, HRTIMER_MODE_ABS); } err = taprio_get_start_time(sch, new_admin, &start); @@ -1956,7 +1953,9 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt, goto unlock; } - rcu_assign_pointer(q->admin_sched, new_admin); + /* Not going to race against advance_sched(), but still */ + admin = rcu_replace_pointer(q->admin_sched, new_admin, + lockdep_rtnl_is_held()); if (admin) call_rcu(&admin->rcu, taprio_free_sched_cb); } else { @@ -1967,7 +1966,8 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt, taprio_start_sched(sch, start, new_admin); - rcu_assign_pointer(q->admin_sched, new_admin); + admin = rcu_replace_pointer(q->admin_sched, new_admin, + lockdep_rtnl_is_held()); if (admin) call_rcu(&admin->rcu, taprio_free_sched_cb); @@ -2057,8 +2057,7 @@ static int taprio_init(struct Qdisc *sch, struct nlattr *opt, spin_lock_init(&q->current_entry_lock); - hrtimer_init(&q->advance_timer, CLOCK_TAI, HRTIMER_MODE_ABS); - q->advance_timer.function = advance_sched; + hrtimer_setup(&q->advance_timer, advance_sched, CLOCK_TAI, HRTIMER_MODE_ABS); q->root = sch; @@ -2375,9 +2374,6 @@ static int taprio_dump(struct Qdisc *sch, struct sk_buff *skb) struct tc_mqprio_qopt opt = { 0 }; struct nlattr *nest, *sched_nest; - oper = rtnl_dereference(q->oper_sched); - admin = rtnl_dereference(q->admin_sched); - mqprio_qopt_reconstruct(dev, &opt); nest = nla_nest_start_noflag(skb, TCA_OPTIONS); @@ -2398,18 +2394,23 @@ static int taprio_dump(struct Qdisc *sch, struct sk_buff *skb) nla_put_u32(skb, TCA_TAPRIO_ATTR_TXTIME_DELAY, q->txtime_delay)) goto options_error; + rcu_read_lock(); + + oper = rtnl_dereference(q->oper_sched); + admin = rtnl_dereference(q->admin_sched); + if (oper && taprio_dump_tc_entries(skb, q, oper)) - goto options_error; + goto options_error_rcu; if (oper && dump_schedule(skb, oper)) - goto options_error; + goto options_error_rcu; if (!admin) goto done; sched_nest = nla_nest_start_noflag(skb, TCA_TAPRIO_ATTR_ADMIN_SCHED); if (!sched_nest) - goto options_error; + goto options_error_rcu; if (dump_schedule(skb, admin)) goto admin_error; @@ -2417,11 +2418,15 @@ static int taprio_dump(struct Qdisc *sch, struct sk_buff *skb) nla_nest_end(skb, sched_nest); done: + rcu_read_unlock(); return nla_nest_end(skb, nest); admin_error: nla_nest_cancel(skb, sched_nest); +options_error_rcu: + rcu_read_unlock(); + options_error: nla_nest_cancel(skb, nest); diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c index f1d09183ae63..4c977f049670 100644 --- a/net/sched/sch_tbf.c +++ b/net/sched/sch_tbf.c @@ -208,7 +208,7 @@ static int tbf_segment(struct sk_buff *skb, struct Qdisc *sch, struct tbf_sched_data *q = qdisc_priv(sch); struct sk_buff *segs, *nskb; netdev_features_t features = netif_skb_features(skb); - unsigned int len = 0, prev_len = qdisc_pkt_len(skb); + unsigned int len = 0, prev_len = qdisc_pkt_len(skb), seg_len; int ret, nb; segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK); @@ -219,21 +219,27 @@ static int tbf_segment(struct sk_buff *skb, struct Qdisc *sch, nb = 0; skb_list_walk_safe(segs, segs, nskb) { skb_mark_not_on_list(segs); - qdisc_skb_cb(segs)->pkt_len = segs->len; - len += segs->len; + seg_len = segs->len; + qdisc_skb_cb(segs)->pkt_len = seg_len; ret = qdisc_enqueue(segs, q->qdisc, to_free); if (ret != NET_XMIT_SUCCESS) { if (net_xmit_drop_count(ret)) qdisc_qstats_drop(sch); } else { nb++; + len += seg_len; } } sch->q.qlen += nb; - if (nb > 1) + sch->qstats.backlog += len; + if (nb > 0) { qdisc_tree_reduce_backlog(sch, 1 - nb, prev_len - len); - consume_skb(skb); - return nb > 0 ? NET_XMIT_SUCCESS : NET_XMIT_DROP; + consume_skb(skb); + return NET_XMIT_SUCCESS; + } + + kfree_skb(skb); + return NET_XMIT_DROP; } static int tbf_enqueue(struct sk_buff *skb, struct Qdisc *sch, @@ -446,7 +452,7 @@ static int tbf_change(struct Qdisc *sch, struct nlattr *opt, sch_tree_lock(sch); if (child) { - qdisc_tree_flush_backlog(q->qdisc); + qdisc_purge_queue(q->qdisc); old = q->qdisc; q->qdisc = child; } diff --git a/net/sched/sch_teql.c b/net/sched/sch_teql.c index 59304611dc00..8badec6d82a2 100644 --- a/net/sched/sch_teql.c +++ b/net/sched/sch_teql.c @@ -78,7 +78,7 @@ teql_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) struct net_device *dev = qdisc_dev(sch); struct teql_sched_data *q = qdisc_priv(sch); - if (q->q.qlen < dev->tx_queue_len) { + if (q->q.qlen < READ_ONCE(dev->tx_queue_len)) { __skb_queue_tail(&q->q, skb); return NET_XMIT_SUCCESS; } @@ -424,7 +424,7 @@ static int teql_master_mtu(struct net_device *dev, int new_mtu) } while ((q = NEXT_SLAVE(q)) != m->slaves); } - dev->mtu = new_mtu; + WRITE_ONCE(dev->mtu, new_mtu); return 0; } |