diff options
Diffstat (limited to 'net/sched')
-rw-r--r-- | net/sched/Kconfig | 12 | ||||
-rw-r--r-- | net/sched/Makefile | 1 | ||||
-rw-r--r-- | net/sched/act_api.c | 9 | ||||
-rw-r--r-- | net/sched/act_connmark.c | 18 | ||||
-rw-r--r-- | net/sched/act_csum.c | 18 | ||||
-rw-r--r-- | net/sched/act_ct.c | 30 | ||||
-rw-r--r-- | net/sched/act_ctinfo.c | 42 | ||||
-rw-r--r-- | net/sched/act_mpls.c | 21 | ||||
-rw-r--r-- | net/sched/act_nat.c | 25 | ||||
-rw-r--r-- | net/sched/act_pedit.c | 20 | ||||
-rw-r--r-- | net/sched/act_police.c | 18 | ||||
-rw-r--r-- | net/sched/act_skbedit.c | 20 | ||||
-rw-r--r-- | net/sched/bpf_qdisc.c | 9 | ||||
-rw-r--r-- | net/sched/em_text.c | 2 | ||||
-rw-r--r-- | net/sched/sch_api.c | 52 | ||||
-rw-r--r-- | net/sched/sch_cake.c | 5 | ||||
-rw-r--r-- | net/sched/sch_dualpi2.c | 1175 | ||||
-rw-r--r-- | net/sched/sch_ets.c | 2 | ||||
-rw-r--r-- | net/sched/sch_generic.c | 2 | ||||
-rw-r--r-- | net/sched/sch_hfsc.c | 16 | ||||
-rw-r--r-- | net/sched/sch_htb.c | 4 | ||||
-rw-r--r-- | net/sched/sch_netem.c | 40 | ||||
-rw-r--r-- | net/sched/sch_prio.c | 2 | ||||
-rw-r--r-- | net/sched/sch_qfq.c | 35 | ||||
-rw-r--r-- | net/sched/sch_red.c | 2 | ||||
-rw-r--r-- | net/sched/sch_sfq.c | 15 | ||||
-rw-r--r-- | net/sched/sch_taprio.c | 18 | ||||
-rw-r--r-- | net/sched/sch_tbf.c | 2 |
28 files changed, 1423 insertions, 192 deletions
diff --git a/net/sched/Kconfig b/net/sched/Kconfig index ad914d2b2e22..6ddff028b81a 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -415,6 +415,18 @@ config NET_SCH_BPF If unsure, say N. +config NET_SCH_DUALPI2 + tristate "Dual Queue PI Square (DUALPI2) scheduler" + help + Say Y here if you want to use the Dual Queue Proportional Integral + Controller Improved with a Square scheduling algorithm. + For more information, please see https://tools.ietf.org/html/rfc9332 + + To compile this driver as a module, choose M here: the module + will be called sch_dualpi2. + + If unsure, say N. + menuconfig NET_SCH_DEFAULT bool "Allow override default queue discipline" help diff --git a/net/sched/Makefile b/net/sched/Makefile index 904d784902d1..5078ea84e6ad 100644 --- a/net/sched/Makefile +++ b/net/sched/Makefile @@ -63,6 +63,7 @@ obj-$(CONFIG_NET_SCH_CBS) += sch_cbs.o obj-$(CONFIG_NET_SCH_ETF) += sch_etf.o obj-$(CONFIG_NET_SCH_TAPRIO) += sch_taprio.o obj-$(CONFIG_NET_SCH_BPF) += bpf_qdisc.o +obj-$(CONFIG_NET_SCH_DUALPI2) += sch_dualpi2.o obj-$(CONFIG_NET_CLS_U32) += cls_u32.o obj-$(CONFIG_NET_CLS_ROUTE4) += cls_route.o diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 057e20cef375..9e468e463467 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -933,18 +933,25 @@ void tcf_idrinfo_destroy(const struct tc_action_ops *ops, struct tcf_idrinfo *idrinfo) { struct idr *idr = &idrinfo->action_idr; + bool mutex_taken = false; struct tc_action *p; - int ret; unsigned long id = 1; unsigned long tmp; + int ret; idr_for_each_entry_ul(idr, p, tmp, id) { + if (tc_act_in_hw(p) && !mutex_taken) { + rtnl_lock(); + mutex_taken = true; + } ret = __tcf_idr_release(p, false, true); if (ret == ACT_P_DELETED) module_put(ops->owner); else if (ret < 0) return; } + if (mutex_taken) + rtnl_unlock(); idr_destroy(&idrinfo->action_idr); } EXPORT_SYMBOL(tcf_idrinfo_destroy); diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c index 0fce631e7c91..3e89927d7116 100644 --- a/net/sched/act_connmark.c +++ b/net/sched/act_connmark.c @@ -88,7 +88,7 @@ count: /* using overlimits stats to count how many packets marked */ tcf_action_inc_overlimit_qstats(&ca->common); out: - return READ_ONCE(ca->tcf_action); + return parms->action; } static const struct nla_policy connmark_policy[TCA_CONNMARK_MAX + 1] = { @@ -167,6 +167,8 @@ static int tcf_connmark_init(struct net *net, struct nlattr *nla, if (err < 0) goto release_idr; + nparms->action = parm->action; + spin_lock_bh(&ci->tcf_lock); goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); oparms = rcu_replace_pointer(ci->parms, nparms, lockdep_is_held(&ci->tcf_lock)); @@ -190,20 +192,20 @@ out_free: static inline int tcf_connmark_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { + const struct tcf_connmark_info *ci = to_connmark(a); unsigned char *b = skb_tail_pointer(skb); - struct tcf_connmark_info *ci = to_connmark(a); + const struct tcf_connmark_parms *parms; struct tc_connmark opt = { .index = ci->tcf_index, .refcnt = refcount_read(&ci->tcf_refcnt) - ref, .bindcnt = atomic_read(&ci->tcf_bindcnt) - bind, }; - struct tcf_connmark_parms *parms; struct tcf_t t; - spin_lock_bh(&ci->tcf_lock); - parms = rcu_dereference_protected(ci->parms, lockdep_is_held(&ci->tcf_lock)); + rcu_read_lock(); + parms = rcu_dereference(ci->parms); - opt.action = ci->tcf_action; + opt.action = parms->action; opt.zone = parms->zone; if (nla_put(skb, TCA_CONNMARK_PARMS, sizeof(opt), &opt)) goto nla_put_failure; @@ -212,12 +214,12 @@ static inline int tcf_connmark_dump(struct sk_buff *skb, struct tc_action *a, if (nla_put_64bit(skb, TCA_CONNMARK_TM, sizeof(t), &t, TCA_CONNMARK_PAD)) goto nla_put_failure; - spin_unlock_bh(&ci->tcf_lock); + rcu_read_unlock(); return skb->len; nla_put_failure: - spin_unlock_bh(&ci->tcf_lock); + rcu_read_unlock(); nlmsg_trim(skb, b); return -1; } diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c index 5cc8e407e791..0939e6b2ba4d 100644 --- a/net/sched/act_csum.c +++ b/net/sched/act_csum.c @@ -99,6 +99,7 @@ static int tcf_csum_init(struct net *net, struct nlattr *nla, goto put_chain; } params_new->update_flags = parm->update_flags; + params_new->action = parm->action; spin_lock_bh(&p->tcf_lock); goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); @@ -580,7 +581,7 @@ TC_INDIRECT_SCOPE int tcf_csum_act(struct sk_buff *skb, tcf_lastuse_update(&p->tcf_tm); tcf_action_update_bstats(&p->common, skb); - action = READ_ONCE(p->tcf_action); + action = params->action; if (unlikely(action == TC_ACT_SHOT)) goto drop; @@ -631,9 +632,9 @@ drop: static int tcf_csum_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { + const struct tcf_csum *p = to_tcf_csum(a); unsigned char *b = skb_tail_pointer(skb); - struct tcf_csum *p = to_tcf_csum(a); - struct tcf_csum_params *params; + const struct tcf_csum_params *params; struct tc_csum opt = { .index = p->tcf_index, .refcnt = refcount_read(&p->tcf_refcnt) - ref, @@ -641,10 +642,9 @@ static int tcf_csum_dump(struct sk_buff *skb, struct tc_action *a, int bind, }; struct tcf_t t; - spin_lock_bh(&p->tcf_lock); - params = rcu_dereference_protected(p->params, - lockdep_is_held(&p->tcf_lock)); - opt.action = p->tcf_action; + rcu_read_lock(); + params = rcu_dereference(p->params); + opt.action = params->action; opt.update_flags = params->update_flags; if (nla_put(skb, TCA_CSUM_PARMS, sizeof(opt), &opt)) @@ -653,12 +653,12 @@ static int tcf_csum_dump(struct sk_buff *skb, struct tc_action *a, int bind, tcf_tm_dump(&t, &p->tcf_tm); if (nla_put_64bit(skb, TCA_CSUM_TM, sizeof(t), &t, TCA_CSUM_PAD)) goto nla_put_failure; - spin_unlock_bh(&p->tcf_lock); + rcu_read_unlock(); return skb->len; nla_put_failure: - spin_unlock_bh(&p->tcf_lock); + rcu_read_unlock(); nlmsg_trim(skb, b); return -1; } diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c index c02f39efc6ef..6749a4a9a9cd 100644 --- a/net/sched/act_ct.c +++ b/net/sched/act_ct.c @@ -977,7 +977,7 @@ TC_INDIRECT_SCOPE int tcf_ct_act(struct sk_buff *skb, const struct tc_action *a, p = rcu_dereference_bh(c->params); - retval = READ_ONCE(c->tcf_action); + retval = p->action; commit = p->ct_action & TCA_CT_ACT_COMMIT; clear = p->ct_action & TCA_CT_ACT_CLEAR; tmpl = p->tmpl; @@ -1409,6 +1409,7 @@ static int tcf_ct_init(struct net *net, struct nlattr *nla, if (err) goto cleanup; + params->action = parm->action; spin_lock_bh(&c->tcf_lock); goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); params = rcu_replace_pointer(c->params, params, @@ -1442,8 +1443,8 @@ static void tcf_ct_cleanup(struct tc_action *a) } static int tcf_ct_dump_key_val(struct sk_buff *skb, - void *val, int val_type, - void *mask, int mask_type, + const void *val, int val_type, + const void *mask, int mask_type, int len) { int err; @@ -1464,9 +1465,9 @@ static int tcf_ct_dump_key_val(struct sk_buff *skb, return 0; } -static int tcf_ct_dump_nat(struct sk_buff *skb, struct tcf_ct_params *p) +static int tcf_ct_dump_nat(struct sk_buff *skb, const struct tcf_ct_params *p) { - struct nf_nat_range2 *range = &p->range; + const struct nf_nat_range2 *range = &p->range; if (!(p->ct_action & TCA_CT_ACT_NAT)) return 0; @@ -1504,7 +1505,8 @@ static int tcf_ct_dump_nat(struct sk_buff *skb, struct tcf_ct_params *p) return 0; } -static int tcf_ct_dump_helper(struct sk_buff *skb, struct nf_conntrack_helper *helper) +static int tcf_ct_dump_helper(struct sk_buff *skb, + const struct nf_conntrack_helper *helper) { if (!helper) return 0; @@ -1521,9 +1523,8 @@ static inline int tcf_ct_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { unsigned char *b = skb_tail_pointer(skb); - struct tcf_ct *c = to_ct(a); - struct tcf_ct_params *p; - + const struct tcf_ct *c = to_ct(a); + const struct tcf_ct_params *p; struct tc_ct opt = { .index = c->tcf_index, .refcnt = refcount_read(&c->tcf_refcnt) - ref, @@ -1531,10 +1532,9 @@ static inline int tcf_ct_dump(struct sk_buff *skb, struct tc_action *a, }; struct tcf_t t; - spin_lock_bh(&c->tcf_lock); - p = rcu_dereference_protected(c->params, - lockdep_is_held(&c->tcf_lock)); - opt.action = c->tcf_action; + rcu_read_lock(); + p = rcu_dereference(c->params); + opt.action = p->action; if (tcf_ct_dump_key_val(skb, &p->ct_action, TCA_CT_ACTION, @@ -1579,11 +1579,11 @@ skip_dump: tcf_tm_dump(&t, &c->tcf_tm); if (nla_put_64bit(skb, TCA_CT_TM, sizeof(t), &t, TCA_CT_PAD)) goto nla_put_failure; - spin_unlock_bh(&c->tcf_lock); + rcu_read_unlock(); return skb->len; nla_put_failure: - spin_unlock_bh(&c->tcf_lock); + rcu_read_unlock(); nlmsg_trim(skb, b); return -1; } diff --git a/net/sched/act_ctinfo.c b/net/sched/act_ctinfo.c index 5b1241ddc758..71efe04d00b5 100644 --- a/net/sched/act_ctinfo.c +++ b/net/sched/act_ctinfo.c @@ -44,9 +44,9 @@ static void tcf_ctinfo_dscp_set(struct nf_conn *ct, struct tcf_ctinfo *ca, ipv4_change_dsfield(ip_hdr(skb), INET_ECN_MASK, newdscp); - ca->stats_dscp_set++; + atomic64_inc(&ca->stats_dscp_set); } else { - ca->stats_dscp_error++; + atomic64_inc(&ca->stats_dscp_error); } } break; @@ -57,9 +57,9 @@ static void tcf_ctinfo_dscp_set(struct nf_conn *ct, struct tcf_ctinfo *ca, ipv6_change_dsfield(ipv6_hdr(skb), INET_ECN_MASK, newdscp); - ca->stats_dscp_set++; + atomic64_inc(&ca->stats_dscp_set); } else { - ca->stats_dscp_error++; + atomic64_inc(&ca->stats_dscp_error); } } break; @@ -72,7 +72,7 @@ static void tcf_ctinfo_cpmark_set(struct nf_conn *ct, struct tcf_ctinfo *ca, struct tcf_ctinfo_params *cp, struct sk_buff *skb) { - ca->stats_cpmark_set++; + atomic64_inc(&ca->stats_cpmark_set); skb->mark = READ_ONCE(ct->mark) & cp->cpmarkmask; } @@ -88,13 +88,11 @@ TC_INDIRECT_SCOPE int tcf_ctinfo_act(struct sk_buff *skb, struct tcf_ctinfo_params *cp; struct nf_conn *ct; int proto, wlen; - int action; cp = rcu_dereference_bh(ca->params); tcf_lastuse_update(&ca->tcf_tm); tcf_action_update_bstats(&ca->common, skb); - action = READ_ONCE(ca->tcf_action); wlen = skb_network_offset(skb); switch (skb_protocol(skb, true)) { @@ -141,7 +139,7 @@ TC_INDIRECT_SCOPE int tcf_ctinfo_act(struct sk_buff *skb, if (thash) nf_ct_put(ct); out: - return action; + return cp->action; } static const struct nla_policy ctinfo_policy[TCA_CTINFO_MAX + 1] = { @@ -258,6 +256,8 @@ static int tcf_ctinfo_init(struct net *net, struct nlattr *nla, cp_new->mode |= CTINFO_MODE_CPMARK; } + cp_new->action = actparm->action; + spin_lock_bh(&ci->tcf_lock); goto_ch = tcf_action_set_ctrlact(*a, actparm->action, goto_ch); cp_new = rcu_replace_pointer(ci->params, cp_new, @@ -282,25 +282,24 @@ release_idr: static int tcf_ctinfo_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { - struct tcf_ctinfo *ci = to_ctinfo(a); + const struct tcf_ctinfo *ci = to_ctinfo(a); + unsigned char *b = skb_tail_pointer(skb); + const struct tcf_ctinfo_params *cp; struct tc_ctinfo opt = { .index = ci->tcf_index, .refcnt = refcount_read(&ci->tcf_refcnt) - ref, .bindcnt = atomic_read(&ci->tcf_bindcnt) - bind, }; - unsigned char *b = skb_tail_pointer(skb); - struct tcf_ctinfo_params *cp; struct tcf_t t; - spin_lock_bh(&ci->tcf_lock); - cp = rcu_dereference_protected(ci->params, - lockdep_is_held(&ci->tcf_lock)); + rcu_read_lock(); + cp = rcu_dereference(ci->params); tcf_tm_dump(&t, &ci->tcf_tm); if (nla_put_64bit(skb, TCA_CTINFO_TM, sizeof(t), &t, TCA_CTINFO_PAD)) goto nla_put_failure; - opt.action = ci->tcf_action; + opt.action = cp->action; if (nla_put(skb, TCA_CTINFO_ACT, sizeof(opt), &opt)) goto nla_put_failure; @@ -323,22 +322,25 @@ static int tcf_ctinfo_dump(struct sk_buff *skb, struct tc_action *a, } if (nla_put_u64_64bit(skb, TCA_CTINFO_STATS_DSCP_SET, - ci->stats_dscp_set, TCA_CTINFO_PAD)) + atomic64_read(&ci->stats_dscp_set), + TCA_CTINFO_PAD)) goto nla_put_failure; if (nla_put_u64_64bit(skb, TCA_CTINFO_STATS_DSCP_ERROR, - ci->stats_dscp_error, TCA_CTINFO_PAD)) + atomic64_read(&ci->stats_dscp_error), + TCA_CTINFO_PAD)) goto nla_put_failure; if (nla_put_u64_64bit(skb, TCA_CTINFO_STATS_CPMARK_SET, - ci->stats_cpmark_set, TCA_CTINFO_PAD)) + atomic64_read(&ci->stats_cpmark_set), + TCA_CTINFO_PAD)) goto nla_put_failure; - spin_unlock_bh(&ci->tcf_lock); + rcu_read_unlock(); return skb->len; nla_put_failure: - spin_unlock_bh(&ci->tcf_lock); + rcu_read_unlock(); nlmsg_trim(skb, b); return -1; } diff --git a/net/sched/act_mpls.c b/net/sched/act_mpls.c index 9f86f4e666d3..6654011dcd2b 100644 --- a/net/sched/act_mpls.c +++ b/net/sched/act_mpls.c @@ -57,7 +57,7 @@ TC_INDIRECT_SCOPE int tcf_mpls_act(struct sk_buff *skb, struct tcf_mpls *m = to_mpls(a); struct tcf_mpls_params *p; __be32 new_lse; - int ret, mac_len; + int mac_len; tcf_lastuse_update(&m->tcf_tm); bstats_update(this_cpu_ptr(m->common.cpu_bstats), skb); @@ -72,8 +72,6 @@ TC_INDIRECT_SCOPE int tcf_mpls_act(struct sk_buff *skb, mac_len = skb_network_offset(skb); } - ret = READ_ONCE(m->tcf_action); - p = rcu_dereference_bh(m->mpls_p); switch (p->tcfm_action) { @@ -122,7 +120,7 @@ TC_INDIRECT_SCOPE int tcf_mpls_act(struct sk_buff *skb, if (skb_at_tc_ingress(skb)) skb_pull_rcsum(skb, skb->mac_len); - return ret; + return p->action; drop: qstats_drop_inc(this_cpu_ptr(m->common.cpu_qstats)); @@ -296,6 +294,7 @@ static int tcf_mpls_init(struct net *net, struct nlattr *nla, ACT_MPLS_BOS_NOT_SET); p->tcfm_proto = nla_get_be16_default(tb[TCA_MPLS_PROTO], htons(ETH_P_MPLS_UC)); + p->action = parm->action; spin_lock_bh(&m->tcf_lock); goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); @@ -330,8 +329,8 @@ static int tcf_mpls_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { unsigned char *b = skb_tail_pointer(skb); - struct tcf_mpls *m = to_mpls(a); - struct tcf_mpls_params *p; + const struct tcf_mpls *m = to_mpls(a); + const struct tcf_mpls_params *p; struct tc_mpls opt = { .index = m->tcf_index, .refcnt = refcount_read(&m->tcf_refcnt) - ref, @@ -339,10 +338,10 @@ static int tcf_mpls_dump(struct sk_buff *skb, struct tc_action *a, }; struct tcf_t t; - spin_lock_bh(&m->tcf_lock); - opt.action = m->tcf_action; - p = rcu_dereference_protected(m->mpls_p, lockdep_is_held(&m->tcf_lock)); + rcu_read_lock(); + p = rcu_dereference(m->mpls_p); opt.m_action = p->tcfm_action; + opt.action = p->action; if (nla_put(skb, TCA_MPLS_PARMS, sizeof(opt), &opt)) goto nla_put_failure; @@ -370,12 +369,12 @@ static int tcf_mpls_dump(struct sk_buff *skb, struct tc_action *a, if (nla_put_64bit(skb, TCA_MPLS_TM, sizeof(t), &t, TCA_MPLS_PAD)) goto nla_put_failure; - spin_unlock_bh(&m->tcf_lock); + rcu_read_unlock(); return skb->len; nla_put_failure: - spin_unlock_bh(&m->tcf_lock); + rcu_read_unlock(); nlmsg_trim(skb, b); return -EMSGSIZE; } diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c index d541f553805f..26241d80ebe0 100644 --- a/net/sched/act_nat.c +++ b/net/sched/act_nat.c @@ -91,6 +91,7 @@ static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est, nparm->new_addr = parm->new_addr; nparm->mask = parm->mask; nparm->flags = parm->flags; + nparm->action = parm->action; p = to_tcf_nat(*a); @@ -130,17 +131,16 @@ TC_INDIRECT_SCOPE int tcf_nat_act(struct sk_buff *skb, tcf_lastuse_update(&p->tcf_tm); tcf_action_update_bstats(&p->common, skb); - action = READ_ONCE(p->tcf_action); - parms = rcu_dereference_bh(p->parms); + action = parms->action; + if (unlikely(action == TC_ACT_SHOT)) + goto drop; + old_addr = parms->old_addr; new_addr = parms->new_addr; mask = parms->mask; egress = parms->flags & TCA_NAT_FLAG_EGRESS; - if (unlikely(action == TC_ACT_SHOT)) - goto drop; - noff = skb_network_offset(skb); if (!pskb_may_pull(skb, sizeof(*iph) + noff)) goto drop; @@ -268,21 +268,20 @@ static int tcf_nat_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { unsigned char *b = skb_tail_pointer(skb); - struct tcf_nat *p = to_tcf_nat(a); + const struct tcf_nat *p = to_tcf_nat(a); + const struct tcf_nat_parms *parms; struct tc_nat opt = { .index = p->tcf_index, .refcnt = refcount_read(&p->tcf_refcnt) - ref, .bindcnt = atomic_read(&p->tcf_bindcnt) - bind, }; - struct tcf_nat_parms *parms; struct tcf_t t; - spin_lock_bh(&p->tcf_lock); - - opt.action = p->tcf_action; + rcu_read_lock(); - parms = rcu_dereference_protected(p->parms, lockdep_is_held(&p->tcf_lock)); + parms = rcu_dereference(p->parms); + opt.action = parms->action; opt.old_addr = parms->old_addr; opt.new_addr = parms->new_addr; opt.mask = parms->mask; @@ -294,12 +293,12 @@ static int tcf_nat_dump(struct sk_buff *skb, struct tc_action *a, tcf_tm_dump(&t, &p->tcf_tm); if (nla_put_64bit(skb, TCA_NAT_TM, sizeof(t), &t, TCA_NAT_PAD)) goto nla_put_failure; - spin_unlock_bh(&p->tcf_lock); + rcu_read_unlock(); return skb->len; nla_put_failure: - spin_unlock_bh(&p->tcf_lock); + rcu_read_unlock(); nlmsg_trim(skb, b); return -1; } diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c index fc0a35a7b62a..4b65901397a8 100644 --- a/net/sched/act_pedit.c +++ b/net/sched/act_pedit.c @@ -279,7 +279,7 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla, } p = to_pedit(*a); - + nparms->action = parm->action; spin_lock_bh(&p->tcf_lock); goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); oparms = rcu_replace_pointer(p->parms, nparms, 1); @@ -483,7 +483,7 @@ TC_INDIRECT_SCOPE int tcf_pedit_act(struct sk_buff *skb, bad: tcf_action_inc_overlimit_qstats(&p->common); done: - return p->tcf_action; + return parms->action; } static void tcf_pedit_stats_update(struct tc_action *a, u64 bytes, u64 packets, @@ -500,19 +500,19 @@ static int tcf_pedit_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { unsigned char *b = skb_tail_pointer(skb); - struct tcf_pedit *p = to_pedit(a); - struct tcf_pedit_parms *parms; + const struct tcf_pedit *p = to_pedit(a); + const struct tcf_pedit_parms *parms; struct tc_pedit *opt; struct tcf_t t; int s; - spin_lock_bh(&p->tcf_lock); - parms = rcu_dereference_protected(p->parms, 1); + rcu_read_lock(); + parms = rcu_dereference(p->parms); s = struct_size(opt, keys, parms->tcfp_nkeys); opt = kzalloc(s, GFP_ATOMIC); if (unlikely(!opt)) { - spin_unlock_bh(&p->tcf_lock); + rcu_read_unlock(); return -ENOBUFS; } opt->nkeys = parms->tcfp_nkeys; @@ -521,7 +521,7 @@ static int tcf_pedit_dump(struct sk_buff *skb, struct tc_action *a, flex_array_size(opt, keys, parms->tcfp_nkeys)); opt->index = p->tcf_index; opt->flags = parms->tcfp_flags; - opt->action = p->tcf_action; + opt->action = parms->action; opt->refcnt = refcount_read(&p->tcf_refcnt) - ref; opt->bindcnt = atomic_read(&p->tcf_bindcnt) - bind; @@ -540,13 +540,13 @@ static int tcf_pedit_dump(struct sk_buff *skb, struct tc_action *a, tcf_tm_dump(&t, &p->tcf_tm); if (nla_put_64bit(skb, TCA_PEDIT_TM, sizeof(t), &t, TCA_PEDIT_PAD)) goto nla_put_failure; - spin_unlock_bh(&p->tcf_lock); + rcu_read_unlock(); kfree(opt); return skb->len; nla_put_failure: - spin_unlock_bh(&p->tcf_lock); + rcu_read_unlock(); nlmsg_trim(skb, b); kfree(opt); return -1; diff --git a/net/sched/act_police.c b/net/sched/act_police.c index a214ed681142..0e1c61183379 100644 --- a/net/sched/act_police.c +++ b/net/sched/act_police.c @@ -198,6 +198,7 @@ static int tcf_police_init(struct net *net, struct nlattr *nla, psched_ppscfg_precompute(&new->ppsrate, pps); } + new->action = parm->action; spin_lock_bh(&police->tcf_lock); spin_lock_bh(&police->tcfp_lock); police->tcfp_t_c = ktime_get_ns(); @@ -254,8 +255,8 @@ TC_INDIRECT_SCOPE int tcf_police_act(struct sk_buff *skb, tcf_lastuse_update(&police->tcf_tm); bstats_update(this_cpu_ptr(police->common.cpu_bstats), skb); - ret = READ_ONCE(police->tcf_action); p = rcu_dereference_bh(police->params); + ret = p->action; if (p->tcfp_ewma_rate) { struct gnet_stats_rate_est64 sample; @@ -338,9 +339,9 @@ static void tcf_police_stats_update(struct tc_action *a, static int tcf_police_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { + const struct tcf_police *police = to_police(a); unsigned char *b = skb_tail_pointer(skb); - struct tcf_police *police = to_police(a); - struct tcf_police_params *p; + const struct tcf_police_params *p; struct tc_police opt = { .index = police->tcf_index, .refcnt = refcount_read(&police->tcf_refcnt) - ref, @@ -348,10 +349,9 @@ static int tcf_police_dump(struct sk_buff *skb, struct tc_action *a, }; struct tcf_t t; - spin_lock_bh(&police->tcf_lock); - opt.action = police->tcf_action; - p = rcu_dereference_protected(police->params, - lockdep_is_held(&police->tcf_lock)); + rcu_read_lock(); + p = rcu_dereference(police->params); + opt.action = p->action; opt.mtu = p->tcfp_mtu; opt.burst = PSCHED_NS2TICKS(p->tcfp_burst); if (p->rate_present) { @@ -392,12 +392,12 @@ static int tcf_police_dump(struct sk_buff *skb, struct tc_action *a, tcf_tm_dump(&t, &police->tcf_tm); if (nla_put_64bit(skb, TCA_POLICE_TM, sizeof(t), &t, TCA_POLICE_PAD)) goto nla_put_failure; - spin_unlock_bh(&police->tcf_lock); + rcu_read_unlock(); return skb->len; nla_put_failure: - spin_unlock_bh(&police->tcf_lock); + rcu_read_unlock(); nlmsg_trim(skb, b); return -1; } diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c index 1f1d9ce3e968..8c1d1554f657 100644 --- a/net/sched/act_skbedit.c +++ b/net/sched/act_skbedit.c @@ -43,13 +43,11 @@ TC_INDIRECT_SCOPE int tcf_skbedit_act(struct sk_buff *skb, { struct tcf_skbedit *d = to_skbedit(a); struct tcf_skbedit_params *params; - int action; tcf_lastuse_update(&d->tcf_tm); bstats_update(this_cpu_ptr(d->common.cpu_bstats), skb); params = rcu_dereference_bh(d->params); - action = READ_ONCE(d->tcf_action); if (params->flags & SKBEDIT_F_PRIORITY) skb->priority = params->priority; @@ -85,7 +83,7 @@ TC_INDIRECT_SCOPE int tcf_skbedit_act(struct sk_buff *skb, } if (params->flags & SKBEDIT_F_PTYPE) skb->pkt_type = params->ptype; - return action; + return params->action; err: qstats_drop_inc(this_cpu_ptr(d->common.cpu_qstats)); @@ -262,6 +260,7 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla, if (flags & SKBEDIT_F_MASK) params_new->mask = *mask; + params_new->action = parm->action; spin_lock_bh(&d->tcf_lock); goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); params_new = rcu_replace_pointer(d->params, params_new, @@ -284,9 +283,9 @@ release_idr: static int tcf_skbedit_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { + const struct tcf_skbedit *d = to_skbedit(a); unsigned char *b = skb_tail_pointer(skb); - struct tcf_skbedit *d = to_skbedit(a); - struct tcf_skbedit_params *params; + const struct tcf_skbedit_params *params; struct tc_skbedit opt = { .index = d->tcf_index, .refcnt = refcount_read(&d->tcf_refcnt) - ref, @@ -295,10 +294,9 @@ static int tcf_skbedit_dump(struct sk_buff *skb, struct tc_action *a, u64 pure_flags = 0; struct tcf_t t; - spin_lock_bh(&d->tcf_lock); - params = rcu_dereference_protected(d->params, - lockdep_is_held(&d->tcf_lock)); - opt.action = d->tcf_action; + rcu_read_lock(); + params = rcu_dereference(d->params); + opt.action = params->action; if (nla_put(skb, TCA_SKBEDIT_PARMS, sizeof(opt), &opt)) goto nla_put_failure; @@ -333,12 +331,12 @@ static int tcf_skbedit_dump(struct sk_buff *skb, struct tc_action *a, tcf_tm_dump(&t, &d->tcf_tm); if (nla_put_64bit(skb, TCA_SKBEDIT_TM, sizeof(t), &t, TCA_SKBEDIT_PAD)) goto nla_put_failure; - spin_unlock_bh(&d->tcf_lock); + rcu_read_unlock(); return skb->len; nla_put_failure: - spin_unlock_bh(&d->tcf_lock); + rcu_read_unlock(); nlmsg_trim(skb, b); return -1; } diff --git a/net/sched/bpf_qdisc.c b/net/sched/bpf_qdisc.c index 7ea8b54b2ab1..adcb618a2bfc 100644 --- a/net/sched/bpf_qdisc.c +++ b/net/sched/bpf_qdisc.c @@ -130,8 +130,7 @@ static int bpf_qdisc_btf_struct_access(struct bpf_verifier_log *log, return 0; } -BTF_ID_LIST(bpf_qdisc_init_prologue_ids) -BTF_ID(func, bpf_qdisc_init_prologue) +BTF_ID_LIST_SINGLE(bpf_qdisc_init_prologue_ids, func, bpf_qdisc_init_prologue) static int bpf_qdisc_gen_prologue(struct bpf_insn *insn_buf, bool direct_write, const struct bpf_prog *prog) @@ -161,8 +160,7 @@ static int bpf_qdisc_gen_prologue(struct bpf_insn *insn_buf, bool direct_write, return insn - insn_buf; } -BTF_ID_LIST(bpf_qdisc_reset_destroy_epilogue_ids) -BTF_ID(func, bpf_qdisc_reset_destroy_epilogue) +BTF_ID_LIST_SINGLE(bpf_qdisc_reset_destroy_epilogue_ids, func, bpf_qdisc_reset_destroy_epilogue) static int bpf_qdisc_gen_epilogue(struct bpf_insn *insn_buf, const struct bpf_prog *prog, s16 ctx_stack_off) @@ -451,8 +449,7 @@ static struct bpf_struct_ops bpf_Qdisc_ops = { .owner = THIS_MODULE, }; -BTF_ID_LIST(bpf_sk_buff_dtor_ids) -BTF_ID(func, bpf_kfree_skb) +BTF_ID_LIST_SINGLE(bpf_sk_buff_dtor_ids, func, bpf_kfree_skb) static int __init bpf_qdisc_kfunc_init(void) { diff --git a/net/sched/em_text.c b/net/sched/em_text.c index 420c66203b17..6b3d0af72c39 100644 --- a/net/sched/em_text.c +++ b/net/sched/em_text.c @@ -108,7 +108,7 @@ static int em_text_dump(struct sk_buff *skb, struct tcf_ematch *m) struct text_match *tm = EM_TEXT_PRIV(m); struct tcf_em_text conf; - strncpy(conf.algo, tm->config->ops->name, sizeof(conf.algo) - 1); + strscpy(conf.algo, tm->config->ops->name); conf.from_offset = tm->from_offset; conf.to_offset = tm->to_offset; conf.from_layer = tm->from_layer; diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index c5e3673aadbe..d7c767b861a4 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -336,17 +336,22 @@ out: return q; } -static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid) +static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid, + struct netlink_ext_ack *extack) { unsigned long cl; const struct Qdisc_class_ops *cops = p->ops->cl_ops; - if (cops == NULL) - return NULL; + if (cops == NULL) { + NL_SET_ERR_MSG(extack, "Parent qdisc is not classful"); + return ERR_PTR(-EOPNOTSUPP); + } cl = cops->find(p, classid); - if (cl == 0) - return NULL; + if (cl == 0) { + NL_SET_ERR_MSG(extack, "Specified class not found"); + return ERR_PTR(-ENOENT); + } return cops->leaf(p, cl); } @@ -596,16 +601,6 @@ out: qdisc_skb_cb(skb)->pkt_len = pkt_len; } -void qdisc_warn_nonwc(const char *txt, struct Qdisc *qdisc) -{ - if (!(qdisc->flags & TCQ_F_WARN_NONWC)) { - pr_warn("%s: %s qdisc %X: is non-work-conserving?\n", - txt, qdisc->ops->id, qdisc->handle >> 16); - qdisc->flags |= TCQ_F_WARN_NONWC; - } -} -EXPORT_SYMBOL(qdisc_warn_nonwc); - static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer) { struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog, @@ -780,15 +775,12 @@ static u32 qdisc_alloc_handle(struct net_device *dev) void qdisc_tree_reduce_backlog(struct Qdisc *sch, int n, int len) { - bool qdisc_is_offloaded = sch->flags & TCQ_F_OFFLOADED; const struct Qdisc_class_ops *cops; unsigned long cl; u32 parentid; bool notify; int drops; - if (n == 0 && len == 0) - return; drops = max_t(int, n, 0); rcu_read_lock(); while ((parentid = sch->parent)) { @@ -797,17 +789,8 @@ void qdisc_tree_reduce_backlog(struct Qdisc *sch, int n, int len) if (sch->flags & TCQ_F_NOPARENT) break; - /* Notify parent qdisc only if child qdisc becomes empty. - * - * If child was empty even before update then backlog - * counter is screwed and we skip notification because - * parent class is already passive. - * - * If the original child was offloaded then it is allowed - * to be seem as empty, so the parent is notified anyway. - */ - notify = !sch->q.qlen && !WARN_ON_ONCE(!n && - !qdisc_is_offloaded); + /* Notify parent qdisc only if child qdisc becomes empty. */ + notify = !sch->q.qlen; /* TODO: perform the search on a per txq basis */ sch = qdisc_lookup_rcu(qdisc_dev(sch), TC_H_MAJ(parentid)); if (sch == NULL) { @@ -816,6 +799,9 @@ void qdisc_tree_reduce_backlog(struct Qdisc *sch, int n, int len) } cops = sch->ops->cl_ops; if (notify && cops->qlen_notify) { + /* Note that qlen_notify must be idempotent as it may get called + * multiple times. + */ cl = cops->find(sch, parentid); cops->qlen_notify(sch, cl); } @@ -1499,7 +1485,7 @@ static int __tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified classid"); return -ENOENT; } - q = qdisc_leaf(p, clid); + q = qdisc_leaf(p, clid, extack); } else if (dev_ingress_queue(dev)) { q = rtnl_dereference(dev_ingress_queue(dev)->qdisc_sleeping); } @@ -1510,6 +1496,8 @@ static int __tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, NL_SET_ERR_MSG(extack, "Cannot find specified qdisc on specified device"); return -ENOENT; } + if (IS_ERR(q)) + return PTR_ERR(q); if (tcm->tcm_handle && q->handle != tcm->tcm_handle) { NL_SET_ERR_MSG(extack, "Invalid handle"); @@ -1611,7 +1599,9 @@ static int __tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, NL_SET_ERR_MSG(extack, "Failed to find specified qdisc"); return -ENOENT; } - q = qdisc_leaf(p, clid); + q = qdisc_leaf(p, clid, extack); + if (IS_ERR(q)) + return PTR_ERR(q); } else if (dev_ingress_queue_create(dev)) { q = rtnl_dereference(dev_ingress_queue(dev)->qdisc_sleeping); } diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c index 48dd8c88903f..dbcfb948c867 100644 --- a/net/sched/sch_cake.c +++ b/net/sched/sch_cake.c @@ -1407,7 +1407,10 @@ static u32 cake_overhead(struct cake_sched_data *q, const struct sk_buff *skb) return cake_calc_overhead(q, len, off); /* borrowed from qdisc_pkt_len_init() */ - hdr_len = skb_transport_offset(skb); + if (!skb->encapsulation) + hdr_len = skb_transport_offset(skb); + else + hdr_len = skb_inner_transport_offset(skb); /* + transport layer */ if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | diff --git a/net/sched/sch_dualpi2.c b/net/sched/sch_dualpi2.c new file mode 100644 index 000000000000..845375ebd4ea --- /dev/null +++ b/net/sched/sch_dualpi2.c @@ -0,0 +1,1175 @@ +// SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause +/* Copyright (C) 2024 Nokia + * + * Author: Koen De Schepper <koen.de_schepper@nokia-bell-labs.com> + * Author: Olga Albisser <olga@albisser.org> + * Author: Henrik Steen <henrist@henrist.net> + * Author: Olivier Tilmans <olivier.tilmans@nokia.com> + * Author: Chia-Yu Chang <chia-yu.chang@nokia-bell-labs.com> + * + * DualPI Improved with a Square (dualpi2): + * - Supports congestion controls that comply with the Prague requirements + * in RFC9331 (e.g. TCP-Prague) + * - Supports coupled dual-queue with PI2 as defined in RFC9332 + * - Supports ECN L4S-identifier (IP.ECN==0b*1) + * + * note: Although DCTCP and BBRv3 can use shallow-threshold ECN marks, + * they do not meet the 'Prague L4S Requirements' listed in RFC 9331 + * Section 4, so they can only be used with DualPI2 in a datacenter + * context. + * + * References: + * - RFC9332: https://datatracker.ietf.org/doc/html/rfc9332 + * - De Schepper, Koen, et al. "PI 2: A linearized AQM for both classic and + * scalable TCP." in proc. ACM CoNEXT'16, 2016. + */ + +#include <linux/errno.h> +#include <linux/hrtimer.h> +#include <linux/if_vlan.h> +#include <linux/kernel.h> +#include <linux/limits.h> +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/types.h> + +#include <net/gso.h> +#include <net/inet_ecn.h> +#include <net/pkt_cls.h> +#include <net/pkt_sched.h> + +/* 32b enable to support flows with windows up to ~8.6 * 1e9 packets + * i.e., twice the maximal snd_cwnd. + * MAX_PROB must be consistent with the RNG in dualpi2_roll(). + */ +#define MAX_PROB U32_MAX + +/* alpha/beta values exchanged over netlink are in units of 256ns */ +#define ALPHA_BETA_SHIFT 8 + +/* Scaled values of alpha/beta must fit in 32b to avoid overflow in later + * computations. Consequently (see and dualpi2_scale_alpha_beta()), their + * netlink-provided values can use at most 31b, i.e. be at most (2^23)-1 + * (~4MHz) as those are given in 1/256th. This enable to tune alpha/beta to + * control flows whose maximal RTTs can be in usec up to few secs. + */ +#define ALPHA_BETA_MAX ((1U << 31) - 1) + +/* Internal alpha/beta are in units of 64ns. + * This enables to use all alpha/beta values in the allowed range without loss + * of precision due to rounding when scaling them internally, e.g., + * scale_alpha_beta(1) will not round down to 0. + */ +#define ALPHA_BETA_GRANULARITY 6 + +#define ALPHA_BETA_SCALING (ALPHA_BETA_SHIFT - ALPHA_BETA_GRANULARITY) + +/* We express the weights (wc, wl) in %, i.e., wc + wl = 100 */ +#define MAX_WC 100 + +struct dualpi2_sched_data { + struct Qdisc *l_queue; /* The L4S Low latency queue (L-queue) */ + struct Qdisc *sch; /* The Classic queue (C-queue) */ + + /* Registered tc filters */ + struct tcf_proto __rcu *tcf_filters; + struct tcf_block *tcf_block; + + /* PI2 parameters */ + u64 pi2_target; /* Target delay in nanoseconds */ + u32 pi2_tupdate; /* Timer frequency in nanoseconds */ + u32 pi2_prob; /* Base PI probability */ + u32 pi2_alpha; /* Gain factor for the integral rate response */ + u32 pi2_beta; /* Gain factor for the proportional response */ + struct hrtimer pi2_timer; /* prob update timer */ + + /* Step AQM (L-queue only) parameters */ + u32 step_thresh; /* Step threshold */ + bool step_in_packets; /* Step thresh in packets (1) or time (0) */ + + /* C-queue starvation protection */ + s32 c_protection_credit; /* Credit (sign indicates which queue) */ + s32 c_protection_init; /* Reset value of the credit */ + u8 c_protection_wc; /* C-queue weight (between 0 and MAX_WC) */ + u8 c_protection_wl; /* L-queue weight (MAX_WC - wc) */ + + /* General dualQ parameters */ + u32 memory_limit; /* Memory limit of both queues */ + u8 coupling_factor;/* Coupling factor (k) between both queues */ + u8 ecn_mask; /* Mask to match packets into L-queue */ + u32 min_qlen_step; /* Minimum queue length to apply step thresh */ + bool drop_early; /* Drop at enqueue (1) instead of dequeue (0) */ + bool drop_overload; /* Drop (1) on overload, or overflow (0) */ + bool split_gso; /* Split aggregated skb (1) or leave as is (0) */ + + /* Statistics */ + u64 c_head_ts; /* Enqueue timestamp of the C-queue head */ + u64 l_head_ts; /* Enqueue timestamp of the L-queue head */ + u64 last_qdelay; /* Q delay val at the last probability update */ + u32 packets_in_c; /* Enqueue packet counter of the C-queue */ + u32 packets_in_l; /* Enqueue packet counter of the L-queue */ + u32 maxq; /* Maximum queue size of the C-queue */ + u32 ecn_mark; /* ECN mark pkt counter due to PI probability */ + u32 step_marks; /* ECN mark pkt counter due to step AQM */ + u32 memory_used; /* Memory used of both queues */ + u32 max_memory_used;/* Maximum used memory */ + + /* Deferred drop statistics */ + u32 deferred_drops_cnt; /* Packets dropped */ + u32 deferred_drops_len; /* Bytes dropped */ +}; + +struct dualpi2_skb_cb { + u64 ts; /* Timestamp at enqueue */ + u8 apply_step:1, /* Can we apply the step threshold */ + classified:2, /* Packet classification results */ + ect:2; /* Packet ECT codepoint */ +}; + +enum dualpi2_classification_results { + DUALPI2_C_CLASSIC = 0, /* C-queue */ + DUALPI2_C_L4S = 1, /* L-queue (scale mark/classic drop) */ + DUALPI2_C_LLLL = 2, /* L-queue (no drops/marks) */ + __DUALPI2_C_MAX /* Keep last*/ +}; + +static struct dualpi2_skb_cb *dualpi2_skb_cb(struct sk_buff *skb) +{ + qdisc_cb_private_validate(skb, sizeof(struct dualpi2_skb_cb)); + return (struct dualpi2_skb_cb *)qdisc_skb_cb(skb)->data; +} + +static u64 dualpi2_sojourn_time(struct sk_buff *skb, u64 reference) +{ + return reference - dualpi2_skb_cb(skb)->ts; +} + +static u64 head_enqueue_time(struct Qdisc *q) +{ + struct sk_buff *skb = qdisc_peek_head(q); + + return skb ? dualpi2_skb_cb(skb)->ts : 0; +} + +static u32 dualpi2_scale_alpha_beta(u32 param) +{ + u64 tmp = ((u64)param * MAX_PROB >> ALPHA_BETA_SCALING); + + do_div(tmp, NSEC_PER_SEC); + return tmp; +} + +static u32 dualpi2_unscale_alpha_beta(u32 param) +{ + u64 tmp = ((u64)param * NSEC_PER_SEC << ALPHA_BETA_SCALING); + + do_div(tmp, MAX_PROB); + return tmp; +} + +static ktime_t next_pi2_timeout(struct dualpi2_sched_data *q) +{ + return ktime_add_ns(ktime_get_ns(), q->pi2_tupdate); +} + +static bool skb_is_l4s(struct sk_buff *skb) +{ + return dualpi2_skb_cb(skb)->classified == DUALPI2_C_L4S; +} + +static bool skb_in_l_queue(struct sk_buff *skb) +{ + return dualpi2_skb_cb(skb)->classified != DUALPI2_C_CLASSIC; +} + +static bool skb_apply_step(struct sk_buff *skb, struct dualpi2_sched_data *q) +{ + return skb_is_l4s(skb) && qdisc_qlen(q->l_queue) >= q->min_qlen_step; +} + +static bool dualpi2_mark(struct dualpi2_sched_data *q, struct sk_buff *skb) +{ + if (INET_ECN_set_ce(skb)) { + q->ecn_mark++; + return true; + } + return false; +} + +static void dualpi2_reset_c_protection(struct dualpi2_sched_data *q) +{ + q->c_protection_credit = q->c_protection_init; +} + +/* This computes the initial credit value and WRR weight for the L queue (wl) + * from the weight of the C queue (wc). + * If wl > wc, the scheduler will start with the L queue when reset. + */ +static void dualpi2_calculate_c_protection(struct Qdisc *sch, + struct dualpi2_sched_data *q, u32 wc) +{ + q->c_protection_wc = wc; + q->c_protection_wl = MAX_WC - wc; + q->c_protection_init = (s32)psched_mtu(qdisc_dev(sch)) * + ((int)q->c_protection_wc - (int)q->c_protection_wl); + dualpi2_reset_c_protection(q); +} + +static bool dualpi2_roll(u32 prob) +{ + return get_random_u32() <= prob; +} + +/* Packets in the C-queue are subject to a marking probability pC, which is the + * square of the internal PI probability (i.e., have an overall lower mark/drop + * probability). If the qdisc is overloaded, ignore ECT values and only drop. + * + * Note that this marking scheme is also applied to L4S packets during overload. + * Return true if packet dropping is required in C queue + */ +static bool dualpi2_classic_marking(struct dualpi2_sched_data *q, + struct sk_buff *skb, u32 prob, + bool overload) +{ + if (dualpi2_roll(prob) && dualpi2_roll(prob)) { + if (overload || dualpi2_skb_cb(skb)->ect == INET_ECN_NOT_ECT) + return true; + dualpi2_mark(q, skb); + } + return false; +} + +/* Packets in the L-queue are subject to a marking probability pL given by the + * internal PI probability scaled by the coupling factor. + * + * On overload (i.e., @local_l_prob is >= 100%): + * - if the qdisc is configured to trade losses to preserve latency (i.e., + * @q->drop_overload), apply classic drops first before marking. + * - otherwise, preserve the "no loss" property of ECN at the cost of queueing + * delay, eventually resulting in taildrop behavior once sch->limit is + * reached. + * Return true if packet dropping is required in L queue + */ +static bool dualpi2_scalable_marking(struct dualpi2_sched_data *q, + struct sk_buff *skb, + u64 local_l_prob, u32 prob, + bool overload) +{ + if (overload) { + /* Apply classic drop */ + if (!q->drop_overload || + !(dualpi2_roll(prob) && dualpi2_roll(prob))) + goto mark; + return true; + } + + /* We can safely cut the upper 32b as overload==false */ + if (dualpi2_roll(local_l_prob)) { + /* Non-ECT packets could have classified as L4S by filters. */ + if (dualpi2_skb_cb(skb)->ect == INET_ECN_NOT_ECT) + return true; +mark: + dualpi2_mark(q, skb); + } + return false; +} + +/* Decide whether a given packet must be dropped (or marked if ECT), according + * to the PI2 probability. + * + * Never mark/drop if we have a standing queue of less than 2 MTUs. + */ +static bool must_drop(struct Qdisc *sch, struct dualpi2_sched_data *q, + struct sk_buff *skb) +{ + u64 local_l_prob; + bool overload; + u32 prob; + + if (sch->qstats.backlog < 2 * psched_mtu(qdisc_dev(sch))) + return false; + + prob = READ_ONCE(q->pi2_prob); + local_l_prob = (u64)prob * q->coupling_factor; + overload = local_l_prob > MAX_PROB; + + switch (dualpi2_skb_cb(skb)->classified) { + case DUALPI2_C_CLASSIC: + return dualpi2_classic_marking(q, skb, prob, overload); + case DUALPI2_C_L4S: + return dualpi2_scalable_marking(q, skb, local_l_prob, prob, + overload); + default: /* DUALPI2_C_LLLL */ + return false; + } +} + +static void dualpi2_read_ect(struct sk_buff *skb) +{ + struct dualpi2_skb_cb *cb = dualpi2_skb_cb(skb); + int wlen = skb_network_offset(skb); + + switch (skb_protocol(skb, true)) { + case htons(ETH_P_IP): + wlen += sizeof(struct iphdr); + if (!pskb_may_pull(skb, wlen) || + skb_try_make_writable(skb, wlen)) + goto not_ecn; + + cb->ect = ipv4_get_dsfield(ip_hdr(skb)) & INET_ECN_MASK; + break; + case htons(ETH_P_IPV6): + wlen += sizeof(struct ipv6hdr); + if (!pskb_may_pull(skb, wlen) || + skb_try_make_writable(skb, wlen)) + goto not_ecn; + + cb->ect = ipv6_get_dsfield(ipv6_hdr(skb)) & INET_ECN_MASK; + break; + default: + goto not_ecn; + } + return; + +not_ecn: + /* Non pullable/writable packets can only be dropped hence are + * classified as not ECT. + */ + cb->ect = INET_ECN_NOT_ECT; +} + +static int dualpi2_skb_classify(struct dualpi2_sched_data *q, + struct sk_buff *skb) +{ + struct dualpi2_skb_cb *cb = dualpi2_skb_cb(skb); + struct tcf_result res; + struct tcf_proto *fl; + int result; + + dualpi2_read_ect(skb); + if (cb->ect & q->ecn_mask) { + cb->classified = DUALPI2_C_L4S; + return NET_XMIT_SUCCESS; + } + + if (TC_H_MAJ(skb->priority) == q->sch->handle && + TC_H_MIN(skb->priority) < __DUALPI2_C_MAX) { + cb->classified = TC_H_MIN(skb->priority); + return NET_XMIT_SUCCESS; + } + + fl = rcu_dereference_bh(q->tcf_filters); + if (!fl) { + cb->classified = DUALPI2_C_CLASSIC; + return NET_XMIT_SUCCESS; + } + + result = tcf_classify(skb, NULL, fl, &res, false); + if (result >= 0) { +#ifdef CONFIG_NET_CLS_ACT + switch (result) { + case TC_ACT_STOLEN: + case TC_ACT_QUEUED: + case TC_ACT_TRAP: + return NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; + case TC_ACT_SHOT: + return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; + } +#endif + cb->classified = TC_H_MIN(res.classid) < __DUALPI2_C_MAX ? + TC_H_MIN(res.classid) : DUALPI2_C_CLASSIC; + } + return NET_XMIT_SUCCESS; +} + +static int dualpi2_enqueue_skb(struct sk_buff *skb, struct Qdisc *sch, + struct sk_buff **to_free) +{ + struct dualpi2_sched_data *q = qdisc_priv(sch); + struct dualpi2_skb_cb *cb; + + if (unlikely(qdisc_qlen(sch) >= sch->limit) || + unlikely((u64)q->memory_used + skb->truesize > q->memory_limit)) { + qdisc_qstats_overlimit(sch); + if (skb_in_l_queue(skb)) + qdisc_qstats_overlimit(q->l_queue); + return qdisc_drop_reason(skb, sch, to_free, + SKB_DROP_REASON_QDISC_OVERLIMIT); + } + + if (q->drop_early && must_drop(sch, q, skb)) { + qdisc_drop_reason(skb, sch, to_free, + SKB_DROP_REASON_QDISC_CONGESTED); + return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; + } + + cb = dualpi2_skb_cb(skb); + cb->ts = ktime_get_ns(); + q->memory_used += skb->truesize; + if (q->memory_used > q->max_memory_used) + q->max_memory_used = q->memory_used; + + if (qdisc_qlen(sch) > q->maxq) + q->maxq = qdisc_qlen(sch); + + if (skb_in_l_queue(skb)) { + /* Apply step thresh if skb is L4S && L-queue len >= min_qlen */ + dualpi2_skb_cb(skb)->apply_step = skb_apply_step(skb, q); + + /* Keep the overall qdisc stats consistent */ + ++sch->q.qlen; + qdisc_qstats_backlog_inc(sch, skb); + ++q->packets_in_l; + if (!q->l_head_ts) + q->l_head_ts = cb->ts; + return qdisc_enqueue_tail(skb, q->l_queue); + } + ++q->packets_in_c; + if (!q->c_head_ts) + q->c_head_ts = cb->ts; + return qdisc_enqueue_tail(skb, sch); +} + +/* By default, dualpi2 will split GSO skbs into independent skbs and enqueue + * each of those individually. This yields the following benefits, at the + * expense of CPU usage: + * - Finer-grained AQM actions as the sub-packets of a burst no longer share the + * same fate (e.g., the random mark/drop probability is applied individually) + * - Improved precision of the starvation protection/WRR scheduler at dequeue, + * as the size of the dequeued packets will be smaller. + */ +static int dualpi2_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, + struct sk_buff **to_free) +{ + struct dualpi2_sched_data *q = qdisc_priv(sch); + int err; + + err = dualpi2_skb_classify(q, skb); + if (err != NET_XMIT_SUCCESS) { + if (err & __NET_XMIT_BYPASS) + qdisc_qstats_drop(sch); + __qdisc_drop(skb, to_free); + return err; + } + + if (q->split_gso && skb_is_gso(skb)) { + netdev_features_t features; + struct sk_buff *nskb, *next; + int cnt, byte_len, orig_len; + int err; + + features = netif_skb_features(skb); + nskb = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK); + if (IS_ERR_OR_NULL(nskb)) + return qdisc_drop(skb, sch, to_free); + + cnt = 1; + byte_len = 0; + orig_len = qdisc_pkt_len(skb); + skb_list_walk_safe(nskb, nskb, next) { + skb_mark_not_on_list(nskb); + + /* Iterate through GSO fragments of an skb: + * (1) Set pkt_len from the single GSO fragments + * (2) Copy classified and ect values of an skb + * (3) Enqueue fragment & set ts in dualpi2_enqueue_skb + */ + qdisc_skb_cb(nskb)->pkt_len = nskb->len; + dualpi2_skb_cb(nskb)->classified = + dualpi2_skb_cb(skb)->classified; + dualpi2_skb_cb(nskb)->ect = dualpi2_skb_cb(skb)->ect; + err = dualpi2_enqueue_skb(nskb, sch, to_free); + + if (err == NET_XMIT_SUCCESS) { + /* Compute the backlog adjustment that needs + * to be propagated in the qdisc tree to reflect + * all new skbs successfully enqueued. + */ + ++cnt; + byte_len += nskb->len; + } + } + if (cnt > 1) { + /* The caller will add the original skb stats to its + * backlog, compensate this if any nskb is enqueued. + */ + --cnt; + byte_len -= orig_len; + } + qdisc_tree_reduce_backlog(sch, -cnt, -byte_len); + consume_skb(skb); + return err; + } + return dualpi2_enqueue_skb(skb, sch, to_free); +} + +/* Select the queue from which the next packet can be dequeued, ensuring that + * neither queue can starve the other with a WRR scheduler. + * + * The sign of the WRR credit determines the next queue, while the size of + * the dequeued packet determines the magnitude of the WRR credit change. If + * either queue is empty, the WRR credit is kept unchanged. + * + * As the dequeued packet can be dropped later, the caller has to perform the + * qdisc_bstats_update() calls. + */ +static struct sk_buff *dequeue_packet(struct Qdisc *sch, + struct dualpi2_sched_data *q, + int *credit_change, + u64 now) +{ + struct sk_buff *skb = NULL; + int c_len; + + *credit_change = 0; + c_len = qdisc_qlen(sch) - qdisc_qlen(q->l_queue); + if (qdisc_qlen(q->l_queue) && (!c_len || q->c_protection_credit <= 0)) { + skb = __qdisc_dequeue_head(&q->l_queue->q); + WRITE_ONCE(q->l_head_ts, head_enqueue_time(q->l_queue)); + if (c_len) + *credit_change = q->c_protection_wc; + qdisc_qstats_backlog_dec(q->l_queue, skb); + + /* Keep the global queue size consistent */ + --sch->q.qlen; + q->memory_used -= skb->truesize; + } else if (c_len) { + skb = __qdisc_dequeue_head(&sch->q); + WRITE_ONCE(q->c_head_ts, head_enqueue_time(sch)); + if (qdisc_qlen(q->l_queue)) + *credit_change = ~((s32)q->c_protection_wl) + 1; + q->memory_used -= skb->truesize; + } else { + dualpi2_reset_c_protection(q); + return NULL; + } + *credit_change *= qdisc_pkt_len(skb); + qdisc_qstats_backlog_dec(sch, skb); + return skb; +} + +static int do_step_aqm(struct dualpi2_sched_data *q, struct sk_buff *skb, + u64 now) +{ + u64 qdelay = 0; + + if (q->step_in_packets) + qdelay = qdisc_qlen(q->l_queue); + else + qdelay = dualpi2_sojourn_time(skb, now); + + if (dualpi2_skb_cb(skb)->apply_step && qdelay > q->step_thresh) { + if (!dualpi2_skb_cb(skb)->ect) { + /* Drop this non-ECT packet */ + return 1; + } + + if (dualpi2_mark(q, skb)) + ++q->step_marks; + } + qdisc_bstats_update(q->l_queue, skb); + return 0; +} + +static void drop_and_retry(struct dualpi2_sched_data *q, struct sk_buff *skb, + struct Qdisc *sch, enum skb_drop_reason reason) +{ + ++q->deferred_drops_cnt; + q->deferred_drops_len += qdisc_pkt_len(skb); + kfree_skb_reason(skb, reason); + qdisc_qstats_drop(sch); +} + +static struct sk_buff *dualpi2_qdisc_dequeue(struct Qdisc *sch) +{ + struct dualpi2_sched_data *q = qdisc_priv(sch); + struct sk_buff *skb; + int credit_change; + u64 now; + + now = ktime_get_ns(); + + while ((skb = dequeue_packet(sch, q, &credit_change, now))) { + if (!q->drop_early && must_drop(sch, q, skb)) { + drop_and_retry(q, skb, sch, + SKB_DROP_REASON_QDISC_CONGESTED); + continue; + } + + if (skb_in_l_queue(skb) && do_step_aqm(q, skb, now)) { + qdisc_qstats_drop(q->l_queue); + drop_and_retry(q, skb, sch, + SKB_DROP_REASON_DUALPI2_STEP_DROP); + continue; + } + + q->c_protection_credit += credit_change; + qdisc_bstats_update(sch, skb); + break; + } + + if (q->deferred_drops_cnt) { + qdisc_tree_reduce_backlog(sch, q->deferred_drops_cnt, + q->deferred_drops_len); + q->deferred_drops_cnt = 0; + q->deferred_drops_len = 0; + } + return skb; +} + +static s64 __scale_delta(u64 diff) +{ + do_div(diff, 1 << ALPHA_BETA_GRANULARITY); + return diff; +} + +static void get_queue_delays(struct dualpi2_sched_data *q, u64 *qdelay_c, + u64 *qdelay_l) +{ + u64 now, qc, ql; + + now = ktime_get_ns(); + qc = READ_ONCE(q->c_head_ts); + ql = READ_ONCE(q->l_head_ts); + + *qdelay_c = qc ? now - qc : 0; + *qdelay_l = ql ? now - ql : 0; +} + +static u32 calculate_probability(struct Qdisc *sch) +{ + struct dualpi2_sched_data *q = qdisc_priv(sch); + u32 new_prob; + u64 qdelay_c; + u64 qdelay_l; + u64 qdelay; + s64 delta; + + get_queue_delays(q, &qdelay_c, &qdelay_l); + qdelay = max(qdelay_l, qdelay_c); + + /* Alpha and beta take at most 32b, i.e, the delay difference would + * overflow for queuing delay differences > ~4.2sec. + */ + delta = ((s64)qdelay - (s64)q->pi2_target) * q->pi2_alpha; + delta += ((s64)qdelay - (s64)q->last_qdelay) * q->pi2_beta; + q->last_qdelay = qdelay; + + /* Bound new_prob between 0 and MAX_PROB */ + if (delta > 0) { + new_prob = __scale_delta(delta) + q->pi2_prob; + if (new_prob < q->pi2_prob) + new_prob = MAX_PROB; + } else { + new_prob = q->pi2_prob - __scale_delta(~delta + 1); + if (new_prob > q->pi2_prob) + new_prob = 0; + } + + /* If we do not drop on overload, ensure we cap the L4S probability to + * 100% to keep window fairness when overflowing. + */ + if (!q->drop_overload) + return min_t(u32, new_prob, MAX_PROB / q->coupling_factor); + return new_prob; +} + +static u32 get_memory_limit(struct Qdisc *sch, u32 limit) +{ + /* Apply rule of thumb, i.e., doubling the packet length, + * to further include per packet overhead in memory_limit. + */ + u64 memlim = mul_u32_u32(limit, 2 * psched_mtu(qdisc_dev(sch))); + + if (upper_32_bits(memlim)) + return U32_MAX; + else + return lower_32_bits(memlim); +} + +static u32 convert_us_to_nsec(u32 us) +{ + u64 ns = mul_u32_u32(us, NSEC_PER_USEC); + + if (upper_32_bits(ns)) + return U32_MAX; + + return lower_32_bits(ns); +} + +static u32 convert_ns_to_usec(u64 ns) +{ + do_div(ns, NSEC_PER_USEC); + if (upper_32_bits(ns)) + return U32_MAX; + + return lower_32_bits(ns); +} + +static enum hrtimer_restart dualpi2_timer(struct hrtimer *timer) +{ + struct dualpi2_sched_data *q = timer_container_of(q, timer, pi2_timer); + struct Qdisc *sch = q->sch; + spinlock_t *root_lock; /* to lock qdisc for probability calculations */ + + rcu_read_lock(); + root_lock = qdisc_lock(qdisc_root_sleeping(sch)); + spin_lock(root_lock); + + WRITE_ONCE(q->pi2_prob, calculate_probability(sch)); + hrtimer_set_expires(&q->pi2_timer, next_pi2_timeout(q)); + + spin_unlock(root_lock); + rcu_read_unlock(); + return HRTIMER_RESTART; +} + +static struct netlink_range_validation dualpi2_alpha_beta_range = { + .min = 1, + .max = ALPHA_BETA_MAX, +}; + +static const struct nla_policy dualpi2_policy[TCA_DUALPI2_MAX + 1] = { + [TCA_DUALPI2_LIMIT] = NLA_POLICY_MIN(NLA_U32, 1), + [TCA_DUALPI2_MEMORY_LIMIT] = NLA_POLICY_MIN(NLA_U32, 1), + [TCA_DUALPI2_TARGET] = { .type = NLA_U32 }, + [TCA_DUALPI2_TUPDATE] = NLA_POLICY_MIN(NLA_U32, 1), + [TCA_DUALPI2_ALPHA] = + NLA_POLICY_FULL_RANGE(NLA_U32, &dualpi2_alpha_beta_range), + [TCA_DUALPI2_BETA] = + NLA_POLICY_FULL_RANGE(NLA_U32, &dualpi2_alpha_beta_range), + [TCA_DUALPI2_STEP_THRESH_PKTS] = { .type = NLA_U32 }, + [TCA_DUALPI2_STEP_THRESH_US] = { .type = NLA_U32 }, + [TCA_DUALPI2_MIN_QLEN_STEP] = { .type = NLA_U32 }, + [TCA_DUALPI2_COUPLING] = NLA_POLICY_MIN(NLA_U8, 1), + [TCA_DUALPI2_DROP_OVERLOAD] = + NLA_POLICY_MAX(NLA_U8, TCA_DUALPI2_DROP_OVERLOAD_MAX), + [TCA_DUALPI2_DROP_EARLY] = + NLA_POLICY_MAX(NLA_U8, TCA_DUALPI2_DROP_EARLY_MAX), + [TCA_DUALPI2_C_PROTECTION] = + NLA_POLICY_RANGE(NLA_U8, 0, MAX_WC), + [TCA_DUALPI2_ECN_MASK] = + NLA_POLICY_RANGE(NLA_U8, TC_DUALPI2_ECN_MASK_L4S_ECT, + TCA_DUALPI2_ECN_MASK_MAX), + [TCA_DUALPI2_SPLIT_GSO] = + NLA_POLICY_MAX(NLA_U8, TCA_DUALPI2_SPLIT_GSO_MAX), +}; + +static int dualpi2_change(struct Qdisc *sch, struct nlattr *opt, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[TCA_DUALPI2_MAX + 1]; + struct dualpi2_sched_data *q; + int old_backlog; + int old_qlen; + int err; + + if (!opt || !nla_len(opt)) { + NL_SET_ERR_MSG_MOD(extack, "Dualpi2 options are required"); + return -EINVAL; + } + err = nla_parse_nested(tb, TCA_DUALPI2_MAX, opt, dualpi2_policy, + extack); + if (err < 0) + return err; + if (tb[TCA_DUALPI2_STEP_THRESH_PKTS] && tb[TCA_DUALPI2_STEP_THRESH_US]) { + NL_SET_ERR_MSG_MOD(extack, "multiple step thresh attributes"); + return -EINVAL; + } + + q = qdisc_priv(sch); + sch_tree_lock(sch); + + if (tb[TCA_DUALPI2_LIMIT]) { + u32 limit = nla_get_u32(tb[TCA_DUALPI2_LIMIT]); + + WRITE_ONCE(sch->limit, limit); + WRITE_ONCE(q->memory_limit, get_memory_limit(sch, limit)); + } + + if (tb[TCA_DUALPI2_MEMORY_LIMIT]) + WRITE_ONCE(q->memory_limit, + nla_get_u32(tb[TCA_DUALPI2_MEMORY_LIMIT])); + + if (tb[TCA_DUALPI2_TARGET]) { + u64 target = nla_get_u32(tb[TCA_DUALPI2_TARGET]); + + WRITE_ONCE(q->pi2_target, target * NSEC_PER_USEC); + } + + if (tb[TCA_DUALPI2_TUPDATE]) { + u64 tupdate = nla_get_u32(tb[TCA_DUALPI2_TUPDATE]); + + WRITE_ONCE(q->pi2_tupdate, convert_us_to_nsec(tupdate)); + } + + if (tb[TCA_DUALPI2_ALPHA]) { + u32 alpha = nla_get_u32(tb[TCA_DUALPI2_ALPHA]); + + WRITE_ONCE(q->pi2_alpha, dualpi2_scale_alpha_beta(alpha)); + } + + if (tb[TCA_DUALPI2_BETA]) { + u32 beta = nla_get_u32(tb[TCA_DUALPI2_BETA]); + + WRITE_ONCE(q->pi2_beta, dualpi2_scale_alpha_beta(beta)); + } + + if (tb[TCA_DUALPI2_STEP_THRESH_PKTS]) { + u32 step_th = nla_get_u32(tb[TCA_DUALPI2_STEP_THRESH_PKTS]); + + WRITE_ONCE(q->step_in_packets, true); + WRITE_ONCE(q->step_thresh, step_th); + } else if (tb[TCA_DUALPI2_STEP_THRESH_US]) { + u32 step_th = nla_get_u32(tb[TCA_DUALPI2_STEP_THRESH_US]); + + WRITE_ONCE(q->step_in_packets, false); + WRITE_ONCE(q->step_thresh, convert_us_to_nsec(step_th)); + } + + if (tb[TCA_DUALPI2_MIN_QLEN_STEP]) + WRITE_ONCE(q->min_qlen_step, + nla_get_u32(tb[TCA_DUALPI2_MIN_QLEN_STEP])); + + if (tb[TCA_DUALPI2_COUPLING]) { + u8 coupling = nla_get_u8(tb[TCA_DUALPI2_COUPLING]); + + WRITE_ONCE(q->coupling_factor, coupling); + } + + if (tb[TCA_DUALPI2_DROP_OVERLOAD]) { + u8 drop_overload = nla_get_u8(tb[TCA_DUALPI2_DROP_OVERLOAD]); + + WRITE_ONCE(q->drop_overload, (bool)drop_overload); + } + + if (tb[TCA_DUALPI2_DROP_EARLY]) { + u8 drop_early = nla_get_u8(tb[TCA_DUALPI2_DROP_EARLY]); + + WRITE_ONCE(q->drop_early, (bool)drop_early); + } + + if (tb[TCA_DUALPI2_C_PROTECTION]) { + u8 wc = nla_get_u8(tb[TCA_DUALPI2_C_PROTECTION]); + + dualpi2_calculate_c_protection(sch, q, wc); + } + + if (tb[TCA_DUALPI2_ECN_MASK]) { + u8 ecn_mask = nla_get_u8(tb[TCA_DUALPI2_ECN_MASK]); + + WRITE_ONCE(q->ecn_mask, ecn_mask); + } + + if (tb[TCA_DUALPI2_SPLIT_GSO]) { + u8 split_gso = nla_get_u8(tb[TCA_DUALPI2_SPLIT_GSO]); + + WRITE_ONCE(q->split_gso, (bool)split_gso); + } + + old_qlen = qdisc_qlen(sch); + old_backlog = sch->qstats.backlog; + while (qdisc_qlen(sch) > sch->limit || + q->memory_used > q->memory_limit) { + struct sk_buff *skb = qdisc_dequeue_internal(sch, true); + + q->memory_used -= skb->truesize; + qdisc_qstats_backlog_dec(sch, skb); + rtnl_qdisc_drop(skb, sch); + } + qdisc_tree_reduce_backlog(sch, old_qlen - qdisc_qlen(sch), + old_backlog - sch->qstats.backlog); + + sch_tree_unlock(sch); + return 0; +} + +/* Default alpha/beta values give a 10dB stability margin with max_rtt=100ms. */ +static void dualpi2_reset_default(struct Qdisc *sch) +{ + struct dualpi2_sched_data *q = qdisc_priv(sch); + + q->sch->limit = 10000; /* Max 125ms at 1Gbps */ + q->memory_limit = get_memory_limit(sch, q->sch->limit); + + q->pi2_target = 15 * NSEC_PER_MSEC; + q->pi2_tupdate = 16 * NSEC_PER_MSEC; + q->pi2_alpha = dualpi2_scale_alpha_beta(41); /* ~0.16 Hz * 256 */ + q->pi2_beta = dualpi2_scale_alpha_beta(819); /* ~3.20 Hz * 256 */ + + q->step_thresh = 1 * NSEC_PER_MSEC; + q->step_in_packets = false; + + dualpi2_calculate_c_protection(q->sch, q, 10); /* wc=10%, wl=90% */ + + q->ecn_mask = TC_DUALPI2_ECN_MASK_L4S_ECT; /* INET_ECN_ECT_1 */ + q->min_qlen_step = 0; /* Always apply step mark in L-queue */ + q->coupling_factor = 2; /* window fairness for equal RTTs */ + q->drop_overload = TC_DUALPI2_DROP_OVERLOAD_DROP; /* Drop overload */ + q->drop_early = TC_DUALPI2_DROP_EARLY_DROP_DEQUEUE; /* Drop dequeue */ + q->split_gso = TC_DUALPI2_SPLIT_GSO_SPLIT_GSO; /* Split GSO */ +} + +static int dualpi2_init(struct Qdisc *sch, struct nlattr *opt, + struct netlink_ext_ack *extack) +{ + struct dualpi2_sched_data *q = qdisc_priv(sch); + int err; + + q->l_queue = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, + TC_H_MAKE(sch->handle, 1), extack); + if (!q->l_queue) + return -ENOMEM; + + err = tcf_block_get(&q->tcf_block, &q->tcf_filters, sch, extack); + if (err) + return err; + + q->sch = sch; + dualpi2_reset_default(sch); + hrtimer_setup(&q->pi2_timer, dualpi2_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED); + + if (opt && nla_len(opt)) { + err = dualpi2_change(sch, opt, extack); + + if (err) + return err; + } + + hrtimer_start(&q->pi2_timer, next_pi2_timeout(q), + HRTIMER_MODE_ABS_PINNED); + return 0; +} + +static int dualpi2_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + struct dualpi2_sched_data *q = qdisc_priv(sch); + struct nlattr *opts; + bool step_in_pkts; + u32 step_th; + + step_in_pkts = READ_ONCE(q->step_in_packets); + step_th = READ_ONCE(q->step_thresh); + + opts = nla_nest_start_noflag(skb, TCA_OPTIONS); + if (!opts) + goto nla_put_failure; + + if (step_in_pkts && + (nla_put_u32(skb, TCA_DUALPI2_LIMIT, READ_ONCE(sch->limit)) || + nla_put_u32(skb, TCA_DUALPI2_MEMORY_LIMIT, + READ_ONCE(q->memory_limit)) || + nla_put_u32(skb, TCA_DUALPI2_TARGET, + convert_ns_to_usec(READ_ONCE(q->pi2_target))) || + nla_put_u32(skb, TCA_DUALPI2_TUPDATE, + convert_ns_to_usec(READ_ONCE(q->pi2_tupdate))) || + nla_put_u32(skb, TCA_DUALPI2_ALPHA, + dualpi2_unscale_alpha_beta(READ_ONCE(q->pi2_alpha))) || + nla_put_u32(skb, TCA_DUALPI2_BETA, + dualpi2_unscale_alpha_beta(READ_ONCE(q->pi2_beta))) || + nla_put_u32(skb, TCA_DUALPI2_STEP_THRESH_PKTS, step_th) || + nla_put_u32(skb, TCA_DUALPI2_MIN_QLEN_STEP, + READ_ONCE(q->min_qlen_step)) || + nla_put_u8(skb, TCA_DUALPI2_COUPLING, + READ_ONCE(q->coupling_factor)) || + nla_put_u8(skb, TCA_DUALPI2_DROP_OVERLOAD, + READ_ONCE(q->drop_overload)) || + nla_put_u8(skb, TCA_DUALPI2_DROP_EARLY, + READ_ONCE(q->drop_early)) || + nla_put_u8(skb, TCA_DUALPI2_C_PROTECTION, + READ_ONCE(q->c_protection_wc)) || + nla_put_u8(skb, TCA_DUALPI2_ECN_MASK, READ_ONCE(q->ecn_mask)) || + nla_put_u8(skb, TCA_DUALPI2_SPLIT_GSO, READ_ONCE(q->split_gso)))) + goto nla_put_failure; + + if (!step_in_pkts && + (nla_put_u32(skb, TCA_DUALPI2_LIMIT, READ_ONCE(sch->limit)) || + nla_put_u32(skb, TCA_DUALPI2_MEMORY_LIMIT, + READ_ONCE(q->memory_limit)) || + nla_put_u32(skb, TCA_DUALPI2_TARGET, + convert_ns_to_usec(READ_ONCE(q->pi2_target))) || + nla_put_u32(skb, TCA_DUALPI2_TUPDATE, + convert_ns_to_usec(READ_ONCE(q->pi2_tupdate))) || + nla_put_u32(skb, TCA_DUALPI2_ALPHA, + dualpi2_unscale_alpha_beta(READ_ONCE(q->pi2_alpha))) || + nla_put_u32(skb, TCA_DUALPI2_BETA, + dualpi2_unscale_alpha_beta(READ_ONCE(q->pi2_beta))) || + nla_put_u32(skb, TCA_DUALPI2_STEP_THRESH_US, + convert_ns_to_usec(step_th)) || + nla_put_u32(skb, TCA_DUALPI2_MIN_QLEN_STEP, + READ_ONCE(q->min_qlen_step)) || + nla_put_u8(skb, TCA_DUALPI2_COUPLING, + READ_ONCE(q->coupling_factor)) || + nla_put_u8(skb, TCA_DUALPI2_DROP_OVERLOAD, + READ_ONCE(q->drop_overload)) || + nla_put_u8(skb, TCA_DUALPI2_DROP_EARLY, + READ_ONCE(q->drop_early)) || + nla_put_u8(skb, TCA_DUALPI2_C_PROTECTION, + READ_ONCE(q->c_protection_wc)) || + nla_put_u8(skb, TCA_DUALPI2_ECN_MASK, READ_ONCE(q->ecn_mask)) || + nla_put_u8(skb, TCA_DUALPI2_SPLIT_GSO, READ_ONCE(q->split_gso)))) + goto nla_put_failure; + + return nla_nest_end(skb, opts); + +nla_put_failure: + nla_nest_cancel(skb, opts); + return -1; +} + +static int dualpi2_dump_stats(struct Qdisc *sch, struct gnet_dump *d) +{ + struct dualpi2_sched_data *q = qdisc_priv(sch); + struct tc_dualpi2_xstats st = { + .prob = READ_ONCE(q->pi2_prob), + .packets_in_c = q->packets_in_c, + .packets_in_l = q->packets_in_l, + .maxq = q->maxq, + .ecn_mark = q->ecn_mark, + .credit = q->c_protection_credit, + .step_marks = q->step_marks, + .memory_used = q->memory_used, + .max_memory_used = q->max_memory_used, + .memory_limit = q->memory_limit, + }; + u64 qc, ql; + + get_queue_delays(q, &qc, &ql); + st.delay_l = convert_ns_to_usec(ql); + st.delay_c = convert_ns_to_usec(qc); + return gnet_stats_copy_app(d, &st, sizeof(st)); +} + +/* Reset both L-queue and C-queue, internal packet counters, PI probability, + * C-queue protection credit, and timestamps, while preserving current + * configuration of DUALPI2. + */ +static void dualpi2_reset(struct Qdisc *sch) +{ + struct dualpi2_sched_data *q = qdisc_priv(sch); + + qdisc_reset_queue(sch); + qdisc_reset_queue(q->l_queue); + q->c_head_ts = 0; + q->l_head_ts = 0; + q->pi2_prob = 0; + q->packets_in_c = 0; + q->packets_in_l = 0; + q->maxq = 0; + q->ecn_mark = 0; + q->step_marks = 0; + q->memory_used = 0; + q->max_memory_used = 0; + dualpi2_reset_c_protection(q); +} + +static void dualpi2_destroy(struct Qdisc *sch) +{ + struct dualpi2_sched_data *q = qdisc_priv(sch); + + q->pi2_tupdate = 0; + hrtimer_cancel(&q->pi2_timer); + if (q->l_queue) + qdisc_put(q->l_queue); + tcf_block_put(q->tcf_block); +} + +static struct Qdisc *dualpi2_leaf(struct Qdisc *sch, unsigned long arg) +{ + return NULL; +} + +static unsigned long dualpi2_find(struct Qdisc *sch, u32 classid) +{ + return 0; +} + +static unsigned long dualpi2_bind(struct Qdisc *sch, unsigned long parent, + u32 classid) +{ + return 0; +} + +static void dualpi2_unbind(struct Qdisc *q, unsigned long cl) +{ +} + +static struct tcf_block *dualpi2_tcf_block(struct Qdisc *sch, unsigned long cl, + struct netlink_ext_ack *extack) +{ + struct dualpi2_sched_data *q = qdisc_priv(sch); + + if (cl) + return NULL; + return q->tcf_block; +} + +static void dualpi2_walk(struct Qdisc *sch, struct qdisc_walker *arg) +{ + unsigned int i; + + if (arg->stop) + return; + + /* We statically define only 2 queues */ + for (i = 0; i < 2; i++) { + if (arg->count < arg->skip) { + arg->count++; + continue; + } + if (arg->fn(sch, i + 1, arg) < 0) { + arg->stop = 1; + break; + } + arg->count++; + } +} + +/* Minimal class support to handle tc filters */ +static const struct Qdisc_class_ops dualpi2_class_ops = { + .leaf = dualpi2_leaf, + .find = dualpi2_find, + .tcf_block = dualpi2_tcf_block, + .bind_tcf = dualpi2_bind, + .unbind_tcf = dualpi2_unbind, + .walk = dualpi2_walk, +}; + +static struct Qdisc_ops dualpi2_qdisc_ops __read_mostly = { + .id = "dualpi2", + .cl_ops = &dualpi2_class_ops, + .priv_size = sizeof(struct dualpi2_sched_data), + .enqueue = dualpi2_qdisc_enqueue, + .dequeue = dualpi2_qdisc_dequeue, + .peek = qdisc_peek_dequeued, + .init = dualpi2_init, + .destroy = dualpi2_destroy, + .reset = dualpi2_reset, + .change = dualpi2_change, + .dump = dualpi2_dump, + .dump_stats = dualpi2_dump_stats, + .owner = THIS_MODULE, +}; + +static int __init dualpi2_module_init(void) +{ + return register_qdisc(&dualpi2_qdisc_ops); +} + +static void __exit dualpi2_module_exit(void) +{ + unregister_qdisc(&dualpi2_qdisc_ops); +} + +module_init(dualpi2_module_init); +module_exit(dualpi2_module_exit); + +MODULE_DESCRIPTION("Dual Queue with Proportional Integral controller Improved with a Square (dualpi2) scheduler"); +MODULE_AUTHOR("Koen De Schepper <koen.de_schepper@nokia-bell-labs.com>"); +MODULE_AUTHOR("Chia-Yu Chang <chia-yu.chang@nokia-bell-labs.com>"); +MODULE_AUTHOR("Olga Albisser <olga@albisser.org>"); +MODULE_AUTHOR("Henrik Steen <henrist@henrist.net>"); +MODULE_AUTHOR("Olivier Tilmans <olivier.tilmans@nokia.com>"); + +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_VERSION("1.0"); diff --git a/net/sched/sch_ets.c b/net/sched/sch_ets.c index 2c069f0181c6..037f764822b9 100644 --- a/net/sched/sch_ets.c +++ b/net/sched/sch_ets.c @@ -661,7 +661,7 @@ static int ets_qdisc_change(struct Qdisc *sch, struct nlattr *opt, for (i = q->nbands; i < oldbands; i++) { if (i >= q->nstrict && q->classes[i].qdisc->q.qlen) list_del_init(&q->classes[i].alist); - qdisc_tree_flush_backlog(q->classes[i].qdisc); + qdisc_purge_queue(q->classes[i].qdisc); } WRITE_ONCE(q->nstrict, nstrict); memcpy(q->prio2band, priomap, sizeof(priomap)); diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 16afb834fe4a..1e008a228ebd 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -740,6 +740,8 @@ static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc *qdisc, err = skb_array_produce(q, skb); if (unlikely(err)) { + tcf_set_drop_reason(skb, SKB_DROP_REASON_QDISC_OVERLIMIT); + if (qdisc_is_percpu_stats(qdisc)) return qdisc_drop_cpu(skb, qdisc, to_free); else diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c index 5a7745170e84..d8fd35da32a7 100644 --- a/net/sched/sch_hfsc.c +++ b/net/sched/sch_hfsc.c @@ -835,22 +835,6 @@ update_vf(struct hfsc_class *cl, unsigned int len, u64 cur_time) } } -static unsigned int -qdisc_peek_len(struct Qdisc *sch) -{ - struct sk_buff *skb; - unsigned int len; - - skb = sch->ops->peek(sch); - if (unlikely(skb == NULL)) { - qdisc_warn_nonwc("qdisc_peek_len", sch); - return 0; - } - len = qdisc_pkt_len(skb); - - return len; -} - static void hfsc_adjust_levels(struct hfsc_class *cl) { diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index 14bf71f57057..c968ea763774 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -821,7 +821,9 @@ static struct htb_class *htb_lookup_leaf(struct htb_prio *hprio, const int prio) u32 *pid; } stk[TC_HTB_MAXDEPTH], *sp = stk; - BUG_ON(!hprio->row.rb_node); + if (unlikely(!hprio->row.rb_node)) + return NULL; + sp->root = hprio->row.rb_node; sp->pptr = &hprio->ptr; sp->pid = &hprio->last_ptr_id; diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index fdd79d3ccd8c..eafc316ae319 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -973,6 +973,41 @@ static int parse_attr(struct nlattr *tb[], int maxtype, struct nlattr *nla, return 0; } +static const struct Qdisc_class_ops netem_class_ops; + +static int check_netem_in_tree(struct Qdisc *sch, bool duplicates, + struct netlink_ext_ack *extack) +{ + struct Qdisc *root, *q; + unsigned int i; + + root = qdisc_root_sleeping(sch); + + if (sch != root && root->ops->cl_ops == &netem_class_ops) { + if (duplicates || + ((struct netem_sched_data *)qdisc_priv(root))->duplicate) + goto err; + } + + if (!qdisc_dev(root)) + return 0; + + hash_for_each(qdisc_dev(root)->qdisc_hash, i, q, hash) { + if (sch != q && q->ops->cl_ops == &netem_class_ops) { + if (duplicates || + ((struct netem_sched_data *)qdisc_priv(q))->duplicate) + goto err; + } + } + + return 0; + +err: + NL_SET_ERR_MSG(extack, + "netem: cannot mix duplicating netems with other netems in tree"); + return -EINVAL; +} + /* Parse netlink message to set options */ static int netem_change(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) @@ -1031,6 +1066,11 @@ static int netem_change(struct Qdisc *sch, struct nlattr *opt, q->gap = qopt->gap; q->counter = 0; q->loss = qopt->loss; + + ret = check_netem_in_tree(sch, qopt->duplicate, extack); + if (ret) + goto unlock; + q->duplicate = qopt->duplicate; /* for compatibility with earlier versions. diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c index cc30f7a32f1a..9e2b9a490db2 100644 --- a/net/sched/sch_prio.c +++ b/net/sched/sch_prio.c @@ -211,7 +211,7 @@ static int prio_tune(struct Qdisc *sch, struct nlattr *opt, memcpy(q->prio2band, qopt->priomap, TC_PRIO_MAX+1); for (i = q->bands; i < oldbands; i++) - qdisc_tree_flush_backlog(q->queues[i]); + qdisc_purge_queue(q->queues[i]); for (i = oldbands; i < q->bands; i++) { q->queues[i] = queues[i]; diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c index bf1282cb22eb..2255355e51d3 100644 --- a/net/sched/sch_qfq.c +++ b/net/sched/sch_qfq.c @@ -412,7 +412,7 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, bool existing = false; struct nlattr *tb[TCA_QFQ_MAX + 1]; struct qfq_aggregate *new_agg = NULL; - u32 weight, lmax, inv_w; + u32 weight, lmax, inv_w, old_weight, old_lmax; int err; int delta_w; @@ -443,12 +443,16 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, inv_w = ONE_FP / weight; weight = ONE_FP / inv_w; - if (cl != NULL && - lmax == cl->agg->lmax && - weight == cl->agg->class_weight) - return 0; /* nothing to change */ + if (cl != NULL) { + sch_tree_lock(sch); + old_weight = cl->agg->class_weight; + old_lmax = cl->agg->lmax; + sch_tree_unlock(sch); + if (lmax == old_lmax && weight == old_weight) + return 0; /* nothing to change */ + } - delta_w = weight - (cl ? cl->agg->class_weight : 0); + delta_w = weight - (cl ? old_weight : 0); if (q->wsum + delta_w > QFQ_MAX_WSUM) { NL_SET_ERR_MSG_FMT_MOD(extack, @@ -532,9 +536,6 @@ destroy_class: static void qfq_destroy_class(struct Qdisc *sch, struct qfq_class *cl) { - struct qfq_sched *q = qdisc_priv(sch); - - qfq_rm_from_agg(q, cl); gen_kill_estimator(&cl->rate_est); qdisc_put(cl->qdisc); kfree(cl); @@ -555,6 +556,7 @@ static int qfq_delete_class(struct Qdisc *sch, unsigned long arg, qdisc_purge_queue(cl->qdisc); qdisc_class_hash_remove(&q->clhash, &cl->common); + qfq_rm_from_agg(q, cl); sch_tree_unlock(sch); @@ -625,6 +627,7 @@ static int qfq_dump_class(struct Qdisc *sch, unsigned long arg, { struct qfq_class *cl = (struct qfq_class *)arg; struct nlattr *nest; + u32 class_weight, lmax; tcm->tcm_parent = TC_H_ROOT; tcm->tcm_handle = cl->common.classid; @@ -633,8 +636,13 @@ static int qfq_dump_class(struct Qdisc *sch, unsigned long arg, nest = nla_nest_start_noflag(skb, TCA_OPTIONS); if (nest == NULL) goto nla_put_failure; - if (nla_put_u32(skb, TCA_QFQ_WEIGHT, cl->agg->class_weight) || - nla_put_u32(skb, TCA_QFQ_LMAX, cl->agg->lmax)) + + sch_tree_lock(sch); + class_weight = cl->agg->class_weight; + lmax = cl->agg->lmax; + sch_tree_unlock(sch); + if (nla_put_u32(skb, TCA_QFQ_WEIGHT, class_weight) || + nla_put_u32(skb, TCA_QFQ_LMAX, lmax)) goto nla_put_failure; return nla_nest_end(skb, nest); @@ -651,8 +659,10 @@ static int qfq_dump_class_stats(struct Qdisc *sch, unsigned long arg, memset(&xstats, 0, sizeof(xstats)); + sch_tree_lock(sch); xstats.weight = cl->agg->class_weight; xstats.lmax = cl->agg->lmax; + sch_tree_unlock(sch); if (gnet_stats_copy_basic(d, NULL, &cl->bstats, true) < 0 || gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 || @@ -989,7 +999,7 @@ static struct sk_buff *agg_dequeue(struct qfq_aggregate *agg, if (cl->qdisc->q.qlen == 0) /* no more packets, remove from list */ list_del_init(&cl->alist); - else if (cl->deficit < qdisc_pkt_len(cl->qdisc->ops->peek(cl->qdisc))) { + else if (cl->deficit < qdisc_peek_len(cl->qdisc)) { cl->deficit += agg->lmax; list_move_tail(&cl->alist, &agg->active); } @@ -1491,6 +1501,7 @@ static void qfq_destroy_qdisc(struct Qdisc *sch) for (i = 0; i < q->clhash.hashsize; i++) { hlist_for_each_entry_safe(cl, next, &q->clhash.hash[i], common.hnode) { + qfq_rm_from_agg(q, cl); qfq_destroy_class(sch, cl); } } diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c index 339d70b4a4c5..479c42d11083 100644 --- a/net/sched/sch_red.c +++ b/net/sched/sch_red.c @@ -285,7 +285,7 @@ static int __red_change(struct Qdisc *sch, struct nlattr **tb, q->userbits = userbits; q->limit = ctl->limit; if (child) { - qdisc_tree_flush_backlog(q->qdisc); + qdisc_purge_queue(q->qdisc); old_child = q->qdisc; q->qdisc = child; } diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c index a8081492d671..96eb2f122973 100644 --- a/net/sched/sch_sfq.c +++ b/net/sched/sch_sfq.c @@ -310,7 +310,10 @@ drop: /* It is difficult to believe, but ALL THE SLOTS HAVE LENGTH 1. */ x = q->tail->next; slot = &q->slots[x]; - q->tail->next = slot->next; + if (slot->next == x) + q->tail = NULL; /* no more active slots */ + else + q->tail->next = slot->next; q->ht[slot->hash] = SFQ_EMPTY_SLOT; goto drop; } @@ -653,6 +656,14 @@ static int sfq_change(struct Qdisc *sch, struct nlattr *opt, NL_SET_ERR_MSG_MOD(extack, "invalid quantum"); return -EINVAL; } + + if (ctl->perturb_period < 0 || + ctl->perturb_period > INT_MAX / HZ) { + NL_SET_ERR_MSG_MOD(extack, "invalid perturb period"); + return -EINVAL; + } + perturb_period = ctl->perturb_period * HZ; + if (ctl_v1 && !red_check_params(ctl_v1->qth_min, ctl_v1->qth_max, ctl_v1->Wlog, ctl_v1->Scell_log, NULL)) return -EINVAL; @@ -669,14 +680,12 @@ static int sfq_change(struct Qdisc *sch, struct nlattr *opt, headdrop = q->headdrop; maxdepth = q->maxdepth; maxflows = q->maxflows; - perturb_period = q->perturb_period; quantum = q->quantum; flags = q->flags; /* update and validate configuration */ if (ctl->quantum) quantum = ctl->quantum; - perturb_period = ctl->perturb_period * HZ; if (ctl->flows) maxflows = min_t(u32, ctl->flows, SFQ_MAX_FLOWS); if (ctl->divisor) { diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c index 14021b812329..e759e43ad27e 100644 --- a/net/sched/sch_taprio.c +++ b/net/sched/sch_taprio.c @@ -998,7 +998,7 @@ static const struct nla_policy entry_policy[TCA_TAPRIO_SCHED_ENTRY_MAX + 1] = { static const struct nla_policy taprio_tc_policy[TCA_TAPRIO_TC_ENTRY_MAX + 1] = { [TCA_TAPRIO_TC_ENTRY_INDEX] = NLA_POLICY_MAX(NLA_U32, - TC_QOPT_MAX_QUEUE), + TC_QOPT_MAX_QUEUE - 1), [TCA_TAPRIO_TC_ENTRY_MAX_SDU] = { .type = NLA_U32 }, [TCA_TAPRIO_TC_ENTRY_FP] = NLA_POLICY_RANGE(NLA_U32, TC_FP_EXPRESS, @@ -1328,13 +1328,15 @@ static int taprio_dev_notifier(struct notifier_block *nb, unsigned long event, stab = rtnl_dereference(q->root->stab); - oper = rtnl_dereference(q->oper_sched); + rcu_read_lock(); + oper = rcu_dereference(q->oper_sched); if (oper) taprio_update_queue_max_sdu(q, oper, stab); - admin = rtnl_dereference(q->admin_sched); + admin = rcu_dereference(q->admin_sched); if (admin) taprio_update_queue_max_sdu(q, admin, stab); + rcu_read_unlock(); break; } @@ -1696,19 +1698,15 @@ static int taprio_parse_tc_entry(struct Qdisc *sch, if (err < 0) return err; - if (!tb[TCA_TAPRIO_TC_ENTRY_INDEX]) { + if (NL_REQ_ATTR_CHECK(extack, opt, tb, TCA_TAPRIO_TC_ENTRY_INDEX)) { NL_SET_ERR_MSG_MOD(extack, "TC entry index missing"); return -EINVAL; } tc = nla_get_u32(tb[TCA_TAPRIO_TC_ENTRY_INDEX]); - if (tc >= TC_QOPT_MAX_QUEUE) { - NL_SET_ERR_MSG_MOD(extack, "TC entry index out of range"); - return -ERANGE; - } - if (*seen_tcs & BIT(tc)) { - NL_SET_ERR_MSG_MOD(extack, "Duplicate TC entry"); + NL_SET_ERR_MSG_ATTR(extack, tb[TCA_TAPRIO_TC_ENTRY_INDEX], + "Duplicate tc entry"); return -EINVAL; } diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c index dc26b22d53c7..4c977f049670 100644 --- a/net/sched/sch_tbf.c +++ b/net/sched/sch_tbf.c @@ -452,7 +452,7 @@ static int tbf_change(struct Qdisc *sch, struct nlattr *opt, sch_tree_lock(sch); if (child) { - qdisc_tree_flush_backlog(q->qdisc); + qdisc_purge_queue(q->qdisc); old = q->qdisc; q->qdisc = child; } |