summaryrefslogtreecommitdiff
path: root/net/sched
diff options
context:
space:
mode:
Diffstat (limited to 'net/sched')
-rw-r--r--net/sched/Kconfig12
-rw-r--r--net/sched/Makefile1
-rw-r--r--net/sched/act_api.c9
-rw-r--r--net/sched/act_connmark.c18
-rw-r--r--net/sched/act_csum.c18
-rw-r--r--net/sched/act_ct.c30
-rw-r--r--net/sched/act_ctinfo.c42
-rw-r--r--net/sched/act_mpls.c21
-rw-r--r--net/sched/act_nat.c25
-rw-r--r--net/sched/act_pedit.c20
-rw-r--r--net/sched/act_police.c18
-rw-r--r--net/sched/act_skbedit.c20
-rw-r--r--net/sched/bpf_qdisc.c9
-rw-r--r--net/sched/em_text.c2
-rw-r--r--net/sched/sch_api.c52
-rw-r--r--net/sched/sch_cake.c5
-rw-r--r--net/sched/sch_dualpi2.c1175
-rw-r--r--net/sched/sch_generic.c2
-rw-r--r--net/sched/sch_hfsc.c16
-rw-r--r--net/sched/sch_htb.c4
-rw-r--r--net/sched/sch_netem.c40
-rw-r--r--net/sched/sch_qfq.c35
-rw-r--r--net/sched/sch_taprio.c18
23 files changed, 1407 insertions, 185 deletions
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index ad914d2b2e22..6ddff028b81a 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -415,6 +415,18 @@ config NET_SCH_BPF
If unsure, say N.
+config NET_SCH_DUALPI2
+ tristate "Dual Queue PI Square (DUALPI2) scheduler"
+ help
+ Say Y here if you want to use the Dual Queue Proportional Integral
+ Controller Improved with a Square scheduling algorithm.
+ For more information, please see https://tools.ietf.org/html/rfc9332
+
+ To compile this driver as a module, choose M here: the module
+ will be called sch_dualpi2.
+
+ If unsure, say N.
+
menuconfig NET_SCH_DEFAULT
bool "Allow override default queue discipline"
help
diff --git a/net/sched/Makefile b/net/sched/Makefile
index 904d784902d1..5078ea84e6ad 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -63,6 +63,7 @@ obj-$(CONFIG_NET_SCH_CBS) += sch_cbs.o
obj-$(CONFIG_NET_SCH_ETF) += sch_etf.o
obj-$(CONFIG_NET_SCH_TAPRIO) += sch_taprio.o
obj-$(CONFIG_NET_SCH_BPF) += bpf_qdisc.o
+obj-$(CONFIG_NET_SCH_DUALPI2) += sch_dualpi2.o
obj-$(CONFIG_NET_CLS_U32) += cls_u32.o
obj-$(CONFIG_NET_CLS_ROUTE4) += cls_route.o
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 057e20cef375..9e468e463467 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -933,18 +933,25 @@ void tcf_idrinfo_destroy(const struct tc_action_ops *ops,
struct tcf_idrinfo *idrinfo)
{
struct idr *idr = &idrinfo->action_idr;
+ bool mutex_taken = false;
struct tc_action *p;
- int ret;
unsigned long id = 1;
unsigned long tmp;
+ int ret;
idr_for_each_entry_ul(idr, p, tmp, id) {
+ if (tc_act_in_hw(p) && !mutex_taken) {
+ rtnl_lock();
+ mutex_taken = true;
+ }
ret = __tcf_idr_release(p, false, true);
if (ret == ACT_P_DELETED)
module_put(ops->owner);
else if (ret < 0)
return;
}
+ if (mutex_taken)
+ rtnl_unlock();
idr_destroy(&idrinfo->action_idr);
}
EXPORT_SYMBOL(tcf_idrinfo_destroy);
diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c
index 0fce631e7c91..3e89927d7116 100644
--- a/net/sched/act_connmark.c
+++ b/net/sched/act_connmark.c
@@ -88,7 +88,7 @@ count:
/* using overlimits stats to count how many packets marked */
tcf_action_inc_overlimit_qstats(&ca->common);
out:
- return READ_ONCE(ca->tcf_action);
+ return parms->action;
}
static const struct nla_policy connmark_policy[TCA_CONNMARK_MAX + 1] = {
@@ -167,6 +167,8 @@ static int tcf_connmark_init(struct net *net, struct nlattr *nla,
if (err < 0)
goto release_idr;
+ nparms->action = parm->action;
+
spin_lock_bh(&ci->tcf_lock);
goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch);
oparms = rcu_replace_pointer(ci->parms, nparms, lockdep_is_held(&ci->tcf_lock));
@@ -190,20 +192,20 @@ out_free:
static inline int tcf_connmark_dump(struct sk_buff *skb, struct tc_action *a,
int bind, int ref)
{
+ const struct tcf_connmark_info *ci = to_connmark(a);
unsigned char *b = skb_tail_pointer(skb);
- struct tcf_connmark_info *ci = to_connmark(a);
+ const struct tcf_connmark_parms *parms;
struct tc_connmark opt = {
.index = ci->tcf_index,
.refcnt = refcount_read(&ci->tcf_refcnt) - ref,
.bindcnt = atomic_read(&ci->tcf_bindcnt) - bind,
};
- struct tcf_connmark_parms *parms;
struct tcf_t t;
- spin_lock_bh(&ci->tcf_lock);
- parms = rcu_dereference_protected(ci->parms, lockdep_is_held(&ci->tcf_lock));
+ rcu_read_lock();
+ parms = rcu_dereference(ci->parms);
- opt.action = ci->tcf_action;
+ opt.action = parms->action;
opt.zone = parms->zone;
if (nla_put(skb, TCA_CONNMARK_PARMS, sizeof(opt), &opt))
goto nla_put_failure;
@@ -212,12 +214,12 @@ static inline int tcf_connmark_dump(struct sk_buff *skb, struct tc_action *a,
if (nla_put_64bit(skb, TCA_CONNMARK_TM, sizeof(t), &t,
TCA_CONNMARK_PAD))
goto nla_put_failure;
- spin_unlock_bh(&ci->tcf_lock);
+ rcu_read_unlock();
return skb->len;
nla_put_failure:
- spin_unlock_bh(&ci->tcf_lock);
+ rcu_read_unlock();
nlmsg_trim(skb, b);
return -1;
}
diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c
index 5cc8e407e791..0939e6b2ba4d 100644
--- a/net/sched/act_csum.c
+++ b/net/sched/act_csum.c
@@ -99,6 +99,7 @@ static int tcf_csum_init(struct net *net, struct nlattr *nla,
goto put_chain;
}
params_new->update_flags = parm->update_flags;
+ params_new->action = parm->action;
spin_lock_bh(&p->tcf_lock);
goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch);
@@ -580,7 +581,7 @@ TC_INDIRECT_SCOPE int tcf_csum_act(struct sk_buff *skb,
tcf_lastuse_update(&p->tcf_tm);
tcf_action_update_bstats(&p->common, skb);
- action = READ_ONCE(p->tcf_action);
+ action = params->action;
if (unlikely(action == TC_ACT_SHOT))
goto drop;
@@ -631,9 +632,9 @@ drop:
static int tcf_csum_dump(struct sk_buff *skb, struct tc_action *a, int bind,
int ref)
{
+ const struct tcf_csum *p = to_tcf_csum(a);
unsigned char *b = skb_tail_pointer(skb);
- struct tcf_csum *p = to_tcf_csum(a);
- struct tcf_csum_params *params;
+ const struct tcf_csum_params *params;
struct tc_csum opt = {
.index = p->tcf_index,
.refcnt = refcount_read(&p->tcf_refcnt) - ref,
@@ -641,10 +642,9 @@ static int tcf_csum_dump(struct sk_buff *skb, struct tc_action *a, int bind,
};
struct tcf_t t;
- spin_lock_bh(&p->tcf_lock);
- params = rcu_dereference_protected(p->params,
- lockdep_is_held(&p->tcf_lock));
- opt.action = p->tcf_action;
+ rcu_read_lock();
+ params = rcu_dereference(p->params);
+ opt.action = params->action;
opt.update_flags = params->update_flags;
if (nla_put(skb, TCA_CSUM_PARMS, sizeof(opt), &opt))
@@ -653,12 +653,12 @@ static int tcf_csum_dump(struct sk_buff *skb, struct tc_action *a, int bind,
tcf_tm_dump(&t, &p->tcf_tm);
if (nla_put_64bit(skb, TCA_CSUM_TM, sizeof(t), &t, TCA_CSUM_PAD))
goto nla_put_failure;
- spin_unlock_bh(&p->tcf_lock);
+ rcu_read_unlock();
return skb->len;
nla_put_failure:
- spin_unlock_bh(&p->tcf_lock);
+ rcu_read_unlock();
nlmsg_trim(skb, b);
return -1;
}
diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c
index c02f39efc6ef..6749a4a9a9cd 100644
--- a/net/sched/act_ct.c
+++ b/net/sched/act_ct.c
@@ -977,7 +977,7 @@ TC_INDIRECT_SCOPE int tcf_ct_act(struct sk_buff *skb, const struct tc_action *a,
p = rcu_dereference_bh(c->params);
- retval = READ_ONCE(c->tcf_action);
+ retval = p->action;
commit = p->ct_action & TCA_CT_ACT_COMMIT;
clear = p->ct_action & TCA_CT_ACT_CLEAR;
tmpl = p->tmpl;
@@ -1409,6 +1409,7 @@ static int tcf_ct_init(struct net *net, struct nlattr *nla,
if (err)
goto cleanup;
+ params->action = parm->action;
spin_lock_bh(&c->tcf_lock);
goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch);
params = rcu_replace_pointer(c->params, params,
@@ -1442,8 +1443,8 @@ static void tcf_ct_cleanup(struct tc_action *a)
}
static int tcf_ct_dump_key_val(struct sk_buff *skb,
- void *val, int val_type,
- void *mask, int mask_type,
+ const void *val, int val_type,
+ const void *mask, int mask_type,
int len)
{
int err;
@@ -1464,9 +1465,9 @@ static int tcf_ct_dump_key_val(struct sk_buff *skb,
return 0;
}
-static int tcf_ct_dump_nat(struct sk_buff *skb, struct tcf_ct_params *p)
+static int tcf_ct_dump_nat(struct sk_buff *skb, const struct tcf_ct_params *p)
{
- struct nf_nat_range2 *range = &p->range;
+ const struct nf_nat_range2 *range = &p->range;
if (!(p->ct_action & TCA_CT_ACT_NAT))
return 0;
@@ -1504,7 +1505,8 @@ static int tcf_ct_dump_nat(struct sk_buff *skb, struct tcf_ct_params *p)
return 0;
}
-static int tcf_ct_dump_helper(struct sk_buff *skb, struct nf_conntrack_helper *helper)
+static int tcf_ct_dump_helper(struct sk_buff *skb,
+ const struct nf_conntrack_helper *helper)
{
if (!helper)
return 0;
@@ -1521,9 +1523,8 @@ static inline int tcf_ct_dump(struct sk_buff *skb, struct tc_action *a,
int bind, int ref)
{
unsigned char *b = skb_tail_pointer(skb);
- struct tcf_ct *c = to_ct(a);
- struct tcf_ct_params *p;
-
+ const struct tcf_ct *c = to_ct(a);
+ const struct tcf_ct_params *p;
struct tc_ct opt = {
.index = c->tcf_index,
.refcnt = refcount_read(&c->tcf_refcnt) - ref,
@@ -1531,10 +1532,9 @@ static inline int tcf_ct_dump(struct sk_buff *skb, struct tc_action *a,
};
struct tcf_t t;
- spin_lock_bh(&c->tcf_lock);
- p = rcu_dereference_protected(c->params,
- lockdep_is_held(&c->tcf_lock));
- opt.action = c->tcf_action;
+ rcu_read_lock();
+ p = rcu_dereference(c->params);
+ opt.action = p->action;
if (tcf_ct_dump_key_val(skb,
&p->ct_action, TCA_CT_ACTION,
@@ -1579,11 +1579,11 @@ skip_dump:
tcf_tm_dump(&t, &c->tcf_tm);
if (nla_put_64bit(skb, TCA_CT_TM, sizeof(t), &t, TCA_CT_PAD))
goto nla_put_failure;
- spin_unlock_bh(&c->tcf_lock);
+ rcu_read_unlock();
return skb->len;
nla_put_failure:
- spin_unlock_bh(&c->tcf_lock);
+ rcu_read_unlock();
nlmsg_trim(skb, b);
return -1;
}
diff --git a/net/sched/act_ctinfo.c b/net/sched/act_ctinfo.c
index 5b1241ddc758..71efe04d00b5 100644
--- a/net/sched/act_ctinfo.c
+++ b/net/sched/act_ctinfo.c
@@ -44,9 +44,9 @@ static void tcf_ctinfo_dscp_set(struct nf_conn *ct, struct tcf_ctinfo *ca,
ipv4_change_dsfield(ip_hdr(skb),
INET_ECN_MASK,
newdscp);
- ca->stats_dscp_set++;
+ atomic64_inc(&ca->stats_dscp_set);
} else {
- ca->stats_dscp_error++;
+ atomic64_inc(&ca->stats_dscp_error);
}
}
break;
@@ -57,9 +57,9 @@ static void tcf_ctinfo_dscp_set(struct nf_conn *ct, struct tcf_ctinfo *ca,
ipv6_change_dsfield(ipv6_hdr(skb),
INET_ECN_MASK,
newdscp);
- ca->stats_dscp_set++;
+ atomic64_inc(&ca->stats_dscp_set);
} else {
- ca->stats_dscp_error++;
+ atomic64_inc(&ca->stats_dscp_error);
}
}
break;
@@ -72,7 +72,7 @@ static void tcf_ctinfo_cpmark_set(struct nf_conn *ct, struct tcf_ctinfo *ca,
struct tcf_ctinfo_params *cp,
struct sk_buff *skb)
{
- ca->stats_cpmark_set++;
+ atomic64_inc(&ca->stats_cpmark_set);
skb->mark = READ_ONCE(ct->mark) & cp->cpmarkmask;
}
@@ -88,13 +88,11 @@ TC_INDIRECT_SCOPE int tcf_ctinfo_act(struct sk_buff *skb,
struct tcf_ctinfo_params *cp;
struct nf_conn *ct;
int proto, wlen;
- int action;
cp = rcu_dereference_bh(ca->params);
tcf_lastuse_update(&ca->tcf_tm);
tcf_action_update_bstats(&ca->common, skb);
- action = READ_ONCE(ca->tcf_action);
wlen = skb_network_offset(skb);
switch (skb_protocol(skb, true)) {
@@ -141,7 +139,7 @@ TC_INDIRECT_SCOPE int tcf_ctinfo_act(struct sk_buff *skb,
if (thash)
nf_ct_put(ct);
out:
- return action;
+ return cp->action;
}
static const struct nla_policy ctinfo_policy[TCA_CTINFO_MAX + 1] = {
@@ -258,6 +256,8 @@ static int tcf_ctinfo_init(struct net *net, struct nlattr *nla,
cp_new->mode |= CTINFO_MODE_CPMARK;
}
+ cp_new->action = actparm->action;
+
spin_lock_bh(&ci->tcf_lock);
goto_ch = tcf_action_set_ctrlact(*a, actparm->action, goto_ch);
cp_new = rcu_replace_pointer(ci->params, cp_new,
@@ -282,25 +282,24 @@ release_idr:
static int tcf_ctinfo_dump(struct sk_buff *skb, struct tc_action *a,
int bind, int ref)
{
- struct tcf_ctinfo *ci = to_ctinfo(a);
+ const struct tcf_ctinfo *ci = to_ctinfo(a);
+ unsigned char *b = skb_tail_pointer(skb);
+ const struct tcf_ctinfo_params *cp;
struct tc_ctinfo opt = {
.index = ci->tcf_index,
.refcnt = refcount_read(&ci->tcf_refcnt) - ref,
.bindcnt = atomic_read(&ci->tcf_bindcnt) - bind,
};
- unsigned char *b = skb_tail_pointer(skb);
- struct tcf_ctinfo_params *cp;
struct tcf_t t;
- spin_lock_bh(&ci->tcf_lock);
- cp = rcu_dereference_protected(ci->params,
- lockdep_is_held(&ci->tcf_lock));
+ rcu_read_lock();
+ cp = rcu_dereference(ci->params);
tcf_tm_dump(&t, &ci->tcf_tm);
if (nla_put_64bit(skb, TCA_CTINFO_TM, sizeof(t), &t, TCA_CTINFO_PAD))
goto nla_put_failure;
- opt.action = ci->tcf_action;
+ opt.action = cp->action;
if (nla_put(skb, TCA_CTINFO_ACT, sizeof(opt), &opt))
goto nla_put_failure;
@@ -323,22 +322,25 @@ static int tcf_ctinfo_dump(struct sk_buff *skb, struct tc_action *a,
}
if (nla_put_u64_64bit(skb, TCA_CTINFO_STATS_DSCP_SET,
- ci->stats_dscp_set, TCA_CTINFO_PAD))
+ atomic64_read(&ci->stats_dscp_set),
+ TCA_CTINFO_PAD))
goto nla_put_failure;
if (nla_put_u64_64bit(skb, TCA_CTINFO_STATS_DSCP_ERROR,
- ci->stats_dscp_error, TCA_CTINFO_PAD))
+ atomic64_read(&ci->stats_dscp_error),
+ TCA_CTINFO_PAD))
goto nla_put_failure;
if (nla_put_u64_64bit(skb, TCA_CTINFO_STATS_CPMARK_SET,
- ci->stats_cpmark_set, TCA_CTINFO_PAD))
+ atomic64_read(&ci->stats_cpmark_set),
+ TCA_CTINFO_PAD))
goto nla_put_failure;
- spin_unlock_bh(&ci->tcf_lock);
+ rcu_read_unlock();
return skb->len;
nla_put_failure:
- spin_unlock_bh(&ci->tcf_lock);
+ rcu_read_unlock();
nlmsg_trim(skb, b);
return -1;
}
diff --git a/net/sched/act_mpls.c b/net/sched/act_mpls.c
index 9f86f4e666d3..6654011dcd2b 100644
--- a/net/sched/act_mpls.c
+++ b/net/sched/act_mpls.c
@@ -57,7 +57,7 @@ TC_INDIRECT_SCOPE int tcf_mpls_act(struct sk_buff *skb,
struct tcf_mpls *m = to_mpls(a);
struct tcf_mpls_params *p;
__be32 new_lse;
- int ret, mac_len;
+ int mac_len;
tcf_lastuse_update(&m->tcf_tm);
bstats_update(this_cpu_ptr(m->common.cpu_bstats), skb);
@@ -72,8 +72,6 @@ TC_INDIRECT_SCOPE int tcf_mpls_act(struct sk_buff *skb,
mac_len = skb_network_offset(skb);
}
- ret = READ_ONCE(m->tcf_action);
-
p = rcu_dereference_bh(m->mpls_p);
switch (p->tcfm_action) {
@@ -122,7 +120,7 @@ TC_INDIRECT_SCOPE int tcf_mpls_act(struct sk_buff *skb,
if (skb_at_tc_ingress(skb))
skb_pull_rcsum(skb, skb->mac_len);
- return ret;
+ return p->action;
drop:
qstats_drop_inc(this_cpu_ptr(m->common.cpu_qstats));
@@ -296,6 +294,7 @@ static int tcf_mpls_init(struct net *net, struct nlattr *nla,
ACT_MPLS_BOS_NOT_SET);
p->tcfm_proto = nla_get_be16_default(tb[TCA_MPLS_PROTO],
htons(ETH_P_MPLS_UC));
+ p->action = parm->action;
spin_lock_bh(&m->tcf_lock);
goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch);
@@ -330,8 +329,8 @@ static int tcf_mpls_dump(struct sk_buff *skb, struct tc_action *a,
int bind, int ref)
{
unsigned char *b = skb_tail_pointer(skb);
- struct tcf_mpls *m = to_mpls(a);
- struct tcf_mpls_params *p;
+ const struct tcf_mpls *m = to_mpls(a);
+ const struct tcf_mpls_params *p;
struct tc_mpls opt = {
.index = m->tcf_index,
.refcnt = refcount_read(&m->tcf_refcnt) - ref,
@@ -339,10 +338,10 @@ static int tcf_mpls_dump(struct sk_buff *skb, struct tc_action *a,
};
struct tcf_t t;
- spin_lock_bh(&m->tcf_lock);
- opt.action = m->tcf_action;
- p = rcu_dereference_protected(m->mpls_p, lockdep_is_held(&m->tcf_lock));
+ rcu_read_lock();
+ p = rcu_dereference(m->mpls_p);
opt.m_action = p->tcfm_action;
+ opt.action = p->action;
if (nla_put(skb, TCA_MPLS_PARMS, sizeof(opt), &opt))
goto nla_put_failure;
@@ -370,12 +369,12 @@ static int tcf_mpls_dump(struct sk_buff *skb, struct tc_action *a,
if (nla_put_64bit(skb, TCA_MPLS_TM, sizeof(t), &t, TCA_MPLS_PAD))
goto nla_put_failure;
- spin_unlock_bh(&m->tcf_lock);
+ rcu_read_unlock();
return skb->len;
nla_put_failure:
- spin_unlock_bh(&m->tcf_lock);
+ rcu_read_unlock();
nlmsg_trim(skb, b);
return -EMSGSIZE;
}
diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c
index d541f553805f..26241d80ebe0 100644
--- a/net/sched/act_nat.c
+++ b/net/sched/act_nat.c
@@ -91,6 +91,7 @@ static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est,
nparm->new_addr = parm->new_addr;
nparm->mask = parm->mask;
nparm->flags = parm->flags;
+ nparm->action = parm->action;
p = to_tcf_nat(*a);
@@ -130,17 +131,16 @@ TC_INDIRECT_SCOPE int tcf_nat_act(struct sk_buff *skb,
tcf_lastuse_update(&p->tcf_tm);
tcf_action_update_bstats(&p->common, skb);
- action = READ_ONCE(p->tcf_action);
-
parms = rcu_dereference_bh(p->parms);
+ action = parms->action;
+ if (unlikely(action == TC_ACT_SHOT))
+ goto drop;
+
old_addr = parms->old_addr;
new_addr = parms->new_addr;
mask = parms->mask;
egress = parms->flags & TCA_NAT_FLAG_EGRESS;
- if (unlikely(action == TC_ACT_SHOT))
- goto drop;
-
noff = skb_network_offset(skb);
if (!pskb_may_pull(skb, sizeof(*iph) + noff))
goto drop;
@@ -268,21 +268,20 @@ static int tcf_nat_dump(struct sk_buff *skb, struct tc_action *a,
int bind, int ref)
{
unsigned char *b = skb_tail_pointer(skb);
- struct tcf_nat *p = to_tcf_nat(a);
+ const struct tcf_nat *p = to_tcf_nat(a);
+ const struct tcf_nat_parms *parms;
struct tc_nat opt = {
.index = p->tcf_index,
.refcnt = refcount_read(&p->tcf_refcnt) - ref,
.bindcnt = atomic_read(&p->tcf_bindcnt) - bind,
};
- struct tcf_nat_parms *parms;
struct tcf_t t;
- spin_lock_bh(&p->tcf_lock);
-
- opt.action = p->tcf_action;
+ rcu_read_lock();
- parms = rcu_dereference_protected(p->parms, lockdep_is_held(&p->tcf_lock));
+ parms = rcu_dereference(p->parms);
+ opt.action = parms->action;
opt.old_addr = parms->old_addr;
opt.new_addr = parms->new_addr;
opt.mask = parms->mask;
@@ -294,12 +293,12 @@ static int tcf_nat_dump(struct sk_buff *skb, struct tc_action *a,
tcf_tm_dump(&t, &p->tcf_tm);
if (nla_put_64bit(skb, TCA_NAT_TM, sizeof(t), &t, TCA_NAT_PAD))
goto nla_put_failure;
- spin_unlock_bh(&p->tcf_lock);
+ rcu_read_unlock();
return skb->len;
nla_put_failure:
- spin_unlock_bh(&p->tcf_lock);
+ rcu_read_unlock();
nlmsg_trim(skb, b);
return -1;
}
diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c
index fc0a35a7b62a..4b65901397a8 100644
--- a/net/sched/act_pedit.c
+++ b/net/sched/act_pedit.c
@@ -279,7 +279,7 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla,
}
p = to_pedit(*a);
-
+ nparms->action = parm->action;
spin_lock_bh(&p->tcf_lock);
goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch);
oparms = rcu_replace_pointer(p->parms, nparms, 1);
@@ -483,7 +483,7 @@ TC_INDIRECT_SCOPE int tcf_pedit_act(struct sk_buff *skb,
bad:
tcf_action_inc_overlimit_qstats(&p->common);
done:
- return p->tcf_action;
+ return parms->action;
}
static void tcf_pedit_stats_update(struct tc_action *a, u64 bytes, u64 packets,
@@ -500,19 +500,19 @@ static int tcf_pedit_dump(struct sk_buff *skb, struct tc_action *a,
int bind, int ref)
{
unsigned char *b = skb_tail_pointer(skb);
- struct tcf_pedit *p = to_pedit(a);
- struct tcf_pedit_parms *parms;
+ const struct tcf_pedit *p = to_pedit(a);
+ const struct tcf_pedit_parms *parms;
struct tc_pedit *opt;
struct tcf_t t;
int s;
- spin_lock_bh(&p->tcf_lock);
- parms = rcu_dereference_protected(p->parms, 1);
+ rcu_read_lock();
+ parms = rcu_dereference(p->parms);
s = struct_size(opt, keys, parms->tcfp_nkeys);
opt = kzalloc(s, GFP_ATOMIC);
if (unlikely(!opt)) {
- spin_unlock_bh(&p->tcf_lock);
+ rcu_read_unlock();
return -ENOBUFS;
}
opt->nkeys = parms->tcfp_nkeys;
@@ -521,7 +521,7 @@ static int tcf_pedit_dump(struct sk_buff *skb, struct tc_action *a,
flex_array_size(opt, keys, parms->tcfp_nkeys));
opt->index = p->tcf_index;
opt->flags = parms->tcfp_flags;
- opt->action = p->tcf_action;
+ opt->action = parms->action;
opt->refcnt = refcount_read(&p->tcf_refcnt) - ref;
opt->bindcnt = atomic_read(&p->tcf_bindcnt) - bind;
@@ -540,13 +540,13 @@ static int tcf_pedit_dump(struct sk_buff *skb, struct tc_action *a,
tcf_tm_dump(&t, &p->tcf_tm);
if (nla_put_64bit(skb, TCA_PEDIT_TM, sizeof(t), &t, TCA_PEDIT_PAD))
goto nla_put_failure;
- spin_unlock_bh(&p->tcf_lock);
+ rcu_read_unlock();
kfree(opt);
return skb->len;
nla_put_failure:
- spin_unlock_bh(&p->tcf_lock);
+ rcu_read_unlock();
nlmsg_trim(skb, b);
kfree(opt);
return -1;
diff --git a/net/sched/act_police.c b/net/sched/act_police.c
index a214ed681142..0e1c61183379 100644
--- a/net/sched/act_police.c
+++ b/net/sched/act_police.c
@@ -198,6 +198,7 @@ static int tcf_police_init(struct net *net, struct nlattr *nla,
psched_ppscfg_precompute(&new->ppsrate, pps);
}
+ new->action = parm->action;
spin_lock_bh(&police->tcf_lock);
spin_lock_bh(&police->tcfp_lock);
police->tcfp_t_c = ktime_get_ns();
@@ -254,8 +255,8 @@ TC_INDIRECT_SCOPE int tcf_police_act(struct sk_buff *skb,
tcf_lastuse_update(&police->tcf_tm);
bstats_update(this_cpu_ptr(police->common.cpu_bstats), skb);
- ret = READ_ONCE(police->tcf_action);
p = rcu_dereference_bh(police->params);
+ ret = p->action;
if (p->tcfp_ewma_rate) {
struct gnet_stats_rate_est64 sample;
@@ -338,9 +339,9 @@ static void tcf_police_stats_update(struct tc_action *a,
static int tcf_police_dump(struct sk_buff *skb, struct tc_action *a,
int bind, int ref)
{
+ const struct tcf_police *police = to_police(a);
unsigned char *b = skb_tail_pointer(skb);
- struct tcf_police *police = to_police(a);
- struct tcf_police_params *p;
+ const struct tcf_police_params *p;
struct tc_police opt = {
.index = police->tcf_index,
.refcnt = refcount_read(&police->tcf_refcnt) - ref,
@@ -348,10 +349,9 @@ static int tcf_police_dump(struct sk_buff *skb, struct tc_action *a,
};
struct tcf_t t;
- spin_lock_bh(&police->tcf_lock);
- opt.action = police->tcf_action;
- p = rcu_dereference_protected(police->params,
- lockdep_is_held(&police->tcf_lock));
+ rcu_read_lock();
+ p = rcu_dereference(police->params);
+ opt.action = p->action;
opt.mtu = p->tcfp_mtu;
opt.burst = PSCHED_NS2TICKS(p->tcfp_burst);
if (p->rate_present) {
@@ -392,12 +392,12 @@ static int tcf_police_dump(struct sk_buff *skb, struct tc_action *a,
tcf_tm_dump(&t, &police->tcf_tm);
if (nla_put_64bit(skb, TCA_POLICE_TM, sizeof(t), &t, TCA_POLICE_PAD))
goto nla_put_failure;
- spin_unlock_bh(&police->tcf_lock);
+ rcu_read_unlock();
return skb->len;
nla_put_failure:
- spin_unlock_bh(&police->tcf_lock);
+ rcu_read_unlock();
nlmsg_trim(skb, b);
return -1;
}
diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c
index 1f1d9ce3e968..8c1d1554f657 100644
--- a/net/sched/act_skbedit.c
+++ b/net/sched/act_skbedit.c
@@ -43,13 +43,11 @@ TC_INDIRECT_SCOPE int tcf_skbedit_act(struct sk_buff *skb,
{
struct tcf_skbedit *d = to_skbedit(a);
struct tcf_skbedit_params *params;
- int action;
tcf_lastuse_update(&d->tcf_tm);
bstats_update(this_cpu_ptr(d->common.cpu_bstats), skb);
params = rcu_dereference_bh(d->params);
- action = READ_ONCE(d->tcf_action);
if (params->flags & SKBEDIT_F_PRIORITY)
skb->priority = params->priority;
@@ -85,7 +83,7 @@ TC_INDIRECT_SCOPE int tcf_skbedit_act(struct sk_buff *skb,
}
if (params->flags & SKBEDIT_F_PTYPE)
skb->pkt_type = params->ptype;
- return action;
+ return params->action;
err:
qstats_drop_inc(this_cpu_ptr(d->common.cpu_qstats));
@@ -262,6 +260,7 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla,
if (flags & SKBEDIT_F_MASK)
params_new->mask = *mask;
+ params_new->action = parm->action;
spin_lock_bh(&d->tcf_lock);
goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch);
params_new = rcu_replace_pointer(d->params, params_new,
@@ -284,9 +283,9 @@ release_idr:
static int tcf_skbedit_dump(struct sk_buff *skb, struct tc_action *a,
int bind, int ref)
{
+ const struct tcf_skbedit *d = to_skbedit(a);
unsigned char *b = skb_tail_pointer(skb);
- struct tcf_skbedit *d = to_skbedit(a);
- struct tcf_skbedit_params *params;
+ const struct tcf_skbedit_params *params;
struct tc_skbedit opt = {
.index = d->tcf_index,
.refcnt = refcount_read(&d->tcf_refcnt) - ref,
@@ -295,10 +294,9 @@ static int tcf_skbedit_dump(struct sk_buff *skb, struct tc_action *a,
u64 pure_flags = 0;
struct tcf_t t;
- spin_lock_bh(&d->tcf_lock);
- params = rcu_dereference_protected(d->params,
- lockdep_is_held(&d->tcf_lock));
- opt.action = d->tcf_action;
+ rcu_read_lock();
+ params = rcu_dereference(d->params);
+ opt.action = params->action;
if (nla_put(skb, TCA_SKBEDIT_PARMS, sizeof(opt), &opt))
goto nla_put_failure;
@@ -333,12 +331,12 @@ static int tcf_skbedit_dump(struct sk_buff *skb, struct tc_action *a,
tcf_tm_dump(&t, &d->tcf_tm);
if (nla_put_64bit(skb, TCA_SKBEDIT_TM, sizeof(t), &t, TCA_SKBEDIT_PAD))
goto nla_put_failure;
- spin_unlock_bh(&d->tcf_lock);
+ rcu_read_unlock();
return skb->len;
nla_put_failure:
- spin_unlock_bh(&d->tcf_lock);
+ rcu_read_unlock();
nlmsg_trim(skb, b);
return -1;
}
diff --git a/net/sched/bpf_qdisc.c b/net/sched/bpf_qdisc.c
index 7ea8b54b2ab1..adcb618a2bfc 100644
--- a/net/sched/bpf_qdisc.c
+++ b/net/sched/bpf_qdisc.c
@@ -130,8 +130,7 @@ static int bpf_qdisc_btf_struct_access(struct bpf_verifier_log *log,
return 0;
}
-BTF_ID_LIST(bpf_qdisc_init_prologue_ids)
-BTF_ID(func, bpf_qdisc_init_prologue)
+BTF_ID_LIST_SINGLE(bpf_qdisc_init_prologue_ids, func, bpf_qdisc_init_prologue)
static int bpf_qdisc_gen_prologue(struct bpf_insn *insn_buf, bool direct_write,
const struct bpf_prog *prog)
@@ -161,8 +160,7 @@ static int bpf_qdisc_gen_prologue(struct bpf_insn *insn_buf, bool direct_write,
return insn - insn_buf;
}
-BTF_ID_LIST(bpf_qdisc_reset_destroy_epilogue_ids)
-BTF_ID(func, bpf_qdisc_reset_destroy_epilogue)
+BTF_ID_LIST_SINGLE(bpf_qdisc_reset_destroy_epilogue_ids, func, bpf_qdisc_reset_destroy_epilogue)
static int bpf_qdisc_gen_epilogue(struct bpf_insn *insn_buf, const struct bpf_prog *prog,
s16 ctx_stack_off)
@@ -451,8 +449,7 @@ static struct bpf_struct_ops bpf_Qdisc_ops = {
.owner = THIS_MODULE,
};
-BTF_ID_LIST(bpf_sk_buff_dtor_ids)
-BTF_ID(func, bpf_kfree_skb)
+BTF_ID_LIST_SINGLE(bpf_sk_buff_dtor_ids, func, bpf_kfree_skb)
static int __init bpf_qdisc_kfunc_init(void)
{
diff --git a/net/sched/em_text.c b/net/sched/em_text.c
index 420c66203b17..6b3d0af72c39 100644
--- a/net/sched/em_text.c
+++ b/net/sched/em_text.c
@@ -108,7 +108,7 @@ static int em_text_dump(struct sk_buff *skb, struct tcf_ematch *m)
struct text_match *tm = EM_TEXT_PRIV(m);
struct tcf_em_text conf;
- strncpy(conf.algo, tm->config->ops->name, sizeof(conf.algo) - 1);
+ strscpy(conf.algo, tm->config->ops->name);
conf.from_offset = tm->from_offset;
conf.to_offset = tm->to_offset;
conf.from_layer = tm->from_layer;
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index c5e3673aadbe..d7c767b861a4 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -336,17 +336,22 @@ out:
return q;
}
-static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
+static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid,
+ struct netlink_ext_ack *extack)
{
unsigned long cl;
const struct Qdisc_class_ops *cops = p->ops->cl_ops;
- if (cops == NULL)
- return NULL;
+ if (cops == NULL) {
+ NL_SET_ERR_MSG(extack, "Parent qdisc is not classful");
+ return ERR_PTR(-EOPNOTSUPP);
+ }
cl = cops->find(p, classid);
- if (cl == 0)
- return NULL;
+ if (cl == 0) {
+ NL_SET_ERR_MSG(extack, "Specified class not found");
+ return ERR_PTR(-ENOENT);
+ }
return cops->leaf(p, cl);
}
@@ -596,16 +601,6 @@ out:
qdisc_skb_cb(skb)->pkt_len = pkt_len;
}
-void qdisc_warn_nonwc(const char *txt, struct Qdisc *qdisc)
-{
- if (!(qdisc->flags & TCQ_F_WARN_NONWC)) {
- pr_warn("%s: %s qdisc %X: is non-work-conserving?\n",
- txt, qdisc->ops->id, qdisc->handle >> 16);
- qdisc->flags |= TCQ_F_WARN_NONWC;
- }
-}
-EXPORT_SYMBOL(qdisc_warn_nonwc);
-
static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
{
struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
@@ -780,15 +775,12 @@ static u32 qdisc_alloc_handle(struct net_device *dev)
void qdisc_tree_reduce_backlog(struct Qdisc *sch, int n, int len)
{
- bool qdisc_is_offloaded = sch->flags & TCQ_F_OFFLOADED;
const struct Qdisc_class_ops *cops;
unsigned long cl;
u32 parentid;
bool notify;
int drops;
- if (n == 0 && len == 0)
- return;
drops = max_t(int, n, 0);
rcu_read_lock();
while ((parentid = sch->parent)) {
@@ -797,17 +789,8 @@ void qdisc_tree_reduce_backlog(struct Qdisc *sch, int n, int len)
if (sch->flags & TCQ_F_NOPARENT)
break;
- /* Notify parent qdisc only if child qdisc becomes empty.
- *
- * If child was empty even before update then backlog
- * counter is screwed and we skip notification because
- * parent class is already passive.
- *
- * If the original child was offloaded then it is allowed
- * to be seem as empty, so the parent is notified anyway.
- */
- notify = !sch->q.qlen && !WARN_ON_ONCE(!n &&
- !qdisc_is_offloaded);
+ /* Notify parent qdisc only if child qdisc becomes empty. */
+ notify = !sch->q.qlen;
/* TODO: perform the search on a per txq basis */
sch = qdisc_lookup_rcu(qdisc_dev(sch), TC_H_MAJ(parentid));
if (sch == NULL) {
@@ -816,6 +799,9 @@ void qdisc_tree_reduce_backlog(struct Qdisc *sch, int n, int len)
}
cops = sch->ops->cl_ops;
if (notify && cops->qlen_notify) {
+ /* Note that qlen_notify must be idempotent as it may get called
+ * multiple times.
+ */
cl = cops->find(sch, parentid);
cops->qlen_notify(sch, cl);
}
@@ -1499,7 +1485,7 @@ static int __tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified classid");
return -ENOENT;
}
- q = qdisc_leaf(p, clid);
+ q = qdisc_leaf(p, clid, extack);
} else if (dev_ingress_queue(dev)) {
q = rtnl_dereference(dev_ingress_queue(dev)->qdisc_sleeping);
}
@@ -1510,6 +1496,8 @@ static int __tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
NL_SET_ERR_MSG(extack, "Cannot find specified qdisc on specified device");
return -ENOENT;
}
+ if (IS_ERR(q))
+ return PTR_ERR(q);
if (tcm->tcm_handle && q->handle != tcm->tcm_handle) {
NL_SET_ERR_MSG(extack, "Invalid handle");
@@ -1611,7 +1599,9 @@ static int __tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
NL_SET_ERR_MSG(extack, "Failed to find specified qdisc");
return -ENOENT;
}
- q = qdisc_leaf(p, clid);
+ q = qdisc_leaf(p, clid, extack);
+ if (IS_ERR(q))
+ return PTR_ERR(q);
} else if (dev_ingress_queue_create(dev)) {
q = rtnl_dereference(dev_ingress_queue(dev)->qdisc_sleeping);
}
diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c
index 48dd8c88903f..dbcfb948c867 100644
--- a/net/sched/sch_cake.c
+++ b/net/sched/sch_cake.c
@@ -1407,7 +1407,10 @@ static u32 cake_overhead(struct cake_sched_data *q, const struct sk_buff *skb)
return cake_calc_overhead(q, len, off);
/* borrowed from qdisc_pkt_len_init() */
- hdr_len = skb_transport_offset(skb);
+ if (!skb->encapsulation)
+ hdr_len = skb_transport_offset(skb);
+ else
+ hdr_len = skb_inner_transport_offset(skb);
/* + transport layer */
if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 |
diff --git a/net/sched/sch_dualpi2.c b/net/sched/sch_dualpi2.c
new file mode 100644
index 000000000000..845375ebd4ea
--- /dev/null
+++ b/net/sched/sch_dualpi2.c
@@ -0,0 +1,1175 @@
+// SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause
+/* Copyright (C) 2024 Nokia
+ *
+ * Author: Koen De Schepper <koen.de_schepper@nokia-bell-labs.com>
+ * Author: Olga Albisser <olga@albisser.org>
+ * Author: Henrik Steen <henrist@henrist.net>
+ * Author: Olivier Tilmans <olivier.tilmans@nokia.com>
+ * Author: Chia-Yu Chang <chia-yu.chang@nokia-bell-labs.com>
+ *
+ * DualPI Improved with a Square (dualpi2):
+ * - Supports congestion controls that comply with the Prague requirements
+ * in RFC9331 (e.g. TCP-Prague)
+ * - Supports coupled dual-queue with PI2 as defined in RFC9332
+ * - Supports ECN L4S-identifier (IP.ECN==0b*1)
+ *
+ * note: Although DCTCP and BBRv3 can use shallow-threshold ECN marks,
+ * they do not meet the 'Prague L4S Requirements' listed in RFC 9331
+ * Section 4, so they can only be used with DualPI2 in a datacenter
+ * context.
+ *
+ * References:
+ * - RFC9332: https://datatracker.ietf.org/doc/html/rfc9332
+ * - De Schepper, Koen, et al. "PI 2: A linearized AQM for both classic and
+ * scalable TCP." in proc. ACM CoNEXT'16, 2016.
+ */
+
+#include <linux/errno.h>
+#include <linux/hrtimer.h>
+#include <linux/if_vlan.h>
+#include <linux/kernel.h>
+#include <linux/limits.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/types.h>
+
+#include <net/gso.h>
+#include <net/inet_ecn.h>
+#include <net/pkt_cls.h>
+#include <net/pkt_sched.h>
+
+/* 32b enable to support flows with windows up to ~8.6 * 1e9 packets
+ * i.e., twice the maximal snd_cwnd.
+ * MAX_PROB must be consistent with the RNG in dualpi2_roll().
+ */
+#define MAX_PROB U32_MAX
+
+/* alpha/beta values exchanged over netlink are in units of 256ns */
+#define ALPHA_BETA_SHIFT 8
+
+/* Scaled values of alpha/beta must fit in 32b to avoid overflow in later
+ * computations. Consequently (see and dualpi2_scale_alpha_beta()), their
+ * netlink-provided values can use at most 31b, i.e. be at most (2^23)-1
+ * (~4MHz) as those are given in 1/256th. This enable to tune alpha/beta to
+ * control flows whose maximal RTTs can be in usec up to few secs.
+ */
+#define ALPHA_BETA_MAX ((1U << 31) - 1)
+
+/* Internal alpha/beta are in units of 64ns.
+ * This enables to use all alpha/beta values in the allowed range without loss
+ * of precision due to rounding when scaling them internally, e.g.,
+ * scale_alpha_beta(1) will not round down to 0.
+ */
+#define ALPHA_BETA_GRANULARITY 6
+
+#define ALPHA_BETA_SCALING (ALPHA_BETA_SHIFT - ALPHA_BETA_GRANULARITY)
+
+/* We express the weights (wc, wl) in %, i.e., wc + wl = 100 */
+#define MAX_WC 100
+
+struct dualpi2_sched_data {
+ struct Qdisc *l_queue; /* The L4S Low latency queue (L-queue) */
+ struct Qdisc *sch; /* The Classic queue (C-queue) */
+
+ /* Registered tc filters */
+ struct tcf_proto __rcu *tcf_filters;
+ struct tcf_block *tcf_block;
+
+ /* PI2 parameters */
+ u64 pi2_target; /* Target delay in nanoseconds */
+ u32 pi2_tupdate; /* Timer frequency in nanoseconds */
+ u32 pi2_prob; /* Base PI probability */
+ u32 pi2_alpha; /* Gain factor for the integral rate response */
+ u32 pi2_beta; /* Gain factor for the proportional response */
+ struct hrtimer pi2_timer; /* prob update timer */
+
+ /* Step AQM (L-queue only) parameters */
+ u32 step_thresh; /* Step threshold */
+ bool step_in_packets; /* Step thresh in packets (1) or time (0) */
+
+ /* C-queue starvation protection */
+ s32 c_protection_credit; /* Credit (sign indicates which queue) */
+ s32 c_protection_init; /* Reset value of the credit */
+ u8 c_protection_wc; /* C-queue weight (between 0 and MAX_WC) */
+ u8 c_protection_wl; /* L-queue weight (MAX_WC - wc) */
+
+ /* General dualQ parameters */
+ u32 memory_limit; /* Memory limit of both queues */
+ u8 coupling_factor;/* Coupling factor (k) between both queues */
+ u8 ecn_mask; /* Mask to match packets into L-queue */
+ u32 min_qlen_step; /* Minimum queue length to apply step thresh */
+ bool drop_early; /* Drop at enqueue (1) instead of dequeue (0) */
+ bool drop_overload; /* Drop (1) on overload, or overflow (0) */
+ bool split_gso; /* Split aggregated skb (1) or leave as is (0) */
+
+ /* Statistics */
+ u64 c_head_ts; /* Enqueue timestamp of the C-queue head */
+ u64 l_head_ts; /* Enqueue timestamp of the L-queue head */
+ u64 last_qdelay; /* Q delay val at the last probability update */
+ u32 packets_in_c; /* Enqueue packet counter of the C-queue */
+ u32 packets_in_l; /* Enqueue packet counter of the L-queue */
+ u32 maxq; /* Maximum queue size of the C-queue */
+ u32 ecn_mark; /* ECN mark pkt counter due to PI probability */
+ u32 step_marks; /* ECN mark pkt counter due to step AQM */
+ u32 memory_used; /* Memory used of both queues */
+ u32 max_memory_used;/* Maximum used memory */
+
+ /* Deferred drop statistics */
+ u32 deferred_drops_cnt; /* Packets dropped */
+ u32 deferred_drops_len; /* Bytes dropped */
+};
+
+struct dualpi2_skb_cb {
+ u64 ts; /* Timestamp at enqueue */
+ u8 apply_step:1, /* Can we apply the step threshold */
+ classified:2, /* Packet classification results */
+ ect:2; /* Packet ECT codepoint */
+};
+
+enum dualpi2_classification_results {
+ DUALPI2_C_CLASSIC = 0, /* C-queue */
+ DUALPI2_C_L4S = 1, /* L-queue (scale mark/classic drop) */
+ DUALPI2_C_LLLL = 2, /* L-queue (no drops/marks) */
+ __DUALPI2_C_MAX /* Keep last*/
+};
+
+static struct dualpi2_skb_cb *dualpi2_skb_cb(struct sk_buff *skb)
+{
+ qdisc_cb_private_validate(skb, sizeof(struct dualpi2_skb_cb));
+ return (struct dualpi2_skb_cb *)qdisc_skb_cb(skb)->data;
+}
+
+static u64 dualpi2_sojourn_time(struct sk_buff *skb, u64 reference)
+{
+ return reference - dualpi2_skb_cb(skb)->ts;
+}
+
+static u64 head_enqueue_time(struct Qdisc *q)
+{
+ struct sk_buff *skb = qdisc_peek_head(q);
+
+ return skb ? dualpi2_skb_cb(skb)->ts : 0;
+}
+
+static u32 dualpi2_scale_alpha_beta(u32 param)
+{
+ u64 tmp = ((u64)param * MAX_PROB >> ALPHA_BETA_SCALING);
+
+ do_div(tmp, NSEC_PER_SEC);
+ return tmp;
+}
+
+static u32 dualpi2_unscale_alpha_beta(u32 param)
+{
+ u64 tmp = ((u64)param * NSEC_PER_SEC << ALPHA_BETA_SCALING);
+
+ do_div(tmp, MAX_PROB);
+ return tmp;
+}
+
+static ktime_t next_pi2_timeout(struct dualpi2_sched_data *q)
+{
+ return ktime_add_ns(ktime_get_ns(), q->pi2_tupdate);
+}
+
+static bool skb_is_l4s(struct sk_buff *skb)
+{
+ return dualpi2_skb_cb(skb)->classified == DUALPI2_C_L4S;
+}
+
+static bool skb_in_l_queue(struct sk_buff *skb)
+{
+ return dualpi2_skb_cb(skb)->classified != DUALPI2_C_CLASSIC;
+}
+
+static bool skb_apply_step(struct sk_buff *skb, struct dualpi2_sched_data *q)
+{
+ return skb_is_l4s(skb) && qdisc_qlen(q->l_queue) >= q->min_qlen_step;
+}
+
+static bool dualpi2_mark(struct dualpi2_sched_data *q, struct sk_buff *skb)
+{
+ if (INET_ECN_set_ce(skb)) {
+ q->ecn_mark++;
+ return true;
+ }
+ return false;
+}
+
+static void dualpi2_reset_c_protection(struct dualpi2_sched_data *q)
+{
+ q->c_protection_credit = q->c_protection_init;
+}
+
+/* This computes the initial credit value and WRR weight for the L queue (wl)
+ * from the weight of the C queue (wc).
+ * If wl > wc, the scheduler will start with the L queue when reset.
+ */
+static void dualpi2_calculate_c_protection(struct Qdisc *sch,
+ struct dualpi2_sched_data *q, u32 wc)
+{
+ q->c_protection_wc = wc;
+ q->c_protection_wl = MAX_WC - wc;
+ q->c_protection_init = (s32)psched_mtu(qdisc_dev(sch)) *
+ ((int)q->c_protection_wc - (int)q->c_protection_wl);
+ dualpi2_reset_c_protection(q);
+}
+
+static bool dualpi2_roll(u32 prob)
+{
+ return get_random_u32() <= prob;
+}
+
+/* Packets in the C-queue are subject to a marking probability pC, which is the
+ * square of the internal PI probability (i.e., have an overall lower mark/drop
+ * probability). If the qdisc is overloaded, ignore ECT values and only drop.
+ *
+ * Note that this marking scheme is also applied to L4S packets during overload.
+ * Return true if packet dropping is required in C queue
+ */
+static bool dualpi2_classic_marking(struct dualpi2_sched_data *q,
+ struct sk_buff *skb, u32 prob,
+ bool overload)
+{
+ if (dualpi2_roll(prob) && dualpi2_roll(prob)) {
+ if (overload || dualpi2_skb_cb(skb)->ect == INET_ECN_NOT_ECT)
+ return true;
+ dualpi2_mark(q, skb);
+ }
+ return false;
+}
+
+/* Packets in the L-queue are subject to a marking probability pL given by the
+ * internal PI probability scaled by the coupling factor.
+ *
+ * On overload (i.e., @local_l_prob is >= 100%):
+ * - if the qdisc is configured to trade losses to preserve latency (i.e.,
+ * @q->drop_overload), apply classic drops first before marking.
+ * - otherwise, preserve the "no loss" property of ECN at the cost of queueing
+ * delay, eventually resulting in taildrop behavior once sch->limit is
+ * reached.
+ * Return true if packet dropping is required in L queue
+ */
+static bool dualpi2_scalable_marking(struct dualpi2_sched_data *q,
+ struct sk_buff *skb,
+ u64 local_l_prob, u32 prob,
+ bool overload)
+{
+ if (overload) {
+ /* Apply classic drop */
+ if (!q->drop_overload ||
+ !(dualpi2_roll(prob) && dualpi2_roll(prob)))
+ goto mark;
+ return true;
+ }
+
+ /* We can safely cut the upper 32b as overload==false */
+ if (dualpi2_roll(local_l_prob)) {
+ /* Non-ECT packets could have classified as L4S by filters. */
+ if (dualpi2_skb_cb(skb)->ect == INET_ECN_NOT_ECT)
+ return true;
+mark:
+ dualpi2_mark(q, skb);
+ }
+ return false;
+}
+
+/* Decide whether a given packet must be dropped (or marked if ECT), according
+ * to the PI2 probability.
+ *
+ * Never mark/drop if we have a standing queue of less than 2 MTUs.
+ */
+static bool must_drop(struct Qdisc *sch, struct dualpi2_sched_data *q,
+ struct sk_buff *skb)
+{
+ u64 local_l_prob;
+ bool overload;
+ u32 prob;
+
+ if (sch->qstats.backlog < 2 * psched_mtu(qdisc_dev(sch)))
+ return false;
+
+ prob = READ_ONCE(q->pi2_prob);
+ local_l_prob = (u64)prob * q->coupling_factor;
+ overload = local_l_prob > MAX_PROB;
+
+ switch (dualpi2_skb_cb(skb)->classified) {
+ case DUALPI2_C_CLASSIC:
+ return dualpi2_classic_marking(q, skb, prob, overload);
+ case DUALPI2_C_L4S:
+ return dualpi2_scalable_marking(q, skb, local_l_prob, prob,
+ overload);
+ default: /* DUALPI2_C_LLLL */
+ return false;
+ }
+}
+
+static void dualpi2_read_ect(struct sk_buff *skb)
+{
+ struct dualpi2_skb_cb *cb = dualpi2_skb_cb(skb);
+ int wlen = skb_network_offset(skb);
+
+ switch (skb_protocol(skb, true)) {
+ case htons(ETH_P_IP):
+ wlen += sizeof(struct iphdr);
+ if (!pskb_may_pull(skb, wlen) ||
+ skb_try_make_writable(skb, wlen))
+ goto not_ecn;
+
+ cb->ect = ipv4_get_dsfield(ip_hdr(skb)) & INET_ECN_MASK;
+ break;
+ case htons(ETH_P_IPV6):
+ wlen += sizeof(struct ipv6hdr);
+ if (!pskb_may_pull(skb, wlen) ||
+ skb_try_make_writable(skb, wlen))
+ goto not_ecn;
+
+ cb->ect = ipv6_get_dsfield(ipv6_hdr(skb)) & INET_ECN_MASK;
+ break;
+ default:
+ goto not_ecn;
+ }
+ return;
+
+not_ecn:
+ /* Non pullable/writable packets can only be dropped hence are
+ * classified as not ECT.
+ */
+ cb->ect = INET_ECN_NOT_ECT;
+}
+
+static int dualpi2_skb_classify(struct dualpi2_sched_data *q,
+ struct sk_buff *skb)
+{
+ struct dualpi2_skb_cb *cb = dualpi2_skb_cb(skb);
+ struct tcf_result res;
+ struct tcf_proto *fl;
+ int result;
+
+ dualpi2_read_ect(skb);
+ if (cb->ect & q->ecn_mask) {
+ cb->classified = DUALPI2_C_L4S;
+ return NET_XMIT_SUCCESS;
+ }
+
+ if (TC_H_MAJ(skb->priority) == q->sch->handle &&
+ TC_H_MIN(skb->priority) < __DUALPI2_C_MAX) {
+ cb->classified = TC_H_MIN(skb->priority);
+ return NET_XMIT_SUCCESS;
+ }
+
+ fl = rcu_dereference_bh(q->tcf_filters);
+ if (!fl) {
+ cb->classified = DUALPI2_C_CLASSIC;
+ return NET_XMIT_SUCCESS;
+ }
+
+ result = tcf_classify(skb, NULL, fl, &res, false);
+ if (result >= 0) {
+#ifdef CONFIG_NET_CLS_ACT
+ switch (result) {
+ case TC_ACT_STOLEN:
+ case TC_ACT_QUEUED:
+ case TC_ACT_TRAP:
+ return NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
+ case TC_ACT_SHOT:
+ return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
+ }
+#endif
+ cb->classified = TC_H_MIN(res.classid) < __DUALPI2_C_MAX ?
+ TC_H_MIN(res.classid) : DUALPI2_C_CLASSIC;
+ }
+ return NET_XMIT_SUCCESS;
+}
+
+static int dualpi2_enqueue_skb(struct sk_buff *skb, struct Qdisc *sch,
+ struct sk_buff **to_free)
+{
+ struct dualpi2_sched_data *q = qdisc_priv(sch);
+ struct dualpi2_skb_cb *cb;
+
+ if (unlikely(qdisc_qlen(sch) >= sch->limit) ||
+ unlikely((u64)q->memory_used + skb->truesize > q->memory_limit)) {
+ qdisc_qstats_overlimit(sch);
+ if (skb_in_l_queue(skb))
+ qdisc_qstats_overlimit(q->l_queue);
+ return qdisc_drop_reason(skb, sch, to_free,
+ SKB_DROP_REASON_QDISC_OVERLIMIT);
+ }
+
+ if (q->drop_early && must_drop(sch, q, skb)) {
+ qdisc_drop_reason(skb, sch, to_free,
+ SKB_DROP_REASON_QDISC_CONGESTED);
+ return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
+ }
+
+ cb = dualpi2_skb_cb(skb);
+ cb->ts = ktime_get_ns();
+ q->memory_used += skb->truesize;
+ if (q->memory_used > q->max_memory_used)
+ q->max_memory_used = q->memory_used;
+
+ if (qdisc_qlen(sch) > q->maxq)
+ q->maxq = qdisc_qlen(sch);
+
+ if (skb_in_l_queue(skb)) {
+ /* Apply step thresh if skb is L4S && L-queue len >= min_qlen */
+ dualpi2_skb_cb(skb)->apply_step = skb_apply_step(skb, q);
+
+ /* Keep the overall qdisc stats consistent */
+ ++sch->q.qlen;
+ qdisc_qstats_backlog_inc(sch, skb);
+ ++q->packets_in_l;
+ if (!q->l_head_ts)
+ q->l_head_ts = cb->ts;
+ return qdisc_enqueue_tail(skb, q->l_queue);
+ }
+ ++q->packets_in_c;
+ if (!q->c_head_ts)
+ q->c_head_ts = cb->ts;
+ return qdisc_enqueue_tail(skb, sch);
+}
+
+/* By default, dualpi2 will split GSO skbs into independent skbs and enqueue
+ * each of those individually. This yields the following benefits, at the
+ * expense of CPU usage:
+ * - Finer-grained AQM actions as the sub-packets of a burst no longer share the
+ * same fate (e.g., the random mark/drop probability is applied individually)
+ * - Improved precision of the starvation protection/WRR scheduler at dequeue,
+ * as the size of the dequeued packets will be smaller.
+ */
+static int dualpi2_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch,
+ struct sk_buff **to_free)
+{
+ struct dualpi2_sched_data *q = qdisc_priv(sch);
+ int err;
+
+ err = dualpi2_skb_classify(q, skb);
+ if (err != NET_XMIT_SUCCESS) {
+ if (err & __NET_XMIT_BYPASS)
+ qdisc_qstats_drop(sch);
+ __qdisc_drop(skb, to_free);
+ return err;
+ }
+
+ if (q->split_gso && skb_is_gso(skb)) {
+ netdev_features_t features;
+ struct sk_buff *nskb, *next;
+ int cnt, byte_len, orig_len;
+ int err;
+
+ features = netif_skb_features(skb);
+ nskb = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
+ if (IS_ERR_OR_NULL(nskb))
+ return qdisc_drop(skb, sch, to_free);
+
+ cnt = 1;
+ byte_len = 0;
+ orig_len = qdisc_pkt_len(skb);
+ skb_list_walk_safe(nskb, nskb, next) {
+ skb_mark_not_on_list(nskb);
+
+ /* Iterate through GSO fragments of an skb:
+ * (1) Set pkt_len from the single GSO fragments
+ * (2) Copy classified and ect values of an skb
+ * (3) Enqueue fragment & set ts in dualpi2_enqueue_skb
+ */
+ qdisc_skb_cb(nskb)->pkt_len = nskb->len;
+ dualpi2_skb_cb(nskb)->classified =
+ dualpi2_skb_cb(skb)->classified;
+ dualpi2_skb_cb(nskb)->ect = dualpi2_skb_cb(skb)->ect;
+ err = dualpi2_enqueue_skb(nskb, sch, to_free);
+
+ if (err == NET_XMIT_SUCCESS) {
+ /* Compute the backlog adjustment that needs
+ * to be propagated in the qdisc tree to reflect
+ * all new skbs successfully enqueued.
+ */
+ ++cnt;
+ byte_len += nskb->len;
+ }
+ }
+ if (cnt > 1) {
+ /* The caller will add the original skb stats to its
+ * backlog, compensate this if any nskb is enqueued.
+ */
+ --cnt;
+ byte_len -= orig_len;
+ }
+ qdisc_tree_reduce_backlog(sch, -cnt, -byte_len);
+ consume_skb(skb);
+ return err;
+ }
+ return dualpi2_enqueue_skb(skb, sch, to_free);
+}
+
+/* Select the queue from which the next packet can be dequeued, ensuring that
+ * neither queue can starve the other with a WRR scheduler.
+ *
+ * The sign of the WRR credit determines the next queue, while the size of
+ * the dequeued packet determines the magnitude of the WRR credit change. If
+ * either queue is empty, the WRR credit is kept unchanged.
+ *
+ * As the dequeued packet can be dropped later, the caller has to perform the
+ * qdisc_bstats_update() calls.
+ */
+static struct sk_buff *dequeue_packet(struct Qdisc *sch,
+ struct dualpi2_sched_data *q,
+ int *credit_change,
+ u64 now)
+{
+ struct sk_buff *skb = NULL;
+ int c_len;
+
+ *credit_change = 0;
+ c_len = qdisc_qlen(sch) - qdisc_qlen(q->l_queue);
+ if (qdisc_qlen(q->l_queue) && (!c_len || q->c_protection_credit <= 0)) {
+ skb = __qdisc_dequeue_head(&q->l_queue->q);
+ WRITE_ONCE(q->l_head_ts, head_enqueue_time(q->l_queue));
+ if (c_len)
+ *credit_change = q->c_protection_wc;
+ qdisc_qstats_backlog_dec(q->l_queue, skb);
+
+ /* Keep the global queue size consistent */
+ --sch->q.qlen;
+ q->memory_used -= skb->truesize;
+ } else if (c_len) {
+ skb = __qdisc_dequeue_head(&sch->q);
+ WRITE_ONCE(q->c_head_ts, head_enqueue_time(sch));
+ if (qdisc_qlen(q->l_queue))
+ *credit_change = ~((s32)q->c_protection_wl) + 1;
+ q->memory_used -= skb->truesize;
+ } else {
+ dualpi2_reset_c_protection(q);
+ return NULL;
+ }
+ *credit_change *= qdisc_pkt_len(skb);
+ qdisc_qstats_backlog_dec(sch, skb);
+ return skb;
+}
+
+static int do_step_aqm(struct dualpi2_sched_data *q, struct sk_buff *skb,
+ u64 now)
+{
+ u64 qdelay = 0;
+
+ if (q->step_in_packets)
+ qdelay = qdisc_qlen(q->l_queue);
+ else
+ qdelay = dualpi2_sojourn_time(skb, now);
+
+ if (dualpi2_skb_cb(skb)->apply_step && qdelay > q->step_thresh) {
+ if (!dualpi2_skb_cb(skb)->ect) {
+ /* Drop this non-ECT packet */
+ return 1;
+ }
+
+ if (dualpi2_mark(q, skb))
+ ++q->step_marks;
+ }
+ qdisc_bstats_update(q->l_queue, skb);
+ return 0;
+}
+
+static void drop_and_retry(struct dualpi2_sched_data *q, struct sk_buff *skb,
+ struct Qdisc *sch, enum skb_drop_reason reason)
+{
+ ++q->deferred_drops_cnt;
+ q->deferred_drops_len += qdisc_pkt_len(skb);
+ kfree_skb_reason(skb, reason);
+ qdisc_qstats_drop(sch);
+}
+
+static struct sk_buff *dualpi2_qdisc_dequeue(struct Qdisc *sch)
+{
+ struct dualpi2_sched_data *q = qdisc_priv(sch);
+ struct sk_buff *skb;
+ int credit_change;
+ u64 now;
+
+ now = ktime_get_ns();
+
+ while ((skb = dequeue_packet(sch, q, &credit_change, now))) {
+ if (!q->drop_early && must_drop(sch, q, skb)) {
+ drop_and_retry(q, skb, sch,
+ SKB_DROP_REASON_QDISC_CONGESTED);
+ continue;
+ }
+
+ if (skb_in_l_queue(skb) && do_step_aqm(q, skb, now)) {
+ qdisc_qstats_drop(q->l_queue);
+ drop_and_retry(q, skb, sch,
+ SKB_DROP_REASON_DUALPI2_STEP_DROP);
+ continue;
+ }
+
+ q->c_protection_credit += credit_change;
+ qdisc_bstats_update(sch, skb);
+ break;
+ }
+
+ if (q->deferred_drops_cnt) {
+ qdisc_tree_reduce_backlog(sch, q->deferred_drops_cnt,
+ q->deferred_drops_len);
+ q->deferred_drops_cnt = 0;
+ q->deferred_drops_len = 0;
+ }
+ return skb;
+}
+
+static s64 __scale_delta(u64 diff)
+{
+ do_div(diff, 1 << ALPHA_BETA_GRANULARITY);
+ return diff;
+}
+
+static void get_queue_delays(struct dualpi2_sched_data *q, u64 *qdelay_c,
+ u64 *qdelay_l)
+{
+ u64 now, qc, ql;
+
+ now = ktime_get_ns();
+ qc = READ_ONCE(q->c_head_ts);
+ ql = READ_ONCE(q->l_head_ts);
+
+ *qdelay_c = qc ? now - qc : 0;
+ *qdelay_l = ql ? now - ql : 0;
+}
+
+static u32 calculate_probability(struct Qdisc *sch)
+{
+ struct dualpi2_sched_data *q = qdisc_priv(sch);
+ u32 new_prob;
+ u64 qdelay_c;
+ u64 qdelay_l;
+ u64 qdelay;
+ s64 delta;
+
+ get_queue_delays(q, &qdelay_c, &qdelay_l);
+ qdelay = max(qdelay_l, qdelay_c);
+
+ /* Alpha and beta take at most 32b, i.e, the delay difference would
+ * overflow for queuing delay differences > ~4.2sec.
+ */
+ delta = ((s64)qdelay - (s64)q->pi2_target) * q->pi2_alpha;
+ delta += ((s64)qdelay - (s64)q->last_qdelay) * q->pi2_beta;
+ q->last_qdelay = qdelay;
+
+ /* Bound new_prob between 0 and MAX_PROB */
+ if (delta > 0) {
+ new_prob = __scale_delta(delta) + q->pi2_prob;
+ if (new_prob < q->pi2_prob)
+ new_prob = MAX_PROB;
+ } else {
+ new_prob = q->pi2_prob - __scale_delta(~delta + 1);
+ if (new_prob > q->pi2_prob)
+ new_prob = 0;
+ }
+
+ /* If we do not drop on overload, ensure we cap the L4S probability to
+ * 100% to keep window fairness when overflowing.
+ */
+ if (!q->drop_overload)
+ return min_t(u32, new_prob, MAX_PROB / q->coupling_factor);
+ return new_prob;
+}
+
+static u32 get_memory_limit(struct Qdisc *sch, u32 limit)
+{
+ /* Apply rule of thumb, i.e., doubling the packet length,
+ * to further include per packet overhead in memory_limit.
+ */
+ u64 memlim = mul_u32_u32(limit, 2 * psched_mtu(qdisc_dev(sch)));
+
+ if (upper_32_bits(memlim))
+ return U32_MAX;
+ else
+ return lower_32_bits(memlim);
+}
+
+static u32 convert_us_to_nsec(u32 us)
+{
+ u64 ns = mul_u32_u32(us, NSEC_PER_USEC);
+
+ if (upper_32_bits(ns))
+ return U32_MAX;
+
+ return lower_32_bits(ns);
+}
+
+static u32 convert_ns_to_usec(u64 ns)
+{
+ do_div(ns, NSEC_PER_USEC);
+ if (upper_32_bits(ns))
+ return U32_MAX;
+
+ return lower_32_bits(ns);
+}
+
+static enum hrtimer_restart dualpi2_timer(struct hrtimer *timer)
+{
+ struct dualpi2_sched_data *q = timer_container_of(q, timer, pi2_timer);
+ struct Qdisc *sch = q->sch;
+ spinlock_t *root_lock; /* to lock qdisc for probability calculations */
+
+ rcu_read_lock();
+ root_lock = qdisc_lock(qdisc_root_sleeping(sch));
+ spin_lock(root_lock);
+
+ WRITE_ONCE(q->pi2_prob, calculate_probability(sch));
+ hrtimer_set_expires(&q->pi2_timer, next_pi2_timeout(q));
+
+ spin_unlock(root_lock);
+ rcu_read_unlock();
+ return HRTIMER_RESTART;
+}
+
+static struct netlink_range_validation dualpi2_alpha_beta_range = {
+ .min = 1,
+ .max = ALPHA_BETA_MAX,
+};
+
+static const struct nla_policy dualpi2_policy[TCA_DUALPI2_MAX + 1] = {
+ [TCA_DUALPI2_LIMIT] = NLA_POLICY_MIN(NLA_U32, 1),
+ [TCA_DUALPI2_MEMORY_LIMIT] = NLA_POLICY_MIN(NLA_U32, 1),
+ [TCA_DUALPI2_TARGET] = { .type = NLA_U32 },
+ [TCA_DUALPI2_TUPDATE] = NLA_POLICY_MIN(NLA_U32, 1),
+ [TCA_DUALPI2_ALPHA] =
+ NLA_POLICY_FULL_RANGE(NLA_U32, &dualpi2_alpha_beta_range),
+ [TCA_DUALPI2_BETA] =
+ NLA_POLICY_FULL_RANGE(NLA_U32, &dualpi2_alpha_beta_range),
+ [TCA_DUALPI2_STEP_THRESH_PKTS] = { .type = NLA_U32 },
+ [TCA_DUALPI2_STEP_THRESH_US] = { .type = NLA_U32 },
+ [TCA_DUALPI2_MIN_QLEN_STEP] = { .type = NLA_U32 },
+ [TCA_DUALPI2_COUPLING] = NLA_POLICY_MIN(NLA_U8, 1),
+ [TCA_DUALPI2_DROP_OVERLOAD] =
+ NLA_POLICY_MAX(NLA_U8, TCA_DUALPI2_DROP_OVERLOAD_MAX),
+ [TCA_DUALPI2_DROP_EARLY] =
+ NLA_POLICY_MAX(NLA_U8, TCA_DUALPI2_DROP_EARLY_MAX),
+ [TCA_DUALPI2_C_PROTECTION] =
+ NLA_POLICY_RANGE(NLA_U8, 0, MAX_WC),
+ [TCA_DUALPI2_ECN_MASK] =
+ NLA_POLICY_RANGE(NLA_U8, TC_DUALPI2_ECN_MASK_L4S_ECT,
+ TCA_DUALPI2_ECN_MASK_MAX),
+ [TCA_DUALPI2_SPLIT_GSO] =
+ NLA_POLICY_MAX(NLA_U8, TCA_DUALPI2_SPLIT_GSO_MAX),
+};
+
+static int dualpi2_change(struct Qdisc *sch, struct nlattr *opt,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[TCA_DUALPI2_MAX + 1];
+ struct dualpi2_sched_data *q;
+ int old_backlog;
+ int old_qlen;
+ int err;
+
+ if (!opt || !nla_len(opt)) {
+ NL_SET_ERR_MSG_MOD(extack, "Dualpi2 options are required");
+ return -EINVAL;
+ }
+ err = nla_parse_nested(tb, TCA_DUALPI2_MAX, opt, dualpi2_policy,
+ extack);
+ if (err < 0)
+ return err;
+ if (tb[TCA_DUALPI2_STEP_THRESH_PKTS] && tb[TCA_DUALPI2_STEP_THRESH_US]) {
+ NL_SET_ERR_MSG_MOD(extack, "multiple step thresh attributes");
+ return -EINVAL;
+ }
+
+ q = qdisc_priv(sch);
+ sch_tree_lock(sch);
+
+ if (tb[TCA_DUALPI2_LIMIT]) {
+ u32 limit = nla_get_u32(tb[TCA_DUALPI2_LIMIT]);
+
+ WRITE_ONCE(sch->limit, limit);
+ WRITE_ONCE(q->memory_limit, get_memory_limit(sch, limit));
+ }
+
+ if (tb[TCA_DUALPI2_MEMORY_LIMIT])
+ WRITE_ONCE(q->memory_limit,
+ nla_get_u32(tb[TCA_DUALPI2_MEMORY_LIMIT]));
+
+ if (tb[TCA_DUALPI2_TARGET]) {
+ u64 target = nla_get_u32(tb[TCA_DUALPI2_TARGET]);
+
+ WRITE_ONCE(q->pi2_target, target * NSEC_PER_USEC);
+ }
+
+ if (tb[TCA_DUALPI2_TUPDATE]) {
+ u64 tupdate = nla_get_u32(tb[TCA_DUALPI2_TUPDATE]);
+
+ WRITE_ONCE(q->pi2_tupdate, convert_us_to_nsec(tupdate));
+ }
+
+ if (tb[TCA_DUALPI2_ALPHA]) {
+ u32 alpha = nla_get_u32(tb[TCA_DUALPI2_ALPHA]);
+
+ WRITE_ONCE(q->pi2_alpha, dualpi2_scale_alpha_beta(alpha));
+ }
+
+ if (tb[TCA_DUALPI2_BETA]) {
+ u32 beta = nla_get_u32(tb[TCA_DUALPI2_BETA]);
+
+ WRITE_ONCE(q->pi2_beta, dualpi2_scale_alpha_beta(beta));
+ }
+
+ if (tb[TCA_DUALPI2_STEP_THRESH_PKTS]) {
+ u32 step_th = nla_get_u32(tb[TCA_DUALPI2_STEP_THRESH_PKTS]);
+
+ WRITE_ONCE(q->step_in_packets, true);
+ WRITE_ONCE(q->step_thresh, step_th);
+ } else if (tb[TCA_DUALPI2_STEP_THRESH_US]) {
+ u32 step_th = nla_get_u32(tb[TCA_DUALPI2_STEP_THRESH_US]);
+
+ WRITE_ONCE(q->step_in_packets, false);
+ WRITE_ONCE(q->step_thresh, convert_us_to_nsec(step_th));
+ }
+
+ if (tb[TCA_DUALPI2_MIN_QLEN_STEP])
+ WRITE_ONCE(q->min_qlen_step,
+ nla_get_u32(tb[TCA_DUALPI2_MIN_QLEN_STEP]));
+
+ if (tb[TCA_DUALPI2_COUPLING]) {
+ u8 coupling = nla_get_u8(tb[TCA_DUALPI2_COUPLING]);
+
+ WRITE_ONCE(q->coupling_factor, coupling);
+ }
+
+ if (tb[TCA_DUALPI2_DROP_OVERLOAD]) {
+ u8 drop_overload = nla_get_u8(tb[TCA_DUALPI2_DROP_OVERLOAD]);
+
+ WRITE_ONCE(q->drop_overload, (bool)drop_overload);
+ }
+
+ if (tb[TCA_DUALPI2_DROP_EARLY]) {
+ u8 drop_early = nla_get_u8(tb[TCA_DUALPI2_DROP_EARLY]);
+
+ WRITE_ONCE(q->drop_early, (bool)drop_early);
+ }
+
+ if (tb[TCA_DUALPI2_C_PROTECTION]) {
+ u8 wc = nla_get_u8(tb[TCA_DUALPI2_C_PROTECTION]);
+
+ dualpi2_calculate_c_protection(sch, q, wc);
+ }
+
+ if (tb[TCA_DUALPI2_ECN_MASK]) {
+ u8 ecn_mask = nla_get_u8(tb[TCA_DUALPI2_ECN_MASK]);
+
+ WRITE_ONCE(q->ecn_mask, ecn_mask);
+ }
+
+ if (tb[TCA_DUALPI2_SPLIT_GSO]) {
+ u8 split_gso = nla_get_u8(tb[TCA_DUALPI2_SPLIT_GSO]);
+
+ WRITE_ONCE(q->split_gso, (bool)split_gso);
+ }
+
+ old_qlen = qdisc_qlen(sch);
+ old_backlog = sch->qstats.backlog;
+ while (qdisc_qlen(sch) > sch->limit ||
+ q->memory_used > q->memory_limit) {
+ struct sk_buff *skb = qdisc_dequeue_internal(sch, true);
+
+ q->memory_used -= skb->truesize;
+ qdisc_qstats_backlog_dec(sch, skb);
+ rtnl_qdisc_drop(skb, sch);
+ }
+ qdisc_tree_reduce_backlog(sch, old_qlen - qdisc_qlen(sch),
+ old_backlog - sch->qstats.backlog);
+
+ sch_tree_unlock(sch);
+ return 0;
+}
+
+/* Default alpha/beta values give a 10dB stability margin with max_rtt=100ms. */
+static void dualpi2_reset_default(struct Qdisc *sch)
+{
+ struct dualpi2_sched_data *q = qdisc_priv(sch);
+
+ q->sch->limit = 10000; /* Max 125ms at 1Gbps */
+ q->memory_limit = get_memory_limit(sch, q->sch->limit);
+
+ q->pi2_target = 15 * NSEC_PER_MSEC;
+ q->pi2_tupdate = 16 * NSEC_PER_MSEC;
+ q->pi2_alpha = dualpi2_scale_alpha_beta(41); /* ~0.16 Hz * 256 */
+ q->pi2_beta = dualpi2_scale_alpha_beta(819); /* ~3.20 Hz * 256 */
+
+ q->step_thresh = 1 * NSEC_PER_MSEC;
+ q->step_in_packets = false;
+
+ dualpi2_calculate_c_protection(q->sch, q, 10); /* wc=10%, wl=90% */
+
+ q->ecn_mask = TC_DUALPI2_ECN_MASK_L4S_ECT; /* INET_ECN_ECT_1 */
+ q->min_qlen_step = 0; /* Always apply step mark in L-queue */
+ q->coupling_factor = 2; /* window fairness for equal RTTs */
+ q->drop_overload = TC_DUALPI2_DROP_OVERLOAD_DROP; /* Drop overload */
+ q->drop_early = TC_DUALPI2_DROP_EARLY_DROP_DEQUEUE; /* Drop dequeue */
+ q->split_gso = TC_DUALPI2_SPLIT_GSO_SPLIT_GSO; /* Split GSO */
+}
+
+static int dualpi2_init(struct Qdisc *sch, struct nlattr *opt,
+ struct netlink_ext_ack *extack)
+{
+ struct dualpi2_sched_data *q = qdisc_priv(sch);
+ int err;
+
+ q->l_queue = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
+ TC_H_MAKE(sch->handle, 1), extack);
+ if (!q->l_queue)
+ return -ENOMEM;
+
+ err = tcf_block_get(&q->tcf_block, &q->tcf_filters, sch, extack);
+ if (err)
+ return err;
+
+ q->sch = sch;
+ dualpi2_reset_default(sch);
+ hrtimer_setup(&q->pi2_timer, dualpi2_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
+
+ if (opt && nla_len(opt)) {
+ err = dualpi2_change(sch, opt, extack);
+
+ if (err)
+ return err;
+ }
+
+ hrtimer_start(&q->pi2_timer, next_pi2_timeout(q),
+ HRTIMER_MODE_ABS_PINNED);
+ return 0;
+}
+
+static int dualpi2_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+ struct dualpi2_sched_data *q = qdisc_priv(sch);
+ struct nlattr *opts;
+ bool step_in_pkts;
+ u32 step_th;
+
+ step_in_pkts = READ_ONCE(q->step_in_packets);
+ step_th = READ_ONCE(q->step_thresh);
+
+ opts = nla_nest_start_noflag(skb, TCA_OPTIONS);
+ if (!opts)
+ goto nla_put_failure;
+
+ if (step_in_pkts &&
+ (nla_put_u32(skb, TCA_DUALPI2_LIMIT, READ_ONCE(sch->limit)) ||
+ nla_put_u32(skb, TCA_DUALPI2_MEMORY_LIMIT,
+ READ_ONCE(q->memory_limit)) ||
+ nla_put_u32(skb, TCA_DUALPI2_TARGET,
+ convert_ns_to_usec(READ_ONCE(q->pi2_target))) ||
+ nla_put_u32(skb, TCA_DUALPI2_TUPDATE,
+ convert_ns_to_usec(READ_ONCE(q->pi2_tupdate))) ||
+ nla_put_u32(skb, TCA_DUALPI2_ALPHA,
+ dualpi2_unscale_alpha_beta(READ_ONCE(q->pi2_alpha))) ||
+ nla_put_u32(skb, TCA_DUALPI2_BETA,
+ dualpi2_unscale_alpha_beta(READ_ONCE(q->pi2_beta))) ||
+ nla_put_u32(skb, TCA_DUALPI2_STEP_THRESH_PKTS, step_th) ||
+ nla_put_u32(skb, TCA_DUALPI2_MIN_QLEN_STEP,
+ READ_ONCE(q->min_qlen_step)) ||
+ nla_put_u8(skb, TCA_DUALPI2_COUPLING,
+ READ_ONCE(q->coupling_factor)) ||
+ nla_put_u8(skb, TCA_DUALPI2_DROP_OVERLOAD,
+ READ_ONCE(q->drop_overload)) ||
+ nla_put_u8(skb, TCA_DUALPI2_DROP_EARLY,
+ READ_ONCE(q->drop_early)) ||
+ nla_put_u8(skb, TCA_DUALPI2_C_PROTECTION,
+ READ_ONCE(q->c_protection_wc)) ||
+ nla_put_u8(skb, TCA_DUALPI2_ECN_MASK, READ_ONCE(q->ecn_mask)) ||
+ nla_put_u8(skb, TCA_DUALPI2_SPLIT_GSO, READ_ONCE(q->split_gso))))
+ goto nla_put_failure;
+
+ if (!step_in_pkts &&
+ (nla_put_u32(skb, TCA_DUALPI2_LIMIT, READ_ONCE(sch->limit)) ||
+ nla_put_u32(skb, TCA_DUALPI2_MEMORY_LIMIT,
+ READ_ONCE(q->memory_limit)) ||
+ nla_put_u32(skb, TCA_DUALPI2_TARGET,
+ convert_ns_to_usec(READ_ONCE(q->pi2_target))) ||
+ nla_put_u32(skb, TCA_DUALPI2_TUPDATE,
+ convert_ns_to_usec(READ_ONCE(q->pi2_tupdate))) ||
+ nla_put_u32(skb, TCA_DUALPI2_ALPHA,
+ dualpi2_unscale_alpha_beta(READ_ONCE(q->pi2_alpha))) ||
+ nla_put_u32(skb, TCA_DUALPI2_BETA,
+ dualpi2_unscale_alpha_beta(READ_ONCE(q->pi2_beta))) ||
+ nla_put_u32(skb, TCA_DUALPI2_STEP_THRESH_US,
+ convert_ns_to_usec(step_th)) ||
+ nla_put_u32(skb, TCA_DUALPI2_MIN_QLEN_STEP,
+ READ_ONCE(q->min_qlen_step)) ||
+ nla_put_u8(skb, TCA_DUALPI2_COUPLING,
+ READ_ONCE(q->coupling_factor)) ||
+ nla_put_u8(skb, TCA_DUALPI2_DROP_OVERLOAD,
+ READ_ONCE(q->drop_overload)) ||
+ nla_put_u8(skb, TCA_DUALPI2_DROP_EARLY,
+ READ_ONCE(q->drop_early)) ||
+ nla_put_u8(skb, TCA_DUALPI2_C_PROTECTION,
+ READ_ONCE(q->c_protection_wc)) ||
+ nla_put_u8(skb, TCA_DUALPI2_ECN_MASK, READ_ONCE(q->ecn_mask)) ||
+ nla_put_u8(skb, TCA_DUALPI2_SPLIT_GSO, READ_ONCE(q->split_gso))))
+ goto nla_put_failure;
+
+ return nla_nest_end(skb, opts);
+
+nla_put_failure:
+ nla_nest_cancel(skb, opts);
+ return -1;
+}
+
+static int dualpi2_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
+{
+ struct dualpi2_sched_data *q = qdisc_priv(sch);
+ struct tc_dualpi2_xstats st = {
+ .prob = READ_ONCE(q->pi2_prob),
+ .packets_in_c = q->packets_in_c,
+ .packets_in_l = q->packets_in_l,
+ .maxq = q->maxq,
+ .ecn_mark = q->ecn_mark,
+ .credit = q->c_protection_credit,
+ .step_marks = q->step_marks,
+ .memory_used = q->memory_used,
+ .max_memory_used = q->max_memory_used,
+ .memory_limit = q->memory_limit,
+ };
+ u64 qc, ql;
+
+ get_queue_delays(q, &qc, &ql);
+ st.delay_l = convert_ns_to_usec(ql);
+ st.delay_c = convert_ns_to_usec(qc);
+ return gnet_stats_copy_app(d, &st, sizeof(st));
+}
+
+/* Reset both L-queue and C-queue, internal packet counters, PI probability,
+ * C-queue protection credit, and timestamps, while preserving current
+ * configuration of DUALPI2.
+ */
+static void dualpi2_reset(struct Qdisc *sch)
+{
+ struct dualpi2_sched_data *q = qdisc_priv(sch);
+
+ qdisc_reset_queue(sch);
+ qdisc_reset_queue(q->l_queue);
+ q->c_head_ts = 0;
+ q->l_head_ts = 0;
+ q->pi2_prob = 0;
+ q->packets_in_c = 0;
+ q->packets_in_l = 0;
+ q->maxq = 0;
+ q->ecn_mark = 0;
+ q->step_marks = 0;
+ q->memory_used = 0;
+ q->max_memory_used = 0;
+ dualpi2_reset_c_protection(q);
+}
+
+static void dualpi2_destroy(struct Qdisc *sch)
+{
+ struct dualpi2_sched_data *q = qdisc_priv(sch);
+
+ q->pi2_tupdate = 0;
+ hrtimer_cancel(&q->pi2_timer);
+ if (q->l_queue)
+ qdisc_put(q->l_queue);
+ tcf_block_put(q->tcf_block);
+}
+
+static struct Qdisc *dualpi2_leaf(struct Qdisc *sch, unsigned long arg)
+{
+ return NULL;
+}
+
+static unsigned long dualpi2_find(struct Qdisc *sch, u32 classid)
+{
+ return 0;
+}
+
+static unsigned long dualpi2_bind(struct Qdisc *sch, unsigned long parent,
+ u32 classid)
+{
+ return 0;
+}
+
+static void dualpi2_unbind(struct Qdisc *q, unsigned long cl)
+{
+}
+
+static struct tcf_block *dualpi2_tcf_block(struct Qdisc *sch, unsigned long cl,
+ struct netlink_ext_ack *extack)
+{
+ struct dualpi2_sched_data *q = qdisc_priv(sch);
+
+ if (cl)
+ return NULL;
+ return q->tcf_block;
+}
+
+static void dualpi2_walk(struct Qdisc *sch, struct qdisc_walker *arg)
+{
+ unsigned int i;
+
+ if (arg->stop)
+ return;
+
+ /* We statically define only 2 queues */
+ for (i = 0; i < 2; i++) {
+ if (arg->count < arg->skip) {
+ arg->count++;
+ continue;
+ }
+ if (arg->fn(sch, i + 1, arg) < 0) {
+ arg->stop = 1;
+ break;
+ }
+ arg->count++;
+ }
+}
+
+/* Minimal class support to handle tc filters */
+static const struct Qdisc_class_ops dualpi2_class_ops = {
+ .leaf = dualpi2_leaf,
+ .find = dualpi2_find,
+ .tcf_block = dualpi2_tcf_block,
+ .bind_tcf = dualpi2_bind,
+ .unbind_tcf = dualpi2_unbind,
+ .walk = dualpi2_walk,
+};
+
+static struct Qdisc_ops dualpi2_qdisc_ops __read_mostly = {
+ .id = "dualpi2",
+ .cl_ops = &dualpi2_class_ops,
+ .priv_size = sizeof(struct dualpi2_sched_data),
+ .enqueue = dualpi2_qdisc_enqueue,
+ .dequeue = dualpi2_qdisc_dequeue,
+ .peek = qdisc_peek_dequeued,
+ .init = dualpi2_init,
+ .destroy = dualpi2_destroy,
+ .reset = dualpi2_reset,
+ .change = dualpi2_change,
+ .dump = dualpi2_dump,
+ .dump_stats = dualpi2_dump_stats,
+ .owner = THIS_MODULE,
+};
+
+static int __init dualpi2_module_init(void)
+{
+ return register_qdisc(&dualpi2_qdisc_ops);
+}
+
+static void __exit dualpi2_module_exit(void)
+{
+ unregister_qdisc(&dualpi2_qdisc_ops);
+}
+
+module_init(dualpi2_module_init);
+module_exit(dualpi2_module_exit);
+
+MODULE_DESCRIPTION("Dual Queue with Proportional Integral controller Improved with a Square (dualpi2) scheduler");
+MODULE_AUTHOR("Koen De Schepper <koen.de_schepper@nokia-bell-labs.com>");
+MODULE_AUTHOR("Chia-Yu Chang <chia-yu.chang@nokia-bell-labs.com>");
+MODULE_AUTHOR("Olga Albisser <olga@albisser.org>");
+MODULE_AUTHOR("Henrik Steen <henrist@henrist.net>");
+MODULE_AUTHOR("Olivier Tilmans <olivier.tilmans@nokia.com>");
+
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_VERSION("1.0");
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 16afb834fe4a..1e008a228ebd 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -740,6 +740,8 @@ static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc *qdisc,
err = skb_array_produce(q, skb);
if (unlikely(err)) {
+ tcf_set_drop_reason(skb, SKB_DROP_REASON_QDISC_OVERLIMIT);
+
if (qdisc_is_percpu_stats(qdisc))
return qdisc_drop_cpu(skb, qdisc, to_free);
else
diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c
index 5a7745170e84..d8fd35da32a7 100644
--- a/net/sched/sch_hfsc.c
+++ b/net/sched/sch_hfsc.c
@@ -835,22 +835,6 @@ update_vf(struct hfsc_class *cl, unsigned int len, u64 cur_time)
}
}
-static unsigned int
-qdisc_peek_len(struct Qdisc *sch)
-{
- struct sk_buff *skb;
- unsigned int len;
-
- skb = sch->ops->peek(sch);
- if (unlikely(skb == NULL)) {
- qdisc_warn_nonwc("qdisc_peek_len", sch);
- return 0;
- }
- len = qdisc_pkt_len(skb);
-
- return len;
-}
-
static void
hfsc_adjust_levels(struct hfsc_class *cl)
{
diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c
index 14bf71f57057..c968ea763774 100644
--- a/net/sched/sch_htb.c
+++ b/net/sched/sch_htb.c
@@ -821,7 +821,9 @@ static struct htb_class *htb_lookup_leaf(struct htb_prio *hprio, const int prio)
u32 *pid;
} stk[TC_HTB_MAXDEPTH], *sp = stk;
- BUG_ON(!hprio->row.rb_node);
+ if (unlikely(!hprio->row.rb_node))
+ return NULL;
+
sp->root = hprio->row.rb_node;
sp->pptr = &hprio->ptr;
sp->pid = &hprio->last_ptr_id;
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index fdd79d3ccd8c..eafc316ae319 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -973,6 +973,41 @@ static int parse_attr(struct nlattr *tb[], int maxtype, struct nlattr *nla,
return 0;
}
+static const struct Qdisc_class_ops netem_class_ops;
+
+static int check_netem_in_tree(struct Qdisc *sch, bool duplicates,
+ struct netlink_ext_ack *extack)
+{
+ struct Qdisc *root, *q;
+ unsigned int i;
+
+ root = qdisc_root_sleeping(sch);
+
+ if (sch != root && root->ops->cl_ops == &netem_class_ops) {
+ if (duplicates ||
+ ((struct netem_sched_data *)qdisc_priv(root))->duplicate)
+ goto err;
+ }
+
+ if (!qdisc_dev(root))
+ return 0;
+
+ hash_for_each(qdisc_dev(root)->qdisc_hash, i, q, hash) {
+ if (sch != q && q->ops->cl_ops == &netem_class_ops) {
+ if (duplicates ||
+ ((struct netem_sched_data *)qdisc_priv(q))->duplicate)
+ goto err;
+ }
+ }
+
+ return 0;
+
+err:
+ NL_SET_ERR_MSG(extack,
+ "netem: cannot mix duplicating netems with other netems in tree");
+ return -EINVAL;
+}
+
/* Parse netlink message to set options */
static int netem_change(struct Qdisc *sch, struct nlattr *opt,
struct netlink_ext_ack *extack)
@@ -1031,6 +1066,11 @@ static int netem_change(struct Qdisc *sch, struct nlattr *opt,
q->gap = qopt->gap;
q->counter = 0;
q->loss = qopt->loss;
+
+ ret = check_netem_in_tree(sch, qopt->duplicate, extack);
+ if (ret)
+ goto unlock;
+
q->duplicate = qopt->duplicate;
/* for compatibility with earlier versions.
diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c
index bf1282cb22eb..2255355e51d3 100644
--- a/net/sched/sch_qfq.c
+++ b/net/sched/sch_qfq.c
@@ -412,7 +412,7 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
bool existing = false;
struct nlattr *tb[TCA_QFQ_MAX + 1];
struct qfq_aggregate *new_agg = NULL;
- u32 weight, lmax, inv_w;
+ u32 weight, lmax, inv_w, old_weight, old_lmax;
int err;
int delta_w;
@@ -443,12 +443,16 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
inv_w = ONE_FP / weight;
weight = ONE_FP / inv_w;
- if (cl != NULL &&
- lmax == cl->agg->lmax &&
- weight == cl->agg->class_weight)
- return 0; /* nothing to change */
+ if (cl != NULL) {
+ sch_tree_lock(sch);
+ old_weight = cl->agg->class_weight;
+ old_lmax = cl->agg->lmax;
+ sch_tree_unlock(sch);
+ if (lmax == old_lmax && weight == old_weight)
+ return 0; /* nothing to change */
+ }
- delta_w = weight - (cl ? cl->agg->class_weight : 0);
+ delta_w = weight - (cl ? old_weight : 0);
if (q->wsum + delta_w > QFQ_MAX_WSUM) {
NL_SET_ERR_MSG_FMT_MOD(extack,
@@ -532,9 +536,6 @@ destroy_class:
static void qfq_destroy_class(struct Qdisc *sch, struct qfq_class *cl)
{
- struct qfq_sched *q = qdisc_priv(sch);
-
- qfq_rm_from_agg(q, cl);
gen_kill_estimator(&cl->rate_est);
qdisc_put(cl->qdisc);
kfree(cl);
@@ -555,6 +556,7 @@ static int qfq_delete_class(struct Qdisc *sch, unsigned long arg,
qdisc_purge_queue(cl->qdisc);
qdisc_class_hash_remove(&q->clhash, &cl->common);
+ qfq_rm_from_agg(q, cl);
sch_tree_unlock(sch);
@@ -625,6 +627,7 @@ static int qfq_dump_class(struct Qdisc *sch, unsigned long arg,
{
struct qfq_class *cl = (struct qfq_class *)arg;
struct nlattr *nest;
+ u32 class_weight, lmax;
tcm->tcm_parent = TC_H_ROOT;
tcm->tcm_handle = cl->common.classid;
@@ -633,8 +636,13 @@ static int qfq_dump_class(struct Qdisc *sch, unsigned long arg,
nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
if (nest == NULL)
goto nla_put_failure;
- if (nla_put_u32(skb, TCA_QFQ_WEIGHT, cl->agg->class_weight) ||
- nla_put_u32(skb, TCA_QFQ_LMAX, cl->agg->lmax))
+
+ sch_tree_lock(sch);
+ class_weight = cl->agg->class_weight;
+ lmax = cl->agg->lmax;
+ sch_tree_unlock(sch);
+ if (nla_put_u32(skb, TCA_QFQ_WEIGHT, class_weight) ||
+ nla_put_u32(skb, TCA_QFQ_LMAX, lmax))
goto nla_put_failure;
return nla_nest_end(skb, nest);
@@ -651,8 +659,10 @@ static int qfq_dump_class_stats(struct Qdisc *sch, unsigned long arg,
memset(&xstats, 0, sizeof(xstats));
+ sch_tree_lock(sch);
xstats.weight = cl->agg->class_weight;
xstats.lmax = cl->agg->lmax;
+ sch_tree_unlock(sch);
if (gnet_stats_copy_basic(d, NULL, &cl->bstats, true) < 0 ||
gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
@@ -989,7 +999,7 @@ static struct sk_buff *agg_dequeue(struct qfq_aggregate *agg,
if (cl->qdisc->q.qlen == 0) /* no more packets, remove from list */
list_del_init(&cl->alist);
- else if (cl->deficit < qdisc_pkt_len(cl->qdisc->ops->peek(cl->qdisc))) {
+ else if (cl->deficit < qdisc_peek_len(cl->qdisc)) {
cl->deficit += agg->lmax;
list_move_tail(&cl->alist, &agg->active);
}
@@ -1491,6 +1501,7 @@ static void qfq_destroy_qdisc(struct Qdisc *sch)
for (i = 0; i < q->clhash.hashsize; i++) {
hlist_for_each_entry_safe(cl, next, &q->clhash.hash[i],
common.hnode) {
+ qfq_rm_from_agg(q, cl);
qfq_destroy_class(sch, cl);
}
}
diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c
index 14021b812329..e759e43ad27e 100644
--- a/net/sched/sch_taprio.c
+++ b/net/sched/sch_taprio.c
@@ -998,7 +998,7 @@ static const struct nla_policy entry_policy[TCA_TAPRIO_SCHED_ENTRY_MAX + 1] = {
static const struct nla_policy taprio_tc_policy[TCA_TAPRIO_TC_ENTRY_MAX + 1] = {
[TCA_TAPRIO_TC_ENTRY_INDEX] = NLA_POLICY_MAX(NLA_U32,
- TC_QOPT_MAX_QUEUE),
+ TC_QOPT_MAX_QUEUE - 1),
[TCA_TAPRIO_TC_ENTRY_MAX_SDU] = { .type = NLA_U32 },
[TCA_TAPRIO_TC_ENTRY_FP] = NLA_POLICY_RANGE(NLA_U32,
TC_FP_EXPRESS,
@@ -1328,13 +1328,15 @@ static int taprio_dev_notifier(struct notifier_block *nb, unsigned long event,
stab = rtnl_dereference(q->root->stab);
- oper = rtnl_dereference(q->oper_sched);
+ rcu_read_lock();
+ oper = rcu_dereference(q->oper_sched);
if (oper)
taprio_update_queue_max_sdu(q, oper, stab);
- admin = rtnl_dereference(q->admin_sched);
+ admin = rcu_dereference(q->admin_sched);
if (admin)
taprio_update_queue_max_sdu(q, admin, stab);
+ rcu_read_unlock();
break;
}
@@ -1696,19 +1698,15 @@ static int taprio_parse_tc_entry(struct Qdisc *sch,
if (err < 0)
return err;
- if (!tb[TCA_TAPRIO_TC_ENTRY_INDEX]) {
+ if (NL_REQ_ATTR_CHECK(extack, opt, tb, TCA_TAPRIO_TC_ENTRY_INDEX)) {
NL_SET_ERR_MSG_MOD(extack, "TC entry index missing");
return -EINVAL;
}
tc = nla_get_u32(tb[TCA_TAPRIO_TC_ENTRY_INDEX]);
- if (tc >= TC_QOPT_MAX_QUEUE) {
- NL_SET_ERR_MSG_MOD(extack, "TC entry index out of range");
- return -ERANGE;
- }
-
if (*seen_tcs & BIT(tc)) {
- NL_SET_ERR_MSG_MOD(extack, "Duplicate TC entry");
+ NL_SET_ERR_MSG_ATTR(extack, tb[TCA_TAPRIO_TC_ENTRY_INDEX],
+ "Duplicate tc entry");
return -EINVAL;
}