diff options
Diffstat (limited to 'net/sched')
33 files changed, 1978 insertions, 208 deletions
diff --git a/net/sched/Kconfig b/net/sched/Kconfig index a800127effcd..6ddff028b81a 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -403,6 +403,30 @@ config NET_SCH_ETS If unsure, say N. +config NET_SCH_BPF + bool "BPF-based Qdisc" + depends on BPF_SYSCALL && BPF_JIT && DEBUG_INFO_BTF + help + This option allows BPF-based queueing disiplines. With BPF struct_ops, + users can implement supported operators in Qdisc_ops using BPF programs. + The queue holding skb can be built with BPF maps or graphs. + + Say Y here if you want to use BPF-based Qdisc. + + If unsure, say N. + +config NET_SCH_DUALPI2 + tristate "Dual Queue PI Square (DUALPI2) scheduler" + help + Say Y here if you want to use the Dual Queue Proportional Integral + Controller Improved with a Square scheduling algorithm. + For more information, please see https://tools.ietf.org/html/rfc9332 + + To compile this driver as a module, choose M here: the module + will be called sch_dualpi2. + + If unsure, say N. + menuconfig NET_SCH_DEFAULT bool "Allow override default queue discipline" help @@ -784,7 +808,7 @@ config NET_ACT_SKBEDIT config NET_ACT_CSUM tristate "Checksum Updating" depends on NET_CLS_ACT && INET - select CRC32 + select NET_CRC32C help Say Y here to update some common checksum after some direct packet alterations. diff --git a/net/sched/Makefile b/net/sched/Makefile index 82c3f78ca486..5078ea84e6ad 100644 --- a/net/sched/Makefile +++ b/net/sched/Makefile @@ -62,6 +62,8 @@ obj-$(CONFIG_NET_SCH_FQ_PIE) += sch_fq_pie.o obj-$(CONFIG_NET_SCH_CBS) += sch_cbs.o obj-$(CONFIG_NET_SCH_ETF) += sch_etf.o obj-$(CONFIG_NET_SCH_TAPRIO) += sch_taprio.o +obj-$(CONFIG_NET_SCH_BPF) += bpf_qdisc.o +obj-$(CONFIG_NET_SCH_DUALPI2) += sch_dualpi2.o obj-$(CONFIG_NET_CLS_U32) += cls_u32.o obj-$(CONFIG_NET_CLS_ROUTE4) += cls_route.o diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 839790043256..9e468e463467 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -933,18 +933,25 @@ void tcf_idrinfo_destroy(const struct tc_action_ops *ops, struct tcf_idrinfo *idrinfo) { struct idr *idr = &idrinfo->action_idr; + bool mutex_taken = false; struct tc_action *p; - int ret; unsigned long id = 1; unsigned long tmp; + int ret; idr_for_each_entry_ul(idr, p, tmp, id) { + if (tc_act_in_hw(p) && !mutex_taken) { + rtnl_lock(); + mutex_taken = true; + } ret = __tcf_idr_release(p, false, true); if (ret == ACT_P_DELETED) module_put(ops->owner); else if (ret < 0) return; } + if (mutex_taken) + rtnl_unlock(); idr_destroy(&idrinfo->action_idr); } EXPORT_SYMBOL(tcf_idrinfo_destroy); @@ -1461,17 +1468,29 @@ int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla, struct netlink_ext_ack *extack) { struct tc_action_ops *ops[TCA_ACT_MAX_PRIO] = {}; - struct nlattr *tb[TCA_ACT_MAX_PRIO + 1]; + struct nlattr *tb[TCA_ACT_MAX_PRIO + 2]; struct tc_action *act; size_t sz = 0; int err; int i; - err = nla_parse_nested_deprecated(tb, TCA_ACT_MAX_PRIO, nla, NULL, + err = nla_parse_nested_deprecated(tb, TCA_ACT_MAX_PRIO + 1, nla, NULL, extack); if (err < 0) return err; + /* The nested attributes are parsed as types, but they are really an + * array of actions. So we parse one more than we can handle, and return + * an error if the last one is set (as that indicates that the request + * contained more than the maximum number of actions). + */ + if (tb[TCA_ACT_MAX_PRIO + 1]) { + NL_SET_ERR_MSG_FMT(extack, + "Only %d actions supported per filter", + TCA_ACT_MAX_PRIO); + return -EINVAL; + } + for (i = 1; i <= TCA_ACT_MAX_PRIO && tb[i]; i++) { struct tc_action_ops *a_o; diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c index 0fce631e7c91..3e89927d7116 100644 --- a/net/sched/act_connmark.c +++ b/net/sched/act_connmark.c @@ -88,7 +88,7 @@ count: /* using overlimits stats to count how many packets marked */ tcf_action_inc_overlimit_qstats(&ca->common); out: - return READ_ONCE(ca->tcf_action); + return parms->action; } static const struct nla_policy connmark_policy[TCA_CONNMARK_MAX + 1] = { @@ -167,6 +167,8 @@ static int tcf_connmark_init(struct net *net, struct nlattr *nla, if (err < 0) goto release_idr; + nparms->action = parm->action; + spin_lock_bh(&ci->tcf_lock); goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); oparms = rcu_replace_pointer(ci->parms, nparms, lockdep_is_held(&ci->tcf_lock)); @@ -190,20 +192,20 @@ out_free: static inline int tcf_connmark_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { + const struct tcf_connmark_info *ci = to_connmark(a); unsigned char *b = skb_tail_pointer(skb); - struct tcf_connmark_info *ci = to_connmark(a); + const struct tcf_connmark_parms *parms; struct tc_connmark opt = { .index = ci->tcf_index, .refcnt = refcount_read(&ci->tcf_refcnt) - ref, .bindcnt = atomic_read(&ci->tcf_bindcnt) - bind, }; - struct tcf_connmark_parms *parms; struct tcf_t t; - spin_lock_bh(&ci->tcf_lock); - parms = rcu_dereference_protected(ci->parms, lockdep_is_held(&ci->tcf_lock)); + rcu_read_lock(); + parms = rcu_dereference(ci->parms); - opt.action = ci->tcf_action; + opt.action = parms->action; opt.zone = parms->zone; if (nla_put(skb, TCA_CONNMARK_PARMS, sizeof(opt), &opt)) goto nla_put_failure; @@ -212,12 +214,12 @@ static inline int tcf_connmark_dump(struct sk_buff *skb, struct tc_action *a, if (nla_put_64bit(skb, TCA_CONNMARK_TM, sizeof(t), &t, TCA_CONNMARK_PAD)) goto nla_put_failure; - spin_unlock_bh(&ci->tcf_lock); + rcu_read_unlock(); return skb->len; nla_put_failure: - spin_unlock_bh(&ci->tcf_lock); + rcu_read_unlock(); nlmsg_trim(skb, b); return -1; } diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c index 5cc8e407e791..0939e6b2ba4d 100644 --- a/net/sched/act_csum.c +++ b/net/sched/act_csum.c @@ -99,6 +99,7 @@ static int tcf_csum_init(struct net *net, struct nlattr *nla, goto put_chain; } params_new->update_flags = parm->update_flags; + params_new->action = parm->action; spin_lock_bh(&p->tcf_lock); goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); @@ -580,7 +581,7 @@ TC_INDIRECT_SCOPE int tcf_csum_act(struct sk_buff *skb, tcf_lastuse_update(&p->tcf_tm); tcf_action_update_bstats(&p->common, skb); - action = READ_ONCE(p->tcf_action); + action = params->action; if (unlikely(action == TC_ACT_SHOT)) goto drop; @@ -631,9 +632,9 @@ drop: static int tcf_csum_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { + const struct tcf_csum *p = to_tcf_csum(a); unsigned char *b = skb_tail_pointer(skb); - struct tcf_csum *p = to_tcf_csum(a); - struct tcf_csum_params *params; + const struct tcf_csum_params *params; struct tc_csum opt = { .index = p->tcf_index, .refcnt = refcount_read(&p->tcf_refcnt) - ref, @@ -641,10 +642,9 @@ static int tcf_csum_dump(struct sk_buff *skb, struct tc_action *a, int bind, }; struct tcf_t t; - spin_lock_bh(&p->tcf_lock); - params = rcu_dereference_protected(p->params, - lockdep_is_held(&p->tcf_lock)); - opt.action = p->tcf_action; + rcu_read_lock(); + params = rcu_dereference(p->params); + opt.action = params->action; opt.update_flags = params->update_flags; if (nla_put(skb, TCA_CSUM_PARMS, sizeof(opt), &opt)) @@ -653,12 +653,12 @@ static int tcf_csum_dump(struct sk_buff *skb, struct tc_action *a, int bind, tcf_tm_dump(&t, &p->tcf_tm); if (nla_put_64bit(skb, TCA_CSUM_TM, sizeof(t), &t, TCA_CSUM_PAD)) goto nla_put_failure; - spin_unlock_bh(&p->tcf_lock); + rcu_read_unlock(); return skb->len; nla_put_failure: - spin_unlock_bh(&p->tcf_lock); + rcu_read_unlock(); nlmsg_trim(skb, b); return -1; } diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c index c02f39efc6ef..6749a4a9a9cd 100644 --- a/net/sched/act_ct.c +++ b/net/sched/act_ct.c @@ -977,7 +977,7 @@ TC_INDIRECT_SCOPE int tcf_ct_act(struct sk_buff *skb, const struct tc_action *a, p = rcu_dereference_bh(c->params); - retval = READ_ONCE(c->tcf_action); + retval = p->action; commit = p->ct_action & TCA_CT_ACT_COMMIT; clear = p->ct_action & TCA_CT_ACT_CLEAR; tmpl = p->tmpl; @@ -1409,6 +1409,7 @@ static int tcf_ct_init(struct net *net, struct nlattr *nla, if (err) goto cleanup; + params->action = parm->action; spin_lock_bh(&c->tcf_lock); goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); params = rcu_replace_pointer(c->params, params, @@ -1442,8 +1443,8 @@ static void tcf_ct_cleanup(struct tc_action *a) } static int tcf_ct_dump_key_val(struct sk_buff *skb, - void *val, int val_type, - void *mask, int mask_type, + const void *val, int val_type, + const void *mask, int mask_type, int len) { int err; @@ -1464,9 +1465,9 @@ static int tcf_ct_dump_key_val(struct sk_buff *skb, return 0; } -static int tcf_ct_dump_nat(struct sk_buff *skb, struct tcf_ct_params *p) +static int tcf_ct_dump_nat(struct sk_buff *skb, const struct tcf_ct_params *p) { - struct nf_nat_range2 *range = &p->range; + const struct nf_nat_range2 *range = &p->range; if (!(p->ct_action & TCA_CT_ACT_NAT)) return 0; @@ -1504,7 +1505,8 @@ static int tcf_ct_dump_nat(struct sk_buff *skb, struct tcf_ct_params *p) return 0; } -static int tcf_ct_dump_helper(struct sk_buff *skb, struct nf_conntrack_helper *helper) +static int tcf_ct_dump_helper(struct sk_buff *skb, + const struct nf_conntrack_helper *helper) { if (!helper) return 0; @@ -1521,9 +1523,8 @@ static inline int tcf_ct_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { unsigned char *b = skb_tail_pointer(skb); - struct tcf_ct *c = to_ct(a); - struct tcf_ct_params *p; - + const struct tcf_ct *c = to_ct(a); + const struct tcf_ct_params *p; struct tc_ct opt = { .index = c->tcf_index, .refcnt = refcount_read(&c->tcf_refcnt) - ref, @@ -1531,10 +1532,9 @@ static inline int tcf_ct_dump(struct sk_buff *skb, struct tc_action *a, }; struct tcf_t t; - spin_lock_bh(&c->tcf_lock); - p = rcu_dereference_protected(c->params, - lockdep_is_held(&c->tcf_lock)); - opt.action = c->tcf_action; + rcu_read_lock(); + p = rcu_dereference(c->params); + opt.action = p->action; if (tcf_ct_dump_key_val(skb, &p->ct_action, TCA_CT_ACTION, @@ -1579,11 +1579,11 @@ skip_dump: tcf_tm_dump(&t, &c->tcf_tm); if (nla_put_64bit(skb, TCA_CT_TM, sizeof(t), &t, TCA_CT_PAD)) goto nla_put_failure; - spin_unlock_bh(&c->tcf_lock); + rcu_read_unlock(); return skb->len; nla_put_failure: - spin_unlock_bh(&c->tcf_lock); + rcu_read_unlock(); nlmsg_trim(skb, b); return -1; } diff --git a/net/sched/act_ctinfo.c b/net/sched/act_ctinfo.c index 5b1241ddc758..71efe04d00b5 100644 --- a/net/sched/act_ctinfo.c +++ b/net/sched/act_ctinfo.c @@ -44,9 +44,9 @@ static void tcf_ctinfo_dscp_set(struct nf_conn *ct, struct tcf_ctinfo *ca, ipv4_change_dsfield(ip_hdr(skb), INET_ECN_MASK, newdscp); - ca->stats_dscp_set++; + atomic64_inc(&ca->stats_dscp_set); } else { - ca->stats_dscp_error++; + atomic64_inc(&ca->stats_dscp_error); } } break; @@ -57,9 +57,9 @@ static void tcf_ctinfo_dscp_set(struct nf_conn *ct, struct tcf_ctinfo *ca, ipv6_change_dsfield(ipv6_hdr(skb), INET_ECN_MASK, newdscp); - ca->stats_dscp_set++; + atomic64_inc(&ca->stats_dscp_set); } else { - ca->stats_dscp_error++; + atomic64_inc(&ca->stats_dscp_error); } } break; @@ -72,7 +72,7 @@ static void tcf_ctinfo_cpmark_set(struct nf_conn *ct, struct tcf_ctinfo *ca, struct tcf_ctinfo_params *cp, struct sk_buff *skb) { - ca->stats_cpmark_set++; + atomic64_inc(&ca->stats_cpmark_set); skb->mark = READ_ONCE(ct->mark) & cp->cpmarkmask; } @@ -88,13 +88,11 @@ TC_INDIRECT_SCOPE int tcf_ctinfo_act(struct sk_buff *skb, struct tcf_ctinfo_params *cp; struct nf_conn *ct; int proto, wlen; - int action; cp = rcu_dereference_bh(ca->params); tcf_lastuse_update(&ca->tcf_tm); tcf_action_update_bstats(&ca->common, skb); - action = READ_ONCE(ca->tcf_action); wlen = skb_network_offset(skb); switch (skb_protocol(skb, true)) { @@ -141,7 +139,7 @@ TC_INDIRECT_SCOPE int tcf_ctinfo_act(struct sk_buff *skb, if (thash) nf_ct_put(ct); out: - return action; + return cp->action; } static const struct nla_policy ctinfo_policy[TCA_CTINFO_MAX + 1] = { @@ -258,6 +256,8 @@ static int tcf_ctinfo_init(struct net *net, struct nlattr *nla, cp_new->mode |= CTINFO_MODE_CPMARK; } + cp_new->action = actparm->action; + spin_lock_bh(&ci->tcf_lock); goto_ch = tcf_action_set_ctrlact(*a, actparm->action, goto_ch); cp_new = rcu_replace_pointer(ci->params, cp_new, @@ -282,25 +282,24 @@ release_idr: static int tcf_ctinfo_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { - struct tcf_ctinfo *ci = to_ctinfo(a); + const struct tcf_ctinfo *ci = to_ctinfo(a); + unsigned char *b = skb_tail_pointer(skb); + const struct tcf_ctinfo_params *cp; struct tc_ctinfo opt = { .index = ci->tcf_index, .refcnt = refcount_read(&ci->tcf_refcnt) - ref, .bindcnt = atomic_read(&ci->tcf_bindcnt) - bind, }; - unsigned char *b = skb_tail_pointer(skb); - struct tcf_ctinfo_params *cp; struct tcf_t t; - spin_lock_bh(&ci->tcf_lock); - cp = rcu_dereference_protected(ci->params, - lockdep_is_held(&ci->tcf_lock)); + rcu_read_lock(); + cp = rcu_dereference(ci->params); tcf_tm_dump(&t, &ci->tcf_tm); if (nla_put_64bit(skb, TCA_CTINFO_TM, sizeof(t), &t, TCA_CTINFO_PAD)) goto nla_put_failure; - opt.action = ci->tcf_action; + opt.action = cp->action; if (nla_put(skb, TCA_CTINFO_ACT, sizeof(opt), &opt)) goto nla_put_failure; @@ -323,22 +322,25 @@ static int tcf_ctinfo_dump(struct sk_buff *skb, struct tc_action *a, } if (nla_put_u64_64bit(skb, TCA_CTINFO_STATS_DSCP_SET, - ci->stats_dscp_set, TCA_CTINFO_PAD)) + atomic64_read(&ci->stats_dscp_set), + TCA_CTINFO_PAD)) goto nla_put_failure; if (nla_put_u64_64bit(skb, TCA_CTINFO_STATS_DSCP_ERROR, - ci->stats_dscp_error, TCA_CTINFO_PAD)) + atomic64_read(&ci->stats_dscp_error), + TCA_CTINFO_PAD)) goto nla_put_failure; if (nla_put_u64_64bit(skb, TCA_CTINFO_STATS_CPMARK_SET, - ci->stats_cpmark_set, TCA_CTINFO_PAD)) + atomic64_read(&ci->stats_cpmark_set), + TCA_CTINFO_PAD)) goto nla_put_failure; - spin_unlock_bh(&ci->tcf_lock); + rcu_read_unlock(); return skb->len; nla_put_failure: - spin_unlock_bh(&ci->tcf_lock); + rcu_read_unlock(); nlmsg_trim(skb, b); return -1; } diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index 5b3814365924..5f01f567c934 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -30,7 +30,29 @@ static LIST_HEAD(mirred_list); static DEFINE_SPINLOCK(mirred_list_lock); #define MIRRED_NEST_LIMIT 4 -static DEFINE_PER_CPU(unsigned int, mirred_nest_level); + +#ifndef CONFIG_PREEMPT_RT +static u8 tcf_mirred_nest_level_inc_return(void) +{ + return __this_cpu_inc_return(softnet_data.xmit.sched_mirred_nest); +} + +static void tcf_mirred_nest_level_dec(void) +{ + __this_cpu_dec(softnet_data.xmit.sched_mirred_nest); +} + +#else +static u8 tcf_mirred_nest_level_inc_return(void) +{ + return current->net_xmit.sched_mirred_nest++; +} + +static void tcf_mirred_nest_level_dec(void) +{ + current->net_xmit.sched_mirred_nest--; +} +#endif static bool tcf_mirred_is_act_redirect(int action) { @@ -423,7 +445,7 @@ TC_INDIRECT_SCOPE int tcf_mirred_act(struct sk_buff *skb, int m_eaction; u32 blockid; - nest_level = __this_cpu_inc_return(mirred_nest_level); + nest_level = tcf_mirred_nest_level_inc_return(); if (unlikely(nest_level > MIRRED_NEST_LIMIT)) { net_warn_ratelimited("Packet exceeded mirred recursion limit on dev %s\n", netdev_name(skb->dev)); @@ -454,7 +476,7 @@ TC_INDIRECT_SCOPE int tcf_mirred_act(struct sk_buff *skb, retval); dec_nest_level: - __this_cpu_dec(mirred_nest_level); + tcf_mirred_nest_level_dec(); return retval; } diff --git a/net/sched/act_mpls.c b/net/sched/act_mpls.c index 9f86f4e666d3..6654011dcd2b 100644 --- a/net/sched/act_mpls.c +++ b/net/sched/act_mpls.c @@ -57,7 +57,7 @@ TC_INDIRECT_SCOPE int tcf_mpls_act(struct sk_buff *skb, struct tcf_mpls *m = to_mpls(a); struct tcf_mpls_params *p; __be32 new_lse; - int ret, mac_len; + int mac_len; tcf_lastuse_update(&m->tcf_tm); bstats_update(this_cpu_ptr(m->common.cpu_bstats), skb); @@ -72,8 +72,6 @@ TC_INDIRECT_SCOPE int tcf_mpls_act(struct sk_buff *skb, mac_len = skb_network_offset(skb); } - ret = READ_ONCE(m->tcf_action); - p = rcu_dereference_bh(m->mpls_p); switch (p->tcfm_action) { @@ -122,7 +120,7 @@ TC_INDIRECT_SCOPE int tcf_mpls_act(struct sk_buff *skb, if (skb_at_tc_ingress(skb)) skb_pull_rcsum(skb, skb->mac_len); - return ret; + return p->action; drop: qstats_drop_inc(this_cpu_ptr(m->common.cpu_qstats)); @@ -296,6 +294,7 @@ static int tcf_mpls_init(struct net *net, struct nlattr *nla, ACT_MPLS_BOS_NOT_SET); p->tcfm_proto = nla_get_be16_default(tb[TCA_MPLS_PROTO], htons(ETH_P_MPLS_UC)); + p->action = parm->action; spin_lock_bh(&m->tcf_lock); goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); @@ -330,8 +329,8 @@ static int tcf_mpls_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { unsigned char *b = skb_tail_pointer(skb); - struct tcf_mpls *m = to_mpls(a); - struct tcf_mpls_params *p; + const struct tcf_mpls *m = to_mpls(a); + const struct tcf_mpls_params *p; struct tc_mpls opt = { .index = m->tcf_index, .refcnt = refcount_read(&m->tcf_refcnt) - ref, @@ -339,10 +338,10 @@ static int tcf_mpls_dump(struct sk_buff *skb, struct tc_action *a, }; struct tcf_t t; - spin_lock_bh(&m->tcf_lock); - opt.action = m->tcf_action; - p = rcu_dereference_protected(m->mpls_p, lockdep_is_held(&m->tcf_lock)); + rcu_read_lock(); + p = rcu_dereference(m->mpls_p); opt.m_action = p->tcfm_action; + opt.action = p->action; if (nla_put(skb, TCA_MPLS_PARMS, sizeof(opt), &opt)) goto nla_put_failure; @@ -370,12 +369,12 @@ static int tcf_mpls_dump(struct sk_buff *skb, struct tc_action *a, if (nla_put_64bit(skb, TCA_MPLS_TM, sizeof(t), &t, TCA_MPLS_PAD)) goto nla_put_failure; - spin_unlock_bh(&m->tcf_lock); + rcu_read_unlock(); return skb->len; nla_put_failure: - spin_unlock_bh(&m->tcf_lock); + rcu_read_unlock(); nlmsg_trim(skb, b); return -EMSGSIZE; } diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c index d541f553805f..26241d80ebe0 100644 --- a/net/sched/act_nat.c +++ b/net/sched/act_nat.c @@ -91,6 +91,7 @@ static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est, nparm->new_addr = parm->new_addr; nparm->mask = parm->mask; nparm->flags = parm->flags; + nparm->action = parm->action; p = to_tcf_nat(*a); @@ -130,17 +131,16 @@ TC_INDIRECT_SCOPE int tcf_nat_act(struct sk_buff *skb, tcf_lastuse_update(&p->tcf_tm); tcf_action_update_bstats(&p->common, skb); - action = READ_ONCE(p->tcf_action); - parms = rcu_dereference_bh(p->parms); + action = parms->action; + if (unlikely(action == TC_ACT_SHOT)) + goto drop; + old_addr = parms->old_addr; new_addr = parms->new_addr; mask = parms->mask; egress = parms->flags & TCA_NAT_FLAG_EGRESS; - if (unlikely(action == TC_ACT_SHOT)) - goto drop; - noff = skb_network_offset(skb); if (!pskb_may_pull(skb, sizeof(*iph) + noff)) goto drop; @@ -268,21 +268,20 @@ static int tcf_nat_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { unsigned char *b = skb_tail_pointer(skb); - struct tcf_nat *p = to_tcf_nat(a); + const struct tcf_nat *p = to_tcf_nat(a); + const struct tcf_nat_parms *parms; struct tc_nat opt = { .index = p->tcf_index, .refcnt = refcount_read(&p->tcf_refcnt) - ref, .bindcnt = atomic_read(&p->tcf_bindcnt) - bind, }; - struct tcf_nat_parms *parms; struct tcf_t t; - spin_lock_bh(&p->tcf_lock); - - opt.action = p->tcf_action; + rcu_read_lock(); - parms = rcu_dereference_protected(p->parms, lockdep_is_held(&p->tcf_lock)); + parms = rcu_dereference(p->parms); + opt.action = parms->action; opt.old_addr = parms->old_addr; opt.new_addr = parms->new_addr; opt.mask = parms->mask; @@ -294,12 +293,12 @@ static int tcf_nat_dump(struct sk_buff *skb, struct tc_action *a, tcf_tm_dump(&t, &p->tcf_tm); if (nla_put_64bit(skb, TCA_NAT_TM, sizeof(t), &t, TCA_NAT_PAD)) goto nla_put_failure; - spin_unlock_bh(&p->tcf_lock); + rcu_read_unlock(); return skb->len; nla_put_failure: - spin_unlock_bh(&p->tcf_lock); + rcu_read_unlock(); nlmsg_trim(skb, b); return -1; } diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c index fc0a35a7b62a..4b65901397a8 100644 --- a/net/sched/act_pedit.c +++ b/net/sched/act_pedit.c @@ -279,7 +279,7 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla, } p = to_pedit(*a); - + nparms->action = parm->action; spin_lock_bh(&p->tcf_lock); goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); oparms = rcu_replace_pointer(p->parms, nparms, 1); @@ -483,7 +483,7 @@ TC_INDIRECT_SCOPE int tcf_pedit_act(struct sk_buff *skb, bad: tcf_action_inc_overlimit_qstats(&p->common); done: - return p->tcf_action; + return parms->action; } static void tcf_pedit_stats_update(struct tc_action *a, u64 bytes, u64 packets, @@ -500,19 +500,19 @@ static int tcf_pedit_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { unsigned char *b = skb_tail_pointer(skb); - struct tcf_pedit *p = to_pedit(a); - struct tcf_pedit_parms *parms; + const struct tcf_pedit *p = to_pedit(a); + const struct tcf_pedit_parms *parms; struct tc_pedit *opt; struct tcf_t t; int s; - spin_lock_bh(&p->tcf_lock); - parms = rcu_dereference_protected(p->parms, 1); + rcu_read_lock(); + parms = rcu_dereference(p->parms); s = struct_size(opt, keys, parms->tcfp_nkeys); opt = kzalloc(s, GFP_ATOMIC); if (unlikely(!opt)) { - spin_unlock_bh(&p->tcf_lock); + rcu_read_unlock(); return -ENOBUFS; } opt->nkeys = parms->tcfp_nkeys; @@ -521,7 +521,7 @@ static int tcf_pedit_dump(struct sk_buff *skb, struct tc_action *a, flex_array_size(opt, keys, parms->tcfp_nkeys)); opt->index = p->tcf_index; opt->flags = parms->tcfp_flags; - opt->action = p->tcf_action; + opt->action = parms->action; opt->refcnt = refcount_read(&p->tcf_refcnt) - ref; opt->bindcnt = atomic_read(&p->tcf_bindcnt) - bind; @@ -540,13 +540,13 @@ static int tcf_pedit_dump(struct sk_buff *skb, struct tc_action *a, tcf_tm_dump(&t, &p->tcf_tm); if (nla_put_64bit(skb, TCA_PEDIT_TM, sizeof(t), &t, TCA_PEDIT_PAD)) goto nla_put_failure; - spin_unlock_bh(&p->tcf_lock); + rcu_read_unlock(); kfree(opt); return skb->len; nla_put_failure: - spin_unlock_bh(&p->tcf_lock); + rcu_read_unlock(); nlmsg_trim(skb, b); kfree(opt); return -1; diff --git a/net/sched/act_police.c b/net/sched/act_police.c index a214ed681142..0e1c61183379 100644 --- a/net/sched/act_police.c +++ b/net/sched/act_police.c @@ -198,6 +198,7 @@ static int tcf_police_init(struct net *net, struct nlattr *nla, psched_ppscfg_precompute(&new->ppsrate, pps); } + new->action = parm->action; spin_lock_bh(&police->tcf_lock); spin_lock_bh(&police->tcfp_lock); police->tcfp_t_c = ktime_get_ns(); @@ -254,8 +255,8 @@ TC_INDIRECT_SCOPE int tcf_police_act(struct sk_buff *skb, tcf_lastuse_update(&police->tcf_tm); bstats_update(this_cpu_ptr(police->common.cpu_bstats), skb); - ret = READ_ONCE(police->tcf_action); p = rcu_dereference_bh(police->params); + ret = p->action; if (p->tcfp_ewma_rate) { struct gnet_stats_rate_est64 sample; @@ -338,9 +339,9 @@ static void tcf_police_stats_update(struct tc_action *a, static int tcf_police_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { + const struct tcf_police *police = to_police(a); unsigned char *b = skb_tail_pointer(skb); - struct tcf_police *police = to_police(a); - struct tcf_police_params *p; + const struct tcf_police_params *p; struct tc_police opt = { .index = police->tcf_index, .refcnt = refcount_read(&police->tcf_refcnt) - ref, @@ -348,10 +349,9 @@ static int tcf_police_dump(struct sk_buff *skb, struct tc_action *a, }; struct tcf_t t; - spin_lock_bh(&police->tcf_lock); - opt.action = police->tcf_action; - p = rcu_dereference_protected(police->params, - lockdep_is_held(&police->tcf_lock)); + rcu_read_lock(); + p = rcu_dereference(police->params); + opt.action = p->action; opt.mtu = p->tcfp_mtu; opt.burst = PSCHED_NS2TICKS(p->tcfp_burst); if (p->rate_present) { @@ -392,12 +392,12 @@ static int tcf_police_dump(struct sk_buff *skb, struct tc_action *a, tcf_tm_dump(&t, &police->tcf_tm); if (nla_put_64bit(skb, TCA_POLICE_TM, sizeof(t), &t, TCA_POLICE_PAD)) goto nla_put_failure; - spin_unlock_bh(&police->tcf_lock); + rcu_read_unlock(); return skb->len; nla_put_failure: - spin_unlock_bh(&police->tcf_lock); + rcu_read_unlock(); nlmsg_trim(skb, b); return -1; } diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c index 1f1d9ce3e968..8c1d1554f657 100644 --- a/net/sched/act_skbedit.c +++ b/net/sched/act_skbedit.c @@ -43,13 +43,11 @@ TC_INDIRECT_SCOPE int tcf_skbedit_act(struct sk_buff *skb, { struct tcf_skbedit *d = to_skbedit(a); struct tcf_skbedit_params *params; - int action; tcf_lastuse_update(&d->tcf_tm); bstats_update(this_cpu_ptr(d->common.cpu_bstats), skb); params = rcu_dereference_bh(d->params); - action = READ_ONCE(d->tcf_action); if (params->flags & SKBEDIT_F_PRIORITY) skb->priority = params->priority; @@ -85,7 +83,7 @@ TC_INDIRECT_SCOPE int tcf_skbedit_act(struct sk_buff *skb, } if (params->flags & SKBEDIT_F_PTYPE) skb->pkt_type = params->ptype; - return action; + return params->action; err: qstats_drop_inc(this_cpu_ptr(d->common.cpu_qstats)); @@ -262,6 +260,7 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla, if (flags & SKBEDIT_F_MASK) params_new->mask = *mask; + params_new->action = parm->action; spin_lock_bh(&d->tcf_lock); goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); params_new = rcu_replace_pointer(d->params, params_new, @@ -284,9 +283,9 @@ release_idr: static int tcf_skbedit_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { + const struct tcf_skbedit *d = to_skbedit(a); unsigned char *b = skb_tail_pointer(skb); - struct tcf_skbedit *d = to_skbedit(a); - struct tcf_skbedit_params *params; + const struct tcf_skbedit_params *params; struct tc_skbedit opt = { .index = d->tcf_index, .refcnt = refcount_read(&d->tcf_refcnt) - ref, @@ -295,10 +294,9 @@ static int tcf_skbedit_dump(struct sk_buff *skb, struct tc_action *a, u64 pure_flags = 0; struct tcf_t t; - spin_lock_bh(&d->tcf_lock); - params = rcu_dereference_protected(d->params, - lockdep_is_held(&d->tcf_lock)); - opt.action = d->tcf_action; + rcu_read_lock(); + params = rcu_dereference(d->params); + opt.action = params->action; if (nla_put(skb, TCA_SKBEDIT_PARMS, sizeof(opt), &opt)) goto nla_put_failure; @@ -333,12 +331,12 @@ static int tcf_skbedit_dump(struct sk_buff *skb, struct tc_action *a, tcf_tm_dump(&t, &d->tcf_tm); if (nla_put_64bit(skb, TCA_SKBEDIT_TM, sizeof(t), &t, TCA_SKBEDIT_PAD)) goto nla_put_failure; - spin_unlock_bh(&d->tcf_lock); + rcu_read_unlock(); return skb->len; nla_put_failure: - spin_unlock_bh(&d->tcf_lock); + rcu_read_unlock(); nlmsg_trim(skb, b); return -1; } diff --git a/net/sched/bpf_qdisc.c b/net/sched/bpf_qdisc.c new file mode 100644 index 000000000000..adcb618a2bfc --- /dev/null +++ b/net/sched/bpf_qdisc.c @@ -0,0 +1,472 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/types.h> +#include <linux/bpf_verifier.h> +#include <linux/bpf.h> +#include <linux/btf.h> +#include <linux/filter.h> +#include <net/pkt_sched.h> +#include <net/pkt_cls.h> + +#define QDISC_OP_IDX(op) (offsetof(struct Qdisc_ops, op) / sizeof(void (*)(void))) +#define QDISC_MOFF_IDX(moff) (moff / sizeof(void (*)(void))) + +static struct bpf_struct_ops bpf_Qdisc_ops; + +struct bpf_sched_data { + struct qdisc_watchdog watchdog; +}; + +struct bpf_sk_buff_ptr { + struct sk_buff *skb; +}; + +static int bpf_qdisc_init(struct btf *btf) +{ + return 0; +} + +BTF_ID_LIST_SINGLE(bpf_qdisc_ids, struct, Qdisc) +BTF_ID_LIST_SINGLE(bpf_sk_buff_ids, struct, sk_buff) +BTF_ID_LIST_SINGLE(bpf_sk_buff_ptr_ids, struct, bpf_sk_buff_ptr) + +static bool bpf_qdisc_is_valid_access(int off, int size, + enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + struct btf *btf = prog->aux->attach_btf; + u32 arg; + + arg = btf_ctx_arg_idx(btf, prog->aux->attach_func_proto, off); + if (prog->aux->attach_st_ops_member_off == offsetof(struct Qdisc_ops, enqueue)) { + if (arg == 2 && type == BPF_READ) { + info->reg_type = PTR_TO_BTF_ID | PTR_TRUSTED; + info->btf = btf; + info->btf_id = bpf_sk_buff_ptr_ids[0]; + return true; + } + } + + return bpf_tracing_btf_ctx_access(off, size, type, prog, info); +} + +static int bpf_qdisc_qdisc_access(struct bpf_verifier_log *log, + const struct bpf_reg_state *reg, + int off, size_t *end) +{ + switch (off) { + case offsetof(struct Qdisc, limit): + *end = offsetofend(struct Qdisc, limit); + break; + case offsetof(struct Qdisc, q) + offsetof(struct qdisc_skb_head, qlen): + *end = offsetof(struct Qdisc, q) + offsetofend(struct qdisc_skb_head, qlen); + break; + case offsetof(struct Qdisc, qstats) ... offsetofend(struct Qdisc, qstats) - 1: + *end = offsetofend(struct Qdisc, qstats); + break; + default: + return -EACCES; + } + + return 0; +} + +static int bpf_qdisc_sk_buff_access(struct bpf_verifier_log *log, + const struct bpf_reg_state *reg, + int off, size_t *end) +{ + switch (off) { + case offsetof(struct sk_buff, tstamp): + *end = offsetofend(struct sk_buff, tstamp); + break; + case offsetof(struct sk_buff, cb) + offsetof(struct qdisc_skb_cb, data[0]) ... + offsetof(struct sk_buff, cb) + offsetof(struct qdisc_skb_cb, + data[QDISC_CB_PRIV_LEN - 1]): + *end = offsetof(struct sk_buff, cb) + + offsetofend(struct qdisc_skb_cb, data[QDISC_CB_PRIV_LEN - 1]); + break; + default: + return -EACCES; + } + + return 0; +} + +static int bpf_qdisc_btf_struct_access(struct bpf_verifier_log *log, + const struct bpf_reg_state *reg, + int off, int size) +{ + const struct btf_type *t, *skbt, *qdisct; + size_t end; + int err; + + skbt = btf_type_by_id(reg->btf, bpf_sk_buff_ids[0]); + qdisct = btf_type_by_id(reg->btf, bpf_qdisc_ids[0]); + t = btf_type_by_id(reg->btf, reg->btf_id); + + if (t == skbt) { + err = bpf_qdisc_sk_buff_access(log, reg, off, &end); + } else if (t == qdisct) { + err = bpf_qdisc_qdisc_access(log, reg, off, &end); + } else { + bpf_log(log, "only read is supported\n"); + return -EACCES; + } + + if (err) { + bpf_log(log, "no write support to %s at off %d\n", + btf_name_by_offset(reg->btf, t->name_off), off); + return -EACCES; + } + + if (off + size > end) { + bpf_log(log, + "write access at off %d with size %d beyond the member of %s ended at %zu\n", + off, size, btf_name_by_offset(reg->btf, t->name_off), end); + return -EACCES; + } + + return 0; +} + +BTF_ID_LIST_SINGLE(bpf_qdisc_init_prologue_ids, func, bpf_qdisc_init_prologue) + +static int bpf_qdisc_gen_prologue(struct bpf_insn *insn_buf, bool direct_write, + const struct bpf_prog *prog) +{ + struct bpf_insn *insn = insn_buf; + + if (prog->aux->attach_st_ops_member_off != offsetof(struct Qdisc_ops, init)) + return 0; + + /* r6 = r1; // r6 will be "u64 *ctx". r1 is "u64 *ctx". + * r2 = r1[16]; // r2 will be "struct netlink_ext_ack *extack" + * r1 = r1[0]; // r1 will be "struct Qdisc *sch" + * r0 = bpf_qdisc_init_prologue(r1, r2); + * if r0 == 0 goto pc+1; + * BPF_EXIT; + * r1 = r6; // r1 will be "u64 *ctx". + */ + *insn++ = BPF_MOV64_REG(BPF_REG_6, BPF_REG_1); + *insn++ = BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1, 16); + *insn++ = BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_1, 0); + *insn++ = BPF_CALL_KFUNC(0, bpf_qdisc_init_prologue_ids[0]); + *insn++ = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1); + *insn++ = BPF_EXIT_INSN(); + *insn++ = BPF_MOV64_REG(BPF_REG_1, BPF_REG_6); + *insn++ = prog->insnsi[0]; + + return insn - insn_buf; +} + +BTF_ID_LIST_SINGLE(bpf_qdisc_reset_destroy_epilogue_ids, func, bpf_qdisc_reset_destroy_epilogue) + +static int bpf_qdisc_gen_epilogue(struct bpf_insn *insn_buf, const struct bpf_prog *prog, + s16 ctx_stack_off) +{ + struct bpf_insn *insn = insn_buf; + + if (prog->aux->attach_st_ops_member_off != offsetof(struct Qdisc_ops, reset) && + prog->aux->attach_st_ops_member_off != offsetof(struct Qdisc_ops, destroy)) + return 0; + + /* r1 = stack[ctx_stack_off]; // r1 will be "u64 *ctx" + * r1 = r1[0]; // r1 will be "struct Qdisc *sch" + * r0 = bpf_qdisc_reset_destroy_epilogue(r1); + * BPF_EXIT; + */ + *insn++ = BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_FP, ctx_stack_off); + *insn++ = BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_1, 0); + *insn++ = BPF_CALL_KFUNC(0, bpf_qdisc_reset_destroy_epilogue_ids[0]); + *insn++ = BPF_EXIT_INSN(); + + return insn - insn_buf; +} + +__bpf_kfunc_start_defs(); + +/* bpf_skb_get_hash - Get the flow hash of an skb. + * @skb: The skb to get the flow hash from. + */ +__bpf_kfunc u32 bpf_skb_get_hash(struct sk_buff *skb) +{ + return skb_get_hash(skb); +} + +/* bpf_kfree_skb - Release an skb's reference and drop it immediately. + * @skb: The skb whose reference to be released and dropped. + */ +__bpf_kfunc void bpf_kfree_skb(struct sk_buff *skb) +{ + kfree_skb(skb); +} + +/* bpf_qdisc_skb_drop - Drop an skb by adding it to a deferred free list. + * @skb: The skb whose reference to be released and dropped. + * @to_free_list: The list of skbs to be dropped. + */ +__bpf_kfunc void bpf_qdisc_skb_drop(struct sk_buff *skb, + struct bpf_sk_buff_ptr *to_free_list) +{ + __qdisc_drop(skb, (struct sk_buff **)to_free_list); +} + +/* bpf_qdisc_watchdog_schedule - Schedule a qdisc to a later time using a timer. + * @sch: The qdisc to be scheduled. + * @expire: The expiry time of the timer. + * @delta_ns: The slack range of the timer. + */ +__bpf_kfunc void bpf_qdisc_watchdog_schedule(struct Qdisc *sch, u64 expire, u64 delta_ns) +{ + struct bpf_sched_data *q = qdisc_priv(sch); + + qdisc_watchdog_schedule_range_ns(&q->watchdog, expire, delta_ns); +} + +/* bpf_qdisc_init_prologue - Hidden kfunc called in prologue of .init. */ +__bpf_kfunc int bpf_qdisc_init_prologue(struct Qdisc *sch, + struct netlink_ext_ack *extack) +{ + struct bpf_sched_data *q = qdisc_priv(sch); + struct net_device *dev = qdisc_dev(sch); + struct Qdisc *p; + + qdisc_watchdog_init(&q->watchdog, sch); + + if (sch->parent != TC_H_ROOT) { + /* If qdisc_lookup() returns NULL, it means .init is called by + * qdisc_create_dflt() in mq/mqprio_init and the parent qdisc + * has not been added to qdisc_hash yet. + */ + p = qdisc_lookup(dev, TC_H_MAJ(sch->parent)); + if (p && !(p->flags & TCQ_F_MQROOT)) { + NL_SET_ERR_MSG(extack, "BPF qdisc only supported on root or mq"); + return -EINVAL; + } + } + + return 0; +} + +/* bpf_qdisc_reset_destroy_epilogue - Hidden kfunc called in epilogue of .reset + * and .destroy + */ +__bpf_kfunc void bpf_qdisc_reset_destroy_epilogue(struct Qdisc *sch) +{ + struct bpf_sched_data *q = qdisc_priv(sch); + + qdisc_watchdog_cancel(&q->watchdog); +} + +/* bpf_qdisc_bstats_update - Update Qdisc basic statistics + * @sch: The qdisc from which an skb is dequeued. + * @skb: The skb to be dequeued. + */ +__bpf_kfunc void bpf_qdisc_bstats_update(struct Qdisc *sch, const struct sk_buff *skb) +{ + bstats_update(&sch->bstats, skb); +} + +__bpf_kfunc_end_defs(); + +BTF_KFUNCS_START(qdisc_kfunc_ids) +BTF_ID_FLAGS(func, bpf_skb_get_hash, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_kfree_skb, KF_RELEASE) +BTF_ID_FLAGS(func, bpf_qdisc_skb_drop, KF_RELEASE) +BTF_ID_FLAGS(func, bpf_dynptr_from_skb, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_qdisc_watchdog_schedule, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_qdisc_init_prologue, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_qdisc_reset_destroy_epilogue, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_qdisc_bstats_update, KF_TRUSTED_ARGS) +BTF_KFUNCS_END(qdisc_kfunc_ids) + +BTF_SET_START(qdisc_common_kfunc_set) +BTF_ID(func, bpf_skb_get_hash) +BTF_ID(func, bpf_kfree_skb) +BTF_ID(func, bpf_dynptr_from_skb) +BTF_SET_END(qdisc_common_kfunc_set) + +BTF_SET_START(qdisc_enqueue_kfunc_set) +BTF_ID(func, bpf_qdisc_skb_drop) +BTF_ID(func, bpf_qdisc_watchdog_schedule) +BTF_SET_END(qdisc_enqueue_kfunc_set) + +BTF_SET_START(qdisc_dequeue_kfunc_set) +BTF_ID(func, bpf_qdisc_watchdog_schedule) +BTF_ID(func, bpf_qdisc_bstats_update) +BTF_SET_END(qdisc_dequeue_kfunc_set) + +enum qdisc_ops_kf_flags { + QDISC_OPS_KF_COMMON = 0, + QDISC_OPS_KF_ENQUEUE = 1 << 0, + QDISC_OPS_KF_DEQUEUE = 1 << 1, +}; + +static const u32 qdisc_ops_context_flags[] = { + [QDISC_OP_IDX(enqueue)] = QDISC_OPS_KF_ENQUEUE, + [QDISC_OP_IDX(dequeue)] = QDISC_OPS_KF_DEQUEUE, + [QDISC_OP_IDX(init)] = QDISC_OPS_KF_COMMON, + [QDISC_OP_IDX(reset)] = QDISC_OPS_KF_COMMON, + [QDISC_OP_IDX(destroy)] = QDISC_OPS_KF_COMMON, +}; + +static int bpf_qdisc_kfunc_filter(const struct bpf_prog *prog, u32 kfunc_id) +{ + u32 moff, flags; + + if (!btf_id_set8_contains(&qdisc_kfunc_ids, kfunc_id)) + return 0; + + if (prog->aux->st_ops != &bpf_Qdisc_ops) + return -EACCES; + + moff = prog->aux->attach_st_ops_member_off; + flags = qdisc_ops_context_flags[QDISC_MOFF_IDX(moff)]; + + if ((flags & QDISC_OPS_KF_ENQUEUE) && + btf_id_set_contains(&qdisc_enqueue_kfunc_set, kfunc_id)) + return 0; + + if ((flags & QDISC_OPS_KF_DEQUEUE) && + btf_id_set_contains(&qdisc_dequeue_kfunc_set, kfunc_id)) + return 0; + + if (btf_id_set_contains(&qdisc_common_kfunc_set, kfunc_id)) + return 0; + + return -EACCES; +} + +static const struct btf_kfunc_id_set bpf_qdisc_kfunc_set = { + .owner = THIS_MODULE, + .set = &qdisc_kfunc_ids, + .filter = bpf_qdisc_kfunc_filter, +}; + +static const struct bpf_verifier_ops bpf_qdisc_verifier_ops = { + .get_func_proto = bpf_base_func_proto, + .is_valid_access = bpf_qdisc_is_valid_access, + .btf_struct_access = bpf_qdisc_btf_struct_access, + .gen_prologue = bpf_qdisc_gen_prologue, + .gen_epilogue = bpf_qdisc_gen_epilogue, +}; + +static int bpf_qdisc_init_member(const struct btf_type *t, + const struct btf_member *member, + void *kdata, const void *udata) +{ + const struct Qdisc_ops *uqdisc_ops; + struct Qdisc_ops *qdisc_ops; + u32 moff; + + uqdisc_ops = (const struct Qdisc_ops *)udata; + qdisc_ops = (struct Qdisc_ops *)kdata; + + moff = __btf_member_bit_offset(t, member) / 8; + switch (moff) { + case offsetof(struct Qdisc_ops, priv_size): + if (uqdisc_ops->priv_size) + return -EINVAL; + qdisc_ops->priv_size = sizeof(struct bpf_sched_data); + return 1; + case offsetof(struct Qdisc_ops, peek): + qdisc_ops->peek = qdisc_peek_dequeued; + return 0; + case offsetof(struct Qdisc_ops, id): + if (bpf_obj_name_cpy(qdisc_ops->id, uqdisc_ops->id, + sizeof(qdisc_ops->id)) <= 0) + return -EINVAL; + return 1; + } + + return 0; +} + +static int bpf_qdisc_reg(void *kdata, struct bpf_link *link) +{ + return register_qdisc(kdata); +} + +static void bpf_qdisc_unreg(void *kdata, struct bpf_link *link) +{ + return unregister_qdisc(kdata); +} + +static int bpf_qdisc_validate(void *kdata) +{ + struct Qdisc_ops *ops = (struct Qdisc_ops *)kdata; + + if (!ops->enqueue || !ops->dequeue || !ops->init || + !ops->reset || !ops->destroy) + return -EINVAL; + + return 0; +} + +static int Qdisc_ops__enqueue(struct sk_buff *skb__ref, struct Qdisc *sch, + struct sk_buff **to_free) +{ + return 0; +} + +static struct sk_buff *Qdisc_ops__dequeue(struct Qdisc *sch) +{ + return NULL; +} + +static int Qdisc_ops__init(struct Qdisc *sch, struct nlattr *arg, + struct netlink_ext_ack *extack) +{ + return 0; +} + +static void Qdisc_ops__reset(struct Qdisc *sch) +{ +} + +static void Qdisc_ops__destroy(struct Qdisc *sch) +{ +} + +static struct Qdisc_ops __bpf_ops_qdisc_ops = { + .enqueue = Qdisc_ops__enqueue, + .dequeue = Qdisc_ops__dequeue, + .init = Qdisc_ops__init, + .reset = Qdisc_ops__reset, + .destroy = Qdisc_ops__destroy, +}; + +static struct bpf_struct_ops bpf_Qdisc_ops = { + .verifier_ops = &bpf_qdisc_verifier_ops, + .reg = bpf_qdisc_reg, + .unreg = bpf_qdisc_unreg, + .validate = bpf_qdisc_validate, + .init_member = bpf_qdisc_init_member, + .init = bpf_qdisc_init, + .name = "Qdisc_ops", + .cfi_stubs = &__bpf_ops_qdisc_ops, + .owner = THIS_MODULE, +}; + +BTF_ID_LIST_SINGLE(bpf_sk_buff_dtor_ids, func, bpf_kfree_skb) + +static int __init bpf_qdisc_kfunc_init(void) +{ + int ret; + const struct btf_id_dtor_kfunc skb_kfunc_dtors[] = { + { + .btf_id = bpf_sk_buff_ids[0], + .kfunc_btf_id = bpf_sk_buff_dtor_ids[0] + }, + }; + + ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &bpf_qdisc_kfunc_set); + ret = ret ?: register_btf_id_dtor_kfuncs(skb_kfunc_dtors, + ARRAY_SIZE(skb_kfunc_dtors), + THIS_MODULE); + ret = ret ?: register_bpf_struct_ops(&bpf_Qdisc_ops, Qdisc_ops); + + return ret; +} +late_initcall(bpf_qdisc_kfunc_init); diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c index 5c2580a07530..5693b41b093f 100644 --- a/net/sched/cls_flow.c +++ b/net/sched/cls_flow.c @@ -345,7 +345,7 @@ TC_INDIRECT_SCOPE int flow_classify(struct sk_buff *skb, static void flow_perturbation(struct timer_list *t) { - struct flow_filter *f = from_timer(f, t, perturb_timer); + struct flow_filter *f = timer_container_of(f, t, perturb_timer); get_random_bytes(&f->hashrnd, 4); if (f->perturb_period) diff --git a/net/sched/em_text.c b/net/sched/em_text.c index 420c66203b17..6b3d0af72c39 100644 --- a/net/sched/em_text.c +++ b/net/sched/em_text.c @@ -108,7 +108,7 @@ static int em_text_dump(struct sk_buff *skb, struct tcf_ematch *m) struct text_match *tm = EM_TEXT_PRIV(m); struct tcf_em_text conf; - strncpy(conf.algo, tm->config->ops->name, sizeof(conf.algo) - 1); + strscpy(conf.algo, tm->config->ops->name); conf.from_offset = tm->from_offset; conf.to_offset = tm->to_offset; conf.from_layer = tm->from_layer; diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index f74a097f54ae..d7c767b861a4 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -25,6 +25,7 @@ #include <linux/hrtimer.h> #include <linux/slab.h> #include <linux/hashtable.h> +#include <linux/bpf.h> #include <net/netdev_lock.h> #include <net/net_namespace.h> @@ -207,7 +208,7 @@ static struct Qdisc_ops *qdisc_lookup_default(const char *name) for (q = qdisc_base; q; q = q->next) { if (!strcmp(name, q->id)) { - if (!try_module_get(q->owner)) + if (!bpf_try_module_get(q, q->owner)) q = NULL; break; } @@ -237,7 +238,7 @@ int qdisc_set_default(const char *name) if (ops) { /* Set new default */ - module_put(default_qdisc_ops->owner); + bpf_module_put(default_qdisc_ops, default_qdisc_ops->owner); default_qdisc_ops = ops; } write_unlock(&qdisc_mod_lock); @@ -335,17 +336,22 @@ out: return q; } -static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid) +static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid, + struct netlink_ext_ack *extack) { unsigned long cl; const struct Qdisc_class_ops *cops = p->ops->cl_ops; - if (cops == NULL) - return NULL; + if (cops == NULL) { + NL_SET_ERR_MSG(extack, "Parent qdisc is not classful"); + return ERR_PTR(-EOPNOTSUPP); + } cl = cops->find(p, classid); - if (cl == 0) - return NULL; + if (cl == 0) { + NL_SET_ERR_MSG(extack, "Specified class not found"); + return ERR_PTR(-ENOENT); + } return cops->leaf(p, cl); } @@ -359,7 +365,7 @@ static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind) read_lock(&qdisc_mod_lock); for (q = qdisc_base; q; q = q->next) { if (nla_strcmp(kind, q->id) == 0) { - if (!try_module_get(q->owner)) + if (!bpf_try_module_get(q, q->owner)) q = NULL; break; } @@ -595,16 +601,6 @@ out: qdisc_skb_cb(skb)->pkt_len = pkt_len; } -void qdisc_warn_nonwc(const char *txt, struct Qdisc *qdisc) -{ - if (!(qdisc->flags & TCQ_F_WARN_NONWC)) { - pr_warn("%s: %s qdisc %X: is non-work-conserving?\n", - txt, qdisc->ops->id, qdisc->handle >> 16); - qdisc->flags |= TCQ_F_WARN_NONWC; - } -} -EXPORT_SYMBOL(qdisc_warn_nonwc); - static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer) { struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog, @@ -779,15 +775,12 @@ static u32 qdisc_alloc_handle(struct net_device *dev) void qdisc_tree_reduce_backlog(struct Qdisc *sch, int n, int len) { - bool qdisc_is_offloaded = sch->flags & TCQ_F_OFFLOADED; const struct Qdisc_class_ops *cops; unsigned long cl; u32 parentid; bool notify; int drops; - if (n == 0 && len == 0) - return; drops = max_t(int, n, 0); rcu_read_lock(); while ((parentid = sch->parent)) { @@ -796,17 +789,8 @@ void qdisc_tree_reduce_backlog(struct Qdisc *sch, int n, int len) if (sch->flags & TCQ_F_NOPARENT) break; - /* Notify parent qdisc only if child qdisc becomes empty. - * - * If child was empty even before update then backlog - * counter is screwed and we skip notification because - * parent class is already passive. - * - * If the original child was offloaded then it is allowed - * to be seem as empty, so the parent is notified anyway. - */ - notify = !sch->q.qlen && !WARN_ON_ONCE(!n && - !qdisc_is_offloaded); + /* Notify parent qdisc only if child qdisc becomes empty. */ + notify = !sch->q.qlen; /* TODO: perform the search on a per txq basis */ sch = qdisc_lookup_rcu(qdisc_dev(sch), TC_H_MAJ(parentid)); if (sch == NULL) { @@ -815,6 +799,9 @@ void qdisc_tree_reduce_backlog(struct Qdisc *sch, int n, int len) } cops = sch->ops->cl_ops; if (notify && cops->qlen_notify) { + /* Note that qlen_notify must be idempotent as it may get called + * multiple times. + */ cl = cops->find(sch, parentid); cops->qlen_notify(sch, cl); } @@ -1370,7 +1357,7 @@ err_out3: netdev_put(dev, &sch->dev_tracker); qdisc_free(sch); err_out2: - module_put(ops->owner); + bpf_module_put(ops, ops->owner); err_out: *errp = err; return NULL; @@ -1498,7 +1485,7 @@ static int __tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified classid"); return -ENOENT; } - q = qdisc_leaf(p, clid); + q = qdisc_leaf(p, clid, extack); } else if (dev_ingress_queue(dev)) { q = rtnl_dereference(dev_ingress_queue(dev)->qdisc_sleeping); } @@ -1509,6 +1496,8 @@ static int __tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, NL_SET_ERR_MSG(extack, "Cannot find specified qdisc on specified device"); return -ENOENT; } + if (IS_ERR(q)) + return PTR_ERR(q); if (tcm->tcm_handle && q->handle != tcm->tcm_handle) { NL_SET_ERR_MSG(extack, "Invalid handle"); @@ -1610,7 +1599,9 @@ static int __tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, NL_SET_ERR_MSG(extack, "Failed to find specified qdisc"); return -ENOENT; } - q = qdisc_leaf(p, clid); + q = qdisc_leaf(p, clid, extack); + if (IS_ERR(q)) + return PTR_ERR(q); } else if (dev_ingress_queue_create(dev)) { q = rtnl_dereference(dev_ingress_queue(dev)->qdisc_sleeping); } @@ -1782,7 +1773,7 @@ static void request_qdisc_module(struct nlattr *kind) ops = qdisc_lookup_ops(kind); if (ops) { - module_put(ops->owner); + bpf_module_put(ops, ops->owner); return; } diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c index 48dd8c88903f..dbcfb948c867 100644 --- a/net/sched/sch_cake.c +++ b/net/sched/sch_cake.c @@ -1407,7 +1407,10 @@ static u32 cake_overhead(struct cake_sched_data *q, const struct sk_buff *skb) return cake_calc_overhead(q, len, off); /* borrowed from qdisc_pkt_len_init() */ - hdr_len = skb_transport_offset(skb); + if (!skb->encapsulation) + hdr_len = skb_transport_offset(skb); + else + hdr_len = skb_inner_transport_offset(skb); /* + transport layer */ if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | diff --git a/net/sched/sch_dualpi2.c b/net/sched/sch_dualpi2.c new file mode 100644 index 000000000000..845375ebd4ea --- /dev/null +++ b/net/sched/sch_dualpi2.c @@ -0,0 +1,1175 @@ +// SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause +/* Copyright (C) 2024 Nokia + * + * Author: Koen De Schepper <koen.de_schepper@nokia-bell-labs.com> + * Author: Olga Albisser <olga@albisser.org> + * Author: Henrik Steen <henrist@henrist.net> + * Author: Olivier Tilmans <olivier.tilmans@nokia.com> + * Author: Chia-Yu Chang <chia-yu.chang@nokia-bell-labs.com> + * + * DualPI Improved with a Square (dualpi2): + * - Supports congestion controls that comply with the Prague requirements + * in RFC9331 (e.g. TCP-Prague) + * - Supports coupled dual-queue with PI2 as defined in RFC9332 + * - Supports ECN L4S-identifier (IP.ECN==0b*1) + * + * note: Although DCTCP and BBRv3 can use shallow-threshold ECN marks, + * they do not meet the 'Prague L4S Requirements' listed in RFC 9331 + * Section 4, so they can only be used with DualPI2 in a datacenter + * context. + * + * References: + * - RFC9332: https://datatracker.ietf.org/doc/html/rfc9332 + * - De Schepper, Koen, et al. "PI 2: A linearized AQM for both classic and + * scalable TCP." in proc. ACM CoNEXT'16, 2016. + */ + +#include <linux/errno.h> +#include <linux/hrtimer.h> +#include <linux/if_vlan.h> +#include <linux/kernel.h> +#include <linux/limits.h> +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/types.h> + +#include <net/gso.h> +#include <net/inet_ecn.h> +#include <net/pkt_cls.h> +#include <net/pkt_sched.h> + +/* 32b enable to support flows with windows up to ~8.6 * 1e9 packets + * i.e., twice the maximal snd_cwnd. + * MAX_PROB must be consistent with the RNG in dualpi2_roll(). + */ +#define MAX_PROB U32_MAX + +/* alpha/beta values exchanged over netlink are in units of 256ns */ +#define ALPHA_BETA_SHIFT 8 + +/* Scaled values of alpha/beta must fit in 32b to avoid overflow in later + * computations. Consequently (see and dualpi2_scale_alpha_beta()), their + * netlink-provided values can use at most 31b, i.e. be at most (2^23)-1 + * (~4MHz) as those are given in 1/256th. This enable to tune alpha/beta to + * control flows whose maximal RTTs can be in usec up to few secs. + */ +#define ALPHA_BETA_MAX ((1U << 31) - 1) + +/* Internal alpha/beta are in units of 64ns. + * This enables to use all alpha/beta values in the allowed range without loss + * of precision due to rounding when scaling them internally, e.g., + * scale_alpha_beta(1) will not round down to 0. + */ +#define ALPHA_BETA_GRANULARITY 6 + +#define ALPHA_BETA_SCALING (ALPHA_BETA_SHIFT - ALPHA_BETA_GRANULARITY) + +/* We express the weights (wc, wl) in %, i.e., wc + wl = 100 */ +#define MAX_WC 100 + +struct dualpi2_sched_data { + struct Qdisc *l_queue; /* The L4S Low latency queue (L-queue) */ + struct Qdisc *sch; /* The Classic queue (C-queue) */ + + /* Registered tc filters */ + struct tcf_proto __rcu *tcf_filters; + struct tcf_block *tcf_block; + + /* PI2 parameters */ + u64 pi2_target; /* Target delay in nanoseconds */ + u32 pi2_tupdate; /* Timer frequency in nanoseconds */ + u32 pi2_prob; /* Base PI probability */ + u32 pi2_alpha; /* Gain factor for the integral rate response */ + u32 pi2_beta; /* Gain factor for the proportional response */ + struct hrtimer pi2_timer; /* prob update timer */ + + /* Step AQM (L-queue only) parameters */ + u32 step_thresh; /* Step threshold */ + bool step_in_packets; /* Step thresh in packets (1) or time (0) */ + + /* C-queue starvation protection */ + s32 c_protection_credit; /* Credit (sign indicates which queue) */ + s32 c_protection_init; /* Reset value of the credit */ + u8 c_protection_wc; /* C-queue weight (between 0 and MAX_WC) */ + u8 c_protection_wl; /* L-queue weight (MAX_WC - wc) */ + + /* General dualQ parameters */ + u32 memory_limit; /* Memory limit of both queues */ + u8 coupling_factor;/* Coupling factor (k) between both queues */ + u8 ecn_mask; /* Mask to match packets into L-queue */ + u32 min_qlen_step; /* Minimum queue length to apply step thresh */ + bool drop_early; /* Drop at enqueue (1) instead of dequeue (0) */ + bool drop_overload; /* Drop (1) on overload, or overflow (0) */ + bool split_gso; /* Split aggregated skb (1) or leave as is (0) */ + + /* Statistics */ + u64 c_head_ts; /* Enqueue timestamp of the C-queue head */ + u64 l_head_ts; /* Enqueue timestamp of the L-queue head */ + u64 last_qdelay; /* Q delay val at the last probability update */ + u32 packets_in_c; /* Enqueue packet counter of the C-queue */ + u32 packets_in_l; /* Enqueue packet counter of the L-queue */ + u32 maxq; /* Maximum queue size of the C-queue */ + u32 ecn_mark; /* ECN mark pkt counter due to PI probability */ + u32 step_marks; /* ECN mark pkt counter due to step AQM */ + u32 memory_used; /* Memory used of both queues */ + u32 max_memory_used;/* Maximum used memory */ + + /* Deferred drop statistics */ + u32 deferred_drops_cnt; /* Packets dropped */ + u32 deferred_drops_len; /* Bytes dropped */ +}; + +struct dualpi2_skb_cb { + u64 ts; /* Timestamp at enqueue */ + u8 apply_step:1, /* Can we apply the step threshold */ + classified:2, /* Packet classification results */ + ect:2; /* Packet ECT codepoint */ +}; + +enum dualpi2_classification_results { + DUALPI2_C_CLASSIC = 0, /* C-queue */ + DUALPI2_C_L4S = 1, /* L-queue (scale mark/classic drop) */ + DUALPI2_C_LLLL = 2, /* L-queue (no drops/marks) */ + __DUALPI2_C_MAX /* Keep last*/ +}; + +static struct dualpi2_skb_cb *dualpi2_skb_cb(struct sk_buff *skb) +{ + qdisc_cb_private_validate(skb, sizeof(struct dualpi2_skb_cb)); + return (struct dualpi2_skb_cb *)qdisc_skb_cb(skb)->data; +} + +static u64 dualpi2_sojourn_time(struct sk_buff *skb, u64 reference) +{ + return reference - dualpi2_skb_cb(skb)->ts; +} + +static u64 head_enqueue_time(struct Qdisc *q) +{ + struct sk_buff *skb = qdisc_peek_head(q); + + return skb ? dualpi2_skb_cb(skb)->ts : 0; +} + +static u32 dualpi2_scale_alpha_beta(u32 param) +{ + u64 tmp = ((u64)param * MAX_PROB >> ALPHA_BETA_SCALING); + + do_div(tmp, NSEC_PER_SEC); + return tmp; +} + +static u32 dualpi2_unscale_alpha_beta(u32 param) +{ + u64 tmp = ((u64)param * NSEC_PER_SEC << ALPHA_BETA_SCALING); + + do_div(tmp, MAX_PROB); + return tmp; +} + +static ktime_t next_pi2_timeout(struct dualpi2_sched_data *q) +{ + return ktime_add_ns(ktime_get_ns(), q->pi2_tupdate); +} + +static bool skb_is_l4s(struct sk_buff *skb) +{ + return dualpi2_skb_cb(skb)->classified == DUALPI2_C_L4S; +} + +static bool skb_in_l_queue(struct sk_buff *skb) +{ + return dualpi2_skb_cb(skb)->classified != DUALPI2_C_CLASSIC; +} + +static bool skb_apply_step(struct sk_buff *skb, struct dualpi2_sched_data *q) +{ + return skb_is_l4s(skb) && qdisc_qlen(q->l_queue) >= q->min_qlen_step; +} + +static bool dualpi2_mark(struct dualpi2_sched_data *q, struct sk_buff *skb) +{ + if (INET_ECN_set_ce(skb)) { + q->ecn_mark++; + return true; + } + return false; +} + +static void dualpi2_reset_c_protection(struct dualpi2_sched_data *q) +{ + q->c_protection_credit = q->c_protection_init; +} + +/* This computes the initial credit value and WRR weight for the L queue (wl) + * from the weight of the C queue (wc). + * If wl > wc, the scheduler will start with the L queue when reset. + */ +static void dualpi2_calculate_c_protection(struct Qdisc *sch, + struct dualpi2_sched_data *q, u32 wc) +{ + q->c_protection_wc = wc; + q->c_protection_wl = MAX_WC - wc; + q->c_protection_init = (s32)psched_mtu(qdisc_dev(sch)) * + ((int)q->c_protection_wc - (int)q->c_protection_wl); + dualpi2_reset_c_protection(q); +} + +static bool dualpi2_roll(u32 prob) +{ + return get_random_u32() <= prob; +} + +/* Packets in the C-queue are subject to a marking probability pC, which is the + * square of the internal PI probability (i.e., have an overall lower mark/drop + * probability). If the qdisc is overloaded, ignore ECT values and only drop. + * + * Note that this marking scheme is also applied to L4S packets during overload. + * Return true if packet dropping is required in C queue + */ +static bool dualpi2_classic_marking(struct dualpi2_sched_data *q, + struct sk_buff *skb, u32 prob, + bool overload) +{ + if (dualpi2_roll(prob) && dualpi2_roll(prob)) { + if (overload || dualpi2_skb_cb(skb)->ect == INET_ECN_NOT_ECT) + return true; + dualpi2_mark(q, skb); + } + return false; +} + +/* Packets in the L-queue are subject to a marking probability pL given by the + * internal PI probability scaled by the coupling factor. + * + * On overload (i.e., @local_l_prob is >= 100%): + * - if the qdisc is configured to trade losses to preserve latency (i.e., + * @q->drop_overload), apply classic drops first before marking. + * - otherwise, preserve the "no loss" property of ECN at the cost of queueing + * delay, eventually resulting in taildrop behavior once sch->limit is + * reached. + * Return true if packet dropping is required in L queue + */ +static bool dualpi2_scalable_marking(struct dualpi2_sched_data *q, + struct sk_buff *skb, + u64 local_l_prob, u32 prob, + bool overload) +{ + if (overload) { + /* Apply classic drop */ + if (!q->drop_overload || + !(dualpi2_roll(prob) && dualpi2_roll(prob))) + goto mark; + return true; + } + + /* We can safely cut the upper 32b as overload==false */ + if (dualpi2_roll(local_l_prob)) { + /* Non-ECT packets could have classified as L4S by filters. */ + if (dualpi2_skb_cb(skb)->ect == INET_ECN_NOT_ECT) + return true; +mark: + dualpi2_mark(q, skb); + } + return false; +} + +/* Decide whether a given packet must be dropped (or marked if ECT), according + * to the PI2 probability. + * + * Never mark/drop if we have a standing queue of less than 2 MTUs. + */ +static bool must_drop(struct Qdisc *sch, struct dualpi2_sched_data *q, + struct sk_buff *skb) +{ + u64 local_l_prob; + bool overload; + u32 prob; + + if (sch->qstats.backlog < 2 * psched_mtu(qdisc_dev(sch))) + return false; + + prob = READ_ONCE(q->pi2_prob); + local_l_prob = (u64)prob * q->coupling_factor; + overload = local_l_prob > MAX_PROB; + + switch (dualpi2_skb_cb(skb)->classified) { + case DUALPI2_C_CLASSIC: + return dualpi2_classic_marking(q, skb, prob, overload); + case DUALPI2_C_L4S: + return dualpi2_scalable_marking(q, skb, local_l_prob, prob, + overload); + default: /* DUALPI2_C_LLLL */ + return false; + } +} + +static void dualpi2_read_ect(struct sk_buff *skb) +{ + struct dualpi2_skb_cb *cb = dualpi2_skb_cb(skb); + int wlen = skb_network_offset(skb); + + switch (skb_protocol(skb, true)) { + case htons(ETH_P_IP): + wlen += sizeof(struct iphdr); + if (!pskb_may_pull(skb, wlen) || + skb_try_make_writable(skb, wlen)) + goto not_ecn; + + cb->ect = ipv4_get_dsfield(ip_hdr(skb)) & INET_ECN_MASK; + break; + case htons(ETH_P_IPV6): + wlen += sizeof(struct ipv6hdr); + if (!pskb_may_pull(skb, wlen) || + skb_try_make_writable(skb, wlen)) + goto not_ecn; + + cb->ect = ipv6_get_dsfield(ipv6_hdr(skb)) & INET_ECN_MASK; + break; + default: + goto not_ecn; + } + return; + +not_ecn: + /* Non pullable/writable packets can only be dropped hence are + * classified as not ECT. + */ + cb->ect = INET_ECN_NOT_ECT; +} + +static int dualpi2_skb_classify(struct dualpi2_sched_data *q, + struct sk_buff *skb) +{ + struct dualpi2_skb_cb *cb = dualpi2_skb_cb(skb); + struct tcf_result res; + struct tcf_proto *fl; + int result; + + dualpi2_read_ect(skb); + if (cb->ect & q->ecn_mask) { + cb->classified = DUALPI2_C_L4S; + return NET_XMIT_SUCCESS; + } + + if (TC_H_MAJ(skb->priority) == q->sch->handle && + TC_H_MIN(skb->priority) < __DUALPI2_C_MAX) { + cb->classified = TC_H_MIN(skb->priority); + return NET_XMIT_SUCCESS; + } + + fl = rcu_dereference_bh(q->tcf_filters); + if (!fl) { + cb->classified = DUALPI2_C_CLASSIC; + return NET_XMIT_SUCCESS; + } + + result = tcf_classify(skb, NULL, fl, &res, false); + if (result >= 0) { +#ifdef CONFIG_NET_CLS_ACT + switch (result) { + case TC_ACT_STOLEN: + case TC_ACT_QUEUED: + case TC_ACT_TRAP: + return NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; + case TC_ACT_SHOT: + return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; + } +#endif + cb->classified = TC_H_MIN(res.classid) < __DUALPI2_C_MAX ? + TC_H_MIN(res.classid) : DUALPI2_C_CLASSIC; + } + return NET_XMIT_SUCCESS; +} + +static int dualpi2_enqueue_skb(struct sk_buff *skb, struct Qdisc *sch, + struct sk_buff **to_free) +{ + struct dualpi2_sched_data *q = qdisc_priv(sch); + struct dualpi2_skb_cb *cb; + + if (unlikely(qdisc_qlen(sch) >= sch->limit) || + unlikely((u64)q->memory_used + skb->truesize > q->memory_limit)) { + qdisc_qstats_overlimit(sch); + if (skb_in_l_queue(skb)) + qdisc_qstats_overlimit(q->l_queue); + return qdisc_drop_reason(skb, sch, to_free, + SKB_DROP_REASON_QDISC_OVERLIMIT); + } + + if (q->drop_early && must_drop(sch, q, skb)) { + qdisc_drop_reason(skb, sch, to_free, + SKB_DROP_REASON_QDISC_CONGESTED); + return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; + } + + cb = dualpi2_skb_cb(skb); + cb->ts = ktime_get_ns(); + q->memory_used += skb->truesize; + if (q->memory_used > q->max_memory_used) + q->max_memory_used = q->memory_used; + + if (qdisc_qlen(sch) > q->maxq) + q->maxq = qdisc_qlen(sch); + + if (skb_in_l_queue(skb)) { + /* Apply step thresh if skb is L4S && L-queue len >= min_qlen */ + dualpi2_skb_cb(skb)->apply_step = skb_apply_step(skb, q); + + /* Keep the overall qdisc stats consistent */ + ++sch->q.qlen; + qdisc_qstats_backlog_inc(sch, skb); + ++q->packets_in_l; + if (!q->l_head_ts) + q->l_head_ts = cb->ts; + return qdisc_enqueue_tail(skb, q->l_queue); + } + ++q->packets_in_c; + if (!q->c_head_ts) + q->c_head_ts = cb->ts; + return qdisc_enqueue_tail(skb, sch); +} + +/* By default, dualpi2 will split GSO skbs into independent skbs and enqueue + * each of those individually. This yields the following benefits, at the + * expense of CPU usage: + * - Finer-grained AQM actions as the sub-packets of a burst no longer share the + * same fate (e.g., the random mark/drop probability is applied individually) + * - Improved precision of the starvation protection/WRR scheduler at dequeue, + * as the size of the dequeued packets will be smaller. + */ +static int dualpi2_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, + struct sk_buff **to_free) +{ + struct dualpi2_sched_data *q = qdisc_priv(sch); + int err; + + err = dualpi2_skb_classify(q, skb); + if (err != NET_XMIT_SUCCESS) { + if (err & __NET_XMIT_BYPASS) + qdisc_qstats_drop(sch); + __qdisc_drop(skb, to_free); + return err; + } + + if (q->split_gso && skb_is_gso(skb)) { + netdev_features_t features; + struct sk_buff *nskb, *next; + int cnt, byte_len, orig_len; + int err; + + features = netif_skb_features(skb); + nskb = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK); + if (IS_ERR_OR_NULL(nskb)) + return qdisc_drop(skb, sch, to_free); + + cnt = 1; + byte_len = 0; + orig_len = qdisc_pkt_len(skb); + skb_list_walk_safe(nskb, nskb, next) { + skb_mark_not_on_list(nskb); + + /* Iterate through GSO fragments of an skb: + * (1) Set pkt_len from the single GSO fragments + * (2) Copy classified and ect values of an skb + * (3) Enqueue fragment & set ts in dualpi2_enqueue_skb + */ + qdisc_skb_cb(nskb)->pkt_len = nskb->len; + dualpi2_skb_cb(nskb)->classified = + dualpi2_skb_cb(skb)->classified; + dualpi2_skb_cb(nskb)->ect = dualpi2_skb_cb(skb)->ect; + err = dualpi2_enqueue_skb(nskb, sch, to_free); + + if (err == NET_XMIT_SUCCESS) { + /* Compute the backlog adjustment that needs + * to be propagated in the qdisc tree to reflect + * all new skbs successfully enqueued. + */ + ++cnt; + byte_len += nskb->len; + } + } + if (cnt > 1) { + /* The caller will add the original skb stats to its + * backlog, compensate this if any nskb is enqueued. + */ + --cnt; + byte_len -= orig_len; + } + qdisc_tree_reduce_backlog(sch, -cnt, -byte_len); + consume_skb(skb); + return err; + } + return dualpi2_enqueue_skb(skb, sch, to_free); +} + +/* Select the queue from which the next packet can be dequeued, ensuring that + * neither queue can starve the other with a WRR scheduler. + * + * The sign of the WRR credit determines the next queue, while the size of + * the dequeued packet determines the magnitude of the WRR credit change. If + * either queue is empty, the WRR credit is kept unchanged. + * + * As the dequeued packet can be dropped later, the caller has to perform the + * qdisc_bstats_update() calls. + */ +static struct sk_buff *dequeue_packet(struct Qdisc *sch, + struct dualpi2_sched_data *q, + int *credit_change, + u64 now) +{ + struct sk_buff *skb = NULL; + int c_len; + + *credit_change = 0; + c_len = qdisc_qlen(sch) - qdisc_qlen(q->l_queue); + if (qdisc_qlen(q->l_queue) && (!c_len || q->c_protection_credit <= 0)) { + skb = __qdisc_dequeue_head(&q->l_queue->q); + WRITE_ONCE(q->l_head_ts, head_enqueue_time(q->l_queue)); + if (c_len) + *credit_change = q->c_protection_wc; + qdisc_qstats_backlog_dec(q->l_queue, skb); + + /* Keep the global queue size consistent */ + --sch->q.qlen; + q->memory_used -= skb->truesize; + } else if (c_len) { + skb = __qdisc_dequeue_head(&sch->q); + WRITE_ONCE(q->c_head_ts, head_enqueue_time(sch)); + if (qdisc_qlen(q->l_queue)) + *credit_change = ~((s32)q->c_protection_wl) + 1; + q->memory_used -= skb->truesize; + } else { + dualpi2_reset_c_protection(q); + return NULL; + } + *credit_change *= qdisc_pkt_len(skb); + qdisc_qstats_backlog_dec(sch, skb); + return skb; +} + +static int do_step_aqm(struct dualpi2_sched_data *q, struct sk_buff *skb, + u64 now) +{ + u64 qdelay = 0; + + if (q->step_in_packets) + qdelay = qdisc_qlen(q->l_queue); + else + qdelay = dualpi2_sojourn_time(skb, now); + + if (dualpi2_skb_cb(skb)->apply_step && qdelay > q->step_thresh) { + if (!dualpi2_skb_cb(skb)->ect) { + /* Drop this non-ECT packet */ + return 1; + } + + if (dualpi2_mark(q, skb)) + ++q->step_marks; + } + qdisc_bstats_update(q->l_queue, skb); + return 0; +} + +static void drop_and_retry(struct dualpi2_sched_data *q, struct sk_buff *skb, + struct Qdisc *sch, enum skb_drop_reason reason) +{ + ++q->deferred_drops_cnt; + q->deferred_drops_len += qdisc_pkt_len(skb); + kfree_skb_reason(skb, reason); + qdisc_qstats_drop(sch); +} + +static struct sk_buff *dualpi2_qdisc_dequeue(struct Qdisc *sch) +{ + struct dualpi2_sched_data *q = qdisc_priv(sch); + struct sk_buff *skb; + int credit_change; + u64 now; + + now = ktime_get_ns(); + + while ((skb = dequeue_packet(sch, q, &credit_change, now))) { + if (!q->drop_early && must_drop(sch, q, skb)) { + drop_and_retry(q, skb, sch, + SKB_DROP_REASON_QDISC_CONGESTED); + continue; + } + + if (skb_in_l_queue(skb) && do_step_aqm(q, skb, now)) { + qdisc_qstats_drop(q->l_queue); + drop_and_retry(q, skb, sch, + SKB_DROP_REASON_DUALPI2_STEP_DROP); + continue; + } + + q->c_protection_credit += credit_change; + qdisc_bstats_update(sch, skb); + break; + } + + if (q->deferred_drops_cnt) { + qdisc_tree_reduce_backlog(sch, q->deferred_drops_cnt, + q->deferred_drops_len); + q->deferred_drops_cnt = 0; + q->deferred_drops_len = 0; + } + return skb; +} + +static s64 __scale_delta(u64 diff) +{ + do_div(diff, 1 << ALPHA_BETA_GRANULARITY); + return diff; +} + +static void get_queue_delays(struct dualpi2_sched_data *q, u64 *qdelay_c, + u64 *qdelay_l) +{ + u64 now, qc, ql; + + now = ktime_get_ns(); + qc = READ_ONCE(q->c_head_ts); + ql = READ_ONCE(q->l_head_ts); + + *qdelay_c = qc ? now - qc : 0; + *qdelay_l = ql ? now - ql : 0; +} + +static u32 calculate_probability(struct Qdisc *sch) +{ + struct dualpi2_sched_data *q = qdisc_priv(sch); + u32 new_prob; + u64 qdelay_c; + u64 qdelay_l; + u64 qdelay; + s64 delta; + + get_queue_delays(q, &qdelay_c, &qdelay_l); + qdelay = max(qdelay_l, qdelay_c); + + /* Alpha and beta take at most 32b, i.e, the delay difference would + * overflow for queuing delay differences > ~4.2sec. + */ + delta = ((s64)qdelay - (s64)q->pi2_target) * q->pi2_alpha; + delta += ((s64)qdelay - (s64)q->last_qdelay) * q->pi2_beta; + q->last_qdelay = qdelay; + + /* Bound new_prob between 0 and MAX_PROB */ + if (delta > 0) { + new_prob = __scale_delta(delta) + q->pi2_prob; + if (new_prob < q->pi2_prob) + new_prob = MAX_PROB; + } else { + new_prob = q->pi2_prob - __scale_delta(~delta + 1); + if (new_prob > q->pi2_prob) + new_prob = 0; + } + + /* If we do not drop on overload, ensure we cap the L4S probability to + * 100% to keep window fairness when overflowing. + */ + if (!q->drop_overload) + return min_t(u32, new_prob, MAX_PROB / q->coupling_factor); + return new_prob; +} + +static u32 get_memory_limit(struct Qdisc *sch, u32 limit) +{ + /* Apply rule of thumb, i.e., doubling the packet length, + * to further include per packet overhead in memory_limit. + */ + u64 memlim = mul_u32_u32(limit, 2 * psched_mtu(qdisc_dev(sch))); + + if (upper_32_bits(memlim)) + return U32_MAX; + else + return lower_32_bits(memlim); +} + +static u32 convert_us_to_nsec(u32 us) +{ + u64 ns = mul_u32_u32(us, NSEC_PER_USEC); + + if (upper_32_bits(ns)) + return U32_MAX; + + return lower_32_bits(ns); +} + +static u32 convert_ns_to_usec(u64 ns) +{ + do_div(ns, NSEC_PER_USEC); + if (upper_32_bits(ns)) + return U32_MAX; + + return lower_32_bits(ns); +} + +static enum hrtimer_restart dualpi2_timer(struct hrtimer *timer) +{ + struct dualpi2_sched_data *q = timer_container_of(q, timer, pi2_timer); + struct Qdisc *sch = q->sch; + spinlock_t *root_lock; /* to lock qdisc for probability calculations */ + + rcu_read_lock(); + root_lock = qdisc_lock(qdisc_root_sleeping(sch)); + spin_lock(root_lock); + + WRITE_ONCE(q->pi2_prob, calculate_probability(sch)); + hrtimer_set_expires(&q->pi2_timer, next_pi2_timeout(q)); + + spin_unlock(root_lock); + rcu_read_unlock(); + return HRTIMER_RESTART; +} + +static struct netlink_range_validation dualpi2_alpha_beta_range = { + .min = 1, + .max = ALPHA_BETA_MAX, +}; + +static const struct nla_policy dualpi2_policy[TCA_DUALPI2_MAX + 1] = { + [TCA_DUALPI2_LIMIT] = NLA_POLICY_MIN(NLA_U32, 1), + [TCA_DUALPI2_MEMORY_LIMIT] = NLA_POLICY_MIN(NLA_U32, 1), + [TCA_DUALPI2_TARGET] = { .type = NLA_U32 }, + [TCA_DUALPI2_TUPDATE] = NLA_POLICY_MIN(NLA_U32, 1), + [TCA_DUALPI2_ALPHA] = + NLA_POLICY_FULL_RANGE(NLA_U32, &dualpi2_alpha_beta_range), + [TCA_DUALPI2_BETA] = + NLA_POLICY_FULL_RANGE(NLA_U32, &dualpi2_alpha_beta_range), + [TCA_DUALPI2_STEP_THRESH_PKTS] = { .type = NLA_U32 }, + [TCA_DUALPI2_STEP_THRESH_US] = { .type = NLA_U32 }, + [TCA_DUALPI2_MIN_QLEN_STEP] = { .type = NLA_U32 }, + [TCA_DUALPI2_COUPLING] = NLA_POLICY_MIN(NLA_U8, 1), + [TCA_DUALPI2_DROP_OVERLOAD] = + NLA_POLICY_MAX(NLA_U8, TCA_DUALPI2_DROP_OVERLOAD_MAX), + [TCA_DUALPI2_DROP_EARLY] = + NLA_POLICY_MAX(NLA_U8, TCA_DUALPI2_DROP_EARLY_MAX), + [TCA_DUALPI2_C_PROTECTION] = + NLA_POLICY_RANGE(NLA_U8, 0, MAX_WC), + [TCA_DUALPI2_ECN_MASK] = + NLA_POLICY_RANGE(NLA_U8, TC_DUALPI2_ECN_MASK_L4S_ECT, + TCA_DUALPI2_ECN_MASK_MAX), + [TCA_DUALPI2_SPLIT_GSO] = + NLA_POLICY_MAX(NLA_U8, TCA_DUALPI2_SPLIT_GSO_MAX), +}; + +static int dualpi2_change(struct Qdisc *sch, struct nlattr *opt, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[TCA_DUALPI2_MAX + 1]; + struct dualpi2_sched_data *q; + int old_backlog; + int old_qlen; + int err; + + if (!opt || !nla_len(opt)) { + NL_SET_ERR_MSG_MOD(extack, "Dualpi2 options are required"); + return -EINVAL; + } + err = nla_parse_nested(tb, TCA_DUALPI2_MAX, opt, dualpi2_policy, + extack); + if (err < 0) + return err; + if (tb[TCA_DUALPI2_STEP_THRESH_PKTS] && tb[TCA_DUALPI2_STEP_THRESH_US]) { + NL_SET_ERR_MSG_MOD(extack, "multiple step thresh attributes"); + return -EINVAL; + } + + q = qdisc_priv(sch); + sch_tree_lock(sch); + + if (tb[TCA_DUALPI2_LIMIT]) { + u32 limit = nla_get_u32(tb[TCA_DUALPI2_LIMIT]); + + WRITE_ONCE(sch->limit, limit); + WRITE_ONCE(q->memory_limit, get_memory_limit(sch, limit)); + } + + if (tb[TCA_DUALPI2_MEMORY_LIMIT]) + WRITE_ONCE(q->memory_limit, + nla_get_u32(tb[TCA_DUALPI2_MEMORY_LIMIT])); + + if (tb[TCA_DUALPI2_TARGET]) { + u64 target = nla_get_u32(tb[TCA_DUALPI2_TARGET]); + + WRITE_ONCE(q->pi2_target, target * NSEC_PER_USEC); + } + + if (tb[TCA_DUALPI2_TUPDATE]) { + u64 tupdate = nla_get_u32(tb[TCA_DUALPI2_TUPDATE]); + + WRITE_ONCE(q->pi2_tupdate, convert_us_to_nsec(tupdate)); + } + + if (tb[TCA_DUALPI2_ALPHA]) { + u32 alpha = nla_get_u32(tb[TCA_DUALPI2_ALPHA]); + + WRITE_ONCE(q->pi2_alpha, dualpi2_scale_alpha_beta(alpha)); + } + + if (tb[TCA_DUALPI2_BETA]) { + u32 beta = nla_get_u32(tb[TCA_DUALPI2_BETA]); + + WRITE_ONCE(q->pi2_beta, dualpi2_scale_alpha_beta(beta)); + } + + if (tb[TCA_DUALPI2_STEP_THRESH_PKTS]) { + u32 step_th = nla_get_u32(tb[TCA_DUALPI2_STEP_THRESH_PKTS]); + + WRITE_ONCE(q->step_in_packets, true); + WRITE_ONCE(q->step_thresh, step_th); + } else if (tb[TCA_DUALPI2_STEP_THRESH_US]) { + u32 step_th = nla_get_u32(tb[TCA_DUALPI2_STEP_THRESH_US]); + + WRITE_ONCE(q->step_in_packets, false); + WRITE_ONCE(q->step_thresh, convert_us_to_nsec(step_th)); + } + + if (tb[TCA_DUALPI2_MIN_QLEN_STEP]) + WRITE_ONCE(q->min_qlen_step, + nla_get_u32(tb[TCA_DUALPI2_MIN_QLEN_STEP])); + + if (tb[TCA_DUALPI2_COUPLING]) { + u8 coupling = nla_get_u8(tb[TCA_DUALPI2_COUPLING]); + + WRITE_ONCE(q->coupling_factor, coupling); + } + + if (tb[TCA_DUALPI2_DROP_OVERLOAD]) { + u8 drop_overload = nla_get_u8(tb[TCA_DUALPI2_DROP_OVERLOAD]); + + WRITE_ONCE(q->drop_overload, (bool)drop_overload); + } + + if (tb[TCA_DUALPI2_DROP_EARLY]) { + u8 drop_early = nla_get_u8(tb[TCA_DUALPI2_DROP_EARLY]); + + WRITE_ONCE(q->drop_early, (bool)drop_early); + } + + if (tb[TCA_DUALPI2_C_PROTECTION]) { + u8 wc = nla_get_u8(tb[TCA_DUALPI2_C_PROTECTION]); + + dualpi2_calculate_c_protection(sch, q, wc); + } + + if (tb[TCA_DUALPI2_ECN_MASK]) { + u8 ecn_mask = nla_get_u8(tb[TCA_DUALPI2_ECN_MASK]); + + WRITE_ONCE(q->ecn_mask, ecn_mask); + } + + if (tb[TCA_DUALPI2_SPLIT_GSO]) { + u8 split_gso = nla_get_u8(tb[TCA_DUALPI2_SPLIT_GSO]); + + WRITE_ONCE(q->split_gso, (bool)split_gso); + } + + old_qlen = qdisc_qlen(sch); + old_backlog = sch->qstats.backlog; + while (qdisc_qlen(sch) > sch->limit || + q->memory_used > q->memory_limit) { + struct sk_buff *skb = qdisc_dequeue_internal(sch, true); + + q->memory_used -= skb->truesize; + qdisc_qstats_backlog_dec(sch, skb); + rtnl_qdisc_drop(skb, sch); + } + qdisc_tree_reduce_backlog(sch, old_qlen - qdisc_qlen(sch), + old_backlog - sch->qstats.backlog); + + sch_tree_unlock(sch); + return 0; +} + +/* Default alpha/beta values give a 10dB stability margin with max_rtt=100ms. */ +static void dualpi2_reset_default(struct Qdisc *sch) +{ + struct dualpi2_sched_data *q = qdisc_priv(sch); + + q->sch->limit = 10000; /* Max 125ms at 1Gbps */ + q->memory_limit = get_memory_limit(sch, q->sch->limit); + + q->pi2_target = 15 * NSEC_PER_MSEC; + q->pi2_tupdate = 16 * NSEC_PER_MSEC; + q->pi2_alpha = dualpi2_scale_alpha_beta(41); /* ~0.16 Hz * 256 */ + q->pi2_beta = dualpi2_scale_alpha_beta(819); /* ~3.20 Hz * 256 */ + + q->step_thresh = 1 * NSEC_PER_MSEC; + q->step_in_packets = false; + + dualpi2_calculate_c_protection(q->sch, q, 10); /* wc=10%, wl=90% */ + + q->ecn_mask = TC_DUALPI2_ECN_MASK_L4S_ECT; /* INET_ECN_ECT_1 */ + q->min_qlen_step = 0; /* Always apply step mark in L-queue */ + q->coupling_factor = 2; /* window fairness for equal RTTs */ + q->drop_overload = TC_DUALPI2_DROP_OVERLOAD_DROP; /* Drop overload */ + q->drop_early = TC_DUALPI2_DROP_EARLY_DROP_DEQUEUE; /* Drop dequeue */ + q->split_gso = TC_DUALPI2_SPLIT_GSO_SPLIT_GSO; /* Split GSO */ +} + +static int dualpi2_init(struct Qdisc *sch, struct nlattr *opt, + struct netlink_ext_ack *extack) +{ + struct dualpi2_sched_data *q = qdisc_priv(sch); + int err; + + q->l_queue = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, + TC_H_MAKE(sch->handle, 1), extack); + if (!q->l_queue) + return -ENOMEM; + + err = tcf_block_get(&q->tcf_block, &q->tcf_filters, sch, extack); + if (err) + return err; + + q->sch = sch; + dualpi2_reset_default(sch); + hrtimer_setup(&q->pi2_timer, dualpi2_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED); + + if (opt && nla_len(opt)) { + err = dualpi2_change(sch, opt, extack); + + if (err) + return err; + } + + hrtimer_start(&q->pi2_timer, next_pi2_timeout(q), + HRTIMER_MODE_ABS_PINNED); + return 0; +} + +static int dualpi2_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + struct dualpi2_sched_data *q = qdisc_priv(sch); + struct nlattr *opts; + bool step_in_pkts; + u32 step_th; + + step_in_pkts = READ_ONCE(q->step_in_packets); + step_th = READ_ONCE(q->step_thresh); + + opts = nla_nest_start_noflag(skb, TCA_OPTIONS); + if (!opts) + goto nla_put_failure; + + if (step_in_pkts && + (nla_put_u32(skb, TCA_DUALPI2_LIMIT, READ_ONCE(sch->limit)) || + nla_put_u32(skb, TCA_DUALPI2_MEMORY_LIMIT, + READ_ONCE(q->memory_limit)) || + nla_put_u32(skb, TCA_DUALPI2_TARGET, + convert_ns_to_usec(READ_ONCE(q->pi2_target))) || + nla_put_u32(skb, TCA_DUALPI2_TUPDATE, + convert_ns_to_usec(READ_ONCE(q->pi2_tupdate))) || + nla_put_u32(skb, TCA_DUALPI2_ALPHA, + dualpi2_unscale_alpha_beta(READ_ONCE(q->pi2_alpha))) || + nla_put_u32(skb, TCA_DUALPI2_BETA, + dualpi2_unscale_alpha_beta(READ_ONCE(q->pi2_beta))) || + nla_put_u32(skb, TCA_DUALPI2_STEP_THRESH_PKTS, step_th) || + nla_put_u32(skb, TCA_DUALPI2_MIN_QLEN_STEP, + READ_ONCE(q->min_qlen_step)) || + nla_put_u8(skb, TCA_DUALPI2_COUPLING, + READ_ONCE(q->coupling_factor)) || + nla_put_u8(skb, TCA_DUALPI2_DROP_OVERLOAD, + READ_ONCE(q->drop_overload)) || + nla_put_u8(skb, TCA_DUALPI2_DROP_EARLY, + READ_ONCE(q->drop_early)) || + nla_put_u8(skb, TCA_DUALPI2_C_PROTECTION, + READ_ONCE(q->c_protection_wc)) || + nla_put_u8(skb, TCA_DUALPI2_ECN_MASK, READ_ONCE(q->ecn_mask)) || + nla_put_u8(skb, TCA_DUALPI2_SPLIT_GSO, READ_ONCE(q->split_gso)))) + goto nla_put_failure; + + if (!step_in_pkts && + (nla_put_u32(skb, TCA_DUALPI2_LIMIT, READ_ONCE(sch->limit)) || + nla_put_u32(skb, TCA_DUALPI2_MEMORY_LIMIT, + READ_ONCE(q->memory_limit)) || + nla_put_u32(skb, TCA_DUALPI2_TARGET, + convert_ns_to_usec(READ_ONCE(q->pi2_target))) || + nla_put_u32(skb, TCA_DUALPI2_TUPDATE, + convert_ns_to_usec(READ_ONCE(q->pi2_tupdate))) || + nla_put_u32(skb, TCA_DUALPI2_ALPHA, + dualpi2_unscale_alpha_beta(READ_ONCE(q->pi2_alpha))) || + nla_put_u32(skb, TCA_DUALPI2_BETA, + dualpi2_unscale_alpha_beta(READ_ONCE(q->pi2_beta))) || + nla_put_u32(skb, TCA_DUALPI2_STEP_THRESH_US, + convert_ns_to_usec(step_th)) || + nla_put_u32(skb, TCA_DUALPI2_MIN_QLEN_STEP, + READ_ONCE(q->min_qlen_step)) || + nla_put_u8(skb, TCA_DUALPI2_COUPLING, + READ_ONCE(q->coupling_factor)) || + nla_put_u8(skb, TCA_DUALPI2_DROP_OVERLOAD, + READ_ONCE(q->drop_overload)) || + nla_put_u8(skb, TCA_DUALPI2_DROP_EARLY, + READ_ONCE(q->drop_early)) || + nla_put_u8(skb, TCA_DUALPI2_C_PROTECTION, + READ_ONCE(q->c_protection_wc)) || + nla_put_u8(skb, TCA_DUALPI2_ECN_MASK, READ_ONCE(q->ecn_mask)) || + nla_put_u8(skb, TCA_DUALPI2_SPLIT_GSO, READ_ONCE(q->split_gso)))) + goto nla_put_failure; + + return nla_nest_end(skb, opts); + +nla_put_failure: + nla_nest_cancel(skb, opts); + return -1; +} + +static int dualpi2_dump_stats(struct Qdisc *sch, struct gnet_dump *d) +{ + struct dualpi2_sched_data *q = qdisc_priv(sch); + struct tc_dualpi2_xstats st = { + .prob = READ_ONCE(q->pi2_prob), + .packets_in_c = q->packets_in_c, + .packets_in_l = q->packets_in_l, + .maxq = q->maxq, + .ecn_mark = q->ecn_mark, + .credit = q->c_protection_credit, + .step_marks = q->step_marks, + .memory_used = q->memory_used, + .max_memory_used = q->max_memory_used, + .memory_limit = q->memory_limit, + }; + u64 qc, ql; + + get_queue_delays(q, &qc, &ql); + st.delay_l = convert_ns_to_usec(ql); + st.delay_c = convert_ns_to_usec(qc); + return gnet_stats_copy_app(d, &st, sizeof(st)); +} + +/* Reset both L-queue and C-queue, internal packet counters, PI probability, + * C-queue protection credit, and timestamps, while preserving current + * configuration of DUALPI2. + */ +static void dualpi2_reset(struct Qdisc *sch) +{ + struct dualpi2_sched_data *q = qdisc_priv(sch); + + qdisc_reset_queue(sch); + qdisc_reset_queue(q->l_queue); + q->c_head_ts = 0; + q->l_head_ts = 0; + q->pi2_prob = 0; + q->packets_in_c = 0; + q->packets_in_l = 0; + q->maxq = 0; + q->ecn_mark = 0; + q->step_marks = 0; + q->memory_used = 0; + q->max_memory_used = 0; + dualpi2_reset_c_protection(q); +} + +static void dualpi2_destroy(struct Qdisc *sch) +{ + struct dualpi2_sched_data *q = qdisc_priv(sch); + + q->pi2_tupdate = 0; + hrtimer_cancel(&q->pi2_timer); + if (q->l_queue) + qdisc_put(q->l_queue); + tcf_block_put(q->tcf_block); +} + +static struct Qdisc *dualpi2_leaf(struct Qdisc *sch, unsigned long arg) +{ + return NULL; +} + +static unsigned long dualpi2_find(struct Qdisc *sch, u32 classid) +{ + return 0; +} + +static unsigned long dualpi2_bind(struct Qdisc *sch, unsigned long parent, + u32 classid) +{ + return 0; +} + +static void dualpi2_unbind(struct Qdisc *q, unsigned long cl) +{ +} + +static struct tcf_block *dualpi2_tcf_block(struct Qdisc *sch, unsigned long cl, + struct netlink_ext_ack *extack) +{ + struct dualpi2_sched_data *q = qdisc_priv(sch); + + if (cl) + return NULL; + return q->tcf_block; +} + +static void dualpi2_walk(struct Qdisc *sch, struct qdisc_walker *arg) +{ + unsigned int i; + + if (arg->stop) + return; + + /* We statically define only 2 queues */ + for (i = 0; i < 2; i++) { + if (arg->count < arg->skip) { + arg->count++; + continue; + } + if (arg->fn(sch, i + 1, arg) < 0) { + arg->stop = 1; + break; + } + arg->count++; + } +} + +/* Minimal class support to handle tc filters */ +static const struct Qdisc_class_ops dualpi2_class_ops = { + .leaf = dualpi2_leaf, + .find = dualpi2_find, + .tcf_block = dualpi2_tcf_block, + .bind_tcf = dualpi2_bind, + .unbind_tcf = dualpi2_unbind, + .walk = dualpi2_walk, +}; + +static struct Qdisc_ops dualpi2_qdisc_ops __read_mostly = { + .id = "dualpi2", + .cl_ops = &dualpi2_class_ops, + .priv_size = sizeof(struct dualpi2_sched_data), + .enqueue = dualpi2_qdisc_enqueue, + .dequeue = dualpi2_qdisc_dequeue, + .peek = qdisc_peek_dequeued, + .init = dualpi2_init, + .destroy = dualpi2_destroy, + .reset = dualpi2_reset, + .change = dualpi2_change, + .dump = dualpi2_dump, + .dump_stats = dualpi2_dump_stats, + .owner = THIS_MODULE, +}; + +static int __init dualpi2_module_init(void) +{ + return register_qdisc(&dualpi2_qdisc_ops); +} + +static void __exit dualpi2_module_exit(void) +{ + unregister_qdisc(&dualpi2_qdisc_ops); +} + +module_init(dualpi2_module_init); +module_exit(dualpi2_module_exit); + +MODULE_DESCRIPTION("Dual Queue with Proportional Integral controller Improved with a Square (dualpi2) scheduler"); +MODULE_AUTHOR("Koen De Schepper <koen.de_schepper@nokia-bell-labs.com>"); +MODULE_AUTHOR("Chia-Yu Chang <chia-yu.chang@nokia-bell-labs.com>"); +MODULE_AUTHOR("Olga Albisser <olga@albisser.org>"); +MODULE_AUTHOR("Henrik Steen <henrist@henrist.net>"); +MODULE_AUTHOR("Olivier Tilmans <olivier.tilmans@nokia.com>"); + +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_VERSION("1.0"); diff --git a/net/sched/sch_ets.c b/net/sched/sch_ets.c index 2c069f0181c6..037f764822b9 100644 --- a/net/sched/sch_ets.c +++ b/net/sched/sch_ets.c @@ -661,7 +661,7 @@ static int ets_qdisc_change(struct Qdisc *sch, struct nlattr *opt, for (i = q->nbands; i < oldbands; i++) { if (i >= q->nstrict && q->classes[i].qdisc->q.qlen) list_del_init(&q->classes[i].alist); - qdisc_tree_flush_backlog(q->classes[i].qdisc); + qdisc_purge_queue(q->classes[i].qdisc); } WRITE_ONCE(q->nstrict, nstrict); memcpy(q->prio2band, priomap, sizeof(priomap)); diff --git a/net/sched/sch_fq_pie.c b/net/sched/sch_fq_pie.c index df7fac95ab15..b0e34daf1f75 100644 --- a/net/sched/sch_fq_pie.c +++ b/net/sched/sch_fq_pie.c @@ -384,7 +384,7 @@ flow_error: static void fq_pie_timer(struct timer_list *t) { - struct fq_pie_sched_data *q = from_timer(q, t, adapt_timer); + struct fq_pie_sched_data *q = timer_container_of(q, t, adapt_timer); unsigned long next, tupdate; struct Qdisc *sch = q->sch; spinlock_t *root_lock; /* to lock qdisc for probability calculations */ diff --git a/net/sched/sch_frag.c b/net/sched/sch_frag.c index ce63414185fd..d1d87dce7f3f 100644 --- a/net/sched/sch_frag.c +++ b/net/sched/sch_frag.c @@ -16,14 +16,18 @@ struct sch_frag_data { unsigned int l2_len; u8 l2_data[VLAN_ETH_HLEN]; int (*xmit)(struct sk_buff *skb); + local_lock_t bh_lock; }; -static DEFINE_PER_CPU(struct sch_frag_data, sch_frag_data_storage); +static DEFINE_PER_CPU(struct sch_frag_data, sch_frag_data_storage) = { + .bh_lock = INIT_LOCAL_LOCK(bh_lock), +}; static int sch_frag_xmit(struct net *net, struct sock *sk, struct sk_buff *skb) { struct sch_frag_data *data = this_cpu_ptr(&sch_frag_data_storage); + lockdep_assert_held(&data->bh_lock); if (skb_cow_head(skb, data->l2_len) < 0) { kfree_skb(skb); return -ENOMEM; @@ -95,6 +99,7 @@ static int sch_fragment(struct net *net, struct sk_buff *skb, struct rtable sch_frag_rt = { 0 }; unsigned long orig_dst; + local_lock_nested_bh(&sch_frag_data_storage.bh_lock); sch_frag_prepare_frag(skb, xmit); dst_init(&sch_frag_rt.dst, &sch_frag_dst_ops, NULL, DST_OBSOLETE_NONE, DST_NOCOUNT); @@ -105,11 +110,13 @@ static int sch_fragment(struct net *net, struct sk_buff *skb, IPCB(skb)->frag_max_size = mru; ret = ip_do_fragment(net, skb->sk, skb, sch_frag_xmit); + local_unlock_nested_bh(&sch_frag_data_storage.bh_lock); refdst_drop(orig_dst); } else if (skb_protocol(skb, true) == htons(ETH_P_IPV6)) { unsigned long orig_dst; struct rt6_info sch_frag_rt; + local_lock_nested_bh(&sch_frag_data_storage.bh_lock); sch_frag_prepare_frag(skb, xmit); memset(&sch_frag_rt, 0, sizeof(sch_frag_rt)); dst_init(&sch_frag_rt.dst, &sch_frag_dst_ops, NULL, @@ -122,6 +129,7 @@ static int sch_fragment(struct net *net, struct sk_buff *skb, ret = ipv6_stub->ipv6_fragment(net, skb->sk, skb, sch_frag_xmit); + local_unlock_nested_bh(&sch_frag_data_storage.bh_lock); refdst_drop(orig_dst); } else { net_warn_ratelimited("Fail frag %s: eth=%x, MRU=%d, MTU=%d\n", diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 514b1b6ac681..1e008a228ebd 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -24,6 +24,7 @@ #include <linux/if_vlan.h> #include <linux/skb_array.h> #include <linux/if_macvlan.h> +#include <linux/bpf.h> #include <net/sch_generic.h> #include <net/pkt_sched.h> #include <net/dst.h> @@ -495,7 +496,7 @@ EXPORT_SYMBOL(netif_tx_unlock); static void dev_watchdog(struct timer_list *t) { - struct net_device *dev = from_timer(dev, t, watchdog_timer); + struct net_device *dev = timer_container_of(dev, t, watchdog_timer); bool release = true; spin_lock(&dev->tx_global_lock); @@ -739,6 +740,8 @@ static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc *qdisc, err = skb_array_produce(q, skb); if (unlikely(err)) { + tcf_set_drop_reason(skb, SKB_DROP_REASON_QDISC_OVERLIMIT); + if (qdisc_is_percpu_stats(qdisc)) return qdisc_drop_cpu(skb, qdisc, to_free); else @@ -1001,14 +1004,14 @@ struct Qdisc *qdisc_create_dflt(struct netdev_queue *dev_queue, { struct Qdisc *sch; - if (!try_module_get(ops->owner)) { + if (!bpf_try_module_get(ops, ops->owner)) { NL_SET_ERR_MSG(extack, "Failed to increase module reference counter"); return NULL; } sch = qdisc_alloc(dev_queue, ops, extack); if (IS_ERR(sch)) { - module_put(ops->owner); + bpf_module_put(ops, ops->owner); return NULL; } sch->parent = parentid; @@ -1078,7 +1081,7 @@ static void __qdisc_destroy(struct Qdisc *qdisc) ops->destroy(qdisc); lockdep_unregister_key(&qdisc->root_lock_key); - module_put(ops->owner); + bpf_module_put(ops, ops->owner); netdev_put(dev, &qdisc->dev_tracker); trace_qdisc_destroy(qdisc); diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c index 7986145a527c..d8fd35da32a7 100644 --- a/net/sched/sch_hfsc.c +++ b/net/sched/sch_hfsc.c @@ -175,6 +175,11 @@ struct hfsc_sched { #define HT_INFINITY 0xffffffffffffffffULL /* infinite time value */ +static bool cl_in_el_or_vttree(struct hfsc_class *cl) +{ + return ((cl->cl_flags & HFSC_FSC) && cl->cl_nactive) || + ((cl->cl_flags & HFSC_RSC) && !RB_EMPTY_NODE(&cl->el_node)); +} /* * eligible tree holds backlogged classes being sorted by their eligible times. @@ -830,22 +835,6 @@ update_vf(struct hfsc_class *cl, unsigned int len, u64 cur_time) } } -static unsigned int -qdisc_peek_len(struct Qdisc *sch) -{ - struct sk_buff *skb; - unsigned int len; - - skb = sch->ops->peek(sch); - if (unlikely(skb == NULL)) { - qdisc_warn_nonwc("qdisc_peek_len", sch); - return 0; - } - len = qdisc_pkt_len(skb); - - return len; -} - static void hfsc_adjust_levels(struct hfsc_class *cl) { @@ -1040,6 +1029,8 @@ hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid, if (cl == NULL) return -ENOBUFS; + RB_CLEAR_NODE(&cl->el_node); + err = tcf_block_get(&cl->block, &cl->filter_list, sch, extack); if (err) { kfree(cl); @@ -1572,7 +1563,7 @@ hfsc_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) sch->qstats.backlog += len; sch->q.qlen++; - if (first && !cl->cl_nactive) { + if (first && !cl_in_el_or_vttree(cl)) { if (cl->cl_flags & HFSC_RSC) init_ed(cl, len); if (cl->cl_flags & HFSC_FSC) diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index 14bf71f57057..c968ea763774 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -821,7 +821,9 @@ static struct htb_class *htb_lookup_leaf(struct htb_prio *hprio, const int prio) u32 *pid; } stk[TC_HTB_MAXDEPTH], *sp = stk; - BUG_ON(!hprio->row.rb_node); + if (unlikely(!hprio->row.rb_node)) + return NULL; + sp->root = hprio->row.rb_node; sp->pptr = &hprio->ptr; sp->pid = &hprio->last_ptr_id; diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index fdd79d3ccd8c..eafc316ae319 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -973,6 +973,41 @@ static int parse_attr(struct nlattr *tb[], int maxtype, struct nlattr *nla, return 0; } +static const struct Qdisc_class_ops netem_class_ops; + +static int check_netem_in_tree(struct Qdisc *sch, bool duplicates, + struct netlink_ext_ack *extack) +{ + struct Qdisc *root, *q; + unsigned int i; + + root = qdisc_root_sleeping(sch); + + if (sch != root && root->ops->cl_ops == &netem_class_ops) { + if (duplicates || + ((struct netem_sched_data *)qdisc_priv(root))->duplicate) + goto err; + } + + if (!qdisc_dev(root)) + return 0; + + hash_for_each(qdisc_dev(root)->qdisc_hash, i, q, hash) { + if (sch != q && q->ops->cl_ops == &netem_class_ops) { + if (duplicates || + ((struct netem_sched_data *)qdisc_priv(q))->duplicate) + goto err; + } + } + + return 0; + +err: + NL_SET_ERR_MSG(extack, + "netem: cannot mix duplicating netems with other netems in tree"); + return -EINVAL; +} + /* Parse netlink message to set options */ static int netem_change(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) @@ -1031,6 +1066,11 @@ static int netem_change(struct Qdisc *sch, struct nlattr *opt, q->gap = qopt->gap; q->counter = 0; q->loss = qopt->loss; + + ret = check_netem_in_tree(sch, qopt->duplicate, extack); + if (ret) + goto unlock; + q->duplicate = qopt->duplicate; /* for compatibility with earlier versions. diff --git a/net/sched/sch_pie.c b/net/sched/sch_pie.c index ff49a6c97033..ad46ee3ed5a9 100644 --- a/net/sched/sch_pie.c +++ b/net/sched/sch_pie.c @@ -424,7 +424,7 @@ EXPORT_SYMBOL_GPL(pie_calculate_probability); static void pie_timer(struct timer_list *t) { - struct pie_sched_data *q = from_timer(q, t, adapt_timer); + struct pie_sched_data *q = timer_container_of(q, t, adapt_timer); struct Qdisc *sch = q->sch; spinlock_t *root_lock; diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c index cc30f7a32f1a..9e2b9a490db2 100644 --- a/net/sched/sch_prio.c +++ b/net/sched/sch_prio.c @@ -211,7 +211,7 @@ static int prio_tune(struct Qdisc *sch, struct nlattr *opt, memcpy(q->prio2band, qopt->priomap, TC_PRIO_MAX+1); for (i = q->bands; i < oldbands; i++) - qdisc_tree_flush_backlog(q->queues[i]); + qdisc_purge_queue(q->queues[i]); for (i = oldbands; i < q->bands; i++) { q->queues[i] = queues[i]; diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c index bf1282cb22eb..2255355e51d3 100644 --- a/net/sched/sch_qfq.c +++ b/net/sched/sch_qfq.c @@ -412,7 +412,7 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, bool existing = false; struct nlattr *tb[TCA_QFQ_MAX + 1]; struct qfq_aggregate *new_agg = NULL; - u32 weight, lmax, inv_w; + u32 weight, lmax, inv_w, old_weight, old_lmax; int err; int delta_w; @@ -443,12 +443,16 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, inv_w = ONE_FP / weight; weight = ONE_FP / inv_w; - if (cl != NULL && - lmax == cl->agg->lmax && - weight == cl->agg->class_weight) - return 0; /* nothing to change */ + if (cl != NULL) { + sch_tree_lock(sch); + old_weight = cl->agg->class_weight; + old_lmax = cl->agg->lmax; + sch_tree_unlock(sch); + if (lmax == old_lmax && weight == old_weight) + return 0; /* nothing to change */ + } - delta_w = weight - (cl ? cl->agg->class_weight : 0); + delta_w = weight - (cl ? old_weight : 0); if (q->wsum + delta_w > QFQ_MAX_WSUM) { NL_SET_ERR_MSG_FMT_MOD(extack, @@ -532,9 +536,6 @@ destroy_class: static void qfq_destroy_class(struct Qdisc *sch, struct qfq_class *cl) { - struct qfq_sched *q = qdisc_priv(sch); - - qfq_rm_from_agg(q, cl); gen_kill_estimator(&cl->rate_est); qdisc_put(cl->qdisc); kfree(cl); @@ -555,6 +556,7 @@ static int qfq_delete_class(struct Qdisc *sch, unsigned long arg, qdisc_purge_queue(cl->qdisc); qdisc_class_hash_remove(&q->clhash, &cl->common); + qfq_rm_from_agg(q, cl); sch_tree_unlock(sch); @@ -625,6 +627,7 @@ static int qfq_dump_class(struct Qdisc *sch, unsigned long arg, { struct qfq_class *cl = (struct qfq_class *)arg; struct nlattr *nest; + u32 class_weight, lmax; tcm->tcm_parent = TC_H_ROOT; tcm->tcm_handle = cl->common.classid; @@ -633,8 +636,13 @@ static int qfq_dump_class(struct Qdisc *sch, unsigned long arg, nest = nla_nest_start_noflag(skb, TCA_OPTIONS); if (nest == NULL) goto nla_put_failure; - if (nla_put_u32(skb, TCA_QFQ_WEIGHT, cl->agg->class_weight) || - nla_put_u32(skb, TCA_QFQ_LMAX, cl->agg->lmax)) + + sch_tree_lock(sch); + class_weight = cl->agg->class_weight; + lmax = cl->agg->lmax; + sch_tree_unlock(sch); + if (nla_put_u32(skb, TCA_QFQ_WEIGHT, class_weight) || + nla_put_u32(skb, TCA_QFQ_LMAX, lmax)) goto nla_put_failure; return nla_nest_end(skb, nest); @@ -651,8 +659,10 @@ static int qfq_dump_class_stats(struct Qdisc *sch, unsigned long arg, memset(&xstats, 0, sizeof(xstats)); + sch_tree_lock(sch); xstats.weight = cl->agg->class_weight; xstats.lmax = cl->agg->lmax; + sch_tree_unlock(sch); if (gnet_stats_copy_basic(d, NULL, &cl->bstats, true) < 0 || gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 || @@ -989,7 +999,7 @@ static struct sk_buff *agg_dequeue(struct qfq_aggregate *agg, if (cl->qdisc->q.qlen == 0) /* no more packets, remove from list */ list_del_init(&cl->alist); - else if (cl->deficit < qdisc_pkt_len(cl->qdisc->ops->peek(cl->qdisc))) { + else if (cl->deficit < qdisc_peek_len(cl->qdisc)) { cl->deficit += agg->lmax; list_move_tail(&cl->alist, &agg->active); } @@ -1491,6 +1501,7 @@ static void qfq_destroy_qdisc(struct Qdisc *sch) for (i = 0; i < q->clhash.hashsize; i++) { hlist_for_each_entry_safe(cl, next, &q->clhash.hash[i], common.hnode) { + qfq_rm_from_agg(q, cl); qfq_destroy_class(sch, cl); } } diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c index 1ba3e0bba54f..479c42d11083 100644 --- a/net/sched/sch_red.c +++ b/net/sched/sch_red.c @@ -285,7 +285,7 @@ static int __red_change(struct Qdisc *sch, struct nlattr **tb, q->userbits = userbits; q->limit = ctl->limit; if (child) { - qdisc_tree_flush_backlog(q->qdisc); + qdisc_purge_queue(q->qdisc); old_child = q->qdisc; q->qdisc = child; } @@ -321,7 +321,7 @@ unlock_out: static inline void red_adaptative_timer(struct timer_list *t) { - struct red_sched_data *q = from_timer(q, t, adapt_timer); + struct red_sched_data *q = timer_container_of(q, t, adapt_timer); struct Qdisc *sch = q->sch; spinlock_t *root_lock; diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c index b912ad99aa15..96eb2f122973 100644 --- a/net/sched/sch_sfq.c +++ b/net/sched/sch_sfq.c @@ -310,7 +310,10 @@ drop: /* It is difficult to believe, but ALL THE SLOTS HAVE LENGTH 1. */ x = q->tail->next; slot = &q->slots[x]; - q->tail->next = slot->next; + if (slot->next == x) + q->tail = NULL; /* no more active slots */ + else + q->tail->next = slot->next; q->ht[slot->hash] = SFQ_EMPTY_SLOT; goto drop; } @@ -597,7 +600,7 @@ drop: static void sfq_perturbation(struct timer_list *t) { - struct sfq_sched_data *q = from_timer(q, t, perturb_timer); + struct sfq_sched_data *q = timer_container_of(q, t, perturb_timer); struct Qdisc *sch = q->sch; spinlock_t *root_lock; siphash_key_t nkey; @@ -653,6 +656,14 @@ static int sfq_change(struct Qdisc *sch, struct nlattr *opt, NL_SET_ERR_MSG_MOD(extack, "invalid quantum"); return -EINVAL; } + + if (ctl->perturb_period < 0 || + ctl->perturb_period > INT_MAX / HZ) { + NL_SET_ERR_MSG_MOD(extack, "invalid perturb period"); + return -EINVAL; + } + perturb_period = ctl->perturb_period * HZ; + if (ctl_v1 && !red_check_params(ctl_v1->qth_min, ctl_v1->qth_max, ctl_v1->Wlog, ctl_v1->Scell_log, NULL)) return -EINVAL; @@ -669,14 +680,12 @@ static int sfq_change(struct Qdisc *sch, struct nlattr *opt, headdrop = q->headdrop; maxdepth = q->maxdepth; maxflows = q->maxflows; - perturb_period = q->perturb_period; quantum = q->quantum; flags = q->flags; /* update and validate configuration */ if (ctl->quantum) quantum = ctl->quantum; - perturb_period = ctl->perturb_period * HZ; if (ctl->flows) maxflows = min_t(u32, ctl->flows, SFQ_MAX_FLOWS); if (ctl->divisor) { diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c index 14021b812329..e759e43ad27e 100644 --- a/net/sched/sch_taprio.c +++ b/net/sched/sch_taprio.c @@ -998,7 +998,7 @@ static const struct nla_policy entry_policy[TCA_TAPRIO_SCHED_ENTRY_MAX + 1] = { static const struct nla_policy taprio_tc_policy[TCA_TAPRIO_TC_ENTRY_MAX + 1] = { [TCA_TAPRIO_TC_ENTRY_INDEX] = NLA_POLICY_MAX(NLA_U32, - TC_QOPT_MAX_QUEUE), + TC_QOPT_MAX_QUEUE - 1), [TCA_TAPRIO_TC_ENTRY_MAX_SDU] = { .type = NLA_U32 }, [TCA_TAPRIO_TC_ENTRY_FP] = NLA_POLICY_RANGE(NLA_U32, TC_FP_EXPRESS, @@ -1328,13 +1328,15 @@ static int taprio_dev_notifier(struct notifier_block *nb, unsigned long event, stab = rtnl_dereference(q->root->stab); - oper = rtnl_dereference(q->oper_sched); + rcu_read_lock(); + oper = rcu_dereference(q->oper_sched); if (oper) taprio_update_queue_max_sdu(q, oper, stab); - admin = rtnl_dereference(q->admin_sched); + admin = rcu_dereference(q->admin_sched); if (admin) taprio_update_queue_max_sdu(q, admin, stab); + rcu_read_unlock(); break; } @@ -1696,19 +1698,15 @@ static int taprio_parse_tc_entry(struct Qdisc *sch, if (err < 0) return err; - if (!tb[TCA_TAPRIO_TC_ENTRY_INDEX]) { + if (NL_REQ_ATTR_CHECK(extack, opt, tb, TCA_TAPRIO_TC_ENTRY_INDEX)) { NL_SET_ERR_MSG_MOD(extack, "TC entry index missing"); return -EINVAL; } tc = nla_get_u32(tb[TCA_TAPRIO_TC_ENTRY_INDEX]); - if (tc >= TC_QOPT_MAX_QUEUE) { - NL_SET_ERR_MSG_MOD(extack, "TC entry index out of range"); - return -ERANGE; - } - if (*seen_tcs & BIT(tc)) { - NL_SET_ERR_MSG_MOD(extack, "Duplicate TC entry"); + NL_SET_ERR_MSG_ATTR(extack, tb[TCA_TAPRIO_TC_ENTRY_INDEX], + "Duplicate tc entry"); return -EINVAL; } diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c index dc26b22d53c7..4c977f049670 100644 --- a/net/sched/sch_tbf.c +++ b/net/sched/sch_tbf.c @@ -452,7 +452,7 @@ static int tbf_change(struct Qdisc *sch, struct nlattr *opt, sch_tree_lock(sch); if (child) { - qdisc_tree_flush_backlog(q->qdisc); + qdisc_purge_queue(q->qdisc); old = q->qdisc; q->qdisc = child; } |