summaryrefslogtreecommitdiff
path: root/net/sched
diff options
context:
space:
mode:
Diffstat (limited to 'net/sched')
-rw-r--r--net/sched/Kconfig91
-rw-r--r--net/sched/Makefile7
-rw-r--r--net/sched/act_api.c57
-rw-r--r--net/sched/act_connmark.c107
-rw-r--r--net/sched/act_ct.c141
-rw-r--r--net/sched/act_gate.c30
-rw-r--r--net/sched/act_mirred.c23
-rw-r--r--net/sched/act_nat.c72
-rw-r--r--net/sched/act_pedit.c300
-rw-r--r--net/sched/cls_api.c304
-rw-r--r--net/sched/cls_flower.c80
-rw-r--r--net/sched/cls_matchall.c6
-rw-r--r--net/sched/cls_rsvp.c26
-rw-r--r--net/sched/cls_rsvp.h764
-rw-r--r--net/sched/cls_rsvp6.c26
-rw-r--r--net/sched/cls_tcindex.c742
-rw-r--r--net/sched/sch_api.c87
-rw-r--r--net/sched/sch_atm.c706
-rw-r--r--net/sched/sch_cake.c2
-rw-r--r--net/sched/sch_cbq.c1727
-rw-r--r--net/sched/sch_dsmark.c518
-rw-r--r--net/sched/sch_mqprio.c291
-rw-r--r--net/sched/sch_mqprio_lib.c117
-rw-r--r--net/sched/sch_mqprio_lib.h18
-rw-r--r--net/sched/sch_taprio.c745
25 files changed, 1556 insertions, 5431 deletions
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index 777d6b50505c..4b95cb1ac435 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -45,23 +45,6 @@ if NET_SCHED
comment "Queueing/Scheduling"
-config NET_SCH_CBQ
- tristate "Class Based Queueing (CBQ)"
- help
- Say Y here if you want to use the Class-Based Queueing (CBQ) packet
- scheduling algorithm. This algorithm classifies the waiting packets
- into a tree-like hierarchy of classes; the leaves of this tree are
- in turn scheduled by separate algorithms.
-
- See the top of <file:net/sched/sch_cbq.c> for more details.
-
- CBQ is a commonly used scheduler, so if you're unsure, you should
- say Y here. Then say Y to all the queueing algorithms below that you
- want to use as leaf disciplines.
-
- To compile this code as a module, choose M here: the
- module will be called sch_cbq.
-
config NET_SCH_HTB
tristate "Hierarchical Token Bucket (HTB)"
help
@@ -85,20 +68,6 @@ config NET_SCH_HFSC
To compile this code as a module, choose M here: the
module will be called sch_hfsc.
-config NET_SCH_ATM
- tristate "ATM Virtual Circuits (ATM)"
- depends on ATM
- help
- Say Y here if you want to use the ATM pseudo-scheduler. This
- provides a framework for invoking classifiers, which in turn
- select classes of this queuing discipline. Each class maps
- the flow(s) it is handling to a given virtual circuit.
-
- See the top of <file:net/sched/sch_atm.c> for more details.
-
- To compile this code as a module, choose M here: the
- module will be called sch_atm.
-
config NET_SCH_PRIO
tristate "Multi Band Priority Queueing (PRIO)"
help
@@ -195,8 +164,14 @@ config NET_SCH_ETF
To compile this code as a module, choose M here: the
module will be called sch_etf.
+config NET_SCH_MQPRIO_LIB
+ tristate
+ help
+ Common library for manipulating mqprio queue configurations.
+
config NET_SCH_TAPRIO
tristate "Time Aware Priority (taprio) Scheduler"
+ select NET_SCH_MQPRIO_LIB
help
Say Y here if you want to use the Time Aware Priority (taprio) packet
scheduling algorithm.
@@ -217,17 +192,6 @@ config NET_SCH_GRED
To compile this code as a module, choose M here: the
module will be called sch_gred.
-config NET_SCH_DSMARK
- tristate "Differentiated Services marker (DSMARK)"
- help
- Say Y if you want to schedule packets according to the
- Differentiated Services architecture proposed in RFC 2475.
- Technical information on this method, with pointers to associated
- RFCs, is available at <http://www.gta.ufrj.br/diffserv/>.
-
- To compile this code as a module, choose M here: the
- module will be called sch_dsmark.
-
config NET_SCH_NETEM
tristate "Network emulator (NETEM)"
help
@@ -253,6 +217,7 @@ config NET_SCH_DRR
config NET_SCH_MQPRIO
tristate "Multi-queue priority scheduler (MQPRIO)"
+ select NET_SCH_MQPRIO_LIB
help
Say Y here if you want to use the Multi-queue Priority scheduler.
This scheduler allows QOS to be offloaded on NICs that have support
@@ -337,7 +302,7 @@ config NET_SCH_FQ
Say Y here if you want to use the FQ packet scheduling algorithm.
FQ does flow separation, and is able to respect pacing requirements
- set by TCP stack into sk->sk_pacing_rate (for localy generated
+ set by TCP stack into sk->sk_pacing_rate (for locally generated
traffic)
To compile this driver as a module, choose M here: the module
@@ -503,17 +468,6 @@ config NET_CLS_BASIC
To compile this code as a module, choose M here: the
module will be called cls_basic.
-config NET_CLS_TCINDEX
- tristate "Traffic-Control Index (TCINDEX)"
- select NET_CLS
- help
- Say Y here if you want to be able to classify packets based on
- traffic control indices. You will want this feature if you want
- to implement Differentiated Services together with DSMARK.
-
- To compile this code as a module, choose M here: the
- module will be called cls_tcindex.
-
config NET_CLS_ROUTE4
tristate "Routing decision (ROUTE)"
depends on INET
@@ -559,34 +513,6 @@ config CLS_U32_MARK
help
Say Y here to be able to use netfilter marks as u32 key.
-config NET_CLS_RSVP
- tristate "IPv4 Resource Reservation Protocol (RSVP)"
- select NET_CLS
- help
- The Resource Reservation Protocol (RSVP) permits end systems to
- request a minimum and maximum data flow rate for a connection; this
- is important for real time data such as streaming sound or video.
-
- Say Y here if you want to be able to classify outgoing packets based
- on their RSVP requests.
-
- To compile this code as a module, choose M here: the
- module will be called cls_rsvp.
-
-config NET_CLS_RSVP6
- tristate "IPv6 Resource Reservation Protocol (RSVP6)"
- select NET_CLS
- help
- The Resource Reservation Protocol (RSVP) permits end systems to
- request a minimum and maximum data flow rate for a connection; this
- is important for real time data such as streaming sound or video.
-
- Say Y here if you want to be able to classify outgoing packets based
- on their RSVP requests and you are using the IPv6 protocol.
-
- To compile this code as a module, choose M here: the
- module will be called cls_rsvp6.
-
config NET_CLS_FLOW
tristate "Flow classifier"
select NET_CLS
@@ -977,6 +903,7 @@ config NET_ACT_TUNNEL_KEY
config NET_ACT_CT
tristate "connection tracking tc action"
depends on NET_CLS_ACT && NF_CONNTRACK && (!NF_NAT || NF_NAT) && NF_FLOW_TABLE
+ select NF_CONNTRACK_OVS
select NF_NAT_OVS if NF_NAT
help
Say Y here to allow sending the packets to conntrack module.
diff --git a/net/sched/Makefile b/net/sched/Makefile
index dd14ef413fda..b5fd49641d91 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -33,25 +33,23 @@ obj-$(CONFIG_NET_ACT_TUNNEL_KEY)+= act_tunnel_key.o
obj-$(CONFIG_NET_ACT_CT) += act_ct.o
obj-$(CONFIG_NET_ACT_GATE) += act_gate.o
obj-$(CONFIG_NET_SCH_FIFO) += sch_fifo.o
-obj-$(CONFIG_NET_SCH_CBQ) += sch_cbq.o
obj-$(CONFIG_NET_SCH_HTB) += sch_htb.o
obj-$(CONFIG_NET_SCH_HFSC) += sch_hfsc.o
obj-$(CONFIG_NET_SCH_RED) += sch_red.o
obj-$(CONFIG_NET_SCH_GRED) += sch_gred.o
obj-$(CONFIG_NET_SCH_INGRESS) += sch_ingress.o
-obj-$(CONFIG_NET_SCH_DSMARK) += sch_dsmark.o
obj-$(CONFIG_NET_SCH_SFB) += sch_sfb.o
obj-$(CONFIG_NET_SCH_SFQ) += sch_sfq.o
obj-$(CONFIG_NET_SCH_TBF) += sch_tbf.o
obj-$(CONFIG_NET_SCH_TEQL) += sch_teql.o
obj-$(CONFIG_NET_SCH_PRIO) += sch_prio.o
obj-$(CONFIG_NET_SCH_MULTIQ) += sch_multiq.o
-obj-$(CONFIG_NET_SCH_ATM) += sch_atm.o
obj-$(CONFIG_NET_SCH_NETEM) += sch_netem.o
obj-$(CONFIG_NET_SCH_DRR) += sch_drr.o
obj-$(CONFIG_NET_SCH_PLUG) += sch_plug.o
obj-$(CONFIG_NET_SCH_ETS) += sch_ets.o
obj-$(CONFIG_NET_SCH_MQPRIO) += sch_mqprio.o
+obj-$(CONFIG_NET_SCH_MQPRIO_LIB) += sch_mqprio_lib.o
obj-$(CONFIG_NET_SCH_SKBPRIO) += sch_skbprio.o
obj-$(CONFIG_NET_SCH_CHOKE) += sch_choke.o
obj-$(CONFIG_NET_SCH_QFQ) += sch_qfq.o
@@ -69,9 +67,6 @@ obj-$(CONFIG_NET_SCH_TAPRIO) += sch_taprio.o
obj-$(CONFIG_NET_CLS_U32) += cls_u32.o
obj-$(CONFIG_NET_CLS_ROUTE4) += cls_route.o
obj-$(CONFIG_NET_CLS_FW) += cls_fw.o
-obj-$(CONFIG_NET_CLS_RSVP) += cls_rsvp.o
-obj-$(CONFIG_NET_CLS_TCINDEX) += cls_tcindex.o
-obj-$(CONFIG_NET_CLS_RSVP6) += cls_rsvp6.o
obj-$(CONFIG_NET_CLS_BASIC) += cls_basic.o
obj-$(CONFIG_NET_CLS_FLOW) += cls_flow.o
obj-$(CONFIG_NET_CLS_CGROUP) += cls_cgroup.o
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 5b3c0ac495be..fce522886099 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -125,7 +125,7 @@ static void free_tcf(struct tc_action *p)
free_percpu(p->cpu_bstats_hw);
free_percpu(p->cpu_qstats);
- tcf_set_action_cookie(&p->act_cookie, NULL);
+ tcf_set_action_cookie(&p->user_cookie, NULL);
if (chain)
tcf_chain_put_by_act(chain);
@@ -169,11 +169,6 @@ static bool tc_act_skip_sw(u32 flags)
return (flags & TCA_ACT_FLAGS_SKIP_SW) ? true : false;
}
-static bool tc_act_in_hw(struct tc_action *act)
-{
- return !!act->in_hw_count;
-}
-
/* SKIP_HW and SKIP_SW are mutually exclusive flags. */
static bool tc_act_flags_valid(u32 flags)
{
@@ -192,6 +187,7 @@ static int offload_action_init(struct flow_offload_action *fl_action,
fl_action->extack = extack;
fl_action->command = cmd;
fl_action->index = act->tcfa_index;
+ fl_action->cookie = (unsigned long)act;
if (act->ops->offload_act_setup) {
spin_lock_bh(&act->tcfa_lock);
@@ -272,7 +268,7 @@ static int tcf_action_offload_add_ex(struct tc_action *action,
if (err)
goto fl_err;
- err = tc_setup_action(&fl_action->action, actions, extack);
+ err = tc_setup_action(&fl_action->action, actions, 0, extack);
if (err) {
NL_SET_ERR_MSG_MOD(extack,
"Failed to setup tc actions for offload");
@@ -307,9 +303,6 @@ int tcf_action_update_hw_stats(struct tc_action *action)
struct flow_offload_action fl_act = {};
int err;
- if (!tc_act_in_hw(action))
- return -EOPNOTSUPP;
-
err = offload_action_init(&fl_act, action, FLOW_ACT_STATS, NULL);
if (err)
return err;
@@ -438,14 +431,14 @@ EXPORT_SYMBOL(tcf_idr_release);
static size_t tcf_action_shared_attrs_size(const struct tc_action *act)
{
- struct tc_cookie *act_cookie;
+ struct tc_cookie *user_cookie;
u32 cookie_len = 0;
rcu_read_lock();
- act_cookie = rcu_dereference(act->act_cookie);
+ user_cookie = rcu_dereference(act->user_cookie);
- if (act_cookie)
- cookie_len = nla_total_size(act_cookie->len);
+ if (user_cookie)
+ cookie_len = nla_total_size(user_cookie->len);
rcu_read_unlock();
return nla_total_size(0) /* action number nested */
@@ -495,7 +488,7 @@ tcf_action_dump_terse(struct sk_buff *skb, struct tc_action *a, bool from_act)
goto nla_put_failure;
rcu_read_lock();
- cookie = rcu_dereference(a->act_cookie);
+ cookie = rcu_dereference(a->user_cookie);
if (cookie) {
if (nla_put(skb, TCA_ACT_COOKIE, cookie->len, cookie->data)) {
rcu_read_unlock();
@@ -539,6 +532,8 @@ static int tcf_dump_walker(struct tcf_idrinfo *idrinfo, struct sk_buff *skb,
(unsigned long)p->tcfa_tm.lastuse))
continue;
+ tcf_action_update_hw_stats(p);
+
nest = nla_nest_start_noflag(skb, n_i);
if (!nest) {
index--;
@@ -1367,9 +1362,9 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp,
{
bool police = flags & TCA_ACT_FLAGS_POLICE;
struct nla_bitfield32 userflags = { 0, 0 };
+ struct tc_cookie *user_cookie = NULL;
u8 hw_stats = TCA_ACT_HW_STATS_ANY;
struct nlattr *tb[TCA_ACT_MAX + 1];
- struct tc_cookie *cookie = NULL;
struct tc_action *a;
int err;
@@ -1380,8 +1375,8 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp,
if (err < 0)
return ERR_PTR(err);
if (tb[TCA_ACT_COOKIE]) {
- cookie = nla_memdup_cookie(tb);
- if (!cookie) {
+ user_cookie = nla_memdup_cookie(tb);
+ if (!user_cookie) {
NL_SET_ERR_MSG(extack, "No memory to generate TC cookie");
err = -ENOMEM;
goto err_out;
@@ -1407,7 +1402,7 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp,
*init_res = err;
if (!police && tb[TCA_ACT_COOKIE])
- tcf_set_action_cookie(&a->act_cookie, cookie);
+ tcf_set_action_cookie(&a->user_cookie, user_cookie);
if (!police)
a->hw_stats = hw_stats;
@@ -1415,9 +1410,9 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp,
return a;
err_out:
- if (cookie) {
- kfree(cookie->data);
- kfree(cookie);
+ if (user_cookie) {
+ kfree(user_cookie->data);
+ kfree(user_cookie);
}
return ERR_PTR(err);
}
@@ -1539,9 +1534,6 @@ int tcf_action_copy_stats(struct sk_buff *skb, struct tc_action *p,
if (p == NULL)
goto errout;
- /* update hw stats for this action */
- tcf_action_update_hw_stats(p);
-
/* compat_mode being true specifies a call that is supposed
* to add additional backward compatibility statistic TLVs.
*/
@@ -1582,7 +1574,7 @@ errout:
static int tca_get_fill(struct sk_buff *skb, struct tc_action *actions[],
u32 portid, u32 seq, u16 flags, int event, int bind,
- int ref)
+ int ref, struct netlink_ext_ack *extack)
{
struct tcamsg *t;
struct nlmsghdr *nlh;
@@ -1606,7 +1598,12 @@ static int tca_get_fill(struct sk_buff *skb, struct tc_action *actions[],
nla_nest_end(skb, nest);
+ if (extack && extack->_msg &&
+ nla_put_string(skb, TCA_EXT_WARN_MSG, extack->_msg))
+ goto out_nlmsg_trim;
+
nlh->nlmsg_len = skb_tail_pointer(skb) - b;
+
return skb->len;
out_nlmsg_trim:
@@ -1625,7 +1622,7 @@ tcf_get_notify(struct net *net, u32 portid, struct nlmsghdr *n,
if (!skb)
return -ENOBUFS;
if (tca_get_fill(skb, actions, portid, n->nlmsg_seq, 0, event,
- 0, 1) <= 0) {
+ 0, 1, NULL) <= 0) {
NL_SET_ERR_MSG(extack, "Failed to fill netlink attributes while adding TC action");
kfree_skb(skb);
return -EINVAL;
@@ -1799,7 +1796,7 @@ tcf_reoffload_del_notify(struct net *net, struct tc_action *action)
if (!skb)
return -ENOBUFS;
- if (tca_get_fill(skb, actions, 0, 0, 0, RTM_DELACTION, 0, 1) <= 0) {
+ if (tca_get_fill(skb, actions, 0, 0, 0, RTM_DELACTION, 0, 1, NULL) <= 0) {
kfree_skb(skb);
return -EINVAL;
}
@@ -1886,7 +1883,7 @@ tcf_del_notify(struct net *net, struct nlmsghdr *n, struct tc_action *actions[],
return -ENOBUFS;
if (tca_get_fill(skb, actions, portid, n->nlmsg_seq, 0, RTM_DELACTION,
- 0, 2) <= 0) {
+ 0, 2, extack) <= 0) {
NL_SET_ERR_MSG(extack, "Failed to fill netlink TC action attributes");
kfree_skb(skb);
return -EINVAL;
@@ -1965,7 +1962,7 @@ tcf_add_notify(struct net *net, struct nlmsghdr *n, struct tc_action *actions[],
return -ENOBUFS;
if (tca_get_fill(skb, actions, portid, n->nlmsg_seq, n->nlmsg_flags,
- RTM_NEWACTION, 0, 0) <= 0) {
+ RTM_NEWACTION, 0, 0, extack) <= 0) {
NL_SET_ERR_MSG(extack, "Failed to fill netlink attributes while adding TC action");
kfree_skb(skb);
return -EINVAL;
diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c
index 7e63ff7e3ed7..8dabfb52ea3d 100644
--- a/net/sched/act_connmark.c
+++ b/net/sched/act_connmark.c
@@ -36,13 +36,15 @@ TC_INDIRECT_SCOPE int tcf_connmark_act(struct sk_buff *skb,
struct nf_conntrack_tuple tuple;
enum ip_conntrack_info ctinfo;
struct tcf_connmark_info *ca = to_connmark(a);
+ struct tcf_connmark_parms *parms;
struct nf_conntrack_zone zone;
struct nf_conn *c;
int proto;
- spin_lock(&ca->tcf_lock);
tcf_lastuse_update(&ca->tcf_tm);
- bstats_update(&ca->tcf_bstats, skb);
+ tcf_action_update_bstats(&ca->common, skb);
+
+ parms = rcu_dereference_bh(ca->parms);
switch (skb_protocol(skb, true)) {
case htons(ETH_P_IP):
@@ -64,31 +66,29 @@ TC_INDIRECT_SCOPE int tcf_connmark_act(struct sk_buff *skb,
c = nf_ct_get(skb, &ctinfo);
if (c) {
skb->mark = READ_ONCE(c->mark);
- /* using overlimits stats to count how many packets marked */
- ca->tcf_qstats.overlimits++;
- goto out;
+ goto count;
}
- if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb),
- proto, ca->net, &tuple))
+ if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb), proto, parms->net,
+ &tuple))
goto out;
- zone.id = ca->zone;
+ zone.id = parms->zone;
zone.dir = NF_CT_DEFAULT_ZONE_DIR;
- thash = nf_conntrack_find_get(ca->net, &zone, &tuple);
+ thash = nf_conntrack_find_get(parms->net, &zone, &tuple);
if (!thash)
goto out;
c = nf_ct_tuplehash_to_ctrack(thash);
- /* using overlimits stats to count how many packets marked */
- ca->tcf_qstats.overlimits++;
skb->mark = READ_ONCE(c->mark);
nf_ct_put(c);
+count:
+ /* using overlimits stats to count how many packets marked */
+ tcf_action_inc_overlimit_qstats(&ca->common);
out:
- spin_unlock(&ca->tcf_lock);
- return ca->tcf_action;
+ return READ_ONCE(ca->tcf_action);
}
static const struct nla_policy connmark_policy[TCA_CONNMARK_MAX + 1] = {
@@ -101,6 +101,7 @@ static int tcf_connmark_init(struct net *net, struct nlattr *nla,
struct netlink_ext_ack *extack)
{
struct tc_action_net *tn = net_generic(net, act_connmark_ops.net_id);
+ struct tcf_connmark_parms *nparms, *oparms;
struct nlattr *tb[TCA_CONNMARK_MAX + 1];
bool bind = flags & TCA_ACT_FLAGS_BIND;
struct tcf_chain *goto_ch = NULL;
@@ -120,52 +121,66 @@ static int tcf_connmark_init(struct net *net, struct nlattr *nla,
if (!tb[TCA_CONNMARK_PARMS])
return -EINVAL;
+ nparms = kzalloc(sizeof(*nparms), GFP_KERNEL);
+ if (!nparms)
+ return -ENOMEM;
+
parm = nla_data(tb[TCA_CONNMARK_PARMS]);
index = parm->index;
ret = tcf_idr_check_alloc(tn, &index, a, bind);
if (!ret) {
- ret = tcf_idr_create(tn, index, est, a,
- &act_connmark_ops, bind, false, flags);
+ ret = tcf_idr_create_from_flags(tn, index, est, a,
+ &act_connmark_ops, bind, flags);
if (ret) {
tcf_idr_cleanup(tn, index);
- return ret;
+ err = ret;
+ goto out_free;
}
ci = to_connmark(*a);
- err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch,
- extack);
- if (err < 0)
- goto release_idr;
- tcf_action_set_ctrlact(*a, parm->action, goto_ch);
- ci->net = net;
- ci->zone = parm->zone;
+
+ nparms->net = net;
+ nparms->zone = parm->zone;
ret = ACT_P_CREATED;
} else if (ret > 0) {
ci = to_connmark(*a);
- if (bind)
- return 0;
- if (!(flags & TCA_ACT_FLAGS_REPLACE)) {
- tcf_idr_release(*a, bind);
- return -EEXIST;
+ if (bind) {
+ err = 0;
+ goto out_free;
}
- err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch,
- extack);
- if (err < 0)
+ if (!(flags & TCA_ACT_FLAGS_REPLACE)) {
+ err = -EEXIST;
goto release_idr;
- /* replacing action and zone */
- spin_lock_bh(&ci->tcf_lock);
- goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch);
- ci->zone = parm->zone;
- spin_unlock_bh(&ci->tcf_lock);
- if (goto_ch)
- tcf_chain_put_by_act(goto_ch);
+ }
+
+ nparms->net = rtnl_dereference(ci->parms)->net;
+ nparms->zone = parm->zone;
+
ret = 0;
}
+ err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack);
+ if (err < 0)
+ goto release_idr;
+
+ spin_lock_bh(&ci->tcf_lock);
+ goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch);
+ oparms = rcu_replace_pointer(ci->parms, nparms, lockdep_is_held(&ci->tcf_lock));
+ spin_unlock_bh(&ci->tcf_lock);
+
+ if (goto_ch)
+ tcf_chain_put_by_act(goto_ch);
+
+ if (oparms)
+ kfree_rcu(oparms, rcu);
+
return ret;
+
release_idr:
tcf_idr_release(*a, bind);
+out_free:
+ kfree(nparms);
return err;
}
@@ -179,11 +194,14 @@ static inline int tcf_connmark_dump(struct sk_buff *skb, struct tc_action *a,
.refcnt = refcount_read(&ci->tcf_refcnt) - ref,
.bindcnt = atomic_read(&ci->tcf_bindcnt) - bind,
};
+ struct tcf_connmark_parms *parms;
struct tcf_t t;
spin_lock_bh(&ci->tcf_lock);
+ parms = rcu_dereference_protected(ci->parms, lockdep_is_held(&ci->tcf_lock));
+
opt.action = ci->tcf_action;
- opt.zone = ci->zone;
+ opt.zone = parms->zone;
if (nla_put(skb, TCA_CONNMARK_PARMS, sizeof(opt), &opt))
goto nla_put_failure;
@@ -201,6 +219,16 @@ nla_put_failure:
return -1;
}
+static void tcf_connmark_cleanup(struct tc_action *a)
+{
+ struct tcf_connmark_info *ci = to_connmark(a);
+ struct tcf_connmark_parms *parms;
+
+ parms = rcu_dereference_protected(ci->parms, 1);
+ if (parms)
+ kfree_rcu(parms, rcu);
+}
+
static struct tc_action_ops act_connmark_ops = {
.kind = "connmark",
.id = TCA_ID_CONNMARK,
@@ -208,6 +236,7 @@ static struct tc_action_ops act_connmark_ops = {
.act = tcf_connmark_act,
.dump = tcf_connmark_dump,
.init = tcf_connmark_init,
+ .cleanup = tcf_connmark_cleanup,
.size = sizeof(struct tcf_connmark_info),
};
diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c
index 0ca2bb8ed026..9cc0bc7c71ed 100644
--- a/net/sched/act_ct.c
+++ b/net/sched/act_ct.c
@@ -170,11 +170,11 @@ tcf_ct_flow_table_add_action_nat_udp(const struct nf_conntrack_tuple *tuple,
static void tcf_ct_flow_table_add_action_meta(struct nf_conn *ct,
enum ip_conntrack_dir dir,
+ enum ip_conntrack_info ctinfo,
struct flow_action *action)
{
struct nf_conn_labels *ct_labels;
struct flow_action_entry *entry;
- enum ip_conntrack_info ctinfo;
u32 *act_ct_labels;
entry = tcf_ct_flow_table_flow_action_get_next(action);
@@ -182,8 +182,6 @@ static void tcf_ct_flow_table_add_action_meta(struct nf_conn *ct,
#if IS_ENABLED(CONFIG_NF_CONNTRACK_MARK)
entry->ct_metadata.mark = READ_ONCE(ct->mark);
#endif
- ctinfo = dir == IP_CT_DIR_ORIGINAL ? IP_CT_ESTABLISHED :
- IP_CT_ESTABLISHED_REPLY;
/* aligns with the CT reference on the SKB nf_ct_set */
entry->ct_metadata.cookie = (unsigned long)ct | ctinfo;
entry->ct_metadata.orig_dir = dir == IP_CT_DIR_ORIGINAL;
@@ -237,22 +235,28 @@ static int tcf_ct_flow_table_add_action_nat(struct net *net,
}
static int tcf_ct_flow_table_fill_actions(struct net *net,
- const struct flow_offload *flow,
+ struct flow_offload *flow,
enum flow_offload_tuple_dir tdir,
struct nf_flow_rule *flow_rule)
{
struct flow_action *action = &flow_rule->rule->action;
int num_entries = action->num_entries;
struct nf_conn *ct = flow->ct;
+ enum ip_conntrack_info ctinfo;
enum ip_conntrack_dir dir;
int i, err;
switch (tdir) {
case FLOW_OFFLOAD_DIR_ORIGINAL:
dir = IP_CT_DIR_ORIGINAL;
+ ctinfo = test_bit(IPS_SEEN_REPLY_BIT, &ct->status) ?
+ IP_CT_ESTABLISHED : IP_CT_NEW;
+ if (ctinfo == IP_CT_ESTABLISHED)
+ set_bit(NF_FLOW_HW_ESTABLISHED, &flow->flags);
break;
case FLOW_OFFLOAD_DIR_REPLY:
dir = IP_CT_DIR_REPLY;
+ ctinfo = IP_CT_ESTABLISHED_REPLY;
break;
default:
return -EOPNOTSUPP;
@@ -262,7 +266,7 @@ static int tcf_ct_flow_table_fill_actions(struct net *net,
if (err)
goto err_nat;
- tcf_ct_flow_table_add_action_meta(ct, dir, action);
+ tcf_ct_flow_table_add_action_meta(ct, dir, ctinfo, action);
return 0;
err_nat:
@@ -365,7 +369,7 @@ static void tcf_ct_flow_tc_ifidx(struct flow_offload *entry,
static void tcf_ct_flow_table_add(struct tcf_ct_flow_table *ct_ft,
struct nf_conn *ct,
- bool tcp)
+ bool tcp, bool bidirectional)
{
struct nf_conn_act_ct_ext *act_ct_ext;
struct flow_offload *entry;
@@ -384,6 +388,8 @@ static void tcf_ct_flow_table_add(struct tcf_ct_flow_table *ct_ft,
ct->proto.tcp.seen[0].flags |= IP_CT_TCP_FLAG_BE_LIBERAL;
ct->proto.tcp.seen[1].flags |= IP_CT_TCP_FLAG_BE_LIBERAL;
}
+ if (bidirectional)
+ __set_bit(NF_FLOW_HW_BIDIRECTIONAL, &entry->flags);
act_ct_ext = nf_conn_act_ct_ext_find(ct);
if (act_ct_ext) {
@@ -407,26 +413,34 @@ static void tcf_ct_flow_table_process_conn(struct tcf_ct_flow_table *ct_ft,
struct nf_conn *ct,
enum ip_conntrack_info ctinfo)
{
- bool tcp = false;
-
- if ((ctinfo != IP_CT_ESTABLISHED && ctinfo != IP_CT_ESTABLISHED_REPLY) ||
- !test_bit(IPS_ASSURED_BIT, &ct->status))
- return;
+ bool tcp = false, bidirectional = true;
switch (nf_ct_protonum(ct)) {
case IPPROTO_TCP:
- tcp = true;
- if (ct->proto.tcp.state != TCP_CONNTRACK_ESTABLISHED)
+ if ((ctinfo != IP_CT_ESTABLISHED &&
+ ctinfo != IP_CT_ESTABLISHED_REPLY) ||
+ !test_bit(IPS_ASSURED_BIT, &ct->status) ||
+ ct->proto.tcp.state != TCP_CONNTRACK_ESTABLISHED)
return;
+
+ tcp = true;
break;
case IPPROTO_UDP:
+ if (!nf_ct_is_confirmed(ct))
+ return;
+ if (!test_bit(IPS_ASSURED_BIT, &ct->status))
+ bidirectional = false;
break;
#ifdef CONFIG_NF_CT_PROTO_GRE
case IPPROTO_GRE: {
struct nf_conntrack_tuple *tuple;
- if (ct->status & IPS_NAT_MASK)
+ if ((ctinfo != IP_CT_ESTABLISHED &&
+ ctinfo != IP_CT_ESTABLISHED_REPLY) ||
+ !test_bit(IPS_ASSURED_BIT, &ct->status) ||
+ ct->status & IPS_NAT_MASK)
return;
+
tuple = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
/* No support for GRE v1 */
if (tuple->src.u.gre.key || tuple->dst.u.gre.key)
@@ -442,7 +456,7 @@ static void tcf_ct_flow_table_process_conn(struct tcf_ct_flow_table *ct_ft,
ct->status & IPS_SEQ_ADJUST)
return;
- tcf_ct_flow_table_add(ct_ft, ct, tcp);
+ tcf_ct_flow_table_add(ct_ft, ct, tcp, bidirectional);
}
static bool
@@ -621,13 +635,30 @@ static bool tcf_ct_flow_table_lookup(struct tcf_ct_params *p,
flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]);
ct = flow->ct;
+ if (dir == FLOW_OFFLOAD_DIR_REPLY &&
+ !test_bit(NF_FLOW_HW_BIDIRECTIONAL, &flow->flags)) {
+ /* Only offload reply direction after connection became
+ * assured.
+ */
+ if (test_bit(IPS_ASSURED_BIT, &ct->status))
+ set_bit(NF_FLOW_HW_BIDIRECTIONAL, &flow->flags);
+ else if (test_bit(NF_FLOW_HW_ESTABLISHED, &flow->flags))
+ /* If flow_table flow has already been updated to the
+ * established state, then don't refresh.
+ */
+ return false;
+ }
+
if (tcph && (unlikely(tcph->fin || tcph->rst))) {
flow_offload_teardown(flow);
return false;
}
- ctinfo = dir == FLOW_OFFLOAD_DIR_ORIGINAL ? IP_CT_ESTABLISHED :
- IP_CT_ESTABLISHED_REPLY;
+ if (dir == FLOW_OFFLOAD_DIR_ORIGINAL)
+ ctinfo = test_bit(IPS_SEEN_REPLY_BIT, &ct->status) ?
+ IP_CT_ESTABLISHED : IP_CT_NEW;
+ else
+ ctinfo = IP_CT_ESTABLISHED_REPLY;
flow_offload_refresh(nf_ft, flow);
nf_conntrack_get(&ct->ct_general);
@@ -695,31 +726,6 @@ drop_ct:
return false;
}
-/* Trim the skb to the length specified by the IP/IPv6 header,
- * removing any trailing lower-layer padding. This prepares the skb
- * for higher-layer processing that assumes skb->len excludes padding
- * (such as nf_ip_checksum). The caller needs to pull the skb to the
- * network header, and ensure ip_hdr/ipv6_hdr points to valid data.
- */
-static int tcf_ct_skb_network_trim(struct sk_buff *skb, int family)
-{
- unsigned int len;
-
- switch (family) {
- case NFPROTO_IPV4:
- len = ntohs(ip_hdr(skb)->tot_len);
- break;
- case NFPROTO_IPV6:
- len = sizeof(struct ipv6hdr)
- + ntohs(ipv6_hdr(skb)->payload_len);
- break;
- default:
- len = skb->len;
- }
-
- return pskb_trim_rcsum(skb, len);
-}
-
static u8 tcf_ct_skb_nf_family(struct sk_buff *skb)
{
u8 family = NFPROTO_UNSPEC;
@@ -779,6 +785,7 @@ static int tcf_ct_handle_fragments(struct net *net, struct sk_buff *skb,
struct nf_conn *ct;
int err = 0;
bool frag;
+ u8 proto;
u16 mru;
/* Previously seen (loopback)? Ignore. */
@@ -794,50 +801,14 @@ static int tcf_ct_handle_fragments(struct net *net, struct sk_buff *skb,
return err;
skb_get(skb);
- mru = tc_skb_cb(skb)->mru;
-
- if (family == NFPROTO_IPV4) {
- enum ip_defrag_users user = IP_DEFRAG_CONNTRACK_IN + zone;
-
- memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
- local_bh_disable();
- err = ip_defrag(net, skb, user);
- local_bh_enable();
- if (err && err != -EINPROGRESS)
- return err;
-
- if (!err) {
- *defrag = true;
- mru = IPCB(skb)->frag_max_size;
- }
- } else { /* NFPROTO_IPV6 */
-#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6)
- enum ip6_defrag_users user = IP6_DEFRAG_CONNTRACK_IN + zone;
-
- memset(IP6CB(skb), 0, sizeof(struct inet6_skb_parm));
- err = nf_ct_frag6_gather(net, skb, user);
- if (err && err != -EINPROGRESS)
- goto out_free;
-
- if (!err) {
- *defrag = true;
- mru = IP6CB(skb)->frag_max_size;
- }
-#else
- err = -EOPNOTSUPP;
- goto out_free;
-#endif
- }
+ err = nf_ct_handle_fragments(net, skb, zone, family, &proto, &mru);
+ if (err)
+ return err;
- if (err != -EINPROGRESS)
- tc_skb_cb(skb)->mru = mru;
- skb_clear_hash(skb);
- skb->ignore_df = 1;
- return err;
+ *defrag = true;
+ tc_skb_cb(skb)->mru = mru;
-out_free:
- kfree_skb(skb);
- return err;
+ return 0;
}
static void tcf_ct_params_free(struct tcf_ct_params *params)
@@ -980,7 +951,7 @@ TC_INDIRECT_SCOPE int tcf_ct_act(struct sk_buff *skb, const struct tc_action *a,
if (err)
goto drop;
- err = tcf_ct_skb_network_trim(skb, family);
+ err = nf_ct_skb_network_trim(skb, family);
if (err)
goto drop;
diff --git a/net/sched/act_gate.c b/net/sched/act_gate.c
index 9b8def0be41e..c9a811f4c7ee 100644
--- a/net/sched/act_gate.c
+++ b/net/sched/act_gate.c
@@ -119,35 +119,37 @@ TC_INDIRECT_SCOPE int tcf_gate_act(struct sk_buff *skb,
struct tcf_result *res)
{
struct tcf_gate *gact = to_gate(a);
-
- spin_lock(&gact->tcf_lock);
+ int action = READ_ONCE(gact->tcf_action);
tcf_lastuse_update(&gact->tcf_tm);
- bstats_update(&gact->tcf_bstats, skb);
+ tcf_action_update_bstats(&gact->common, skb);
+ spin_lock(&gact->tcf_lock);
if (unlikely(gact->current_gate_status & GATE_ACT_PENDING)) {
spin_unlock(&gact->tcf_lock);
- return gact->tcf_action;
+ return action;
}
- if (!(gact->current_gate_status & GATE_ACT_GATE_OPEN))
+ if (!(gact->current_gate_status & GATE_ACT_GATE_OPEN)) {
+ spin_unlock(&gact->tcf_lock);
goto drop;
+ }
if (gact->current_max_octets >= 0) {
gact->current_entry_octets += qdisc_pkt_len(skb);
if (gact->current_entry_octets > gact->current_max_octets) {
- gact->tcf_qstats.overlimits++;
- goto drop;
+ spin_unlock(&gact->tcf_lock);
+ goto overlimit;
}
}
-
spin_unlock(&gact->tcf_lock);
- return gact->tcf_action;
-drop:
- gact->tcf_qstats.drops++;
- spin_unlock(&gact->tcf_lock);
+ return action;
+overlimit:
+ tcf_action_inc_overlimit_qstats(&gact->common);
+drop:
+ tcf_action_inc_drop_qstats(&gact->common);
return TC_ACT_SHOT;
}
@@ -357,8 +359,8 @@ static int tcf_gate_init(struct net *net, struct nlattr *nla,
return 0;
if (!err) {
- ret = tcf_idr_create(tn, index, est, a,
- &act_gate_ops, bind, false, flags);
+ ret = tcf_idr_create_from_flags(tn, index, est, a,
+ &act_gate_ops, bind, flags);
if (ret) {
tcf_idr_cleanup(tn, index);
return ret;
diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index 7284bcea7b0b..8037ec9b1d31 100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -29,8 +29,8 @@
static LIST_HEAD(mirred_list);
static DEFINE_SPINLOCK(mirred_list_lock);
-#define MIRRED_RECURSION_LIMIT 4
-static DEFINE_PER_CPU(unsigned int, mirred_rec_level);
+#define MIRRED_NEST_LIMIT 4
+static DEFINE_PER_CPU(unsigned int, mirred_nest_level);
static bool tcf_mirred_is_act_redirect(int action)
{
@@ -206,12 +206,19 @@ release_idr:
return err;
}
+static bool is_mirred_nested(void)
+{
+ return unlikely(__this_cpu_read(mirred_nest_level) > 1);
+}
+
static int tcf_mirred_forward(bool want_ingress, struct sk_buff *skb)
{
int err;
if (!want_ingress)
err = tcf_dev_queue_xmit(skb, dev_queue_xmit);
+ else if (is_mirred_nested())
+ err = netif_rx(skb);
else
err = netif_receive_skb(skb);
@@ -226,7 +233,7 @@ TC_INDIRECT_SCOPE int tcf_mirred_act(struct sk_buff *skb,
struct sk_buff *skb2 = skb;
bool m_mac_header_xmit;
struct net_device *dev;
- unsigned int rec_level;
+ unsigned int nest_level;
int retval, err = 0;
bool use_reinsert;
bool want_ingress;
@@ -237,11 +244,11 @@ TC_INDIRECT_SCOPE int tcf_mirred_act(struct sk_buff *skb,
int mac_len;
bool at_nh;
- rec_level = __this_cpu_inc_return(mirred_rec_level);
- if (unlikely(rec_level > MIRRED_RECURSION_LIMIT)) {
+ nest_level = __this_cpu_inc_return(mirred_nest_level);
+ if (unlikely(nest_level > MIRRED_NEST_LIMIT)) {
net_warn_ratelimited("Packet exceeded mirred recursion limit on dev %s\n",
netdev_name(skb->dev));
- __this_cpu_dec(mirred_rec_level);
+ __this_cpu_dec(mirred_nest_level);
return TC_ACT_SHOT;
}
@@ -310,7 +317,7 @@ TC_INDIRECT_SCOPE int tcf_mirred_act(struct sk_buff *skb,
err = tcf_mirred_forward(want_ingress, skb);
if (err)
tcf_action_inc_overlimit_qstats(&m->common);
- __this_cpu_dec(mirred_rec_level);
+ __this_cpu_dec(mirred_nest_level);
return TC_ACT_CONSUMED;
}
}
@@ -322,7 +329,7 @@ out:
if (tcf_mirred_is_act_redirect(m_eaction))
retval = TC_ACT_SHOT;
}
- __this_cpu_dec(mirred_rec_level);
+ __this_cpu_dec(mirred_nest_level);
return retval;
}
diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c
index 74c74be33048..4184af5abbf3 100644
--- a/net/sched/act_nat.c
+++ b/net/sched/act_nat.c
@@ -38,6 +38,7 @@ static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est,
{
struct tc_action_net *tn = net_generic(net, act_nat_ops.net_id);
bool bind = flags & TCA_ACT_FLAGS_BIND;
+ struct tcf_nat_parms *nparm, *oparm;
struct nlattr *tb[TCA_NAT_MAX + 1];
struct tcf_chain *goto_ch = NULL;
struct tc_nat *parm;
@@ -59,8 +60,8 @@ static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est,
index = parm->index;
err = tcf_idr_check_alloc(tn, &index, a, bind);
if (!err) {
- ret = tcf_idr_create(tn, index, est, a,
- &act_nat_ops, bind, false, flags);
+ ret = tcf_idr_create_from_flags(tn, index, est, a, &act_nat_ops,
+ bind, flags);
if (ret) {
tcf_idr_cleanup(tn, index);
return ret;
@@ -79,19 +80,31 @@ static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est,
err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack);
if (err < 0)
goto release_idr;
+
+ nparm = kzalloc(sizeof(*nparm), GFP_KERNEL);
+ if (!nparm) {
+ err = -ENOMEM;
+ goto release_idr;
+ }
+
+ nparm->old_addr = parm->old_addr;
+ nparm->new_addr = parm->new_addr;
+ nparm->mask = parm->mask;
+ nparm->flags = parm->flags;
+
p = to_tcf_nat(*a);
spin_lock_bh(&p->tcf_lock);
- p->old_addr = parm->old_addr;
- p->new_addr = parm->new_addr;
- p->mask = parm->mask;
- p->flags = parm->flags;
-
goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch);
+ oparm = rcu_replace_pointer(p->parms, nparm, lockdep_is_held(&p->tcf_lock));
spin_unlock_bh(&p->tcf_lock);
+
if (goto_ch)
tcf_chain_put_by_act(goto_ch);
+ if (oparm)
+ kfree_rcu(oparm, rcu);
+
return ret;
release_idr:
tcf_idr_release(*a, bind);
@@ -103,6 +116,7 @@ TC_INDIRECT_SCOPE int tcf_nat_act(struct sk_buff *skb,
struct tcf_result *res)
{
struct tcf_nat *p = to_tcf_nat(a);
+ struct tcf_nat_parms *parms;
struct iphdr *iph;
__be32 old_addr;
__be32 new_addr;
@@ -113,18 +127,16 @@ TC_INDIRECT_SCOPE int tcf_nat_act(struct sk_buff *skb,
int ihl;
int noff;
- spin_lock(&p->tcf_lock);
-
tcf_lastuse_update(&p->tcf_tm);
- old_addr = p->old_addr;
- new_addr = p->new_addr;
- mask = p->mask;
- egress = p->flags & TCA_NAT_FLAG_EGRESS;
- action = p->tcf_action;
+ tcf_action_update_bstats(&p->common, skb);
- bstats_update(&p->tcf_bstats, skb);
+ action = READ_ONCE(p->tcf_action);
- spin_unlock(&p->tcf_lock);
+ parms = rcu_dereference_bh(p->parms);
+ old_addr = parms->old_addr;
+ new_addr = parms->new_addr;
+ mask = parms->mask;
+ egress = parms->flags & TCA_NAT_FLAG_EGRESS;
if (unlikely(action == TC_ACT_SHOT))
goto drop;
@@ -248,9 +260,7 @@ out:
return action;
drop:
- spin_lock(&p->tcf_lock);
- p->tcf_qstats.drops++;
- spin_unlock(&p->tcf_lock);
+ tcf_action_inc_drop_qstats(&p->common);
return TC_ACT_SHOT;
}
@@ -264,15 +274,20 @@ static int tcf_nat_dump(struct sk_buff *skb, struct tc_action *a,
.refcnt = refcount_read(&p->tcf_refcnt) - ref,
.bindcnt = atomic_read(&p->tcf_bindcnt) - bind,
};
+ struct tcf_nat_parms *parms;
struct tcf_t t;
spin_lock_bh(&p->tcf_lock);
- opt.old_addr = p->old_addr;
- opt.new_addr = p->new_addr;
- opt.mask = p->mask;
- opt.flags = p->flags;
+
opt.action = p->tcf_action;
+ parms = rcu_dereference_protected(p->parms, lockdep_is_held(&p->tcf_lock));
+
+ opt.old_addr = parms->old_addr;
+ opt.new_addr = parms->new_addr;
+ opt.mask = parms->mask;
+ opt.flags = parms->flags;
+
if (nla_put(skb, TCA_NAT_PARMS, sizeof(opt), &opt))
goto nla_put_failure;
@@ -289,6 +304,16 @@ nla_put_failure:
return -1;
}
+static void tcf_nat_cleanup(struct tc_action *a)
+{
+ struct tcf_nat *p = to_tcf_nat(a);
+ struct tcf_nat_parms *parms;
+
+ parms = rcu_dereference_protected(p->parms, 1);
+ if (parms)
+ kfree_rcu(parms, rcu);
+}
+
static struct tc_action_ops act_nat_ops = {
.kind = "nat",
.id = TCA_ID_NAT,
@@ -296,6 +321,7 @@ static struct tc_action_ops act_nat_ops = {
.act = tcf_nat_act,
.dump = tcf_nat_dump,
.init = tcf_nat_init,
+ .cleanup = tcf_nat_cleanup,
.size = sizeof(struct tcf_nat),
};
diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c
index a0378e9f0121..77d288d384ae 100644
--- a/net/sched/act_pedit.c
+++ b/net/sched/act_pedit.c
@@ -134,6 +134,17 @@ nla_failure:
return -EINVAL;
}
+static void tcf_pedit_cleanup_rcu(struct rcu_head *head)
+{
+ struct tcf_pedit_parms *parms =
+ container_of(head, struct tcf_pedit_parms, rcu);
+
+ kfree(parms->tcfp_keys_ex);
+ kfree(parms->tcfp_keys);
+
+ kfree(parms);
+}
+
static int tcf_pedit_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action **a,
struct tcf_proto *tp, u32 flags,
@@ -141,10 +152,9 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla,
{
struct tc_action_net *tn = net_generic(net, act_pedit_ops.net_id);
bool bind = flags & TCA_ACT_FLAGS_BIND;
- struct nlattr *tb[TCA_PEDIT_MAX + 1];
struct tcf_chain *goto_ch = NULL;
- struct tc_pedit_key *keys = NULL;
- struct tcf_pedit_key_ex *keys_ex;
+ struct tcf_pedit_parms *oparms, *nparms;
+ struct nlattr *tb[TCA_PEDIT_MAX + 1];
struct tc_pedit *parm;
struct nlattr *pattr;
struct tcf_pedit *p;
@@ -181,18 +191,25 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla,
return -EINVAL;
}
- keys_ex = tcf_pedit_keys_ex_parse(tb[TCA_PEDIT_KEYS_EX], parm->nkeys);
- if (IS_ERR(keys_ex))
- return PTR_ERR(keys_ex);
+ nparms = kzalloc(sizeof(*nparms), GFP_KERNEL);
+ if (!nparms)
+ return -ENOMEM;
+
+ nparms->tcfp_keys_ex =
+ tcf_pedit_keys_ex_parse(tb[TCA_PEDIT_KEYS_EX], parm->nkeys);
+ if (IS_ERR(nparms->tcfp_keys_ex)) {
+ ret = PTR_ERR(nparms->tcfp_keys_ex);
+ goto out_free;
+ }
index = parm->index;
err = tcf_idr_check_alloc(tn, &index, a, bind);
if (!err) {
- ret = tcf_idr_create(tn, index, est, a,
- &act_pedit_ops, bind, false, flags);
+ ret = tcf_idr_create_from_flags(tn, index, est, a,
+ &act_pedit_ops, bind, flags);
if (ret) {
tcf_idr_cleanup(tn, index);
- goto out_free;
+ goto out_free_ex;
}
ret = ACT_P_CREATED;
} else if (err > 0) {
@@ -204,7 +221,7 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla,
}
} else {
ret = err;
- goto out_free;
+ goto out_free_ex;
}
err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack);
@@ -212,48 +229,50 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla,
ret = err;
goto out_release;
}
- p = to_pedit(*a);
- spin_lock_bh(&p->tcf_lock);
- if (ret == ACT_P_CREATED ||
- (p->tcfp_nkeys && p->tcfp_nkeys != parm->nkeys)) {
- keys = kmalloc(ksize, GFP_ATOMIC);
- if (!keys) {
- spin_unlock_bh(&p->tcf_lock);
- ret = -ENOMEM;
- goto put_chain;
- }
- kfree(p->tcfp_keys);
- p->tcfp_keys = keys;
- p->tcfp_nkeys = parm->nkeys;
+ nparms->tcfp_off_max_hint = 0;
+ nparms->tcfp_flags = parm->flags;
+ nparms->tcfp_nkeys = parm->nkeys;
+
+ nparms->tcfp_keys = kmalloc(ksize, GFP_KERNEL);
+ if (!nparms->tcfp_keys) {
+ ret = -ENOMEM;
+ goto put_chain;
}
- memcpy(p->tcfp_keys, parm->keys, ksize);
- p->tcfp_off_max_hint = 0;
- for (i = 0; i < p->tcfp_nkeys; ++i) {
- u32 cur = p->tcfp_keys[i].off;
+
+ memcpy(nparms->tcfp_keys, parm->keys, ksize);
+
+ for (i = 0; i < nparms->tcfp_nkeys; ++i) {
+ u32 cur = nparms->tcfp_keys[i].off;
/* sanitize the shift value for any later use */
- p->tcfp_keys[i].shift = min_t(size_t, BITS_PER_TYPE(int) - 1,
- p->tcfp_keys[i].shift);
+ nparms->tcfp_keys[i].shift = min_t(size_t,
+ BITS_PER_TYPE(int) - 1,
+ nparms->tcfp_keys[i].shift);
/* The AT option can read a single byte, we can bound the actual
* value with uchar max.
*/
- cur += (0xff & p->tcfp_keys[i].offmask) >> p->tcfp_keys[i].shift;
+ cur += (0xff & nparms->tcfp_keys[i].offmask) >> nparms->tcfp_keys[i].shift;
/* Each key touches 4 bytes starting from the computed offset */
- p->tcfp_off_max_hint = max(p->tcfp_off_max_hint, cur + 4);
+ nparms->tcfp_off_max_hint =
+ max(nparms->tcfp_off_max_hint, cur + 4);
}
- p->tcfp_flags = parm->flags;
+ p = to_pedit(*a);
+
+ spin_lock_bh(&p->tcf_lock);
goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch);
+ oparms = rcu_replace_pointer(p->parms, nparms, 1);
+ spin_unlock_bh(&p->tcf_lock);
- kfree(p->tcfp_keys_ex);
- p->tcfp_keys_ex = keys_ex;
+ if (oparms)
+ call_rcu(&oparms->rcu, tcf_pedit_cleanup_rcu);
- spin_unlock_bh(&p->tcf_lock);
if (goto_ch)
tcf_chain_put_by_act(goto_ch);
+
return ret;
put_chain:
@@ -261,19 +280,22 @@ put_chain:
tcf_chain_put_by_act(goto_ch);
out_release:
tcf_idr_release(*a, bind);
+out_free_ex:
+ kfree(nparms->tcfp_keys_ex);
out_free:
- kfree(keys_ex);
+ kfree(nparms);
return ret;
-
}
static void tcf_pedit_cleanup(struct tc_action *a)
{
struct tcf_pedit *p = to_pedit(a);
- struct tc_pedit_key *keys = p->tcfp_keys;
+ struct tcf_pedit_parms *parms;
- kfree(keys);
- kfree(p->tcfp_keys_ex);
+ parms = rcu_dereference_protected(p->parms, 1);
+
+ if (parms)
+ call_rcu(&parms->rcu, tcf_pedit_cleanup_rcu);
}
static bool offset_valid(struct sk_buff *skb, int offset)
@@ -324,109 +346,105 @@ TC_INDIRECT_SCOPE int tcf_pedit_act(struct sk_buff *skb,
const struct tc_action *a,
struct tcf_result *res)
{
+ enum pedit_header_type htype = TCA_PEDIT_KEY_EX_HDR_TYPE_NETWORK;
+ enum pedit_cmd cmd = TCA_PEDIT_KEY_EX_CMD_SET;
struct tcf_pedit *p = to_pedit(a);
+ struct tcf_pedit_key_ex *tkey_ex;
+ struct tcf_pedit_parms *parms;
+ struct tc_pedit_key *tkey;
u32 max_offset;
int i;
- spin_lock(&p->tcf_lock);
+ parms = rcu_dereference_bh(p->parms);
max_offset = (skb_transport_header_was_set(skb) ?
skb_transport_offset(skb) :
skb_network_offset(skb)) +
- p->tcfp_off_max_hint;
+ parms->tcfp_off_max_hint;
if (skb_ensure_writable(skb, min(skb->len, max_offset)))
- goto unlock;
+ goto done;
tcf_lastuse_update(&p->tcf_tm);
+ tcf_action_update_bstats(&p->common, skb);
- if (p->tcfp_nkeys > 0) {
- struct tc_pedit_key *tkey = p->tcfp_keys;
- struct tcf_pedit_key_ex *tkey_ex = p->tcfp_keys_ex;
- enum pedit_header_type htype =
- TCA_PEDIT_KEY_EX_HDR_TYPE_NETWORK;
- enum pedit_cmd cmd = TCA_PEDIT_KEY_EX_CMD_SET;
-
- for (i = p->tcfp_nkeys; i > 0; i--, tkey++) {
- u32 *ptr, hdata;
- int offset = tkey->off;
- int hoffset;
- u32 val;
- int rc;
-
- if (tkey_ex) {
- htype = tkey_ex->htype;
- cmd = tkey_ex->cmd;
-
- tkey_ex++;
- }
+ tkey = parms->tcfp_keys;
+ tkey_ex = parms->tcfp_keys_ex;
- rc = pedit_skb_hdr_offset(skb, htype, &hoffset);
- if (rc) {
- pr_info("tc action pedit bad header type specified (0x%x)\n",
- htype);
- goto bad;
- }
+ for (i = parms->tcfp_nkeys; i > 0; i--, tkey++) {
+ int offset = tkey->off;
+ u32 *ptr, hdata;
+ int hoffset;
+ u32 val;
+ int rc;
- if (tkey->offmask) {
- u8 *d, _d;
-
- if (!offset_valid(skb, hoffset + tkey->at)) {
- pr_info("tc action pedit 'at' offset %d out of bounds\n",
- hoffset + tkey->at);
- goto bad;
- }
- d = skb_header_pointer(skb, hoffset + tkey->at,
- sizeof(_d), &_d);
- if (!d)
- goto bad;
- offset += (*d & tkey->offmask) >> tkey->shift;
- }
+ if (tkey_ex) {
+ htype = tkey_ex->htype;
+ cmd = tkey_ex->cmd;
- if (offset % 4) {
- pr_info("tc action pedit offset must be on 32 bit boundaries\n");
- goto bad;
- }
+ tkey_ex++;
+ }
- if (!offset_valid(skb, hoffset + offset)) {
- pr_info("tc action pedit offset %d out of bounds\n",
- hoffset + offset);
- goto bad;
- }
+ rc = pedit_skb_hdr_offset(skb, htype, &hoffset);
+ if (rc) {
+ pr_info("tc action pedit bad header type specified (0x%x)\n",
+ htype);
+ goto bad;
+ }
- ptr = skb_header_pointer(skb, hoffset + offset,
- sizeof(hdata), &hdata);
- if (!ptr)
- goto bad;
- /* just do it, baby */
- switch (cmd) {
- case TCA_PEDIT_KEY_EX_CMD_SET:
- val = tkey->val;
- break;
- case TCA_PEDIT_KEY_EX_CMD_ADD:
- val = (*ptr + tkey->val) & ~tkey->mask;
- break;
- default:
- pr_info("tc action pedit bad command (%d)\n",
- cmd);
+ if (tkey->offmask) {
+ u8 *d, _d;
+
+ if (!offset_valid(skb, hoffset + tkey->at)) {
+ pr_info("tc action pedit 'at' offset %d out of bounds\n",
+ hoffset + tkey->at);
goto bad;
}
+ d = skb_header_pointer(skb, hoffset + tkey->at,
+ sizeof(_d), &_d);
+ if (!d)
+ goto bad;
+ offset += (*d & tkey->offmask) >> tkey->shift;
+ }
- *ptr = ((*ptr & tkey->mask) ^ val);
- if (ptr == &hdata)
- skb_store_bits(skb, hoffset + offset, ptr, 4);
+ if (offset % 4) {
+ pr_info("tc action pedit offset must be on 32 bit boundaries\n");
+ goto bad;
}
- goto done;
- } else {
- WARN(1, "pedit BUG: index %d\n", p->tcf_index);
+ if (!offset_valid(skb, hoffset + offset)) {
+ pr_info("tc action pedit offset %d out of bounds\n",
+ hoffset + offset);
+ goto bad;
+ }
+
+ ptr = skb_header_pointer(skb, hoffset + offset,
+ sizeof(hdata), &hdata);
+ if (!ptr)
+ goto bad;
+ /* just do it, baby */
+ switch (cmd) {
+ case TCA_PEDIT_KEY_EX_CMD_SET:
+ val = tkey->val;
+ break;
+ case TCA_PEDIT_KEY_EX_CMD_ADD:
+ val = (*ptr + tkey->val) & ~tkey->mask;
+ break;
+ default:
+ pr_info("tc action pedit bad command (%d)\n",
+ cmd);
+ goto bad;
+ }
+
+ *ptr = ((*ptr & tkey->mask) ^ val);
+ if (ptr == &hdata)
+ skb_store_bits(skb, hoffset + offset, ptr, 4);
}
+ goto done;
+
bad:
- p->tcf_qstats.overlimits++;
+ tcf_action_inc_overlimit_qstats(&p->common);
done:
- bstats_update(&p->tcf_bstats, skb);
-unlock:
- spin_unlock(&p->tcf_lock);
return p->tcf_action;
}
@@ -445,30 +463,33 @@ static int tcf_pedit_dump(struct sk_buff *skb, struct tc_action *a,
{
unsigned char *b = skb_tail_pointer(skb);
struct tcf_pedit *p = to_pedit(a);
+ struct tcf_pedit_parms *parms;
struct tc_pedit *opt;
struct tcf_t t;
int s;
- s = struct_size(opt, keys, p->tcfp_nkeys);
+ spin_lock_bh(&p->tcf_lock);
+ parms = rcu_dereference_protected(p->parms, 1);
+ s = struct_size(opt, keys, parms->tcfp_nkeys);
- /* netlink spinlocks held above us - must use ATOMIC */
opt = kzalloc(s, GFP_ATOMIC);
- if (unlikely(!opt))
+ if (unlikely(!opt)) {
+ spin_unlock_bh(&p->tcf_lock);
return -ENOBUFS;
+ }
- spin_lock_bh(&p->tcf_lock);
- memcpy(opt->keys, p->tcfp_keys, flex_array_size(opt, keys, p->tcfp_nkeys));
+ memcpy(opt->keys, parms->tcfp_keys,
+ flex_array_size(opt, keys, parms->tcfp_nkeys));
opt->index = p->tcf_index;
- opt->nkeys = p->tcfp_nkeys;
- opt->flags = p->tcfp_flags;
+ opt->nkeys = parms->tcfp_nkeys;
+ opt->flags = parms->tcfp_flags;
opt->action = p->tcf_action;
opt->refcnt = refcount_read(&p->tcf_refcnt) - ref;
opt->bindcnt = atomic_read(&p->tcf_bindcnt) - bind;
- if (p->tcfp_keys_ex) {
- if (tcf_pedit_key_ex_dump(skb,
- p->tcfp_keys_ex,
- p->tcfp_nkeys))
+ if (parms->tcfp_keys_ex) {
+ if (tcf_pedit_key_ex_dump(skb, parms->tcfp_keys_ex,
+ parms->tcfp_nkeys))
goto nla_put_failure;
if (nla_put(skb, TCA_PEDIT_PARMS_EX, s, opt))
@@ -522,7 +543,28 @@ static int tcf_pedit_offload_act_setup(struct tc_action *act, void *entry_data,
}
*index_inc = k;
} else {
- return -EOPNOTSUPP;
+ struct flow_offload_action *fl_action = entry_data;
+ u32 cmd = tcf_pedit_cmd(act, 0);
+ int k;
+
+ switch (cmd) {
+ case TCA_PEDIT_KEY_EX_CMD_SET:
+ fl_action->id = FLOW_ACTION_MANGLE;
+ break;
+ case TCA_PEDIT_KEY_EX_CMD_ADD:
+ fl_action->id = FLOW_ACTION_ADD;
+ break;
+ default:
+ NL_SET_ERR_MSG_MOD(extack, "Unsupported pedit command offload");
+ return -EOPNOTSUPP;
+ }
+
+ for (k = 1; k < tcf_pedit_nkeys(act); k++) {
+ if (cmd != tcf_pedit_cmd(act, k)) {
+ NL_SET_ERR_MSG_MOD(extack, "Unsupported pedit command offload");
+ return -EOPNOTSUPP;
+ }
+ }
}
return 0;
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 668130f08903..3569e2c3660c 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -22,6 +22,7 @@
#include <linux/idr.h>
#include <linux/jhash.h>
#include <linux/rculist.h>
+#include <linux/rhashtable.h>
#include <net/net_namespace.h>
#include <net/sock.h>
#include <net/netlink.h>
@@ -50,6 +51,109 @@ static LIST_HEAD(tcf_proto_base);
/* Protects list of registered TC modules. It is pure SMP lock. */
static DEFINE_RWLOCK(cls_mod_lock);
+static struct xarray tcf_exts_miss_cookies_xa;
+struct tcf_exts_miss_cookie_node {
+ const struct tcf_chain *chain;
+ const struct tcf_proto *tp;
+ const struct tcf_exts *exts;
+ u32 chain_index;
+ u32 tp_prio;
+ u32 handle;
+ u32 miss_cookie_base;
+ struct rcu_head rcu;
+};
+
+/* Each tc action entry cookie will be comprised of 32bit miss_cookie_base +
+ * action index in the exts tc actions array.
+ */
+union tcf_exts_miss_cookie {
+ struct {
+ u32 miss_cookie_base;
+ u32 act_index;
+ };
+ u64 miss_cookie;
+};
+
+#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
+static int
+tcf_exts_miss_cookie_base_alloc(struct tcf_exts *exts, struct tcf_proto *tp,
+ u32 handle)
+{
+ struct tcf_exts_miss_cookie_node *n;
+ static u32 next;
+ int err;
+
+ if (WARN_ON(!handle || !tp->ops->get_exts))
+ return -EINVAL;
+
+ n = kzalloc(sizeof(*n), GFP_KERNEL);
+ if (!n)
+ return -ENOMEM;
+
+ n->chain_index = tp->chain->index;
+ n->chain = tp->chain;
+ n->tp_prio = tp->prio;
+ n->tp = tp;
+ n->exts = exts;
+ n->handle = handle;
+
+ err = xa_alloc_cyclic(&tcf_exts_miss_cookies_xa, &n->miss_cookie_base,
+ n, xa_limit_32b, &next, GFP_KERNEL);
+ if (err)
+ goto err_xa_alloc;
+
+ exts->miss_cookie_node = n;
+ return 0;
+
+err_xa_alloc:
+ kfree(n);
+ return err;
+}
+
+static void tcf_exts_miss_cookie_base_destroy(struct tcf_exts *exts)
+{
+ struct tcf_exts_miss_cookie_node *n;
+
+ if (!exts->miss_cookie_node)
+ return;
+
+ n = exts->miss_cookie_node;
+ xa_erase(&tcf_exts_miss_cookies_xa, n->miss_cookie_base);
+ kfree_rcu(n, rcu);
+}
+
+static struct tcf_exts_miss_cookie_node *
+tcf_exts_miss_cookie_lookup(u64 miss_cookie, int *act_index)
+{
+ union tcf_exts_miss_cookie mc = { .miss_cookie = miss_cookie, };
+
+ *act_index = mc.act_index;
+ return xa_load(&tcf_exts_miss_cookies_xa, mc.miss_cookie_base);
+}
+#else /* IS_ENABLED(CONFIG_NET_TC_SKB_EXT) */
+static int
+tcf_exts_miss_cookie_base_alloc(struct tcf_exts *exts, struct tcf_proto *tp,
+ u32 handle)
+{
+ return 0;
+}
+
+static void tcf_exts_miss_cookie_base_destroy(struct tcf_exts *exts)
+{
+}
+#endif /* IS_ENABLED(CONFIG_NET_TC_SKB_EXT) */
+
+static u64 tcf_exts_miss_cookie_get(u32 miss_cookie_base, int act_index)
+{
+ union tcf_exts_miss_cookie mc = { .act_index = act_index, };
+
+ if (!miss_cookie_base)
+ return 0;
+
+ mc.miss_cookie_base = miss_cookie_base;
+ return mc.miss_cookie;
+}
+
#ifdef CONFIG_NET_CLS_ACT
DEFINE_STATIC_KEY_FALSE(tc_skb_ext_tc);
EXPORT_SYMBOL(tc_skb_ext_tc);
@@ -488,7 +592,8 @@ static struct tcf_chain *tcf_chain_lookup_rcu(const struct tcf_block *block,
#endif
static int tc_chain_notify(struct tcf_chain *chain, struct sk_buff *oskb,
- u32 seq, u16 flags, int event, bool unicast);
+ u32 seq, u16 flags, int event, bool unicast,
+ struct netlink_ext_ack *extack);
static struct tcf_chain *__tcf_chain_get(struct tcf_block *block,
u32 chain_index, bool create,
@@ -521,7 +626,7 @@ static struct tcf_chain *__tcf_chain_get(struct tcf_block *block,
*/
if (is_first_reference && !by_act)
tc_chain_notify(chain, NULL, 0, NLM_F_CREATE | NLM_F_EXCL,
- RTM_NEWCHAIN, false);
+ RTM_NEWCHAIN, false, NULL);
return chain;
@@ -1548,6 +1653,8 @@ static inline int __tcf_classify(struct sk_buff *skb,
const struct tcf_proto *orig_tp,
struct tcf_result *res,
bool compat_mode,
+ struct tcf_exts_miss_cookie_node *n,
+ int act_index,
u32 *last_executed_chain)
{
#ifdef CONFIG_NET_CLS_ACT
@@ -1559,13 +1666,36 @@ reclassify:
#endif
for (; tp; tp = rcu_dereference_bh(tp->next)) {
__be16 protocol = skb_protocol(skb, false);
- int err;
+ int err = 0;
- if (tp->protocol != protocol &&
- tp->protocol != htons(ETH_P_ALL))
- continue;
+ if (n) {
+ struct tcf_exts *exts;
+
+ if (n->tp_prio != tp->prio)
+ continue;
+
+ /* We re-lookup the tp and chain based on index instead
+ * of having hard refs and locks to them, so do a sanity
+ * check if any of tp,chain,exts was replaced by the
+ * time we got here with a cookie from hardware.
+ */
+ if (unlikely(n->tp != tp || n->tp->chain != n->chain ||
+ !tp->ops->get_exts))
+ return TC_ACT_SHOT;
+
+ exts = tp->ops->get_exts(tp, n->handle);
+ if (unlikely(!exts || n->exts != exts))
+ return TC_ACT_SHOT;
- err = tc_classify(skb, tp, res);
+ n = NULL;
+ err = tcf_exts_exec_ex(skb, exts, act_index, res);
+ } else {
+ if (tp->protocol != protocol &&
+ tp->protocol != htons(ETH_P_ALL))
+ continue;
+
+ err = tc_classify(skb, tp, res);
+ }
#ifdef CONFIG_NET_CLS_ACT
if (unlikely(err == TC_ACT_RECLASSIFY && !compat_mode)) {
first_tp = orig_tp;
@@ -1581,6 +1711,9 @@ reclassify:
return err;
}
+ if (unlikely(n))
+ return TC_ACT_SHOT;
+
return TC_ACT_UNSPEC; /* signal: continue lookup */
#ifdef CONFIG_NET_CLS_ACT
reset:
@@ -1605,21 +1738,35 @@ int tcf_classify(struct sk_buff *skb,
#if !IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
u32 last_executed_chain = 0;
- return __tcf_classify(skb, tp, tp, res, compat_mode,
+ return __tcf_classify(skb, tp, tp, res, compat_mode, NULL, 0,
&last_executed_chain);
#else
u32 last_executed_chain = tp ? tp->chain->index : 0;
+ struct tcf_exts_miss_cookie_node *n = NULL;
const struct tcf_proto *orig_tp = tp;
struct tc_skb_ext *ext;
+ int act_index = 0;
int ret;
if (block) {
ext = skb_ext_find(skb, TC_SKB_EXT);
- if (ext && ext->chain) {
+ if (ext && (ext->chain || ext->act_miss)) {
struct tcf_chain *fchain;
+ u32 chain;
+
+ if (ext->act_miss) {
+ n = tcf_exts_miss_cookie_lookup(ext->act_miss_cookie,
+ &act_index);
+ if (!n)
+ return TC_ACT_SHOT;
- fchain = tcf_chain_lookup_rcu(block, ext->chain);
+ chain = n->chain_index;
+ } else {
+ chain = ext->chain;
+ }
+
+ fchain = tcf_chain_lookup_rcu(block, chain);
if (!fchain)
return TC_ACT_SHOT;
@@ -1631,7 +1778,7 @@ int tcf_classify(struct sk_buff *skb,
}
}
- ret = __tcf_classify(skb, tp, orig_tp, res, compat_mode,
+ ret = __tcf_classify(skb, tp, orig_tp, res, compat_mode, n, act_index,
&last_executed_chain);
if (tc_skb_ext_tc_enabled()) {
@@ -1817,7 +1964,8 @@ static int tcf_fill_node(struct net *net, struct sk_buff *skb,
struct tcf_proto *tp, struct tcf_block *block,
struct Qdisc *q, u32 parent, void *fh,
u32 portid, u32 seq, u16 flags, int event,
- bool terse_dump, bool rtnl_held)
+ bool terse_dump, bool rtnl_held,
+ struct netlink_ext_ack *extack)
{
struct tcmsg *tcm;
struct nlmsghdr *nlh;
@@ -1857,7 +2005,13 @@ static int tcf_fill_node(struct net *net, struct sk_buff *skb,
tp->ops->dump(net, tp, fh, skb, tcm, rtnl_held) < 0)
goto nla_put_failure;
}
+
+ if (extack && extack->_msg &&
+ nla_put_string(skb, TCA_EXT_WARN_MSG, extack->_msg))
+ goto nla_put_failure;
+
nlh->nlmsg_len = skb_tail_pointer(skb) - b;
+
return skb->len;
out_nlmsg_trim:
@@ -1871,7 +2025,7 @@ static int tfilter_notify(struct net *net, struct sk_buff *oskb,
struct nlmsghdr *n, struct tcf_proto *tp,
struct tcf_block *block, struct Qdisc *q,
u32 parent, void *fh, int event, bool unicast,
- bool rtnl_held)
+ bool rtnl_held, struct netlink_ext_ack *extack)
{
struct sk_buff *skb;
u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
@@ -1883,7 +2037,7 @@ static int tfilter_notify(struct net *net, struct sk_buff *oskb,
if (tcf_fill_node(net, skb, tp, block, q, parent, fh, portid,
n->nlmsg_seq, n->nlmsg_flags, event,
- false, rtnl_held) <= 0) {
+ false, rtnl_held, extack) <= 0) {
kfree_skb(skb);
return -EINVAL;
}
@@ -1912,7 +2066,7 @@ static int tfilter_del_notify(struct net *net, struct sk_buff *oskb,
if (tcf_fill_node(net, skb, tp, block, q, parent, fh, portid,
n->nlmsg_seq, n->nlmsg_flags, RTM_DELTFILTER,
- false, rtnl_held) <= 0) {
+ false, rtnl_held, extack) <= 0) {
NL_SET_ERR_MSG(extack, "Failed to build del event notification");
kfree_skb(skb);
return -EINVAL;
@@ -1938,14 +2092,15 @@ static int tfilter_del_notify(struct net *net, struct sk_buff *oskb,
static void tfilter_notify_chain(struct net *net, struct sk_buff *oskb,
struct tcf_block *block, struct Qdisc *q,
u32 parent, struct nlmsghdr *n,
- struct tcf_chain *chain, int event)
+ struct tcf_chain *chain, int event,
+ struct netlink_ext_ack *extack)
{
struct tcf_proto *tp;
for (tp = tcf_get_next_proto(chain, NULL);
tp; tp = tcf_get_next_proto(chain, tp))
- tfilter_notify(net, oskb, n, tp, block,
- q, parent, NULL, event, false, true);
+ tfilter_notify(net, oskb, n, tp, block, q, parent, NULL,
+ event, false, true, extack);
}
static void tfilter_put(struct tcf_proto *tp, void *fh)
@@ -2156,7 +2311,7 @@ replay:
flags, extack);
if (err == 0) {
tfilter_notify(net, skb, n, tp, block, q, parent, fh,
- RTM_NEWTFILTER, false, rtnl_held);
+ RTM_NEWTFILTER, false, rtnl_held, extack);
tfilter_put(tp, fh);
/* q pointer is NULL for shared blocks */
if (q)
@@ -2284,7 +2439,7 @@ static int tc_del_tfilter(struct sk_buff *skb, struct nlmsghdr *n,
if (prio == 0) {
tfilter_notify_chain(net, skb, block, q, parent, n,
- chain, RTM_DELTFILTER);
+ chain, RTM_DELTFILTER, extack);
tcf_chain_flush(chain, rtnl_held);
err = 0;
goto errout;
@@ -2308,7 +2463,7 @@ static int tc_del_tfilter(struct sk_buff *skb, struct nlmsghdr *n,
tcf_proto_put(tp, rtnl_held, NULL);
tfilter_notify(net, skb, n, tp, block, q, parent, fh,
- RTM_DELTFILTER, false, rtnl_held);
+ RTM_DELTFILTER, false, rtnl_held, extack);
err = 0;
goto errout;
}
@@ -2452,7 +2607,7 @@ static int tc_get_tfilter(struct sk_buff *skb, struct nlmsghdr *n,
err = -ENOENT;
} else {
err = tfilter_notify(net, skb, n, tp, block, q, parent,
- fh, RTM_NEWTFILTER, true, rtnl_held);
+ fh, RTM_NEWTFILTER, true, rtnl_held, NULL);
if (err < 0)
NL_SET_ERR_MSG(extack, "Failed to send filter notify message");
}
@@ -2490,7 +2645,7 @@ static int tcf_node_dump(struct tcf_proto *tp, void *n, struct tcf_walker *arg)
return tcf_fill_node(net, a->skb, tp, a->block, a->q, a->parent,
n, NETLINK_CB(a->cb->skb).portid,
a->cb->nlh->nlmsg_seq, NLM_F_MULTI,
- RTM_NEWTFILTER, a->terse_dump, true);
+ RTM_NEWTFILTER, a->terse_dump, true, NULL);
}
static bool tcf_chain_dump(struct tcf_chain *chain, struct Qdisc *q, u32 parent,
@@ -2524,7 +2679,7 @@ static bool tcf_chain_dump(struct tcf_chain *chain, struct Qdisc *q, u32 parent,
if (tcf_fill_node(net, skb, tp, block, q, parent, NULL,
NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq, NLM_F_MULTI,
- RTM_NEWTFILTER, false, true) <= 0)
+ RTM_NEWTFILTER, false, true, NULL) <= 0)
goto errout;
cb->args[1] = 1;
}
@@ -2667,7 +2822,8 @@ static int tc_chain_fill_node(const struct tcf_proto_ops *tmplt_ops,
void *tmplt_priv, u32 chain_index,
struct net *net, struct sk_buff *skb,
struct tcf_block *block,
- u32 portid, u32 seq, u16 flags, int event)
+ u32 portid, u32 seq, u16 flags, int event,
+ struct netlink_ext_ack *extack)
{
unsigned char *b = skb_tail_pointer(skb);
const struct tcf_proto_ops *ops;
@@ -2704,7 +2860,12 @@ static int tc_chain_fill_node(const struct tcf_proto_ops *tmplt_ops,
goto nla_put_failure;
}
+ if (extack && extack->_msg &&
+ nla_put_string(skb, TCA_EXT_WARN_MSG, extack->_msg))
+ goto out_nlmsg_trim;
+
nlh->nlmsg_len = skb_tail_pointer(skb) - b;
+
return skb->len;
out_nlmsg_trim:
@@ -2714,7 +2875,8 @@ nla_put_failure:
}
static int tc_chain_notify(struct tcf_chain *chain, struct sk_buff *oskb,
- u32 seq, u16 flags, int event, bool unicast)
+ u32 seq, u16 flags, int event, bool unicast,
+ struct netlink_ext_ack *extack)
{
u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
struct tcf_block *block = chain->block;
@@ -2728,7 +2890,7 @@ static int tc_chain_notify(struct tcf_chain *chain, struct sk_buff *oskb,
if (tc_chain_fill_node(chain->tmplt_ops, chain->tmplt_priv,
chain->index, net, skb, block, portid,
- seq, flags, event) <= 0) {
+ seq, flags, event, extack) <= 0) {
kfree_skb(skb);
return -EINVAL;
}
@@ -2756,7 +2918,7 @@ static int tc_chain_notify_delete(const struct tcf_proto_ops *tmplt_ops,
return -ENOBUFS;
if (tc_chain_fill_node(tmplt_ops, tmplt_priv, chain_index, net, skb,
- block, portid, seq, flags, RTM_DELCHAIN) <= 0) {
+ block, portid, seq, flags, RTM_DELCHAIN, NULL) <= 0) {
kfree_skb(skb);
return -EINVAL;
}
@@ -2908,11 +3070,11 @@ replay:
}
tc_chain_notify(chain, NULL, 0, NLM_F_CREATE | NLM_F_EXCL,
- RTM_NEWCHAIN, false);
+ RTM_NEWCHAIN, false, extack);
break;
case RTM_DELCHAIN:
tfilter_notify_chain(net, skb, block, q, parent, n,
- chain, RTM_DELTFILTER);
+ chain, RTM_DELTFILTER, extack);
/* Flush the chain first as the user requested chain removal. */
tcf_chain_flush(chain, true);
/* In case the chain was successfully deleted, put a reference
@@ -2922,7 +3084,7 @@ replay:
break;
case RTM_GETCHAIN:
err = tc_chain_notify(chain, skb, n->nlmsg_seq,
- n->nlmsg_flags, n->nlmsg_type, true);
+ n->nlmsg_flags, n->nlmsg_type, true, extack);
if (err < 0)
NL_SET_ERR_MSG(extack, "Failed to send chain notify message");
break;
@@ -3022,7 +3184,7 @@ static int tc_dump_chain(struct sk_buff *skb, struct netlink_callback *cb)
chain->index, net, skb, block,
NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq, NLM_F_MULTI,
- RTM_NEWCHAIN);
+ RTM_NEWCHAIN, NULL);
if (err <= 0)
break;
index++;
@@ -3040,9 +3202,48 @@ out:
return skb->len;
}
+int tcf_exts_init_ex(struct tcf_exts *exts, struct net *net, int action,
+ int police, struct tcf_proto *tp, u32 handle,
+ bool use_action_miss)
+{
+ int err = 0;
+
+#ifdef CONFIG_NET_CLS_ACT
+ exts->type = 0;
+ exts->nr_actions = 0;
+ /* Note: we do not own yet a reference on net.
+ * This reference might be taken later from tcf_exts_get_net().
+ */
+ exts->net = net;
+ exts->actions = kcalloc(TCA_ACT_MAX_PRIO, sizeof(struct tc_action *),
+ GFP_KERNEL);
+ if (!exts->actions)
+ return -ENOMEM;
+#endif
+
+ exts->action = action;
+ exts->police = police;
+
+ if (!use_action_miss)
+ return 0;
+
+ err = tcf_exts_miss_cookie_base_alloc(exts, tp, handle);
+ if (err)
+ goto err_miss_alloc;
+
+ return 0;
+
+err_miss_alloc:
+ tcf_exts_destroy(exts);
+ return err;
+}
+EXPORT_SYMBOL(tcf_exts_init_ex);
+
void tcf_exts_destroy(struct tcf_exts *exts)
{
#ifdef CONFIG_NET_CLS_ACT
+ tcf_exts_miss_cookie_base_destroy(exts);
+
if (exts->actions) {
tcf_action_destroy(exts->actions, TCA_ACT_UNBIND);
kfree(exts->actions);
@@ -3474,28 +3675,28 @@ int tc_setup_cb_reoffload(struct tcf_block *block, struct tcf_proto *tp,
}
EXPORT_SYMBOL(tc_setup_cb_reoffload);
-static int tcf_act_get_cookie(struct flow_action_entry *entry,
- const struct tc_action *act)
+static int tcf_act_get_user_cookie(struct flow_action_entry *entry,
+ const struct tc_action *act)
{
- struct tc_cookie *cookie;
+ struct tc_cookie *user_cookie;
int err = 0;
rcu_read_lock();
- cookie = rcu_dereference(act->act_cookie);
- if (cookie) {
- entry->cookie = flow_action_cookie_create(cookie->data,
- cookie->len,
- GFP_ATOMIC);
- if (!entry->cookie)
+ user_cookie = rcu_dereference(act->user_cookie);
+ if (user_cookie) {
+ entry->user_cookie = flow_action_cookie_create(user_cookie->data,
+ user_cookie->len,
+ GFP_ATOMIC);
+ if (!entry->user_cookie)
err = -ENOMEM;
}
rcu_read_unlock();
return err;
}
-static void tcf_act_put_cookie(struct flow_action_entry *entry)
+static void tcf_act_put_user_cookie(struct flow_action_entry *entry)
{
- flow_action_cookie_destroy(entry->cookie);
+ flow_action_cookie_destroy(entry->user_cookie);
}
void tc_cleanup_offload_action(struct flow_action *flow_action)
@@ -3504,7 +3705,7 @@ void tc_cleanup_offload_action(struct flow_action *flow_action)
int i;
flow_action_for_each(i, entry, flow_action) {
- tcf_act_put_cookie(entry);
+ tcf_act_put_user_cookie(entry);
if (entry->destructor)
entry->destructor(entry->destructor_priv);
}
@@ -3531,6 +3732,7 @@ static int tc_setup_offload_act(struct tc_action *act,
int tc_setup_action(struct flow_action *flow_action,
struct tc_action *actions[],
+ u32 miss_cookie_base,
struct netlink_ext_ack *extack)
{
int i, j, k, index, err = 0;
@@ -3549,7 +3751,7 @@ int tc_setup_action(struct flow_action *flow_action,
entry = &flow_action->entries[j];
spin_lock_bh(&act->tcfa_lock);
- err = tcf_act_get_cookie(entry, act);
+ err = tcf_act_get_user_cookie(entry, act);
if (err)
goto err_out_locked;
@@ -3561,6 +3763,9 @@ int tc_setup_action(struct flow_action *flow_action,
for (k = 0; k < index ; k++) {
entry[k].hw_stats = tc_act_hw_stats(act->hw_stats);
entry[k].hw_index = act->tcfa_index;
+ entry[k].cookie = (unsigned long)act;
+ entry[k].miss_cookie =
+ tcf_exts_miss_cookie_get(miss_cookie_base, i);
}
j += index;
@@ -3583,10 +3788,15 @@ int tc_setup_offload_action(struct flow_action *flow_action,
struct netlink_ext_ack *extack)
{
#ifdef CONFIG_NET_CLS_ACT
+ u32 miss_cookie_base;
+
if (!exts)
return 0;
- return tc_setup_action(flow_action, exts->actions, extack);
+ miss_cookie_base = exts->miss_cookie_node ?
+ exts->miss_cookie_node->miss_cookie_base : 0;
+ return tc_setup_action(flow_action, exts->actions, miss_cookie_base,
+ extack);
#else
return 0;
#endif
@@ -3754,6 +3964,8 @@ static int __init tc_filter_init(void)
if (err)
goto err_register_pernet_subsys;
+ xa_init_flags(&tcf_exts_miss_cookies_xa, XA_FLAGS_ALLOC1);
+
rtnl_register(PF_UNSPEC, RTM_NEWTFILTER, tc_new_tfilter, NULL,
RTNL_FLAG_DOIT_UNLOCKED);
rtnl_register(PF_UNSPEC, RTM_DELTFILTER, tc_del_tfilter, NULL,
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index 0b15698b3531..e960a46b0520 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -502,12 +502,7 @@ static void fl_hw_update_stats(struct tcf_proto *tp, struct cls_fl_filter *f,
tc_setup_cb_call(block, TC_SETUP_CLSFLOWER, &cls_flower, false,
rtnl_held);
- tcf_exts_hw_stats_update(&f->exts, cls_flower.stats.bytes,
- cls_flower.stats.pkts,
- cls_flower.stats.drops,
- cls_flower.stats.lastused,
- cls_flower.stats.used_hw_stats,
- cls_flower.stats.used_hw_stats_valid);
+ tcf_exts_hw_stats_update(&f->exts, &cls_flower.stats, cls_flower.use_act_stats);
}
static void __fl_put(struct cls_fl_filter *f)
@@ -534,6 +529,15 @@ static struct cls_fl_filter *__fl_get(struct cls_fl_head *head, u32 handle)
return f;
}
+static struct tcf_exts *fl_get_exts(const struct tcf_proto *tp, u32 handle)
+{
+ struct cls_fl_head *head = rcu_dereference_bh(tp->root);
+ struct cls_fl_filter *f;
+
+ f = idr_find(&head->handle_idr, handle);
+ return f ? &f->exts : NULL;
+}
+
static int __fl_delete(struct tcf_proto *tp, struct cls_fl_filter *f,
bool *last, bool rtnl_held,
struct netlink_ext_ack *extack)
@@ -2192,10 +2196,6 @@ static int fl_change(struct net *net, struct sk_buff *in_skb,
INIT_LIST_HEAD(&fnew->hw_list);
refcount_set(&fnew->refcnt, 1);
- err = tcf_exts_init(&fnew->exts, net, TCA_FLOWER_ACT, 0);
- if (err < 0)
- goto errout;
-
if (tb[TCA_FLOWER_FLAGS]) {
fnew->flags = nla_get_u32(tb[TCA_FLOWER_FLAGS]);
@@ -2205,15 +2205,46 @@ static int fl_change(struct net *net, struct sk_buff *in_skb,
}
}
+ if (!fold) {
+ spin_lock(&tp->lock);
+ if (!handle) {
+ handle = 1;
+ err = idr_alloc_u32(&head->handle_idr, fnew, &handle,
+ INT_MAX, GFP_ATOMIC);
+ } else {
+ err = idr_alloc_u32(&head->handle_idr, fnew, &handle,
+ handle, GFP_ATOMIC);
+
+ /* Filter with specified handle was concurrently
+ * inserted after initial check in cls_api. This is not
+ * necessarily an error if NLM_F_EXCL is not set in
+ * message flags. Returning EAGAIN will cause cls_api to
+ * try to update concurrently inserted rule.
+ */
+ if (err == -ENOSPC)
+ err = -EAGAIN;
+ }
+ spin_unlock(&tp->lock);
+
+ if (err)
+ goto errout;
+ }
+ fnew->handle = handle;
+
+ err = tcf_exts_init_ex(&fnew->exts, net, TCA_FLOWER_ACT, 0, tp, handle,
+ !tc_skip_hw(fnew->flags));
+ if (err < 0)
+ goto errout_idr;
+
err = fl_set_parms(net, tp, fnew, mask, base, tb, tca[TCA_RATE],
tp->chain->tmplt_priv, flags, fnew->flags,
extack);
if (err)
- goto errout;
+ goto errout_idr;
err = fl_check_assign_mask(head, fnew, fold, mask);
if (err)
- goto errout;
+ goto errout_idr;
err = fl_ht_insert_unique(fnew, fold, &in_ht);
if (err)
@@ -2279,29 +2310,9 @@ static int fl_change(struct net *net, struct sk_buff *in_skb,
refcount_dec(&fold->refcnt);
__fl_put(fold);
} else {
- if (handle) {
- /* user specifies a handle and it doesn't exist */
- err = idr_alloc_u32(&head->handle_idr, fnew, &handle,
- handle, GFP_ATOMIC);
-
- /* Filter with specified handle was concurrently
- * inserted after initial check in cls_api. This is not
- * necessarily an error if NLM_F_EXCL is not set in
- * message flags. Returning EAGAIN will cause cls_api to
- * try to update concurrently inserted rule.
- */
- if (err == -ENOSPC)
- err = -EAGAIN;
- } else {
- handle = 1;
- err = idr_alloc_u32(&head->handle_idr, fnew, &handle,
- INT_MAX, GFP_ATOMIC);
- }
- if (err)
- goto errout_hw;
+ idr_replace(&head->handle_idr, fnew, fnew->handle);
refcount_inc(&fnew->refcnt);
- fnew->handle = handle;
list_add_tail_rcu(&fnew->list, &fnew->mask->filters);
spin_unlock(&tp->lock);
}
@@ -2324,6 +2335,8 @@ errout_hw:
fnew->mask->filter_ht_params);
errout_mask:
fl_mask_put(head, fnew->mask);
+errout_idr:
+ idr_remove(&head->handle_idr, fnew->handle);
errout:
__fl_put(fnew);
errout_tb:
@@ -3441,6 +3454,7 @@ static struct tcf_proto_ops cls_fl_ops __read_mostly = {
.tmplt_create = fl_tmplt_create,
.tmplt_destroy = fl_tmplt_destroy,
.tmplt_dump = fl_tmplt_dump,
+ .get_exts = fl_get_exts,
.owner = THIS_MODULE,
.flags = TCF_PROTO_OPS_DOIT_UNLOCKED,
};
diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c
index 705f63da2c21..fa3bbd187eb9 100644
--- a/net/sched/cls_matchall.c
+++ b/net/sched/cls_matchall.c
@@ -331,11 +331,7 @@ static void mall_stats_hw_filter(struct tcf_proto *tp,
tc_setup_cb_call(block, TC_SETUP_CLSMATCHALL, &cls_mall, false, true);
- tcf_exts_hw_stats_update(&head->exts, cls_mall.stats.bytes,
- cls_mall.stats.pkts, cls_mall.stats.drops,
- cls_mall.stats.lastused,
- cls_mall.stats.used_hw_stats,
- cls_mall.stats.used_hw_stats_valid);
+ tcf_exts_hw_stats_update(&head->exts, &cls_mall.stats, cls_mall.use_act_stats);
}
static int mall_dump(struct net *net, struct tcf_proto *tp, void *fh,
diff --git a/net/sched/cls_rsvp.c b/net/sched/cls_rsvp.c
deleted file mode 100644
index 03d8619bd9c6..000000000000
--- a/net/sched/cls_rsvp.c
+++ /dev/null
@@ -1,26 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * net/sched/cls_rsvp.c Special RSVP packet classifier for IPv4.
- *
- * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
- */
-
-#include <linux/module.h>
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/string.h>
-#include <linux/errno.h>
-#include <linux/skbuff.h>
-#include <net/ip.h>
-#include <net/netlink.h>
-#include <net/act_api.h>
-#include <net/pkt_cls.h>
-#include <net/tc_wrapper.h>
-
-#define RSVP_DST_LEN 1
-#define RSVP_ID "rsvp"
-#define RSVP_OPS cls_rsvp_ops
-#define RSVP_CLS rsvp_classify
-
-#include "cls_rsvp.h"
-MODULE_LICENSE("GPL");
diff --git a/net/sched/cls_rsvp.h b/net/sched/cls_rsvp.h
deleted file mode 100644
index 869efba9f834..000000000000
--- a/net/sched/cls_rsvp.h
+++ /dev/null
@@ -1,764 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * net/sched/cls_rsvp.h Template file for RSVPv[46] classifiers.
- *
- * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
- */
-
-/*
- Comparing to general packet classification problem,
- RSVP needs only several relatively simple rules:
-
- * (dst, protocol) are always specified,
- so that we are able to hash them.
- * src may be exact, or may be wildcard, so that
- we can keep a hash table plus one wildcard entry.
- * source port (or flow label) is important only if src is given.
-
- IMPLEMENTATION.
-
- We use a two level hash table: The top level is keyed by
- destination address and protocol ID, every bucket contains a list
- of "rsvp sessions", identified by destination address, protocol and
- DPI(="Destination Port ID"): triple (key, mask, offset).
-
- Every bucket has a smaller hash table keyed by source address
- (cf. RSVP flowspec) and one wildcard entry for wildcard reservations.
- Every bucket is again a list of "RSVP flows", selected by
- source address and SPI(="Source Port ID" here rather than
- "security parameter index"): triple (key, mask, offset).
-
-
- NOTE 1. All the packets with IPv6 extension headers (but AH and ESP)
- and all fragmented packets go to the best-effort traffic class.
-
-
- NOTE 2. Two "port id"'s seems to be redundant, rfc2207 requires
- only one "Generalized Port Identifier". So that for classic
- ah, esp (and udp,tcp) both *pi should coincide or one of them
- should be wildcard.
-
- At first sight, this redundancy is just a waste of CPU
- resources. But DPI and SPI add the possibility to assign different
- priorities to GPIs. Look also at note 4 about tunnels below.
-
-
- NOTE 3. One complication is the case of tunneled packets.
- We implement it as following: if the first lookup
- matches a special session with "tunnelhdr" value not zero,
- flowid doesn't contain the true flow ID, but the tunnel ID (1...255).
- In this case, we pull tunnelhdr bytes and restart lookup
- with tunnel ID added to the list of keys. Simple and stupid 8)8)
- It's enough for PIMREG and IPIP.
-
-
- NOTE 4. Two GPIs make it possible to parse even GRE packets.
- F.e. DPI can select ETH_P_IP (and necessary flags to make
- tunnelhdr correct) in GRE protocol field and SPI matches
- GRE key. Is it not nice? 8)8)
-
-
- Well, as result, despite its simplicity, we get a pretty
- powerful classification engine. */
-
-
-struct rsvp_head {
- u32 tmap[256/32];
- u32 hgenerator;
- u8 tgenerator;
- struct rsvp_session __rcu *ht[256];
- struct rcu_head rcu;
-};
-
-struct rsvp_session {
- struct rsvp_session __rcu *next;
- __be32 dst[RSVP_DST_LEN];
- struct tc_rsvp_gpi dpi;
- u8 protocol;
- u8 tunnelid;
- /* 16 (src,sport) hash slots, and one wildcard source slot */
- struct rsvp_filter __rcu *ht[16 + 1];
- struct rcu_head rcu;
-};
-
-
-struct rsvp_filter {
- struct rsvp_filter __rcu *next;
- __be32 src[RSVP_DST_LEN];
- struct tc_rsvp_gpi spi;
- u8 tunnelhdr;
-
- struct tcf_result res;
- struct tcf_exts exts;
-
- u32 handle;
- struct rsvp_session *sess;
- struct rcu_work rwork;
-};
-
-static inline unsigned int hash_dst(__be32 *dst, u8 protocol, u8 tunnelid)
-{
- unsigned int h = (__force __u32)dst[RSVP_DST_LEN - 1];
-
- h ^= h>>16;
- h ^= h>>8;
- return (h ^ protocol ^ tunnelid) & 0xFF;
-}
-
-static inline unsigned int hash_src(__be32 *src)
-{
- unsigned int h = (__force __u32)src[RSVP_DST_LEN-1];
-
- h ^= h>>16;
- h ^= h>>8;
- h ^= h>>4;
- return h & 0xF;
-}
-
-#define RSVP_APPLY_RESULT() \
-{ \
- int r = tcf_exts_exec(skb, &f->exts, res); \
- if (r < 0) \
- continue; \
- else if (r > 0) \
- return r; \
-}
-
-TC_INDIRECT_SCOPE int RSVP_CLS(struct sk_buff *skb, const struct tcf_proto *tp,
- struct tcf_result *res)
-{
- struct rsvp_head *head = rcu_dereference_bh(tp->root);
- struct rsvp_session *s;
- struct rsvp_filter *f;
- unsigned int h1, h2;
- __be32 *dst, *src;
- u8 protocol;
- u8 tunnelid = 0;
- u8 *xprt;
-#if RSVP_DST_LEN == 4
- struct ipv6hdr *nhptr;
-
- if (!pskb_network_may_pull(skb, sizeof(*nhptr)))
- return -1;
- nhptr = ipv6_hdr(skb);
-#else
- struct iphdr *nhptr;
-
- if (!pskb_network_may_pull(skb, sizeof(*nhptr)))
- return -1;
- nhptr = ip_hdr(skb);
-#endif
-restart:
-
-#if RSVP_DST_LEN == 4
- src = &nhptr->saddr.s6_addr32[0];
- dst = &nhptr->daddr.s6_addr32[0];
- protocol = nhptr->nexthdr;
- xprt = ((u8 *)nhptr) + sizeof(struct ipv6hdr);
-#else
- src = &nhptr->saddr;
- dst = &nhptr->daddr;
- protocol = nhptr->protocol;
- xprt = ((u8 *)nhptr) + (nhptr->ihl<<2);
- if (ip_is_fragment(nhptr))
- return -1;
-#endif
-
- h1 = hash_dst(dst, protocol, tunnelid);
- h2 = hash_src(src);
-
- for (s = rcu_dereference_bh(head->ht[h1]); s;
- s = rcu_dereference_bh(s->next)) {
- if (dst[RSVP_DST_LEN-1] == s->dst[RSVP_DST_LEN - 1] &&
- protocol == s->protocol &&
- !(s->dpi.mask &
- (*(u32 *)(xprt + s->dpi.offset) ^ s->dpi.key)) &&
-#if RSVP_DST_LEN == 4
- dst[0] == s->dst[0] &&
- dst[1] == s->dst[1] &&
- dst[2] == s->dst[2] &&
-#endif
- tunnelid == s->tunnelid) {
-
- for (f = rcu_dereference_bh(s->ht[h2]); f;
- f = rcu_dereference_bh(f->next)) {
- if (src[RSVP_DST_LEN-1] == f->src[RSVP_DST_LEN - 1] &&
- !(f->spi.mask & (*(u32 *)(xprt + f->spi.offset) ^ f->spi.key))
-#if RSVP_DST_LEN == 4
- &&
- src[0] == f->src[0] &&
- src[1] == f->src[1] &&
- src[2] == f->src[2]
-#endif
- ) {
- *res = f->res;
- RSVP_APPLY_RESULT();
-
-matched:
- if (f->tunnelhdr == 0)
- return 0;
-
- tunnelid = f->res.classid;
- nhptr = (void *)(xprt + f->tunnelhdr - sizeof(*nhptr));
- goto restart;
- }
- }
-
- /* And wildcard bucket... */
- for (f = rcu_dereference_bh(s->ht[16]); f;
- f = rcu_dereference_bh(f->next)) {
- *res = f->res;
- RSVP_APPLY_RESULT();
- goto matched;
- }
- return -1;
- }
- }
- return -1;
-}
-
-static void rsvp_replace(struct tcf_proto *tp, struct rsvp_filter *n, u32 h)
-{
- struct rsvp_head *head = rtnl_dereference(tp->root);
- struct rsvp_session *s;
- struct rsvp_filter __rcu **ins;
- struct rsvp_filter *pins;
- unsigned int h1 = h & 0xFF;
- unsigned int h2 = (h >> 8) & 0xFF;
-
- for (s = rtnl_dereference(head->ht[h1]); s;
- s = rtnl_dereference(s->next)) {
- for (ins = &s->ht[h2], pins = rtnl_dereference(*ins); ;
- ins = &pins->next, pins = rtnl_dereference(*ins)) {
- if (pins->handle == h) {
- RCU_INIT_POINTER(n->next, pins->next);
- rcu_assign_pointer(*ins, n);
- return;
- }
- }
- }
-
- /* Something went wrong if we are trying to replace a non-existent
- * node. Mind as well halt instead of silently failing.
- */
- BUG_ON(1);
-}
-
-static void *rsvp_get(struct tcf_proto *tp, u32 handle)
-{
- struct rsvp_head *head = rtnl_dereference(tp->root);
- struct rsvp_session *s;
- struct rsvp_filter *f;
- unsigned int h1 = handle & 0xFF;
- unsigned int h2 = (handle >> 8) & 0xFF;
-
- if (h2 > 16)
- return NULL;
-
- for (s = rtnl_dereference(head->ht[h1]); s;
- s = rtnl_dereference(s->next)) {
- for (f = rtnl_dereference(s->ht[h2]); f;
- f = rtnl_dereference(f->next)) {
- if (f->handle == handle)
- return f;
- }
- }
- return NULL;
-}
-
-static int rsvp_init(struct tcf_proto *tp)
-{
- struct rsvp_head *data;
-
- data = kzalloc(sizeof(struct rsvp_head), GFP_KERNEL);
- if (data) {
- rcu_assign_pointer(tp->root, data);
- return 0;
- }
- return -ENOBUFS;
-}
-
-static void __rsvp_delete_filter(struct rsvp_filter *f)
-{
- tcf_exts_destroy(&f->exts);
- tcf_exts_put_net(&f->exts);
- kfree(f);
-}
-
-static void rsvp_delete_filter_work(struct work_struct *work)
-{
- struct rsvp_filter *f = container_of(to_rcu_work(work),
- struct rsvp_filter,
- rwork);
- rtnl_lock();
- __rsvp_delete_filter(f);
- rtnl_unlock();
-}
-
-static void rsvp_delete_filter(struct tcf_proto *tp, struct rsvp_filter *f)
-{
- tcf_unbind_filter(tp, &f->res);
- /* all classifiers are required to call tcf_exts_destroy() after rcu
- * grace period, since converted-to-rcu actions are relying on that
- * in cleanup() callback
- */
- if (tcf_exts_get_net(&f->exts))
- tcf_queue_work(&f->rwork, rsvp_delete_filter_work);
- else
- __rsvp_delete_filter(f);
-}
-
-static void rsvp_destroy(struct tcf_proto *tp, bool rtnl_held,
- struct netlink_ext_ack *extack)
-{
- struct rsvp_head *data = rtnl_dereference(tp->root);
- int h1, h2;
-
- if (data == NULL)
- return;
-
- for (h1 = 0; h1 < 256; h1++) {
- struct rsvp_session *s;
-
- while ((s = rtnl_dereference(data->ht[h1])) != NULL) {
- RCU_INIT_POINTER(data->ht[h1], s->next);
-
- for (h2 = 0; h2 <= 16; h2++) {
- struct rsvp_filter *f;
-
- while ((f = rtnl_dereference(s->ht[h2])) != NULL) {
- rcu_assign_pointer(s->ht[h2], f->next);
- rsvp_delete_filter(tp, f);
- }
- }
- kfree_rcu(s, rcu);
- }
- }
- kfree_rcu(data, rcu);
-}
-
-static int rsvp_delete(struct tcf_proto *tp, void *arg, bool *last,
- bool rtnl_held, struct netlink_ext_ack *extack)
-{
- struct rsvp_head *head = rtnl_dereference(tp->root);
- struct rsvp_filter *nfp, *f = arg;
- struct rsvp_filter __rcu **fp;
- unsigned int h = f->handle;
- struct rsvp_session __rcu **sp;
- struct rsvp_session *nsp, *s = f->sess;
- int i, h1;
-
- fp = &s->ht[(h >> 8) & 0xFF];
- for (nfp = rtnl_dereference(*fp); nfp;
- fp = &nfp->next, nfp = rtnl_dereference(*fp)) {
- if (nfp == f) {
- RCU_INIT_POINTER(*fp, f->next);
- rsvp_delete_filter(tp, f);
-
- /* Strip tree */
-
- for (i = 0; i <= 16; i++)
- if (s->ht[i])
- goto out;
-
- /* OK, session has no flows */
- sp = &head->ht[h & 0xFF];
- for (nsp = rtnl_dereference(*sp); nsp;
- sp = &nsp->next, nsp = rtnl_dereference(*sp)) {
- if (nsp == s) {
- RCU_INIT_POINTER(*sp, s->next);
- kfree_rcu(s, rcu);
- goto out;
- }
- }
-
- break;
- }
- }
-
-out:
- *last = true;
- for (h1 = 0; h1 < 256; h1++) {
- if (rcu_access_pointer(head->ht[h1])) {
- *last = false;
- break;
- }
- }
-
- return 0;
-}
-
-static unsigned int gen_handle(struct tcf_proto *tp, unsigned salt)
-{
- struct rsvp_head *data = rtnl_dereference(tp->root);
- int i = 0xFFFF;
-
- while (i-- > 0) {
- u32 h;
-
- if ((data->hgenerator += 0x10000) == 0)
- data->hgenerator = 0x10000;
- h = data->hgenerator|salt;
- if (!rsvp_get(tp, h))
- return h;
- }
- return 0;
-}
-
-static int tunnel_bts(struct rsvp_head *data)
-{
- int n = data->tgenerator >> 5;
- u32 b = 1 << (data->tgenerator & 0x1F);
-
- if (data->tmap[n] & b)
- return 0;
- data->tmap[n] |= b;
- return 1;
-}
-
-static void tunnel_recycle(struct rsvp_head *data)
-{
- struct rsvp_session __rcu **sht = data->ht;
- u32 tmap[256/32];
- int h1, h2;
-
- memset(tmap, 0, sizeof(tmap));
-
- for (h1 = 0; h1 < 256; h1++) {
- struct rsvp_session *s;
- for (s = rtnl_dereference(sht[h1]); s;
- s = rtnl_dereference(s->next)) {
- for (h2 = 0; h2 <= 16; h2++) {
- struct rsvp_filter *f;
-
- for (f = rtnl_dereference(s->ht[h2]); f;
- f = rtnl_dereference(f->next)) {
- if (f->tunnelhdr == 0)
- continue;
- data->tgenerator = f->res.classid;
- tunnel_bts(data);
- }
- }
- }
- }
-
- memcpy(data->tmap, tmap, sizeof(tmap));
-}
-
-static u32 gen_tunnel(struct rsvp_head *data)
-{
- int i, k;
-
- for (k = 0; k < 2; k++) {
- for (i = 255; i > 0; i--) {
- if (++data->tgenerator == 0)
- data->tgenerator = 1;
- if (tunnel_bts(data))
- return data->tgenerator;
- }
- tunnel_recycle(data);
- }
- return 0;
-}
-
-static const struct nla_policy rsvp_policy[TCA_RSVP_MAX + 1] = {
- [TCA_RSVP_CLASSID] = { .type = NLA_U32 },
- [TCA_RSVP_DST] = { .len = RSVP_DST_LEN * sizeof(u32) },
- [TCA_RSVP_SRC] = { .len = RSVP_DST_LEN * sizeof(u32) },
- [TCA_RSVP_PINFO] = { .len = sizeof(struct tc_rsvp_pinfo) },
-};
-
-static int rsvp_change(struct net *net, struct sk_buff *in_skb,
- struct tcf_proto *tp, unsigned long base,
- u32 handle, struct nlattr **tca,
- void **arg, u32 flags,
- struct netlink_ext_ack *extack)
-{
- struct rsvp_head *data = rtnl_dereference(tp->root);
- struct rsvp_filter *f, *nfp;
- struct rsvp_filter __rcu **fp;
- struct rsvp_session *nsp, *s;
- struct rsvp_session __rcu **sp;
- struct tc_rsvp_pinfo *pinfo = NULL;
- struct nlattr *opt = tca[TCA_OPTIONS];
- struct nlattr *tb[TCA_RSVP_MAX + 1];
- struct tcf_exts e;
- unsigned int h1, h2;
- __be32 *dst;
- int err;
-
- if (opt == NULL)
- return handle ? -EINVAL : 0;
-
- err = nla_parse_nested_deprecated(tb, TCA_RSVP_MAX, opt, rsvp_policy,
- NULL);
- if (err < 0)
- return err;
-
- err = tcf_exts_init(&e, net, TCA_RSVP_ACT, TCA_RSVP_POLICE);
- if (err < 0)
- return err;
- err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &e, flags,
- extack);
- if (err < 0)
- goto errout2;
-
- f = *arg;
- if (f) {
- /* Node exists: adjust only classid */
- struct rsvp_filter *n;
-
- if (f->handle != handle && handle)
- goto errout2;
-
- n = kmemdup(f, sizeof(*f), GFP_KERNEL);
- if (!n) {
- err = -ENOMEM;
- goto errout2;
- }
-
- err = tcf_exts_init(&n->exts, net, TCA_RSVP_ACT,
- TCA_RSVP_POLICE);
- if (err < 0) {
- kfree(n);
- goto errout2;
- }
-
- if (tb[TCA_RSVP_CLASSID]) {
- n->res.classid = nla_get_u32(tb[TCA_RSVP_CLASSID]);
- tcf_bind_filter(tp, &n->res, base);
- }
-
- tcf_exts_change(&n->exts, &e);
- rsvp_replace(tp, n, handle);
- return 0;
- }
-
- /* Now more serious part... */
- err = -EINVAL;
- if (handle)
- goto errout2;
- if (tb[TCA_RSVP_DST] == NULL)
- goto errout2;
-
- err = -ENOBUFS;
- f = kzalloc(sizeof(struct rsvp_filter), GFP_KERNEL);
- if (f == NULL)
- goto errout2;
-
- err = tcf_exts_init(&f->exts, net, TCA_RSVP_ACT, TCA_RSVP_POLICE);
- if (err < 0)
- goto errout;
- h2 = 16;
- if (tb[TCA_RSVP_SRC]) {
- memcpy(f->src, nla_data(tb[TCA_RSVP_SRC]), sizeof(f->src));
- h2 = hash_src(f->src);
- }
- if (tb[TCA_RSVP_PINFO]) {
- pinfo = nla_data(tb[TCA_RSVP_PINFO]);
- f->spi = pinfo->spi;
- f->tunnelhdr = pinfo->tunnelhdr;
- }
- if (tb[TCA_RSVP_CLASSID])
- f->res.classid = nla_get_u32(tb[TCA_RSVP_CLASSID]);
-
- dst = nla_data(tb[TCA_RSVP_DST]);
- h1 = hash_dst(dst, pinfo ? pinfo->protocol : 0, pinfo ? pinfo->tunnelid : 0);
-
- err = -ENOMEM;
- if ((f->handle = gen_handle(tp, h1 | (h2<<8))) == 0)
- goto errout;
-
- if (f->tunnelhdr) {
- err = -EINVAL;
- if (f->res.classid > 255)
- goto errout;
-
- err = -ENOMEM;
- if (f->res.classid == 0 &&
- (f->res.classid = gen_tunnel(data)) == 0)
- goto errout;
- }
-
- for (sp = &data->ht[h1];
- (s = rtnl_dereference(*sp)) != NULL;
- sp = &s->next) {
- if (dst[RSVP_DST_LEN-1] == s->dst[RSVP_DST_LEN-1] &&
- pinfo && pinfo->protocol == s->protocol &&
- memcmp(&pinfo->dpi, &s->dpi, sizeof(s->dpi)) == 0 &&
-#if RSVP_DST_LEN == 4
- dst[0] == s->dst[0] &&
- dst[1] == s->dst[1] &&
- dst[2] == s->dst[2] &&
-#endif
- pinfo->tunnelid == s->tunnelid) {
-
-insert:
- /* OK, we found appropriate session */
-
- fp = &s->ht[h2];
-
- f->sess = s;
- if (f->tunnelhdr == 0)
- tcf_bind_filter(tp, &f->res, base);
-
- tcf_exts_change(&f->exts, &e);
-
- fp = &s->ht[h2];
- for (nfp = rtnl_dereference(*fp); nfp;
- fp = &nfp->next, nfp = rtnl_dereference(*fp)) {
- __u32 mask = nfp->spi.mask & f->spi.mask;
-
- if (mask != f->spi.mask)
- break;
- }
- RCU_INIT_POINTER(f->next, nfp);
- rcu_assign_pointer(*fp, f);
-
- *arg = f;
- return 0;
- }
- }
-
- /* No session found. Create new one. */
-
- err = -ENOBUFS;
- s = kzalloc(sizeof(struct rsvp_session), GFP_KERNEL);
- if (s == NULL)
- goto errout;
- memcpy(s->dst, dst, sizeof(s->dst));
-
- if (pinfo) {
- s->dpi = pinfo->dpi;
- s->protocol = pinfo->protocol;
- s->tunnelid = pinfo->tunnelid;
- }
- sp = &data->ht[h1];
- for (nsp = rtnl_dereference(*sp); nsp;
- sp = &nsp->next, nsp = rtnl_dereference(*sp)) {
- if ((nsp->dpi.mask & s->dpi.mask) != s->dpi.mask)
- break;
- }
- RCU_INIT_POINTER(s->next, nsp);
- rcu_assign_pointer(*sp, s);
-
- goto insert;
-
-errout:
- tcf_exts_destroy(&f->exts);
- kfree(f);
-errout2:
- tcf_exts_destroy(&e);
- return err;
-}
-
-static void rsvp_walk(struct tcf_proto *tp, struct tcf_walker *arg,
- bool rtnl_held)
-{
- struct rsvp_head *head = rtnl_dereference(tp->root);
- unsigned int h, h1;
-
- if (arg->stop)
- return;
-
- for (h = 0; h < 256; h++) {
- struct rsvp_session *s;
-
- for (s = rtnl_dereference(head->ht[h]); s;
- s = rtnl_dereference(s->next)) {
- for (h1 = 0; h1 <= 16; h1++) {
- struct rsvp_filter *f;
-
- for (f = rtnl_dereference(s->ht[h1]); f;
- f = rtnl_dereference(f->next)) {
- if (!tc_cls_stats_dump(tp, arg, f))
- return;
- }
- }
- }
- }
-}
-
-static int rsvp_dump(struct net *net, struct tcf_proto *tp, void *fh,
- struct sk_buff *skb, struct tcmsg *t, bool rtnl_held)
-{
- struct rsvp_filter *f = fh;
- struct rsvp_session *s;
- struct nlattr *nest;
- struct tc_rsvp_pinfo pinfo;
-
- if (f == NULL)
- return skb->len;
- s = f->sess;
-
- t->tcm_handle = f->handle;
-
- nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
- if (nest == NULL)
- goto nla_put_failure;
-
- if (nla_put(skb, TCA_RSVP_DST, sizeof(s->dst), &s->dst))
- goto nla_put_failure;
- pinfo.dpi = s->dpi;
- pinfo.spi = f->spi;
- pinfo.protocol = s->protocol;
- pinfo.tunnelid = s->tunnelid;
- pinfo.tunnelhdr = f->tunnelhdr;
- pinfo.pad = 0;
- if (nla_put(skb, TCA_RSVP_PINFO, sizeof(pinfo), &pinfo))
- goto nla_put_failure;
- if (f->res.classid &&
- nla_put_u32(skb, TCA_RSVP_CLASSID, f->res.classid))
- goto nla_put_failure;
- if (((f->handle >> 8) & 0xFF) != 16 &&
- nla_put(skb, TCA_RSVP_SRC, sizeof(f->src), f->src))
- goto nla_put_failure;
-
- if (tcf_exts_dump(skb, &f->exts) < 0)
- goto nla_put_failure;
-
- nla_nest_end(skb, nest);
-
- if (tcf_exts_dump_stats(skb, &f->exts) < 0)
- goto nla_put_failure;
- return skb->len;
-
-nla_put_failure:
- nla_nest_cancel(skb, nest);
- return -1;
-}
-
-static void rsvp_bind_class(void *fh, u32 classid, unsigned long cl, void *q,
- unsigned long base)
-{
- struct rsvp_filter *f = fh;
-
- tc_cls_bind_class(classid, cl, q, &f->res, base);
-}
-
-static struct tcf_proto_ops RSVP_OPS __read_mostly = {
- .kind = RSVP_ID,
- .classify = RSVP_CLS,
- .init = rsvp_init,
- .destroy = rsvp_destroy,
- .get = rsvp_get,
- .change = rsvp_change,
- .delete = rsvp_delete,
- .walk = rsvp_walk,
- .dump = rsvp_dump,
- .bind_class = rsvp_bind_class,
- .owner = THIS_MODULE,
-};
-
-static int __init init_rsvp(void)
-{
- return register_tcf_proto_ops(&RSVP_OPS);
-}
-
-static void __exit exit_rsvp(void)
-{
- unregister_tcf_proto_ops(&RSVP_OPS);
-}
-
-module_init(init_rsvp)
-module_exit(exit_rsvp)
diff --git a/net/sched/cls_rsvp6.c b/net/sched/cls_rsvp6.c
deleted file mode 100644
index e627cc32d633..000000000000
--- a/net/sched/cls_rsvp6.c
+++ /dev/null
@@ -1,26 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * net/sched/cls_rsvp6.c Special RSVP packet classifier for IPv6.
- *
- * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
- */
-
-#include <linux/module.h>
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/string.h>
-#include <linux/errno.h>
-#include <linux/ipv6.h>
-#include <linux/skbuff.h>
-#include <net/act_api.h>
-#include <net/pkt_cls.h>
-#include <net/netlink.h>
-#include <net/tc_wrapper.h>
-
-#define RSVP_DST_LEN 4
-#define RSVP_ID "rsvp6"
-#define RSVP_OPS cls_rsvp6_ops
-#define RSVP_CLS rsvp6_classify
-
-#include "cls_rsvp.h"
-MODULE_LICENSE("GPL");
diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c
deleted file mode 100644
index 6640e75eaa02..000000000000
--- a/net/sched/cls_tcindex.c
+++ /dev/null
@@ -1,742 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * net/sched/cls_tcindex.c Packet classifier for skb->tc_index
- *
- * Written 1998,1999 by Werner Almesberger, EPFL ICA
- */
-
-#include <linux/module.h>
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/skbuff.h>
-#include <linux/errno.h>
-#include <linux/slab.h>
-#include <linux/refcount.h>
-#include <linux/rcupdate.h>
-#include <net/act_api.h>
-#include <net/netlink.h>
-#include <net/pkt_cls.h>
-#include <net/sch_generic.h>
-#include <net/tc_wrapper.h>
-
-/*
- * Passing parameters to the root seems to be done more awkwardly than really
- * necessary. At least, u32 doesn't seem to use such dirty hacks. To be
- * verified. FIXME.
- */
-
-#define PERFECT_HASH_THRESHOLD 64 /* use perfect hash if not bigger */
-#define DEFAULT_HASH_SIZE 64 /* optimized for diffserv */
-
-
-struct tcindex_data;
-
-struct tcindex_filter_result {
- struct tcf_exts exts;
- struct tcf_result res;
- struct tcindex_data *p;
- struct rcu_work rwork;
-};
-
-struct tcindex_filter {
- u16 key;
- struct tcindex_filter_result result;
- struct tcindex_filter __rcu *next;
- struct rcu_work rwork;
-};
-
-
-struct tcindex_data {
- struct tcindex_filter_result *perfect; /* perfect hash; NULL if none */
- struct tcindex_filter __rcu **h; /* imperfect hash; */
- struct tcf_proto *tp;
- u16 mask; /* AND key with mask */
- u32 shift; /* shift ANDed key to the right */
- u32 hash; /* hash table size; 0 if undefined */
- u32 alloc_hash; /* allocated size */
- u32 fall_through; /* 0: only classify if explicit match */
- refcount_t refcnt; /* a temporary refcnt for perfect hash */
- struct rcu_work rwork;
-};
-
-static inline int tcindex_filter_is_set(struct tcindex_filter_result *r)
-{
- return tcf_exts_has_actions(&r->exts) || r->res.classid;
-}
-
-static void tcindex_data_get(struct tcindex_data *p)
-{
- refcount_inc(&p->refcnt);
-}
-
-static void tcindex_data_put(struct tcindex_data *p)
-{
- if (refcount_dec_and_test(&p->refcnt)) {
- kfree(p->perfect);
- kfree(p->h);
- kfree(p);
- }
-}
-
-static struct tcindex_filter_result *tcindex_lookup(struct tcindex_data *p,
- u16 key)
-{
- if (p->perfect) {
- struct tcindex_filter_result *f = p->perfect + key;
-
- return tcindex_filter_is_set(f) ? f : NULL;
- } else if (p->h) {
- struct tcindex_filter __rcu **fp;
- struct tcindex_filter *f;
-
- fp = &p->h[key % p->hash];
- for (f = rcu_dereference_bh_rtnl(*fp);
- f;
- fp = &f->next, f = rcu_dereference_bh_rtnl(*fp))
- if (f->key == key)
- return &f->result;
- }
-
- return NULL;
-}
-
-TC_INDIRECT_SCOPE int tcindex_classify(struct sk_buff *skb,
- const struct tcf_proto *tp,
- struct tcf_result *res)
-{
- struct tcindex_data *p = rcu_dereference_bh(tp->root);
- struct tcindex_filter_result *f;
- int key = (skb->tc_index & p->mask) >> p->shift;
-
- pr_debug("tcindex_classify(skb %p,tp %p,res %p),p %p\n",
- skb, tp, res, p);
-
- f = tcindex_lookup(p, key);
- if (!f) {
- struct Qdisc *q = tcf_block_q(tp->chain->block);
-
- if (!p->fall_through)
- return -1;
- res->classid = TC_H_MAKE(TC_H_MAJ(q->handle), key);
- res->class = 0;
- pr_debug("alg 0x%x\n", res->classid);
- return 0;
- }
- *res = f->res;
- pr_debug("map 0x%x\n", res->classid);
-
- return tcf_exts_exec(skb, &f->exts, res);
-}
-
-
-static void *tcindex_get(struct tcf_proto *tp, u32 handle)
-{
- struct tcindex_data *p = rtnl_dereference(tp->root);
- struct tcindex_filter_result *r;
-
- pr_debug("tcindex_get(tp %p,handle 0x%08x)\n", tp, handle);
- if (p->perfect && handle >= p->alloc_hash)
- return NULL;
- r = tcindex_lookup(p, handle);
- return r && tcindex_filter_is_set(r) ? r : NULL;
-}
-
-static int tcindex_init(struct tcf_proto *tp)
-{
- struct tcindex_data *p;
-
- pr_debug("tcindex_init(tp %p)\n", tp);
- p = kzalloc(sizeof(struct tcindex_data), GFP_KERNEL);
- if (!p)
- return -ENOMEM;
-
- p->mask = 0xffff;
- p->hash = DEFAULT_HASH_SIZE;
- p->fall_through = 1;
- refcount_set(&p->refcnt, 1); /* Paired with tcindex_destroy_work() */
-
- rcu_assign_pointer(tp->root, p);
- return 0;
-}
-
-static void __tcindex_destroy_rexts(struct tcindex_filter_result *r)
-{
- tcf_exts_destroy(&r->exts);
- tcf_exts_put_net(&r->exts);
- tcindex_data_put(r->p);
-}
-
-static void tcindex_destroy_rexts_work(struct work_struct *work)
-{
- struct tcindex_filter_result *r;
-
- r = container_of(to_rcu_work(work),
- struct tcindex_filter_result,
- rwork);
- rtnl_lock();
- __tcindex_destroy_rexts(r);
- rtnl_unlock();
-}
-
-static void __tcindex_destroy_fexts(struct tcindex_filter *f)
-{
- tcf_exts_destroy(&f->result.exts);
- tcf_exts_put_net(&f->result.exts);
- kfree(f);
-}
-
-static void tcindex_destroy_fexts_work(struct work_struct *work)
-{
- struct tcindex_filter *f = container_of(to_rcu_work(work),
- struct tcindex_filter,
- rwork);
-
- rtnl_lock();
- __tcindex_destroy_fexts(f);
- rtnl_unlock();
-}
-
-static int tcindex_delete(struct tcf_proto *tp, void *arg, bool *last,
- bool rtnl_held, struct netlink_ext_ack *extack)
-{
- struct tcindex_data *p = rtnl_dereference(tp->root);
- struct tcindex_filter_result *r = arg;
- struct tcindex_filter __rcu **walk;
- struct tcindex_filter *f = NULL;
-
- pr_debug("tcindex_delete(tp %p,arg %p),p %p\n", tp, arg, p);
- if (p->perfect) {
- if (!r->res.class)
- return -ENOENT;
- } else {
- int i;
-
- for (i = 0; i < p->hash; i++) {
- walk = p->h + i;
- for (f = rtnl_dereference(*walk); f;
- walk = &f->next, f = rtnl_dereference(*walk)) {
- if (&f->result == r)
- goto found;
- }
- }
- return -ENOENT;
-
-found:
- rcu_assign_pointer(*walk, rtnl_dereference(f->next));
- }
- tcf_unbind_filter(tp, &r->res);
- /* all classifiers are required to call tcf_exts_destroy() after rcu
- * grace period, since converted-to-rcu actions are relying on that
- * in cleanup() callback
- */
- if (f) {
- if (tcf_exts_get_net(&f->result.exts))
- tcf_queue_work(&f->rwork, tcindex_destroy_fexts_work);
- else
- __tcindex_destroy_fexts(f);
- } else {
- tcindex_data_get(p);
-
- if (tcf_exts_get_net(&r->exts))
- tcf_queue_work(&r->rwork, tcindex_destroy_rexts_work);
- else
- __tcindex_destroy_rexts(r);
- }
-
- *last = false;
- return 0;
-}
-
-static void tcindex_destroy_work(struct work_struct *work)
-{
- struct tcindex_data *p = container_of(to_rcu_work(work),
- struct tcindex_data,
- rwork);
-
- tcindex_data_put(p);
-}
-
-static inline int
-valid_perfect_hash(struct tcindex_data *p)
-{
- return p->hash > (p->mask >> p->shift);
-}
-
-static const struct nla_policy tcindex_policy[TCA_TCINDEX_MAX + 1] = {
- [TCA_TCINDEX_HASH] = { .type = NLA_U32 },
- [TCA_TCINDEX_MASK] = { .type = NLA_U16 },
- [TCA_TCINDEX_SHIFT] = { .type = NLA_U32 },
- [TCA_TCINDEX_FALL_THROUGH] = { .type = NLA_U32 },
- [TCA_TCINDEX_CLASSID] = { .type = NLA_U32 },
-};
-
-static int tcindex_filter_result_init(struct tcindex_filter_result *r,
- struct tcindex_data *p,
- struct net *net)
-{
- memset(r, 0, sizeof(*r));
- r->p = p;
- return tcf_exts_init(&r->exts, net, TCA_TCINDEX_ACT,
- TCA_TCINDEX_POLICE);
-}
-
-static void tcindex_free_perfect_hash(struct tcindex_data *cp);
-
-static void tcindex_partial_destroy_work(struct work_struct *work)
-{
- struct tcindex_data *p = container_of(to_rcu_work(work),
- struct tcindex_data,
- rwork);
-
- rtnl_lock();
- if (p->perfect)
- tcindex_free_perfect_hash(p);
- kfree(p);
- rtnl_unlock();
-}
-
-static void tcindex_free_perfect_hash(struct tcindex_data *cp)
-{
- int i;
-
- for (i = 0; i < cp->hash; i++)
- tcf_exts_destroy(&cp->perfect[i].exts);
- kfree(cp->perfect);
-}
-
-static int tcindex_alloc_perfect_hash(struct net *net, struct tcindex_data *cp)
-{
- int i, err = 0;
-
- cp->perfect = kcalloc(cp->hash, sizeof(struct tcindex_filter_result),
- GFP_KERNEL | __GFP_NOWARN);
- if (!cp->perfect)
- return -ENOMEM;
-
- for (i = 0; i < cp->hash; i++) {
- err = tcf_exts_init(&cp->perfect[i].exts, net,
- TCA_TCINDEX_ACT, TCA_TCINDEX_POLICE);
- if (err < 0)
- goto errout;
- cp->perfect[i].p = cp;
- }
-
- return 0;
-
-errout:
- tcindex_free_perfect_hash(cp);
- return err;
-}
-
-static int
-tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base,
- u32 handle, struct tcindex_data *p,
- struct tcindex_filter_result *r, struct nlattr **tb,
- struct nlattr *est, u32 flags, struct netlink_ext_ack *extack)
-{
- struct tcindex_filter_result new_filter_result;
- struct tcindex_data *cp = NULL, *oldp;
- struct tcindex_filter *f = NULL; /* make gcc behave */
- struct tcf_result cr = {};
- int err, balloc = 0;
- struct tcf_exts e;
- bool update_h = false;
-
- err = tcf_exts_init(&e, net, TCA_TCINDEX_ACT, TCA_TCINDEX_POLICE);
- if (err < 0)
- return err;
- err = tcf_exts_validate(net, tp, tb, est, &e, flags, extack);
- if (err < 0)
- goto errout;
-
- err = -ENOMEM;
- /* tcindex_data attributes must look atomic to classifier/lookup so
- * allocate new tcindex data and RCU assign it onto root. Keeping
- * perfect hash and hash pointers from old data.
- */
- cp = kzalloc(sizeof(*cp), GFP_KERNEL);
- if (!cp)
- goto errout;
-
- cp->mask = p->mask;
- cp->shift = p->shift;
- cp->hash = p->hash;
- cp->alloc_hash = p->alloc_hash;
- cp->fall_through = p->fall_through;
- cp->tp = tp;
- refcount_set(&cp->refcnt, 1); /* Paired with tcindex_destroy_work() */
-
- if (tb[TCA_TCINDEX_HASH])
- cp->hash = nla_get_u32(tb[TCA_TCINDEX_HASH]);
-
- if (tb[TCA_TCINDEX_MASK])
- cp->mask = nla_get_u16(tb[TCA_TCINDEX_MASK]);
-
- if (tb[TCA_TCINDEX_SHIFT]) {
- cp->shift = nla_get_u32(tb[TCA_TCINDEX_SHIFT]);
- if (cp->shift > 16) {
- err = -EINVAL;
- goto errout;
- }
- }
- if (!cp->hash) {
- /* Hash not specified, use perfect hash if the upper limit
- * of the hashing index is below the threshold.
- */
- if ((cp->mask >> cp->shift) < PERFECT_HASH_THRESHOLD)
- cp->hash = (cp->mask >> cp->shift) + 1;
- else
- cp->hash = DEFAULT_HASH_SIZE;
- }
-
- if (p->perfect) {
- int i;
-
- if (tcindex_alloc_perfect_hash(net, cp) < 0)
- goto errout;
- cp->alloc_hash = cp->hash;
- for (i = 0; i < min(cp->hash, p->hash); i++)
- cp->perfect[i].res = p->perfect[i].res;
- balloc = 1;
- }
- cp->h = p->h;
-
- err = tcindex_filter_result_init(&new_filter_result, cp, net);
- if (err < 0)
- goto errout_alloc;
- if (r)
- cr = r->res;
-
- err = -EBUSY;
-
- /* Hash already allocated, make sure that we still meet the
- * requirements for the allocated hash.
- */
- if (cp->perfect) {
- if (!valid_perfect_hash(cp) ||
- cp->hash > cp->alloc_hash)
- goto errout_alloc;
- } else if (cp->h && cp->hash != cp->alloc_hash) {
- goto errout_alloc;
- }
-
- err = -EINVAL;
- if (tb[TCA_TCINDEX_FALL_THROUGH])
- cp->fall_through = nla_get_u32(tb[TCA_TCINDEX_FALL_THROUGH]);
-
- if (!cp->perfect && !cp->h)
- cp->alloc_hash = cp->hash;
-
- /* Note: this could be as restrictive as if (handle & ~(mask >> shift))
- * but then, we'd fail handles that may become valid after some future
- * mask change. While this is extremely unlikely to ever matter,
- * the check below is safer (and also more backwards-compatible).
- */
- if (cp->perfect || valid_perfect_hash(cp))
- if (handle >= cp->alloc_hash)
- goto errout_alloc;
-
-
- err = -ENOMEM;
- if (!cp->perfect && !cp->h) {
- if (valid_perfect_hash(cp)) {
- if (tcindex_alloc_perfect_hash(net, cp) < 0)
- goto errout_alloc;
- balloc = 1;
- } else {
- struct tcindex_filter __rcu **hash;
-
- hash = kcalloc(cp->hash,
- sizeof(struct tcindex_filter *),
- GFP_KERNEL);
-
- if (!hash)
- goto errout_alloc;
-
- cp->h = hash;
- balloc = 2;
- }
- }
-
- if (cp->perfect) {
- r = cp->perfect + handle;
- } else {
- /* imperfect area is updated in-place using rcu */
- update_h = !!tcindex_lookup(cp, handle);
- r = &new_filter_result;
- }
-
- if (r == &new_filter_result) {
- f = kzalloc(sizeof(*f), GFP_KERNEL);
- if (!f)
- goto errout_alloc;
- f->key = handle;
- f->next = NULL;
- err = tcindex_filter_result_init(&f->result, cp, net);
- if (err < 0) {
- kfree(f);
- goto errout_alloc;
- }
- }
-
- if (tb[TCA_TCINDEX_CLASSID]) {
- cr.classid = nla_get_u32(tb[TCA_TCINDEX_CLASSID]);
- tcf_bind_filter(tp, &cr, base);
- }
-
- oldp = p;
- r->res = cr;
- tcf_exts_change(&r->exts, &e);
-
- rcu_assign_pointer(tp->root, cp);
-
- if (update_h) {
- struct tcindex_filter __rcu **fp;
- struct tcindex_filter *cf;
-
- f->result.res = r->res;
- tcf_exts_change(&f->result.exts, &r->exts);
-
- /* imperfect area bucket */
- fp = cp->h + (handle % cp->hash);
-
- /* lookup the filter, guaranteed to exist */
- for (cf = rcu_dereference_bh_rtnl(*fp); cf;
- fp = &cf->next, cf = rcu_dereference_bh_rtnl(*fp))
- if (cf->key == (u16)handle)
- break;
-
- f->next = cf->next;
-
- cf = rcu_replace_pointer(*fp, f, 1);
- tcf_exts_get_net(&cf->result.exts);
- tcf_queue_work(&cf->rwork, tcindex_destroy_fexts_work);
- } else if (r == &new_filter_result) {
- struct tcindex_filter *nfp;
- struct tcindex_filter __rcu **fp;
-
- f->result.res = r->res;
- tcf_exts_change(&f->result.exts, &r->exts);
-
- fp = cp->h + (handle % cp->hash);
- for (nfp = rtnl_dereference(*fp);
- nfp;
- fp = &nfp->next, nfp = rtnl_dereference(*fp))
- ; /* nothing */
-
- rcu_assign_pointer(*fp, f);
- } else {
- tcf_exts_destroy(&new_filter_result.exts);
- }
-
- if (oldp)
- tcf_queue_work(&oldp->rwork, tcindex_partial_destroy_work);
- return 0;
-
-errout_alloc:
- if (balloc == 1)
- tcindex_free_perfect_hash(cp);
- else if (balloc == 2)
- kfree(cp->h);
- tcf_exts_destroy(&new_filter_result.exts);
-errout:
- kfree(cp);
- tcf_exts_destroy(&e);
- return err;
-}
-
-static int
-tcindex_change(struct net *net, struct sk_buff *in_skb,
- struct tcf_proto *tp, unsigned long base, u32 handle,
- struct nlattr **tca, void **arg, u32 flags,
- struct netlink_ext_ack *extack)
-{
- struct nlattr *opt = tca[TCA_OPTIONS];
- struct nlattr *tb[TCA_TCINDEX_MAX + 1];
- struct tcindex_data *p = rtnl_dereference(tp->root);
- struct tcindex_filter_result *r = *arg;
- int err;
-
- pr_debug("tcindex_change(tp %p,handle 0x%08x,tca %p,arg %p),opt %p,"
- "p %p,r %p,*arg %p\n",
- tp, handle, tca, arg, opt, p, r, *arg);
-
- if (!opt)
- return 0;
-
- err = nla_parse_nested_deprecated(tb, TCA_TCINDEX_MAX, opt,
- tcindex_policy, NULL);
- if (err < 0)
- return err;
-
- return tcindex_set_parms(net, tp, base, handle, p, r, tb,
- tca[TCA_RATE], flags, extack);
-}
-
-static void tcindex_walk(struct tcf_proto *tp, struct tcf_walker *walker,
- bool rtnl_held)
-{
- struct tcindex_data *p = rtnl_dereference(tp->root);
- struct tcindex_filter *f, *next;
- int i;
-
- pr_debug("tcindex_walk(tp %p,walker %p),p %p\n", tp, walker, p);
- if (p->perfect) {
- for (i = 0; i < p->hash; i++) {
- if (!p->perfect[i].res.class)
- continue;
- if (!tc_cls_stats_dump(tp, walker, p->perfect + i))
- return;
- }
- }
- if (!p->h)
- return;
- for (i = 0; i < p->hash; i++) {
- for (f = rtnl_dereference(p->h[i]); f; f = next) {
- next = rtnl_dereference(f->next);
- if (!tc_cls_stats_dump(tp, walker, &f->result))
- return;
- }
- }
-}
-
-static void tcindex_destroy(struct tcf_proto *tp, bool rtnl_held,
- struct netlink_ext_ack *extack)
-{
- struct tcindex_data *p = rtnl_dereference(tp->root);
- int i;
-
- pr_debug("tcindex_destroy(tp %p),p %p\n", tp, p);
-
- if (p->perfect) {
- for (i = 0; i < p->hash; i++) {
- struct tcindex_filter_result *r = p->perfect + i;
-
- /* tcf_queue_work() does not guarantee the ordering we
- * want, so we have to take this refcnt temporarily to
- * ensure 'p' is freed after all tcindex_filter_result
- * here. Imperfect hash does not need this, because it
- * uses linked lists rather than an array.
- */
- tcindex_data_get(p);
-
- tcf_unbind_filter(tp, &r->res);
- if (tcf_exts_get_net(&r->exts))
- tcf_queue_work(&r->rwork,
- tcindex_destroy_rexts_work);
- else
- __tcindex_destroy_rexts(r);
- }
- }
-
- for (i = 0; p->h && i < p->hash; i++) {
- struct tcindex_filter *f, *next;
- bool last;
-
- for (f = rtnl_dereference(p->h[i]); f; f = next) {
- next = rtnl_dereference(f->next);
- tcindex_delete(tp, &f->result, &last, rtnl_held, NULL);
- }
- }
-
- tcf_queue_work(&p->rwork, tcindex_destroy_work);
-}
-
-
-static int tcindex_dump(struct net *net, struct tcf_proto *tp, void *fh,
- struct sk_buff *skb, struct tcmsg *t, bool rtnl_held)
-{
- struct tcindex_data *p = rtnl_dereference(tp->root);
- struct tcindex_filter_result *r = fh;
- struct nlattr *nest;
-
- pr_debug("tcindex_dump(tp %p,fh %p,skb %p,t %p),p %p,r %p\n",
- tp, fh, skb, t, p, r);
- pr_debug("p->perfect %p p->h %p\n", p->perfect, p->h);
-
- nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
- if (nest == NULL)
- goto nla_put_failure;
-
- if (!fh) {
- t->tcm_handle = ~0; /* whatever ... */
- if (nla_put_u32(skb, TCA_TCINDEX_HASH, p->hash) ||
- nla_put_u16(skb, TCA_TCINDEX_MASK, p->mask) ||
- nla_put_u32(skb, TCA_TCINDEX_SHIFT, p->shift) ||
- nla_put_u32(skb, TCA_TCINDEX_FALL_THROUGH, p->fall_through))
- goto nla_put_failure;
- nla_nest_end(skb, nest);
- } else {
- if (p->perfect) {
- t->tcm_handle = r - p->perfect;
- } else {
- struct tcindex_filter *f;
- struct tcindex_filter __rcu **fp;
- int i;
-
- t->tcm_handle = 0;
- for (i = 0; !t->tcm_handle && i < p->hash; i++) {
- fp = &p->h[i];
- for (f = rtnl_dereference(*fp);
- !t->tcm_handle && f;
- fp = &f->next, f = rtnl_dereference(*fp)) {
- if (&f->result == r)
- t->tcm_handle = f->key;
- }
- }
- }
- pr_debug("handle = %d\n", t->tcm_handle);
- if (r->res.class &&
- nla_put_u32(skb, TCA_TCINDEX_CLASSID, r->res.classid))
- goto nla_put_failure;
-
- if (tcf_exts_dump(skb, &r->exts) < 0)
- goto nla_put_failure;
- nla_nest_end(skb, nest);
-
- if (tcf_exts_dump_stats(skb, &r->exts) < 0)
- goto nla_put_failure;
- }
-
- return skb->len;
-
-nla_put_failure:
- nla_nest_cancel(skb, nest);
- return -1;
-}
-
-static void tcindex_bind_class(void *fh, u32 classid, unsigned long cl,
- void *q, unsigned long base)
-{
- struct tcindex_filter_result *r = fh;
-
- tc_cls_bind_class(classid, cl, q, &r->res, base);
-}
-
-static struct tcf_proto_ops cls_tcindex_ops __read_mostly = {
- .kind = "tcindex",
- .classify = tcindex_classify,
- .init = tcindex_init,
- .destroy = tcindex_destroy,
- .get = tcindex_get,
- .change = tcindex_change,
- .delete = tcindex_delete,
- .walk = tcindex_walk,
- .dump = tcindex_dump,
- .bind_class = tcindex_bind_class,
- .owner = THIS_MODULE,
-};
-
-static int __init init_tcindex(void)
-{
- return register_tcf_proto_ops(&cls_tcindex_ops);
-}
-
-static void __exit exit_tcindex(void)
-{
- unregister_tcf_proto_ops(&cls_tcindex_ops);
-}
-
-module_init(init_tcindex)
-module_exit(exit_tcindex)
-MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index 72d2c204d5f3..aba789c30a2e 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -902,7 +902,8 @@ static void qdisc_offload_graft_root(struct net_device *dev,
}
static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
- u32 portid, u32 seq, u16 flags, int event)
+ u32 portid, u32 seq, u16 flags, int event,
+ struct netlink_ext_ack *extack)
{
struct gnet_stats_basic_sync __percpu *cpu_bstats = NULL;
struct gnet_stats_queue __percpu *cpu_qstats = NULL;
@@ -970,7 +971,12 @@ static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
if (gnet_stats_finish_copy(&d) < 0)
goto nla_put_failure;
+ if (extack && extack->_msg &&
+ nla_put_string(skb, TCA_EXT_WARN_MSG, extack->_msg))
+ goto out_nlmsg_trim;
+
nlh->nlmsg_len = skb_tail_pointer(skb) - b;
+
return skb->len;
out_nlmsg_trim:
@@ -991,7 +997,8 @@ static bool tc_qdisc_dump_ignore(struct Qdisc *q, bool dump_invisible)
static int qdisc_notify(struct net *net, struct sk_buff *oskb,
struct nlmsghdr *n, u32 clid,
- struct Qdisc *old, struct Qdisc *new)
+ struct Qdisc *old, struct Qdisc *new,
+ struct netlink_ext_ack *extack)
{
struct sk_buff *skb;
u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
@@ -1002,12 +1009,12 @@ static int qdisc_notify(struct net *net, struct sk_buff *oskb,
if (old && !tc_qdisc_dump_ignore(old, false)) {
if (tc_fill_qdisc(skb, old, clid, portid, n->nlmsg_seq,
- 0, RTM_DELQDISC) < 0)
+ 0, RTM_DELQDISC, extack) < 0)
goto err_out;
}
if (new && !tc_qdisc_dump_ignore(new, false)) {
if (tc_fill_qdisc(skb, new, clid, portid, n->nlmsg_seq,
- old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
+ old ? NLM_F_REPLACE : 0, RTM_NEWQDISC, extack) < 0)
goto err_out;
}
@@ -1022,10 +1029,11 @@ err_out:
static void notify_and_destroy(struct net *net, struct sk_buff *skb,
struct nlmsghdr *n, u32 clid,
- struct Qdisc *old, struct Qdisc *new)
+ struct Qdisc *old, struct Qdisc *new,
+ struct netlink_ext_ack *extack)
{
if (new || old)
- qdisc_notify(net, skb, n, clid, old, new);
+ qdisc_notify(net, skb, n, clid, old, new, extack);
if (old)
qdisc_put(old);
@@ -1105,12 +1113,12 @@ skip:
qdisc_refcount_inc(new);
rcu_assign_pointer(dev->qdisc, new ? : &noop_qdisc);
- notify_and_destroy(net, skb, n, classid, old, new);
+ notify_and_destroy(net, skb, n, classid, old, new, extack);
if (new && new->ops->attach)
new->ops->attach(new);
} else {
- notify_and_destroy(net, skb, n, classid, old, new);
+ notify_and_destroy(net, skb, n, classid, old, new, extack);
}
if (dev->flags & IFF_UP)
@@ -1141,7 +1149,7 @@ skip:
err = cops->graft(parent, cl, new, &old, extack);
if (err)
return err;
- notify_and_destroy(net, skb, n, classid, old, new);
+ notify_and_destroy(net, skb, n, classid, old, new, extack);
}
return 0;
}
@@ -1274,20 +1282,21 @@ static struct Qdisc *qdisc_create(struct net_device *dev,
if (err)
goto err_out3;
- if (ops->init) {
- err = ops->init(sch, tca[TCA_OPTIONS], extack);
- if (err != 0)
- goto err_out5;
- }
-
if (tca[TCA_STAB]) {
stab = qdisc_get_stab(tca[TCA_STAB], extack);
if (IS_ERR(stab)) {
err = PTR_ERR(stab);
- goto err_out4;
+ goto err_out3;
}
rcu_assign_pointer(sch->stab, stab);
}
+
+ if (ops->init) {
+ err = ops->init(sch, tca[TCA_OPTIONS], extack);
+ if (err != 0)
+ goto err_out4;
+ }
+
if (tca[TCA_RATE]) {
err = -EOPNOTSUPP;
if (sch->flags & TCQ_F_MQROOT) {
@@ -1312,10 +1321,13 @@ static struct Qdisc *qdisc_create(struct net_device *dev,
return sch;
-err_out5:
- /* ops->init() failed, we call ->destroy() like qdisc_create_dflt() */
+err_out4:
+ /* Even if ops->init() failed, we call ops->destroy()
+ * like qdisc_create_dflt().
+ */
if (ops->destroy)
ops->destroy(sch);
+ qdisc_put_stab(rtnl_dereference(sch->stab));
err_out3:
netdev_put(dev, &sch->dev_tracker);
qdisc_free(sch);
@@ -1324,16 +1336,6 @@ err_out2:
err_out:
*errp = err;
return NULL;
-
-err_out4:
- /*
- * Any broken qdiscs that would require a ops->reset() here?
- * The qdisc was never in action so it shouldn't be necessary.
- */
- qdisc_put_stab(rtnl_dereference(sch->stab));
- if (ops->destroy)
- ops->destroy(sch);
- goto err_out3;
}
static int qdisc_change(struct Qdisc *sch, struct nlattr **tca,
@@ -1509,7 +1511,7 @@ static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
if (err != 0)
return err;
} else {
- qdisc_notify(net, skb, n, clid, NULL, q);
+ qdisc_notify(net, skb, n, clid, NULL, q, NULL);
}
return 0;
}
@@ -1648,7 +1650,7 @@ replay:
}
err = qdisc_change(q, tca, extack);
if (err == 0)
- qdisc_notify(net, skb, n, clid, NULL, q);
+ qdisc_notify(net, skb, n, clid, NULL, q, extack);
return err;
create_n_graft:
@@ -1715,7 +1717,7 @@ static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
if (!tc_qdisc_dump_ignore(q, dump_invisible) &&
tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq, NLM_F_MULTI,
- RTM_NEWQDISC) <= 0)
+ RTM_NEWQDISC, NULL) <= 0)
goto done;
q_idx++;
}
@@ -1737,7 +1739,7 @@ static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
if (!tc_qdisc_dump_ignore(q, dump_invisible) &&
tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq, NLM_F_MULTI,
- RTM_NEWQDISC) <= 0)
+ RTM_NEWQDISC, NULL) <= 0)
goto done;
q_idx++;
}
@@ -1810,8 +1812,8 @@ done:
************************************************/
static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
- unsigned long cl,
- u32 portid, u32 seq, u16 flags, int event)
+ unsigned long cl, u32 portid, u32 seq, u16 flags,
+ int event, struct netlink_ext_ack *extack)
{
struct tcmsg *tcm;
struct nlmsghdr *nlh;
@@ -1846,7 +1848,12 @@ static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
if (gnet_stats_finish_copy(&d) < 0)
goto nla_put_failure;
+ if (extack && extack->_msg &&
+ nla_put_string(skb, TCA_EXT_WARN_MSG, extack->_msg))
+ goto out_nlmsg_trim;
+
nlh->nlmsg_len = skb_tail_pointer(skb) - b;
+
return skb->len;
out_nlmsg_trim:
@@ -1857,7 +1864,7 @@ nla_put_failure:
static int tclass_notify(struct net *net, struct sk_buff *oskb,
struct nlmsghdr *n, struct Qdisc *q,
- unsigned long cl, int event)
+ unsigned long cl, int event, struct netlink_ext_ack *extack)
{
struct sk_buff *skb;
u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
@@ -1866,7 +1873,7 @@ static int tclass_notify(struct net *net, struct sk_buff *oskb,
if (!skb)
return -ENOBUFS;
- if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, event) < 0) {
+ if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, event, extack) < 0) {
kfree_skb(skb);
return -EINVAL;
}
@@ -1893,7 +1900,7 @@ static int tclass_del_notify(struct net *net,
return -ENOBUFS;
if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0,
- RTM_DELTCLASS) < 0) {
+ RTM_DELTCLASS, extack) < 0) {
kfree_skb(skb);
return -EINVAL;
}
@@ -2100,7 +2107,7 @@ static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n,
tc_bind_tclass(q, portid, clid, 0);
goto out;
case RTM_GETTCLASS:
- err = tclass_notify(net, skb, n, q, cl, RTM_NEWTCLASS);
+ err = tclass_notify(net, skb, n, q, cl, RTM_NEWTCLASS, extack);
goto out;
default:
err = -EINVAL;
@@ -2118,7 +2125,7 @@ static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n,
if (cops->change)
err = cops->change(q, clid, portid, tca, &new_cl, extack);
if (err == 0) {
- tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS);
+ tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS, extack);
/* We just create a new class, need to do reverse binding. */
if (cl != new_cl)
tc_bind_tclass(q, portid, clid, new_cl);
@@ -2140,7 +2147,7 @@ static int qdisc_class_dump(struct Qdisc *q, unsigned long cl,
return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).portid,
a->cb->nlh->nlmsg_seq, NLM_F_MULTI,
- RTM_NEWTCLASS);
+ RTM_NEWTCLASS, NULL);
}
static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
diff --git a/net/sched/sch_atm.c b/net/sched/sch_atm.c
deleted file mode 100644
index 4a981ca90b0b..000000000000
--- a/net/sched/sch_atm.c
+++ /dev/null
@@ -1,706 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/* net/sched/sch_atm.c - ATM VC selection "queueing discipline" */
-
-/* Written 1998-2000 by Werner Almesberger, EPFL ICA */
-
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <linux/init.h>
-#include <linux/interrupt.h>
-#include <linux/string.h>
-#include <linux/errno.h>
-#include <linux/skbuff.h>
-#include <linux/atmdev.h>
-#include <linux/atmclip.h>
-#include <linux/rtnetlink.h>
-#include <linux/file.h> /* for fput */
-#include <net/netlink.h>
-#include <net/pkt_sched.h>
-#include <net/pkt_cls.h>
-
-/*
- * The ATM queuing discipline provides a framework for invoking classifiers
- * (aka "filters"), which in turn select classes of this queuing discipline.
- * Each class maps the flow(s) it is handling to a given VC. Multiple classes
- * may share the same VC.
- *
- * When creating a class, VCs are specified by passing the number of the open
- * socket descriptor by which the calling process references the VC. The kernel
- * keeps the VC open at least until all classes using it are removed.
- *
- * In this file, most functions are named atm_tc_* to avoid confusion with all
- * the atm_* in net/atm. This naming convention differs from what's used in the
- * rest of net/sched.
- *
- * Known bugs:
- * - sometimes messes up the IP stack
- * - any manipulations besides the few operations described in the README, are
- * untested and likely to crash the system
- * - should lock the flow while there is data in the queue (?)
- */
-
-#define VCC2FLOW(vcc) ((struct atm_flow_data *) ((vcc)->user_back))
-
-struct atm_flow_data {
- struct Qdisc_class_common common;
- struct Qdisc *q; /* FIFO, TBF, etc. */
- struct tcf_proto __rcu *filter_list;
- struct tcf_block *block;
- struct atm_vcc *vcc; /* VCC; NULL if VCC is closed */
- void (*old_pop)(struct atm_vcc *vcc,
- struct sk_buff *skb); /* chaining */
- struct atm_qdisc_data *parent; /* parent qdisc */
- struct socket *sock; /* for closing */
- int ref; /* reference count */
- struct gnet_stats_basic_sync bstats;
- struct gnet_stats_queue qstats;
- struct list_head list;
- struct atm_flow_data *excess; /* flow for excess traffic;
- NULL to set CLP instead */
- int hdr_len;
- unsigned char hdr[]; /* header data; MUST BE LAST */
-};
-
-struct atm_qdisc_data {
- struct atm_flow_data link; /* unclassified skbs go here */
- struct list_head flows; /* NB: "link" is also on this
- list */
- struct tasklet_struct task; /* dequeue tasklet */
-};
-
-/* ------------------------- Class/flow operations ------------------------- */
-
-static inline struct atm_flow_data *lookup_flow(struct Qdisc *sch, u32 classid)
-{
- struct atm_qdisc_data *p = qdisc_priv(sch);
- struct atm_flow_data *flow;
-
- list_for_each_entry(flow, &p->flows, list) {
- if (flow->common.classid == classid)
- return flow;
- }
- return NULL;
-}
-
-static int atm_tc_graft(struct Qdisc *sch, unsigned long arg,
- struct Qdisc *new, struct Qdisc **old,
- struct netlink_ext_ack *extack)
-{
- struct atm_qdisc_data *p = qdisc_priv(sch);
- struct atm_flow_data *flow = (struct atm_flow_data *)arg;
-
- pr_debug("atm_tc_graft(sch %p,[qdisc %p],flow %p,new %p,old %p)\n",
- sch, p, flow, new, old);
- if (list_empty(&flow->list))
- return -EINVAL;
- if (!new)
- new = &noop_qdisc;
- *old = flow->q;
- flow->q = new;
- if (*old)
- qdisc_reset(*old);
- return 0;
-}
-
-static struct Qdisc *atm_tc_leaf(struct Qdisc *sch, unsigned long cl)
-{
- struct atm_flow_data *flow = (struct atm_flow_data *)cl;
-
- pr_debug("atm_tc_leaf(sch %p,flow %p)\n", sch, flow);
- return flow ? flow->q : NULL;
-}
-
-static unsigned long atm_tc_find(struct Qdisc *sch, u32 classid)
-{
- struct atm_qdisc_data *p __maybe_unused = qdisc_priv(sch);
- struct atm_flow_data *flow;
-
- pr_debug("%s(sch %p,[qdisc %p],classid %x)\n", __func__, sch, p, classid);
- flow = lookup_flow(sch, classid);
- pr_debug("%s: flow %p\n", __func__, flow);
- return (unsigned long)flow;
-}
-
-static unsigned long atm_tc_bind_filter(struct Qdisc *sch,
- unsigned long parent, u32 classid)
-{
- struct atm_qdisc_data *p __maybe_unused = qdisc_priv(sch);
- struct atm_flow_data *flow;
-
- pr_debug("%s(sch %p,[qdisc %p],classid %x)\n", __func__, sch, p, classid);
- flow = lookup_flow(sch, classid);
- if (flow)
- flow->ref++;
- pr_debug("%s: flow %p\n", __func__, flow);
- return (unsigned long)flow;
-}
-
-/*
- * atm_tc_put handles all destructions, including the ones that are explicitly
- * requested (atm_tc_destroy, etc.). The assumption here is that we never drop
- * anything that still seems to be in use.
- */
-static void atm_tc_put(struct Qdisc *sch, unsigned long cl)
-{
- struct atm_qdisc_data *p = qdisc_priv(sch);
- struct atm_flow_data *flow = (struct atm_flow_data *)cl;
-
- pr_debug("atm_tc_put(sch %p,[qdisc %p],flow %p)\n", sch, p, flow);
- if (--flow->ref)
- return;
- pr_debug("atm_tc_put: destroying\n");
- list_del_init(&flow->list);
- pr_debug("atm_tc_put: qdisc %p\n", flow->q);
- qdisc_put(flow->q);
- tcf_block_put(flow->block);
- if (flow->sock) {
- pr_debug("atm_tc_put: f_count %ld\n",
- file_count(flow->sock->file));
- flow->vcc->pop = flow->old_pop;
- sockfd_put(flow->sock);
- }
- if (flow->excess)
- atm_tc_put(sch, (unsigned long)flow->excess);
- if (flow != &p->link)
- kfree(flow);
- /*
- * If flow == &p->link, the qdisc no longer works at this point and
- * needs to be removed. (By the caller of atm_tc_put.)
- */
-}
-
-static void sch_atm_pop(struct atm_vcc *vcc, struct sk_buff *skb)
-{
- struct atm_qdisc_data *p = VCC2FLOW(vcc)->parent;
-
- pr_debug("sch_atm_pop(vcc %p,skb %p,[qdisc %p])\n", vcc, skb, p);
- VCC2FLOW(vcc)->old_pop(vcc, skb);
- tasklet_schedule(&p->task);
-}
-
-static const u8 llc_oui_ip[] = {
- 0xaa, /* DSAP: non-ISO */
- 0xaa, /* SSAP: non-ISO */
- 0x03, /* Ctrl: Unnumbered Information Command PDU */
- 0x00, /* OUI: EtherType */
- 0x00, 0x00,
- 0x08, 0x00
-}; /* Ethertype IP (0800) */
-
-static const struct nla_policy atm_policy[TCA_ATM_MAX + 1] = {
- [TCA_ATM_FD] = { .type = NLA_U32 },
- [TCA_ATM_EXCESS] = { .type = NLA_U32 },
-};
-
-static int atm_tc_change(struct Qdisc *sch, u32 classid, u32 parent,
- struct nlattr **tca, unsigned long *arg,
- struct netlink_ext_ack *extack)
-{
- struct atm_qdisc_data *p = qdisc_priv(sch);
- struct atm_flow_data *flow = (struct atm_flow_data *)*arg;
- struct atm_flow_data *excess = NULL;
- struct nlattr *opt = tca[TCA_OPTIONS];
- struct nlattr *tb[TCA_ATM_MAX + 1];
- struct socket *sock;
- int fd, error, hdr_len;
- void *hdr;
-
- pr_debug("atm_tc_change(sch %p,[qdisc %p],classid %x,parent %x,"
- "flow %p,opt %p)\n", sch, p, classid, parent, flow, opt);
- /*
- * The concept of parents doesn't apply for this qdisc.
- */
- if (parent && parent != TC_H_ROOT && parent != sch->handle)
- return -EINVAL;
- /*
- * ATM classes cannot be changed. In order to change properties of the
- * ATM connection, that socket needs to be modified directly (via the
- * native ATM API. In order to send a flow to a different VC, the old
- * class needs to be removed and a new one added. (This may be changed
- * later.)
- */
- if (flow)
- return -EBUSY;
- if (opt == NULL)
- return -EINVAL;
-
- error = nla_parse_nested_deprecated(tb, TCA_ATM_MAX, opt, atm_policy,
- NULL);
- if (error < 0)
- return error;
-
- if (!tb[TCA_ATM_FD])
- return -EINVAL;
- fd = nla_get_u32(tb[TCA_ATM_FD]);
- pr_debug("atm_tc_change: fd %d\n", fd);
- if (tb[TCA_ATM_HDR]) {
- hdr_len = nla_len(tb[TCA_ATM_HDR]);
- hdr = nla_data(tb[TCA_ATM_HDR]);
- } else {
- hdr_len = RFC1483LLC_LEN;
- hdr = NULL; /* default LLC/SNAP for IP */
- }
- if (!tb[TCA_ATM_EXCESS])
- excess = NULL;
- else {
- excess = (struct atm_flow_data *)
- atm_tc_find(sch, nla_get_u32(tb[TCA_ATM_EXCESS]));
- if (!excess)
- return -ENOENT;
- }
- pr_debug("atm_tc_change: type %d, payload %d, hdr_len %d\n",
- opt->nla_type, nla_len(opt), hdr_len);
- sock = sockfd_lookup(fd, &error);
- if (!sock)
- return error; /* f_count++ */
- pr_debug("atm_tc_change: f_count %ld\n", file_count(sock->file));
- if (sock->ops->family != PF_ATMSVC && sock->ops->family != PF_ATMPVC) {
- error = -EPROTOTYPE;
- goto err_out;
- }
- /* @@@ should check if the socket is really operational or we'll crash
- on vcc->send */
- if (classid) {
- if (TC_H_MAJ(classid ^ sch->handle)) {
- pr_debug("atm_tc_change: classid mismatch\n");
- error = -EINVAL;
- goto err_out;
- }
- } else {
- int i;
- unsigned long cl;
-
- for (i = 1; i < 0x8000; i++) {
- classid = TC_H_MAKE(sch->handle, 0x8000 | i);
- cl = atm_tc_find(sch, classid);
- if (!cl)
- break;
- }
- }
- pr_debug("atm_tc_change: new id %x\n", classid);
- flow = kzalloc(sizeof(struct atm_flow_data) + hdr_len, GFP_KERNEL);
- pr_debug("atm_tc_change: flow %p\n", flow);
- if (!flow) {
- error = -ENOBUFS;
- goto err_out;
- }
-
- error = tcf_block_get(&flow->block, &flow->filter_list, sch,
- extack);
- if (error) {
- kfree(flow);
- goto err_out;
- }
-
- flow->q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, classid,
- extack);
- if (!flow->q)
- flow->q = &noop_qdisc;
- pr_debug("atm_tc_change: qdisc %p\n", flow->q);
- flow->sock = sock;
- flow->vcc = ATM_SD(sock); /* speedup */
- flow->vcc->user_back = flow;
- pr_debug("atm_tc_change: vcc %p\n", flow->vcc);
- flow->old_pop = flow->vcc->pop;
- flow->parent = p;
- flow->vcc->pop = sch_atm_pop;
- flow->common.classid = classid;
- flow->ref = 1;
- flow->excess = excess;
- list_add(&flow->list, &p->link.list);
- flow->hdr_len = hdr_len;
- if (hdr)
- memcpy(flow->hdr, hdr, hdr_len);
- else
- memcpy(flow->hdr, llc_oui_ip, sizeof(llc_oui_ip));
- *arg = (unsigned long)flow;
- return 0;
-err_out:
- sockfd_put(sock);
- return error;
-}
-
-static int atm_tc_delete(struct Qdisc *sch, unsigned long arg,
- struct netlink_ext_ack *extack)
-{
- struct atm_qdisc_data *p = qdisc_priv(sch);
- struct atm_flow_data *flow = (struct atm_flow_data *)arg;
-
- pr_debug("atm_tc_delete(sch %p,[qdisc %p],flow %p)\n", sch, p, flow);
- if (list_empty(&flow->list))
- return -EINVAL;
- if (rcu_access_pointer(flow->filter_list) || flow == &p->link)
- return -EBUSY;
- /*
- * Reference count must be 2: one for "keepalive" (set at class
- * creation), and one for the reference held when calling delete.
- */
- if (flow->ref < 2) {
- pr_err("atm_tc_delete: flow->ref == %d\n", flow->ref);
- return -EINVAL;
- }
- if (flow->ref > 2)
- return -EBUSY; /* catch references via excess, etc. */
- atm_tc_put(sch, arg);
- return 0;
-}
-
-static void atm_tc_walk(struct Qdisc *sch, struct qdisc_walker *walker)
-{
- struct atm_qdisc_data *p = qdisc_priv(sch);
- struct atm_flow_data *flow;
-
- pr_debug("atm_tc_walk(sch %p,[qdisc %p],walker %p)\n", sch, p, walker);
- if (walker->stop)
- return;
- list_for_each_entry(flow, &p->flows, list) {
- if (!tc_qdisc_stats_dump(sch, (unsigned long)flow, walker))
- break;
- }
-}
-
-static struct tcf_block *atm_tc_tcf_block(struct Qdisc *sch, unsigned long cl,
- struct netlink_ext_ack *extack)
-{
- struct atm_qdisc_data *p = qdisc_priv(sch);
- struct atm_flow_data *flow = (struct atm_flow_data *)cl;
-
- pr_debug("atm_tc_find_tcf(sch %p,[qdisc %p],flow %p)\n", sch, p, flow);
- return flow ? flow->block : p->link.block;
-}
-
-/* --------------------------- Qdisc operations ---------------------------- */
-
-static int atm_tc_enqueue(struct sk_buff *skb, struct Qdisc *sch,
- struct sk_buff **to_free)
-{
- struct atm_qdisc_data *p = qdisc_priv(sch);
- struct atm_flow_data *flow;
- struct tcf_result res;
- int result;
- int ret = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
-
- pr_debug("atm_tc_enqueue(skb %p,sch %p,[qdisc %p])\n", skb, sch, p);
- result = TC_ACT_OK; /* be nice to gcc */
- flow = NULL;
- if (TC_H_MAJ(skb->priority) != sch->handle ||
- !(flow = (struct atm_flow_data *)atm_tc_find(sch, skb->priority))) {
- struct tcf_proto *fl;
-
- list_for_each_entry(flow, &p->flows, list) {
- fl = rcu_dereference_bh(flow->filter_list);
- if (fl) {
- result = tcf_classify(skb, NULL, fl, &res, true);
- if (result < 0)
- continue;
- if (result == TC_ACT_SHOT)
- goto done;
-
- flow = (struct atm_flow_data *)res.class;
- if (!flow)
- flow = lookup_flow(sch, res.classid);
- goto drop;
- }
- }
- flow = NULL;
-done:
- ;
- }
- if (!flow) {
- flow = &p->link;
- } else {
- if (flow->vcc)
- ATM_SKB(skb)->atm_options = flow->vcc->atm_options;
- /*@@@ looks good ... but it's not supposed to work :-) */
-#ifdef CONFIG_NET_CLS_ACT
- switch (result) {
- case TC_ACT_QUEUED:
- case TC_ACT_STOLEN:
- case TC_ACT_TRAP:
- __qdisc_drop(skb, to_free);
- return NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
- case TC_ACT_SHOT:
- __qdisc_drop(skb, to_free);
- goto drop;
- case TC_ACT_RECLASSIFY:
- if (flow->excess)
- flow = flow->excess;
- else
- ATM_SKB(skb)->atm_options |= ATM_ATMOPT_CLP;
- break;
- }
-#endif
- }
-
- ret = qdisc_enqueue(skb, flow->q, to_free);
- if (ret != NET_XMIT_SUCCESS) {
-drop: __maybe_unused
- if (net_xmit_drop_count(ret)) {
- qdisc_qstats_drop(sch);
- if (flow)
- flow->qstats.drops++;
- }
- return ret;
- }
- /*
- * Okay, this may seem weird. We pretend we've dropped the packet if
- * it goes via ATM. The reason for this is that the outer qdisc
- * expects to be able to q->dequeue the packet later on if we return
- * success at this place. Also, sch->q.qdisc needs to reflect whether
- * there is a packet egligible for dequeuing or not. Note that the
- * statistics of the outer qdisc are necessarily wrong because of all
- * this. There's currently no correct solution for this.
- */
- if (flow == &p->link) {
- sch->q.qlen++;
- return NET_XMIT_SUCCESS;
- }
- tasklet_schedule(&p->task);
- return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
-}
-
-/*
- * Dequeue packets and send them over ATM. Note that we quite deliberately
- * avoid checking net_device's flow control here, simply because sch_atm
- * uses its own channels, which have nothing to do with any CLIP/LANE/or
- * non-ATM interfaces.
- */
-
-static void sch_atm_dequeue(struct tasklet_struct *t)
-{
- struct atm_qdisc_data *p = from_tasklet(p, t, task);
- struct Qdisc *sch = qdisc_from_priv(p);
- struct atm_flow_data *flow;
- struct sk_buff *skb;
-
- pr_debug("sch_atm_dequeue(sch %p,[qdisc %p])\n", sch, p);
- list_for_each_entry(flow, &p->flows, list) {
- if (flow == &p->link)
- continue;
- /*
- * If traffic is properly shaped, this won't generate nasty
- * little bursts. Otherwise, it may ... (but that's okay)
- */
- while ((skb = flow->q->ops->peek(flow->q))) {
- if (!atm_may_send(flow->vcc, skb->truesize))
- break;
-
- skb = qdisc_dequeue_peeked(flow->q);
- if (unlikely(!skb))
- break;
-
- qdisc_bstats_update(sch, skb);
- bstats_update(&flow->bstats, skb);
- pr_debug("atm_tc_dequeue: sending on class %p\n", flow);
- /* remove any LL header somebody else has attached */
- skb_pull(skb, skb_network_offset(skb));
- if (skb_headroom(skb) < flow->hdr_len) {
- struct sk_buff *new;
-
- new = skb_realloc_headroom(skb, flow->hdr_len);
- dev_kfree_skb(skb);
- if (!new)
- continue;
- skb = new;
- }
- pr_debug("sch_atm_dequeue: ip %p, data %p\n",
- skb_network_header(skb), skb->data);
- ATM_SKB(skb)->vcc = flow->vcc;
- memcpy(skb_push(skb, flow->hdr_len), flow->hdr,
- flow->hdr_len);
- refcount_add(skb->truesize,
- &sk_atm(flow->vcc)->sk_wmem_alloc);
- /* atm.atm_options are already set by atm_tc_enqueue */
- flow->vcc->send(flow->vcc, skb);
- }
- }
-}
-
-static struct sk_buff *atm_tc_dequeue(struct Qdisc *sch)
-{
- struct atm_qdisc_data *p = qdisc_priv(sch);
- struct sk_buff *skb;
-
- pr_debug("atm_tc_dequeue(sch %p,[qdisc %p])\n", sch, p);
- tasklet_schedule(&p->task);
- skb = qdisc_dequeue_peeked(p->link.q);
- if (skb)
- sch->q.qlen--;
- return skb;
-}
-
-static struct sk_buff *atm_tc_peek(struct Qdisc *sch)
-{
- struct atm_qdisc_data *p = qdisc_priv(sch);
-
- pr_debug("atm_tc_peek(sch %p,[qdisc %p])\n", sch, p);
-
- return p->link.q->ops->peek(p->link.q);
-}
-
-static int atm_tc_init(struct Qdisc *sch, struct nlattr *opt,
- struct netlink_ext_ack *extack)
-{
- struct atm_qdisc_data *p = qdisc_priv(sch);
- int err;
-
- pr_debug("atm_tc_init(sch %p,[qdisc %p],opt %p)\n", sch, p, opt);
- INIT_LIST_HEAD(&p->flows);
- INIT_LIST_HEAD(&p->link.list);
- gnet_stats_basic_sync_init(&p->link.bstats);
- list_add(&p->link.list, &p->flows);
- p->link.q = qdisc_create_dflt(sch->dev_queue,
- &pfifo_qdisc_ops, sch->handle, extack);
- if (!p->link.q)
- p->link.q = &noop_qdisc;
- pr_debug("atm_tc_init: link (%p) qdisc %p\n", &p->link, p->link.q);
- p->link.vcc = NULL;
- p->link.sock = NULL;
- p->link.common.classid = sch->handle;
- p->link.ref = 1;
-
- err = tcf_block_get(&p->link.block, &p->link.filter_list, sch,
- extack);
- if (err)
- return err;
-
- tasklet_setup(&p->task, sch_atm_dequeue);
- return 0;
-}
-
-static void atm_tc_reset(struct Qdisc *sch)
-{
- struct atm_qdisc_data *p = qdisc_priv(sch);
- struct atm_flow_data *flow;
-
- pr_debug("atm_tc_reset(sch %p,[qdisc %p])\n", sch, p);
- list_for_each_entry(flow, &p->flows, list)
- qdisc_reset(flow->q);
-}
-
-static void atm_tc_destroy(struct Qdisc *sch)
-{
- struct atm_qdisc_data *p = qdisc_priv(sch);
- struct atm_flow_data *flow, *tmp;
-
- pr_debug("atm_tc_destroy(sch %p,[qdisc %p])\n", sch, p);
- list_for_each_entry(flow, &p->flows, list) {
- tcf_block_put(flow->block);
- flow->block = NULL;
- }
-
- list_for_each_entry_safe(flow, tmp, &p->flows, list) {
- if (flow->ref > 1)
- pr_err("atm_destroy: %p->ref = %d\n", flow, flow->ref);
- atm_tc_put(sch, (unsigned long)flow);
- }
- tasklet_kill(&p->task);
-}
-
-static int atm_tc_dump_class(struct Qdisc *sch, unsigned long cl,
- struct sk_buff *skb, struct tcmsg *tcm)
-{
- struct atm_qdisc_data *p = qdisc_priv(sch);
- struct atm_flow_data *flow = (struct atm_flow_data *)cl;
- struct nlattr *nest;
-
- pr_debug("atm_tc_dump_class(sch %p,[qdisc %p],flow %p,skb %p,tcm %p)\n",
- sch, p, flow, skb, tcm);
- if (list_empty(&flow->list))
- return -EINVAL;
- tcm->tcm_handle = flow->common.classid;
- tcm->tcm_info = flow->q->handle;
-
- nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
- if (nest == NULL)
- goto nla_put_failure;
-
- if (nla_put(skb, TCA_ATM_HDR, flow->hdr_len, flow->hdr))
- goto nla_put_failure;
- if (flow->vcc) {
- struct sockaddr_atmpvc pvc;
- int state;
-
- memset(&pvc, 0, sizeof(pvc));
- pvc.sap_family = AF_ATMPVC;
- pvc.sap_addr.itf = flow->vcc->dev ? flow->vcc->dev->number : -1;
- pvc.sap_addr.vpi = flow->vcc->vpi;
- pvc.sap_addr.vci = flow->vcc->vci;
- if (nla_put(skb, TCA_ATM_ADDR, sizeof(pvc), &pvc))
- goto nla_put_failure;
- state = ATM_VF2VS(flow->vcc->flags);
- if (nla_put_u32(skb, TCA_ATM_STATE, state))
- goto nla_put_failure;
- }
- if (flow->excess) {
- if (nla_put_u32(skb, TCA_ATM_EXCESS, flow->common.classid))
- goto nla_put_failure;
- } else {
- if (nla_put_u32(skb, TCA_ATM_EXCESS, 0))
- goto nla_put_failure;
- }
- return nla_nest_end(skb, nest);
-
-nla_put_failure:
- nla_nest_cancel(skb, nest);
- return -1;
-}
-static int
-atm_tc_dump_class_stats(struct Qdisc *sch, unsigned long arg,
- struct gnet_dump *d)
-{
- struct atm_flow_data *flow = (struct atm_flow_data *)arg;
-
- if (gnet_stats_copy_basic(d, NULL, &flow->bstats, true) < 0 ||
- gnet_stats_copy_queue(d, NULL, &flow->qstats, flow->q->q.qlen) < 0)
- return -1;
-
- return 0;
-}
-
-static int atm_tc_dump(struct Qdisc *sch, struct sk_buff *skb)
-{
- return 0;
-}
-
-static const struct Qdisc_class_ops atm_class_ops = {
- .graft = atm_tc_graft,
- .leaf = atm_tc_leaf,
- .find = atm_tc_find,
- .change = atm_tc_change,
- .delete = atm_tc_delete,
- .walk = atm_tc_walk,
- .tcf_block = atm_tc_tcf_block,
- .bind_tcf = atm_tc_bind_filter,
- .unbind_tcf = atm_tc_put,
- .dump = atm_tc_dump_class,
- .dump_stats = atm_tc_dump_class_stats,
-};
-
-static struct Qdisc_ops atm_qdisc_ops __read_mostly = {
- .cl_ops = &atm_class_ops,
- .id = "atm",
- .priv_size = sizeof(struct atm_qdisc_data),
- .enqueue = atm_tc_enqueue,
- .dequeue = atm_tc_dequeue,
- .peek = atm_tc_peek,
- .init = atm_tc_init,
- .reset = atm_tc_reset,
- .destroy = atm_tc_destroy,
- .dump = atm_tc_dump,
- .owner = THIS_MODULE,
-};
-
-static int __init atm_init(void)
-{
- return register_qdisc(&atm_qdisc_ops);
-}
-
-static void __exit atm_exit(void)
-{
- unregister_qdisc(&atm_qdisc_ops);
-}
-
-module_init(atm_init)
-module_exit(atm_exit)
-MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c
index 3ed0c3342189..7970217b565a 100644
--- a/net/sched/sch_cake.c
+++ b/net/sched/sch_cake.c
@@ -1209,7 +1209,7 @@ static struct sk_buff *cake_ack_filter(struct cake_sched_data *q,
iph_check->daddr != iph->daddr)
continue;
- seglen = ntohs(iph_check->tot_len) -
+ seglen = iph_totlen(skb, iph_check) -
(4 * iph_check->ihl);
} else if (iph_check->version == 6) {
ipv6h = (struct ipv6hdr *)iph;
diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c
deleted file mode 100644
index 36db5f6782f2..000000000000
--- a/net/sched/sch_cbq.c
+++ /dev/null
@@ -1,1727 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * net/sched/sch_cbq.c Class-Based Queueing discipline.
- *
- * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
- */
-
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/string.h>
-#include <linux/errno.h>
-#include <linux/skbuff.h>
-#include <net/netlink.h>
-#include <net/pkt_sched.h>
-#include <net/pkt_cls.h>
-
-
-/* Class-Based Queueing (CBQ) algorithm.
- =======================================
-
- Sources: [1] Sally Floyd and Van Jacobson, "Link-sharing and Resource
- Management Models for Packet Networks",
- IEEE/ACM Transactions on Networking, Vol.3, No.4, 1995
-
- [2] Sally Floyd, "Notes on CBQ and Guaranteed Service", 1995
-
- [3] Sally Floyd, "Notes on Class-Based Queueing: Setting
- Parameters", 1996
-
- [4] Sally Floyd and Michael Speer, "Experimental Results
- for Class-Based Queueing", 1998, not published.
-
- -----------------------------------------------------------------------
-
- Algorithm skeleton was taken from NS simulator cbq.cc.
- If someone wants to check this code against the LBL version,
- he should take into account that ONLY the skeleton was borrowed,
- the implementation is different. Particularly:
-
- --- The WRR algorithm is different. Our version looks more
- reasonable (I hope) and works when quanta are allowed to be
- less than MTU, which is always the case when real time classes
- have small rates. Note, that the statement of [3] is
- incomplete, delay may actually be estimated even if class
- per-round allotment is less than MTU. Namely, if per-round
- allotment is W*r_i, and r_1+...+r_k = r < 1
-
- delay_i <= ([MTU/(W*r_i)]*W*r + W*r + k*MTU)/B
-
- In the worst case we have IntServ estimate with D = W*r+k*MTU
- and C = MTU*r. The proof (if correct at all) is trivial.
-
-
- --- It seems that cbq-2.0 is not very accurate. At least, I cannot
- interpret some places, which look like wrong translations
- from NS. Anyone is advised to find these differences
- and explain to me, why I am wrong 8).
-
- --- Linux has no EOI event, so that we cannot estimate true class
- idle time. Workaround is to consider the next dequeue event
- as sign that previous packet is finished. This is wrong because of
- internal device queueing, but on a permanently loaded link it is true.
- Moreover, combined with clock integrator, this scheme looks
- very close to an ideal solution. */
-
-struct cbq_sched_data;
-
-
-struct cbq_class {
- struct Qdisc_class_common common;
- struct cbq_class *next_alive; /* next class with backlog in this priority band */
-
-/* Parameters */
- unsigned char priority; /* class priority */
- unsigned char priority2; /* priority to be used after overlimit */
- unsigned char ewma_log; /* time constant for idle time calculation */
-
- u32 defmap;
-
- /* Link-sharing scheduler parameters */
- long maxidle; /* Class parameters: see below. */
- long offtime;
- long minidle;
- u32 avpkt;
- struct qdisc_rate_table *R_tab;
-
- /* General scheduler (WRR) parameters */
- long allot;
- long quantum; /* Allotment per WRR round */
- long weight; /* Relative allotment: see below */
-
- struct Qdisc *qdisc; /* Ptr to CBQ discipline */
- struct cbq_class *split; /* Ptr to split node */
- struct cbq_class *share; /* Ptr to LS parent in the class tree */
- struct cbq_class *tparent; /* Ptr to tree parent in the class tree */
- struct cbq_class *borrow; /* NULL if class is bandwidth limited;
- parent otherwise */
- struct cbq_class *sibling; /* Sibling chain */
- struct cbq_class *children; /* Pointer to children chain */
-
- struct Qdisc *q; /* Elementary queueing discipline */
-
-
-/* Variables */
- unsigned char cpriority; /* Effective priority */
- unsigned char delayed;
- unsigned char level; /* level of the class in hierarchy:
- 0 for leaf classes, and maximal
- level of children + 1 for nodes.
- */
-
- psched_time_t last; /* Last end of service */
- psched_time_t undertime;
- long avgidle;
- long deficit; /* Saved deficit for WRR */
- psched_time_t penalized;
- struct gnet_stats_basic_sync bstats;
- struct gnet_stats_queue qstats;
- struct net_rate_estimator __rcu *rate_est;
- struct tc_cbq_xstats xstats;
-
- struct tcf_proto __rcu *filter_list;
- struct tcf_block *block;
-
- int filters;
-
- struct cbq_class *defaults[TC_PRIO_MAX + 1];
-};
-
-struct cbq_sched_data {
- struct Qdisc_class_hash clhash; /* Hash table of all classes */
- int nclasses[TC_CBQ_MAXPRIO + 1];
- unsigned int quanta[TC_CBQ_MAXPRIO + 1];
-
- struct cbq_class link;
-
- unsigned int activemask;
- struct cbq_class *active[TC_CBQ_MAXPRIO + 1]; /* List of all classes
- with backlog */
-
-#ifdef CONFIG_NET_CLS_ACT
- struct cbq_class *rx_class;
-#endif
- struct cbq_class *tx_class;
- struct cbq_class *tx_borrowed;
- int tx_len;
- psched_time_t now; /* Cached timestamp */
- unsigned int pmask;
-
- struct qdisc_watchdog watchdog; /* Watchdog timer,
- started when CBQ has
- backlog, but cannot
- transmit just now */
- psched_tdiff_t wd_expires;
- int toplevel;
- u32 hgenerator;
-};
-
-
-#define L2T(cl, len) qdisc_l2t((cl)->R_tab, len)
-
-static inline struct cbq_class *
-cbq_class_lookup(struct cbq_sched_data *q, u32 classid)
-{
- struct Qdisc_class_common *clc;
-
- clc = qdisc_class_find(&q->clhash, classid);
- if (clc == NULL)
- return NULL;
- return container_of(clc, struct cbq_class, common);
-}
-
-#ifdef CONFIG_NET_CLS_ACT
-
-static struct cbq_class *
-cbq_reclassify(struct sk_buff *skb, struct cbq_class *this)
-{
- struct cbq_class *cl;
-
- for (cl = this->tparent; cl; cl = cl->tparent) {
- struct cbq_class *new = cl->defaults[TC_PRIO_BESTEFFORT];
-
- if (new != NULL && new != this)
- return new;
- }
- return NULL;
-}
-
-#endif
-
-/* Classify packet. The procedure is pretty complicated, but
- * it allows us to combine link sharing and priority scheduling
- * transparently.
- *
- * Namely, you can put link sharing rules (f.e. route based) at root of CBQ,
- * so that it resolves to split nodes. Then packets are classified
- * by logical priority, or a more specific classifier may be attached
- * to the split node.
- */
-
-static struct cbq_class *
-cbq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
-{
- struct cbq_sched_data *q = qdisc_priv(sch);
- struct cbq_class *head = &q->link;
- struct cbq_class **defmap;
- struct cbq_class *cl = NULL;
- u32 prio = skb->priority;
- struct tcf_proto *fl;
- struct tcf_result res;
-
- /*
- * Step 1. If skb->priority points to one of our classes, use it.
- */
- if (TC_H_MAJ(prio ^ sch->handle) == 0 &&
- (cl = cbq_class_lookup(q, prio)) != NULL)
- return cl;
-
- *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
- for (;;) {
- int result = 0;
- defmap = head->defaults;
-
- fl = rcu_dereference_bh(head->filter_list);
- /*
- * Step 2+n. Apply classifier.
- */
- result = tcf_classify(skb, NULL, fl, &res, true);
- if (!fl || result < 0)
- goto fallback;
- if (result == TC_ACT_SHOT)
- return NULL;
-
- cl = (void *)res.class;
- if (!cl) {
- if (TC_H_MAJ(res.classid))
- cl = cbq_class_lookup(q, res.classid);
- else if ((cl = defmap[res.classid & TC_PRIO_MAX]) == NULL)
- cl = defmap[TC_PRIO_BESTEFFORT];
-
- if (cl == NULL)
- goto fallback;
- }
- if (cl->level >= head->level)
- goto fallback;
-#ifdef CONFIG_NET_CLS_ACT
- switch (result) {
- case TC_ACT_QUEUED:
- case TC_ACT_STOLEN:
- case TC_ACT_TRAP:
- *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
- fallthrough;
- case TC_ACT_RECLASSIFY:
- return cbq_reclassify(skb, cl);
- }
-#endif
- if (cl->level == 0)
- return cl;
-
- /*
- * Step 3+n. If classifier selected a link sharing class,
- * apply agency specific classifier.
- * Repeat this procedure until we hit a leaf node.
- */
- head = cl;
- }
-
-fallback:
- cl = head;
-
- /*
- * Step 4. No success...
- */
- if (TC_H_MAJ(prio) == 0 &&
- !(cl = head->defaults[prio & TC_PRIO_MAX]) &&
- !(cl = head->defaults[TC_PRIO_BESTEFFORT]))
- return head;
-
- return cl;
-}
-
-/*
- * A packet has just been enqueued on the empty class.
- * cbq_activate_class adds it to the tail of active class list
- * of its priority band.
- */
-
-static inline void cbq_activate_class(struct cbq_class *cl)
-{
- struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
- int prio = cl->cpriority;
- struct cbq_class *cl_tail;
-
- cl_tail = q->active[prio];
- q->active[prio] = cl;
-
- if (cl_tail != NULL) {
- cl->next_alive = cl_tail->next_alive;
- cl_tail->next_alive = cl;
- } else {
- cl->next_alive = cl;
- q->activemask |= (1<<prio);
- }
-}
-
-/*
- * Unlink class from active chain.
- * Note that this same procedure is done directly in cbq_dequeue*
- * during round-robin procedure.
- */
-
-static void cbq_deactivate_class(struct cbq_class *this)
-{
- struct cbq_sched_data *q = qdisc_priv(this->qdisc);
- int prio = this->cpriority;
- struct cbq_class *cl;
- struct cbq_class *cl_prev = q->active[prio];
-
- do {
- cl = cl_prev->next_alive;
- if (cl == this) {
- cl_prev->next_alive = cl->next_alive;
- cl->next_alive = NULL;
-
- if (cl == q->active[prio]) {
- q->active[prio] = cl_prev;
- if (cl == q->active[prio]) {
- q->active[prio] = NULL;
- q->activemask &= ~(1<<prio);
- return;
- }
- }
- return;
- }
- } while ((cl_prev = cl) != q->active[prio]);
-}
-
-static void
-cbq_mark_toplevel(struct cbq_sched_data *q, struct cbq_class *cl)
-{
- int toplevel = q->toplevel;
-
- if (toplevel > cl->level) {
- psched_time_t now = psched_get_time();
-
- do {
- if (cl->undertime < now) {
- q->toplevel = cl->level;
- return;
- }
- } while ((cl = cl->borrow) != NULL && toplevel > cl->level);
- }
-}
-
-static int
-cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch,
- struct sk_buff **to_free)
-{
- struct cbq_sched_data *q = qdisc_priv(sch);
- int ret;
- struct cbq_class *cl = cbq_classify(skb, sch, &ret);
-
-#ifdef CONFIG_NET_CLS_ACT
- q->rx_class = cl;
-#endif
- if (cl == NULL) {
- if (ret & __NET_XMIT_BYPASS)
- qdisc_qstats_drop(sch);
- __qdisc_drop(skb, to_free);
- return ret;
- }
-
- ret = qdisc_enqueue(skb, cl->q, to_free);
- if (ret == NET_XMIT_SUCCESS) {
- sch->q.qlen++;
- cbq_mark_toplevel(q, cl);
- if (!cl->next_alive)
- cbq_activate_class(cl);
- return ret;
- }
-
- if (net_xmit_drop_count(ret)) {
- qdisc_qstats_drop(sch);
- cbq_mark_toplevel(q, cl);
- cl->qstats.drops++;
- }
- return ret;
-}
-
-/* Overlimit action: penalize leaf class by adding offtime */
-static void cbq_overlimit(struct cbq_class *cl)
-{
- struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
- psched_tdiff_t delay = cl->undertime - q->now;
-
- if (!cl->delayed) {
- delay += cl->offtime;
-
- /*
- * Class goes to sleep, so that it will have no
- * chance to work avgidle. Let's forgive it 8)
- *
- * BTW cbq-2.0 has a crap in this
- * place, apparently they forgot to shift it by cl->ewma_log.
- */
- if (cl->avgidle < 0)
- delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log);
- if (cl->avgidle < cl->minidle)
- cl->avgidle = cl->minidle;
- if (delay <= 0)
- delay = 1;
- cl->undertime = q->now + delay;
-
- cl->xstats.overactions++;
- cl->delayed = 1;
- }
- if (q->wd_expires == 0 || q->wd_expires > delay)
- q->wd_expires = delay;
-
- /* Dirty work! We must schedule wakeups based on
- * real available rate, rather than leaf rate,
- * which may be tiny (even zero).
- */
- if (q->toplevel == TC_CBQ_MAXLEVEL) {
- struct cbq_class *b;
- psched_tdiff_t base_delay = q->wd_expires;
-
- for (b = cl->borrow; b; b = b->borrow) {
- delay = b->undertime - q->now;
- if (delay < base_delay) {
- if (delay <= 0)
- delay = 1;
- base_delay = delay;
- }
- }
-
- q->wd_expires = base_delay;
- }
-}
-
-/*
- * It is mission critical procedure.
- *
- * We "regenerate" toplevel cutoff, if transmitting class
- * has backlog and it is not regulated. It is not part of
- * original CBQ description, but looks more reasonable.
- * Probably, it is wrong. This question needs further investigation.
- */
-
-static inline void
-cbq_update_toplevel(struct cbq_sched_data *q, struct cbq_class *cl,
- struct cbq_class *borrowed)
-{
- if (cl && q->toplevel >= borrowed->level) {
- if (cl->q->q.qlen > 1) {
- do {
- if (borrowed->undertime == PSCHED_PASTPERFECT) {
- q->toplevel = borrowed->level;
- return;
- }
- } while ((borrowed = borrowed->borrow) != NULL);
- }
-#if 0
- /* It is not necessary now. Uncommenting it
- will save CPU cycles, but decrease fairness.
- */
- q->toplevel = TC_CBQ_MAXLEVEL;
-#endif
- }
-}
-
-static void
-cbq_update(struct cbq_sched_data *q)
-{
- struct cbq_class *this = q->tx_class;
- struct cbq_class *cl = this;
- int len = q->tx_len;
- psched_time_t now;
-
- q->tx_class = NULL;
- /* Time integrator. We calculate EOS time
- * by adding expected packet transmission time.
- */
- now = q->now + L2T(&q->link, len);
-
- for ( ; cl; cl = cl->share) {
- long avgidle = cl->avgidle;
- long idle;
-
- _bstats_update(&cl->bstats, len, 1);
-
- /*
- * (now - last) is total time between packet right edges.
- * (last_pktlen/rate) is "virtual" busy time, so that
- *
- * idle = (now - last) - last_pktlen/rate
- */
-
- idle = now - cl->last;
- if ((unsigned long)idle > 128*1024*1024) {
- avgidle = cl->maxidle;
- } else {
- idle -= L2T(cl, len);
-
- /* true_avgidle := (1-W)*true_avgidle + W*idle,
- * where W=2^{-ewma_log}. But cl->avgidle is scaled:
- * cl->avgidle == true_avgidle/W,
- * hence:
- */
- avgidle += idle - (avgidle>>cl->ewma_log);
- }
-
- if (avgidle <= 0) {
- /* Overlimit or at-limit */
-
- if (avgidle < cl->minidle)
- avgidle = cl->minidle;
-
- cl->avgidle = avgidle;
-
- /* Calculate expected time, when this class
- * will be allowed to send.
- * It will occur, when:
- * (1-W)*true_avgidle + W*delay = 0, i.e.
- * idle = (1/W - 1)*(-true_avgidle)
- * or
- * idle = (1 - W)*(-cl->avgidle);
- */
- idle = (-avgidle) - ((-avgidle) >> cl->ewma_log);
-
- /*
- * That is not all.
- * To maintain the rate allocated to the class,
- * we add to undertime virtual clock,
- * necessary to complete transmitted packet.
- * (len/phys_bandwidth has been already passed
- * to the moment of cbq_update)
- */
-
- idle -= L2T(&q->link, len);
- idle += L2T(cl, len);
-
- cl->undertime = now + idle;
- } else {
- /* Underlimit */
-
- cl->undertime = PSCHED_PASTPERFECT;
- if (avgidle > cl->maxidle)
- cl->avgidle = cl->maxidle;
- else
- cl->avgidle = avgidle;
- }
- if ((s64)(now - cl->last) > 0)
- cl->last = now;
- }
-
- cbq_update_toplevel(q, this, q->tx_borrowed);
-}
-
-static inline struct cbq_class *
-cbq_under_limit(struct cbq_class *cl)
-{
- struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
- struct cbq_class *this_cl = cl;
-
- if (cl->tparent == NULL)
- return cl;
-
- if (cl->undertime == PSCHED_PASTPERFECT || q->now >= cl->undertime) {
- cl->delayed = 0;
- return cl;
- }
-
- do {
- /* It is very suspicious place. Now overlimit
- * action is generated for not bounded classes
- * only if link is completely congested.
- * Though it is in agree with ancestor-only paradigm,
- * it looks very stupid. Particularly,
- * it means that this chunk of code will either
- * never be called or result in strong amplification
- * of burstiness. Dangerous, silly, and, however,
- * no another solution exists.
- */
- cl = cl->borrow;
- if (!cl) {
- this_cl->qstats.overlimits++;
- cbq_overlimit(this_cl);
- return NULL;
- }
- if (cl->level > q->toplevel)
- return NULL;
- } while (cl->undertime != PSCHED_PASTPERFECT && q->now < cl->undertime);
-
- cl->delayed = 0;
- return cl;
-}
-
-static inline struct sk_buff *
-cbq_dequeue_prio(struct Qdisc *sch, int prio)
-{
- struct cbq_sched_data *q = qdisc_priv(sch);
- struct cbq_class *cl_tail, *cl_prev, *cl;
- struct sk_buff *skb;
- int deficit;
-
- cl_tail = cl_prev = q->active[prio];
- cl = cl_prev->next_alive;
-
- do {
- deficit = 0;
-
- /* Start round */
- do {
- struct cbq_class *borrow = cl;
-
- if (cl->q->q.qlen &&
- (borrow = cbq_under_limit(cl)) == NULL)
- goto skip_class;
-
- if (cl->deficit <= 0) {
- /* Class exhausted its allotment per
- * this round. Switch to the next one.
- */
- deficit = 1;
- cl->deficit += cl->quantum;
- goto next_class;
- }
-
- skb = cl->q->dequeue(cl->q);
-
- /* Class did not give us any skb :-(
- * It could occur even if cl->q->q.qlen != 0
- * f.e. if cl->q == "tbf"
- */
- if (skb == NULL)
- goto skip_class;
-
- cl->deficit -= qdisc_pkt_len(skb);
- q->tx_class = cl;
- q->tx_borrowed = borrow;
- if (borrow != cl) {
-#ifndef CBQ_XSTATS_BORROWS_BYTES
- borrow->xstats.borrows++;
- cl->xstats.borrows++;
-#else
- borrow->xstats.borrows += qdisc_pkt_len(skb);
- cl->xstats.borrows += qdisc_pkt_len(skb);
-#endif
- }
- q->tx_len = qdisc_pkt_len(skb);
-
- if (cl->deficit <= 0) {
- q->active[prio] = cl;
- cl = cl->next_alive;
- cl->deficit += cl->quantum;
- }
- return skb;
-
-skip_class:
- if (cl->q->q.qlen == 0 || prio != cl->cpriority) {
- /* Class is empty or penalized.
- * Unlink it from active chain.
- */
- cl_prev->next_alive = cl->next_alive;
- cl->next_alive = NULL;
-
- /* Did cl_tail point to it? */
- if (cl == cl_tail) {
- /* Repair it! */
- cl_tail = cl_prev;
-
- /* Was it the last class in this band? */
- if (cl == cl_tail) {
- /* Kill the band! */
- q->active[prio] = NULL;
- q->activemask &= ~(1<<prio);
- if (cl->q->q.qlen)
- cbq_activate_class(cl);
- return NULL;
- }
-
- q->active[prio] = cl_tail;
- }
- if (cl->q->q.qlen)
- cbq_activate_class(cl);
-
- cl = cl_prev;
- }
-
-next_class:
- cl_prev = cl;
- cl = cl->next_alive;
- } while (cl_prev != cl_tail);
- } while (deficit);
-
- q->active[prio] = cl_prev;
-
- return NULL;
-}
-
-static inline struct sk_buff *
-cbq_dequeue_1(struct Qdisc *sch)
-{
- struct cbq_sched_data *q = qdisc_priv(sch);
- struct sk_buff *skb;
- unsigned int activemask;
-
- activemask = q->activemask & 0xFF;
- while (activemask) {
- int prio = ffz(~activemask);
- activemask &= ~(1<<prio);
- skb = cbq_dequeue_prio(sch, prio);
- if (skb)
- return skb;
- }
- return NULL;
-}
-
-static struct sk_buff *
-cbq_dequeue(struct Qdisc *sch)
-{
- struct sk_buff *skb;
- struct cbq_sched_data *q = qdisc_priv(sch);
- psched_time_t now;
-
- now = psched_get_time();
-
- if (q->tx_class)
- cbq_update(q);
-
- q->now = now;
-
- for (;;) {
- q->wd_expires = 0;
-
- skb = cbq_dequeue_1(sch);
- if (skb) {
- qdisc_bstats_update(sch, skb);
- sch->q.qlen--;
- return skb;
- }
-
- /* All the classes are overlimit.
- *
- * It is possible, if:
- *
- * 1. Scheduler is empty.
- * 2. Toplevel cutoff inhibited borrowing.
- * 3. Root class is overlimit.
- *
- * Reset 2d and 3d conditions and retry.
- *
- * Note, that NS and cbq-2.0 are buggy, peeking
- * an arbitrary class is appropriate for ancestor-only
- * sharing, but not for toplevel algorithm.
- *
- * Our version is better, but slower, because it requires
- * two passes, but it is unavoidable with top-level sharing.
- */
-
- if (q->toplevel == TC_CBQ_MAXLEVEL &&
- q->link.undertime == PSCHED_PASTPERFECT)
- break;
-
- q->toplevel = TC_CBQ_MAXLEVEL;
- q->link.undertime = PSCHED_PASTPERFECT;
- }
-
- /* No packets in scheduler or nobody wants to give them to us :-(
- * Sigh... start watchdog timer in the last case.
- */
-
- if (sch->q.qlen) {
- qdisc_qstats_overlimit(sch);
- if (q->wd_expires)
- qdisc_watchdog_schedule(&q->watchdog,
- now + q->wd_expires);
- }
- return NULL;
-}
-
-/* CBQ class maintenance routines */
-
-static void cbq_adjust_levels(struct cbq_class *this)
-{
- if (this == NULL)
- return;
-
- do {
- int level = 0;
- struct cbq_class *cl;
-
- cl = this->children;
- if (cl) {
- do {
- if (cl->level > level)
- level = cl->level;
- } while ((cl = cl->sibling) != this->children);
- }
- this->level = level + 1;
- } while ((this = this->tparent) != NULL);
-}
-
-static void cbq_normalize_quanta(struct cbq_sched_data *q, int prio)
-{
- struct cbq_class *cl;
- unsigned int h;
-
- if (q->quanta[prio] == 0)
- return;
-
- for (h = 0; h < q->clhash.hashsize; h++) {
- hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
- /* BUGGGG... Beware! This expression suffer of
- * arithmetic overflows!
- */
- if (cl->priority == prio) {
- cl->quantum = (cl->weight*cl->allot*q->nclasses[prio])/
- q->quanta[prio];
- }
- if (cl->quantum <= 0 ||
- cl->quantum > 32*qdisc_dev(cl->qdisc)->mtu) {
- pr_warn("CBQ: class %08x has bad quantum==%ld, repaired.\n",
- cl->common.classid, cl->quantum);
- cl->quantum = qdisc_dev(cl->qdisc)->mtu/2 + 1;
- }
- }
- }
-}
-
-static void cbq_sync_defmap(struct cbq_class *cl)
-{
- struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
- struct cbq_class *split = cl->split;
- unsigned int h;
- int i;
-
- if (split == NULL)
- return;
-
- for (i = 0; i <= TC_PRIO_MAX; i++) {
- if (split->defaults[i] == cl && !(cl->defmap & (1<<i)))
- split->defaults[i] = NULL;
- }
-
- for (i = 0; i <= TC_PRIO_MAX; i++) {
- int level = split->level;
-
- if (split->defaults[i])
- continue;
-
- for (h = 0; h < q->clhash.hashsize; h++) {
- struct cbq_class *c;
-
- hlist_for_each_entry(c, &q->clhash.hash[h],
- common.hnode) {
- if (c->split == split && c->level < level &&
- c->defmap & (1<<i)) {
- split->defaults[i] = c;
- level = c->level;
- }
- }
- }
- }
-}
-
-static void cbq_change_defmap(struct cbq_class *cl, u32 splitid, u32 def, u32 mask)
-{
- struct cbq_class *split = NULL;
-
- if (splitid == 0) {
- split = cl->split;
- if (!split)
- return;
- splitid = split->common.classid;
- }
-
- if (split == NULL || split->common.classid != splitid) {
- for (split = cl->tparent; split; split = split->tparent)
- if (split->common.classid == splitid)
- break;
- }
-
- if (split == NULL)
- return;
-
- if (cl->split != split) {
- cl->defmap = 0;
- cbq_sync_defmap(cl);
- cl->split = split;
- cl->defmap = def & mask;
- } else
- cl->defmap = (cl->defmap & ~mask) | (def & mask);
-
- cbq_sync_defmap(cl);
-}
-
-static void cbq_unlink_class(struct cbq_class *this)
-{
- struct cbq_class *cl, **clp;
- struct cbq_sched_data *q = qdisc_priv(this->qdisc);
-
- qdisc_class_hash_remove(&q->clhash, &this->common);
-
- if (this->tparent) {
- clp = &this->sibling;
- cl = *clp;
- do {
- if (cl == this) {
- *clp = cl->sibling;
- break;
- }
- clp = &cl->sibling;
- } while ((cl = *clp) != this->sibling);
-
- if (this->tparent->children == this) {
- this->tparent->children = this->sibling;
- if (this->sibling == this)
- this->tparent->children = NULL;
- }
- } else {
- WARN_ON(this->sibling != this);
- }
-}
-
-static void cbq_link_class(struct cbq_class *this)
-{
- struct cbq_sched_data *q = qdisc_priv(this->qdisc);
- struct cbq_class *parent = this->tparent;
-
- this->sibling = this;
- qdisc_class_hash_insert(&q->clhash, &this->common);
-
- if (parent == NULL)
- return;
-
- if (parent->children == NULL) {
- parent->children = this;
- } else {
- this->sibling = parent->children->sibling;
- parent->children->sibling = this;
- }
-}
-
-static void
-cbq_reset(struct Qdisc *sch)
-{
- struct cbq_sched_data *q = qdisc_priv(sch);
- struct cbq_class *cl;
- int prio;
- unsigned int h;
-
- q->activemask = 0;
- q->pmask = 0;
- q->tx_class = NULL;
- q->tx_borrowed = NULL;
- qdisc_watchdog_cancel(&q->watchdog);
- q->toplevel = TC_CBQ_MAXLEVEL;
- q->now = psched_get_time();
-
- for (prio = 0; prio <= TC_CBQ_MAXPRIO; prio++)
- q->active[prio] = NULL;
-
- for (h = 0; h < q->clhash.hashsize; h++) {
- hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
- qdisc_reset(cl->q);
-
- cl->next_alive = NULL;
- cl->undertime = PSCHED_PASTPERFECT;
- cl->avgidle = cl->maxidle;
- cl->deficit = cl->quantum;
- cl->cpriority = cl->priority;
- }
- }
-}
-
-
-static void cbq_set_lss(struct cbq_class *cl, struct tc_cbq_lssopt *lss)
-{
- if (lss->change & TCF_CBQ_LSS_FLAGS) {
- cl->share = (lss->flags & TCF_CBQ_LSS_ISOLATED) ? NULL : cl->tparent;
- cl->borrow = (lss->flags & TCF_CBQ_LSS_BOUNDED) ? NULL : cl->tparent;
- }
- if (lss->change & TCF_CBQ_LSS_EWMA)
- cl->ewma_log = lss->ewma_log;
- if (lss->change & TCF_CBQ_LSS_AVPKT)
- cl->avpkt = lss->avpkt;
- if (lss->change & TCF_CBQ_LSS_MINIDLE)
- cl->minidle = -(long)lss->minidle;
- if (lss->change & TCF_CBQ_LSS_MAXIDLE) {
- cl->maxidle = lss->maxidle;
- cl->avgidle = lss->maxidle;
- }
- if (lss->change & TCF_CBQ_LSS_OFFTIME)
- cl->offtime = lss->offtime;
-}
-
-static void cbq_rmprio(struct cbq_sched_data *q, struct cbq_class *cl)
-{
- q->nclasses[cl->priority]--;
- q->quanta[cl->priority] -= cl->weight;
- cbq_normalize_quanta(q, cl->priority);
-}
-
-static void cbq_addprio(struct cbq_sched_data *q, struct cbq_class *cl)
-{
- q->nclasses[cl->priority]++;
- q->quanta[cl->priority] += cl->weight;
- cbq_normalize_quanta(q, cl->priority);
-}
-
-static int cbq_set_wrr(struct cbq_class *cl, struct tc_cbq_wrropt *wrr)
-{
- struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
-
- if (wrr->allot)
- cl->allot = wrr->allot;
- if (wrr->weight)
- cl->weight = wrr->weight;
- if (wrr->priority) {
- cl->priority = wrr->priority - 1;
- cl->cpriority = cl->priority;
- if (cl->priority >= cl->priority2)
- cl->priority2 = TC_CBQ_MAXPRIO - 1;
- }
-
- cbq_addprio(q, cl);
- return 0;
-}
-
-static int cbq_set_fopt(struct cbq_class *cl, struct tc_cbq_fopt *fopt)
-{
- cbq_change_defmap(cl, fopt->split, fopt->defmap, fopt->defchange);
- return 0;
-}
-
-static const struct nla_policy cbq_policy[TCA_CBQ_MAX + 1] = {
- [TCA_CBQ_LSSOPT] = { .len = sizeof(struct tc_cbq_lssopt) },
- [TCA_CBQ_WRROPT] = { .len = sizeof(struct tc_cbq_wrropt) },
- [TCA_CBQ_FOPT] = { .len = sizeof(struct tc_cbq_fopt) },
- [TCA_CBQ_OVL_STRATEGY] = { .len = sizeof(struct tc_cbq_ovl) },
- [TCA_CBQ_RATE] = { .len = sizeof(struct tc_ratespec) },
- [TCA_CBQ_RTAB] = { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
- [TCA_CBQ_POLICE] = { .len = sizeof(struct tc_cbq_police) },
-};
-
-static int cbq_opt_parse(struct nlattr *tb[TCA_CBQ_MAX + 1],
- struct nlattr *opt,
- struct netlink_ext_ack *extack)
-{
- int err;
-
- if (!opt) {
- NL_SET_ERR_MSG(extack, "CBQ options are required for this operation");
- return -EINVAL;
- }
-
- err = nla_parse_nested_deprecated(tb, TCA_CBQ_MAX, opt,
- cbq_policy, extack);
- if (err < 0)
- return err;
-
- if (tb[TCA_CBQ_WRROPT]) {
- const struct tc_cbq_wrropt *wrr = nla_data(tb[TCA_CBQ_WRROPT]);
-
- if (wrr->priority > TC_CBQ_MAXPRIO) {
- NL_SET_ERR_MSG(extack, "priority is bigger than TC_CBQ_MAXPRIO");
- err = -EINVAL;
- }
- }
- return err;
-}
-
-static int cbq_init(struct Qdisc *sch, struct nlattr *opt,
- struct netlink_ext_ack *extack)
-{
- struct cbq_sched_data *q = qdisc_priv(sch);
- struct nlattr *tb[TCA_CBQ_MAX + 1];
- struct tc_ratespec *r;
- int err;
-
- qdisc_watchdog_init(&q->watchdog, sch);
-
- err = cbq_opt_parse(tb, opt, extack);
- if (err < 0)
- return err;
-
- if (!tb[TCA_CBQ_RTAB] || !tb[TCA_CBQ_RATE]) {
- NL_SET_ERR_MSG(extack, "Rate specification missing or incomplete");
- return -EINVAL;
- }
-
- r = nla_data(tb[TCA_CBQ_RATE]);
-
- q->link.R_tab = qdisc_get_rtab(r, tb[TCA_CBQ_RTAB], extack);
- if (!q->link.R_tab)
- return -EINVAL;
-
- err = tcf_block_get(&q->link.block, &q->link.filter_list, sch, extack);
- if (err)
- goto put_rtab;
-
- err = qdisc_class_hash_init(&q->clhash);
- if (err < 0)
- goto put_block;
-
- q->link.sibling = &q->link;
- q->link.common.classid = sch->handle;
- q->link.qdisc = sch;
- q->link.q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
- sch->handle, NULL);
- if (!q->link.q)
- q->link.q = &noop_qdisc;
- else
- qdisc_hash_add(q->link.q, true);
-
- q->link.priority = TC_CBQ_MAXPRIO - 1;
- q->link.priority2 = TC_CBQ_MAXPRIO - 1;
- q->link.cpriority = TC_CBQ_MAXPRIO - 1;
- q->link.allot = psched_mtu(qdisc_dev(sch));
- q->link.quantum = q->link.allot;
- q->link.weight = q->link.R_tab->rate.rate;
-
- q->link.ewma_log = TC_CBQ_DEF_EWMA;
- q->link.avpkt = q->link.allot/2;
- q->link.minidle = -0x7FFFFFFF;
-
- q->toplevel = TC_CBQ_MAXLEVEL;
- q->now = psched_get_time();
-
- cbq_link_class(&q->link);
-
- if (tb[TCA_CBQ_LSSOPT])
- cbq_set_lss(&q->link, nla_data(tb[TCA_CBQ_LSSOPT]));
-
- cbq_addprio(q, &q->link);
- return 0;
-
-put_block:
- tcf_block_put(q->link.block);
-
-put_rtab:
- qdisc_put_rtab(q->link.R_tab);
- return err;
-}
-
-static int cbq_dump_rate(struct sk_buff *skb, struct cbq_class *cl)
-{
- unsigned char *b = skb_tail_pointer(skb);
-
- if (nla_put(skb, TCA_CBQ_RATE, sizeof(cl->R_tab->rate), &cl->R_tab->rate))
- goto nla_put_failure;
- return skb->len;
-
-nla_put_failure:
- nlmsg_trim(skb, b);
- return -1;
-}
-
-static int cbq_dump_lss(struct sk_buff *skb, struct cbq_class *cl)
-{
- unsigned char *b = skb_tail_pointer(skb);
- struct tc_cbq_lssopt opt;
-
- opt.flags = 0;
- if (cl->borrow == NULL)
- opt.flags |= TCF_CBQ_LSS_BOUNDED;
- if (cl->share == NULL)
- opt.flags |= TCF_CBQ_LSS_ISOLATED;
- opt.ewma_log = cl->ewma_log;
- opt.level = cl->level;
- opt.avpkt = cl->avpkt;
- opt.maxidle = cl->maxidle;
- opt.minidle = (u32)(-cl->minidle);
- opt.offtime = cl->offtime;
- opt.change = ~0;
- if (nla_put(skb, TCA_CBQ_LSSOPT, sizeof(opt), &opt))
- goto nla_put_failure;
- return skb->len;
-
-nla_put_failure:
- nlmsg_trim(skb, b);
- return -1;
-}
-
-static int cbq_dump_wrr(struct sk_buff *skb, struct cbq_class *cl)
-{
- unsigned char *b = skb_tail_pointer(skb);
- struct tc_cbq_wrropt opt;
-
- memset(&opt, 0, sizeof(opt));
- opt.flags = 0;
- opt.allot = cl->allot;
- opt.priority = cl->priority + 1;
- opt.cpriority = cl->cpriority + 1;
- opt.weight = cl->weight;
- if (nla_put(skb, TCA_CBQ_WRROPT, sizeof(opt), &opt))
- goto nla_put_failure;
- return skb->len;
-
-nla_put_failure:
- nlmsg_trim(skb, b);
- return -1;
-}
-
-static int cbq_dump_fopt(struct sk_buff *skb, struct cbq_class *cl)
-{
- unsigned char *b = skb_tail_pointer(skb);
- struct tc_cbq_fopt opt;
-
- if (cl->split || cl->defmap) {
- opt.split = cl->split ? cl->split->common.classid : 0;
- opt.defmap = cl->defmap;
- opt.defchange = ~0;
- if (nla_put(skb, TCA_CBQ_FOPT, sizeof(opt), &opt))
- goto nla_put_failure;
- }
- return skb->len;
-
-nla_put_failure:
- nlmsg_trim(skb, b);
- return -1;
-}
-
-static int cbq_dump_attr(struct sk_buff *skb, struct cbq_class *cl)
-{
- if (cbq_dump_lss(skb, cl) < 0 ||
- cbq_dump_rate(skb, cl) < 0 ||
- cbq_dump_wrr(skb, cl) < 0 ||
- cbq_dump_fopt(skb, cl) < 0)
- return -1;
- return 0;
-}
-
-static int cbq_dump(struct Qdisc *sch, struct sk_buff *skb)
-{
- struct cbq_sched_data *q = qdisc_priv(sch);
- struct nlattr *nest;
-
- nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
- if (nest == NULL)
- goto nla_put_failure;
- if (cbq_dump_attr(skb, &q->link) < 0)
- goto nla_put_failure;
- return nla_nest_end(skb, nest);
-
-nla_put_failure:
- nla_nest_cancel(skb, nest);
- return -1;
-}
-
-static int
-cbq_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
-{
- struct cbq_sched_data *q = qdisc_priv(sch);
-
- q->link.xstats.avgidle = q->link.avgidle;
- return gnet_stats_copy_app(d, &q->link.xstats, sizeof(q->link.xstats));
-}
-
-static int
-cbq_dump_class(struct Qdisc *sch, unsigned long arg,
- struct sk_buff *skb, struct tcmsg *tcm)
-{
- struct cbq_class *cl = (struct cbq_class *)arg;
- struct nlattr *nest;
-
- if (cl->tparent)
- tcm->tcm_parent = cl->tparent->common.classid;
- else
- tcm->tcm_parent = TC_H_ROOT;
- tcm->tcm_handle = cl->common.classid;
- tcm->tcm_info = cl->q->handle;
-
- nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
- if (nest == NULL)
- goto nla_put_failure;
- if (cbq_dump_attr(skb, cl) < 0)
- goto nla_put_failure;
- return nla_nest_end(skb, nest);
-
-nla_put_failure:
- nla_nest_cancel(skb, nest);
- return -1;
-}
-
-static int
-cbq_dump_class_stats(struct Qdisc *sch, unsigned long arg,
- struct gnet_dump *d)
-{
- struct cbq_sched_data *q = qdisc_priv(sch);
- struct cbq_class *cl = (struct cbq_class *)arg;
- __u32 qlen;
-
- cl->xstats.avgidle = cl->avgidle;
- cl->xstats.undertime = 0;
- qdisc_qstats_qlen_backlog(cl->q, &qlen, &cl->qstats.backlog);
-
- if (cl->undertime != PSCHED_PASTPERFECT)
- cl->xstats.undertime = cl->undertime - q->now;
-
- if (gnet_stats_copy_basic(d, NULL, &cl->bstats, true) < 0 ||
- gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
- gnet_stats_copy_queue(d, NULL, &cl->qstats, qlen) < 0)
- return -1;
-
- return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats));
-}
-
-static int cbq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
- struct Qdisc **old, struct netlink_ext_ack *extack)
-{
- struct cbq_class *cl = (struct cbq_class *)arg;
-
- if (new == NULL) {
- new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
- cl->common.classid, extack);
- if (new == NULL)
- return -ENOBUFS;
- }
-
- *old = qdisc_replace(sch, new, &cl->q);
- return 0;
-}
-
-static struct Qdisc *cbq_leaf(struct Qdisc *sch, unsigned long arg)
-{
- struct cbq_class *cl = (struct cbq_class *)arg;
-
- return cl->q;
-}
-
-static void cbq_qlen_notify(struct Qdisc *sch, unsigned long arg)
-{
- struct cbq_class *cl = (struct cbq_class *)arg;
-
- cbq_deactivate_class(cl);
-}
-
-static unsigned long cbq_find(struct Qdisc *sch, u32 classid)
-{
- struct cbq_sched_data *q = qdisc_priv(sch);
-
- return (unsigned long)cbq_class_lookup(q, classid);
-}
-
-static void cbq_destroy_class(struct Qdisc *sch, struct cbq_class *cl)
-{
- struct cbq_sched_data *q = qdisc_priv(sch);
-
- WARN_ON(cl->filters);
-
- tcf_block_put(cl->block);
- qdisc_put(cl->q);
- qdisc_put_rtab(cl->R_tab);
- gen_kill_estimator(&cl->rate_est);
- if (cl != &q->link)
- kfree(cl);
-}
-
-static void cbq_destroy(struct Qdisc *sch)
-{
- struct cbq_sched_data *q = qdisc_priv(sch);
- struct hlist_node *next;
- struct cbq_class *cl;
- unsigned int h;
-
-#ifdef CONFIG_NET_CLS_ACT
- q->rx_class = NULL;
-#endif
- /*
- * Filters must be destroyed first because we don't destroy the
- * classes from root to leafs which means that filters can still
- * be bound to classes which have been destroyed already. --TGR '04
- */
- for (h = 0; h < q->clhash.hashsize; h++) {
- hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
- tcf_block_put(cl->block);
- cl->block = NULL;
- }
- }
- for (h = 0; h < q->clhash.hashsize; h++) {
- hlist_for_each_entry_safe(cl, next, &q->clhash.hash[h],
- common.hnode)
- cbq_destroy_class(sch, cl);
- }
- qdisc_class_hash_destroy(&q->clhash);
-}
-
-static int
-cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **tca,
- unsigned long *arg, struct netlink_ext_ack *extack)
-{
- int err;
- struct cbq_sched_data *q = qdisc_priv(sch);
- struct cbq_class *cl = (struct cbq_class *)*arg;
- struct nlattr *opt = tca[TCA_OPTIONS];
- struct nlattr *tb[TCA_CBQ_MAX + 1];
- struct cbq_class *parent;
- struct qdisc_rate_table *rtab = NULL;
-
- err = cbq_opt_parse(tb, opt, extack);
- if (err < 0)
- return err;
-
- if (tb[TCA_CBQ_OVL_STRATEGY] || tb[TCA_CBQ_POLICE]) {
- NL_SET_ERR_MSG(extack, "Neither overlimit strategy nor policing attributes can be used for changing class params");
- return -EOPNOTSUPP;
- }
-
- if (cl) {
- /* Check parent */
- if (parentid) {
- if (cl->tparent &&
- cl->tparent->common.classid != parentid) {
- NL_SET_ERR_MSG(extack, "Invalid parent id");
- return -EINVAL;
- }
- if (!cl->tparent && parentid != TC_H_ROOT) {
- NL_SET_ERR_MSG(extack, "Parent must be root");
- return -EINVAL;
- }
- }
-
- if (tb[TCA_CBQ_RATE]) {
- rtab = qdisc_get_rtab(nla_data(tb[TCA_CBQ_RATE]),
- tb[TCA_CBQ_RTAB], extack);
- if (rtab == NULL)
- return -EINVAL;
- }
-
- if (tca[TCA_RATE]) {
- err = gen_replace_estimator(&cl->bstats, NULL,
- &cl->rate_est,
- NULL,
- true,
- tca[TCA_RATE]);
- if (err) {
- NL_SET_ERR_MSG(extack, "Failed to replace specified rate estimator");
- qdisc_put_rtab(rtab);
- return err;
- }
- }
-
- /* Change class parameters */
- sch_tree_lock(sch);
-
- if (cl->next_alive != NULL)
- cbq_deactivate_class(cl);
-
- if (rtab) {
- qdisc_put_rtab(cl->R_tab);
- cl->R_tab = rtab;
- }
-
- if (tb[TCA_CBQ_LSSOPT])
- cbq_set_lss(cl, nla_data(tb[TCA_CBQ_LSSOPT]));
-
- if (tb[TCA_CBQ_WRROPT]) {
- cbq_rmprio(q, cl);
- cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT]));
- }
-
- if (tb[TCA_CBQ_FOPT])
- cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT]));
-
- if (cl->q->q.qlen)
- cbq_activate_class(cl);
-
- sch_tree_unlock(sch);
-
- return 0;
- }
-
- if (parentid == TC_H_ROOT)
- return -EINVAL;
-
- if (!tb[TCA_CBQ_WRROPT] || !tb[TCA_CBQ_RATE] || !tb[TCA_CBQ_LSSOPT]) {
- NL_SET_ERR_MSG(extack, "One of the following attributes MUST be specified: WRR, rate or link sharing");
- return -EINVAL;
- }
-
- rtab = qdisc_get_rtab(nla_data(tb[TCA_CBQ_RATE]), tb[TCA_CBQ_RTAB],
- extack);
- if (rtab == NULL)
- return -EINVAL;
-
- if (classid) {
- err = -EINVAL;
- if (TC_H_MAJ(classid ^ sch->handle) ||
- cbq_class_lookup(q, classid)) {
- NL_SET_ERR_MSG(extack, "Specified class not found");
- goto failure;
- }
- } else {
- int i;
- classid = TC_H_MAKE(sch->handle, 0x8000);
-
- for (i = 0; i < 0x8000; i++) {
- if (++q->hgenerator >= 0x8000)
- q->hgenerator = 1;
- if (cbq_class_lookup(q, classid|q->hgenerator) == NULL)
- break;
- }
- err = -ENOSR;
- if (i >= 0x8000) {
- NL_SET_ERR_MSG(extack, "Unable to generate classid");
- goto failure;
- }
- classid = classid|q->hgenerator;
- }
-
- parent = &q->link;
- if (parentid) {
- parent = cbq_class_lookup(q, parentid);
- err = -EINVAL;
- if (!parent) {
- NL_SET_ERR_MSG(extack, "Failed to find parentid");
- goto failure;
- }
- }
-
- err = -ENOBUFS;
- cl = kzalloc(sizeof(*cl), GFP_KERNEL);
- if (cl == NULL)
- goto failure;
-
- gnet_stats_basic_sync_init(&cl->bstats);
- err = tcf_block_get(&cl->block, &cl->filter_list, sch, extack);
- if (err) {
- kfree(cl);
- goto failure;
- }
-
- if (tca[TCA_RATE]) {
- err = gen_new_estimator(&cl->bstats, NULL, &cl->rate_est,
- NULL, true, tca[TCA_RATE]);
- if (err) {
- NL_SET_ERR_MSG(extack, "Couldn't create new estimator");
- tcf_block_put(cl->block);
- kfree(cl);
- goto failure;
- }
- }
-
- cl->R_tab = rtab;
- rtab = NULL;
- cl->q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, classid,
- NULL);
- if (!cl->q)
- cl->q = &noop_qdisc;
- else
- qdisc_hash_add(cl->q, true);
-
- cl->common.classid = classid;
- cl->tparent = parent;
- cl->qdisc = sch;
- cl->allot = parent->allot;
- cl->quantum = cl->allot;
- cl->weight = cl->R_tab->rate.rate;
-
- sch_tree_lock(sch);
- cbq_link_class(cl);
- cl->borrow = cl->tparent;
- if (cl->tparent != &q->link)
- cl->share = cl->tparent;
- cbq_adjust_levels(parent);
- cl->minidle = -0x7FFFFFFF;
- cbq_set_lss(cl, nla_data(tb[TCA_CBQ_LSSOPT]));
- cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT]));
- if (cl->ewma_log == 0)
- cl->ewma_log = q->link.ewma_log;
- if (cl->maxidle == 0)
- cl->maxidle = q->link.maxidle;
- if (cl->avpkt == 0)
- cl->avpkt = q->link.avpkt;
- if (tb[TCA_CBQ_FOPT])
- cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT]));
- sch_tree_unlock(sch);
-
- qdisc_class_hash_grow(sch, &q->clhash);
-
- *arg = (unsigned long)cl;
- return 0;
-
-failure:
- qdisc_put_rtab(rtab);
- return err;
-}
-
-static int cbq_delete(struct Qdisc *sch, unsigned long arg,
- struct netlink_ext_ack *extack)
-{
- struct cbq_sched_data *q = qdisc_priv(sch);
- struct cbq_class *cl = (struct cbq_class *)arg;
-
- if (cl->filters || cl->children || cl == &q->link)
- return -EBUSY;
-
- sch_tree_lock(sch);
-
- qdisc_purge_queue(cl->q);
-
- if (cl->next_alive)
- cbq_deactivate_class(cl);
-
- if (q->tx_borrowed == cl)
- q->tx_borrowed = q->tx_class;
- if (q->tx_class == cl) {
- q->tx_class = NULL;
- q->tx_borrowed = NULL;
- }
-#ifdef CONFIG_NET_CLS_ACT
- if (q->rx_class == cl)
- q->rx_class = NULL;
-#endif
-
- cbq_unlink_class(cl);
- cbq_adjust_levels(cl->tparent);
- cl->defmap = 0;
- cbq_sync_defmap(cl);
-
- cbq_rmprio(q, cl);
- sch_tree_unlock(sch);
-
- cbq_destroy_class(sch, cl);
- return 0;
-}
-
-static struct tcf_block *cbq_tcf_block(struct Qdisc *sch, unsigned long arg,
- struct netlink_ext_ack *extack)
-{
- struct cbq_sched_data *q = qdisc_priv(sch);
- struct cbq_class *cl = (struct cbq_class *)arg;
-
- if (cl == NULL)
- cl = &q->link;
-
- return cl->block;
-}
-
-static unsigned long cbq_bind_filter(struct Qdisc *sch, unsigned long parent,
- u32 classid)
-{
- struct cbq_sched_data *q = qdisc_priv(sch);
- struct cbq_class *p = (struct cbq_class *)parent;
- struct cbq_class *cl = cbq_class_lookup(q, classid);
-
- if (cl) {
- if (p && p->level <= cl->level)
- return 0;
- cl->filters++;
- return (unsigned long)cl;
- }
- return 0;
-}
-
-static void cbq_unbind_filter(struct Qdisc *sch, unsigned long arg)
-{
- struct cbq_class *cl = (struct cbq_class *)arg;
-
- cl->filters--;
-}
-
-static void cbq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
-{
- struct cbq_sched_data *q = qdisc_priv(sch);
- struct cbq_class *cl;
- unsigned int h;
-
- if (arg->stop)
- return;
-
- for (h = 0; h < q->clhash.hashsize; h++) {
- hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
- if (!tc_qdisc_stats_dump(sch, (unsigned long)cl, arg))
- return;
- }
- }
-}
-
-static const struct Qdisc_class_ops cbq_class_ops = {
- .graft = cbq_graft,
- .leaf = cbq_leaf,
- .qlen_notify = cbq_qlen_notify,
- .find = cbq_find,
- .change = cbq_change_class,
- .delete = cbq_delete,
- .walk = cbq_walk,
- .tcf_block = cbq_tcf_block,
- .bind_tcf = cbq_bind_filter,
- .unbind_tcf = cbq_unbind_filter,
- .dump = cbq_dump_class,
- .dump_stats = cbq_dump_class_stats,
-};
-
-static struct Qdisc_ops cbq_qdisc_ops __read_mostly = {
- .next = NULL,
- .cl_ops = &cbq_class_ops,
- .id = "cbq",
- .priv_size = sizeof(struct cbq_sched_data),
- .enqueue = cbq_enqueue,
- .dequeue = cbq_dequeue,
- .peek = qdisc_peek_dequeued,
- .init = cbq_init,
- .reset = cbq_reset,
- .destroy = cbq_destroy,
- .change = NULL,
- .dump = cbq_dump,
- .dump_stats = cbq_dump_stats,
- .owner = THIS_MODULE,
-};
-
-static int __init cbq_module_init(void)
-{
- return register_qdisc(&cbq_qdisc_ops);
-}
-static void __exit cbq_module_exit(void)
-{
- unregister_qdisc(&cbq_qdisc_ops);
-}
-module_init(cbq_module_init)
-module_exit(cbq_module_exit)
-MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c
deleted file mode 100644
index 401ffaf87d62..000000000000
--- a/net/sched/sch_dsmark.c
+++ /dev/null
@@ -1,518 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/* net/sched/sch_dsmark.c - Differentiated Services field marker */
-
-/* Written 1998-2000 by Werner Almesberger, EPFL ICA */
-
-
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/slab.h>
-#include <linux/types.h>
-#include <linux/string.h>
-#include <linux/errno.h>
-#include <linux/skbuff.h>
-#include <linux/rtnetlink.h>
-#include <linux/bitops.h>
-#include <net/pkt_sched.h>
-#include <net/pkt_cls.h>
-#include <net/dsfield.h>
-#include <net/inet_ecn.h>
-#include <asm/byteorder.h>
-
-/*
- * classid class marking
- * ------- ----- -------
- * n/a 0 n/a
- * x:0 1 use entry [0]
- * ... ... ...
- * x:y y>0 y+1 use entry [y]
- * ... ... ...
- * x:indices-1 indices use entry [indices-1]
- * ... ... ...
- * x:y y+1 use entry [y & (indices-1)]
- * ... ... ...
- * 0xffff 0x10000 use entry [indices-1]
- */
-
-
-#define NO_DEFAULT_INDEX (1 << 16)
-
-struct mask_value {
- u8 mask;
- u8 value;
-};
-
-struct dsmark_qdisc_data {
- struct Qdisc *q;
- struct tcf_proto __rcu *filter_list;
- struct tcf_block *block;
- struct mask_value *mv;
- u16 indices;
- u8 set_tc_index;
- u32 default_index; /* index range is 0...0xffff */
-#define DSMARK_EMBEDDED_SZ 16
- struct mask_value embedded[DSMARK_EMBEDDED_SZ];
-};
-
-static inline int dsmark_valid_index(struct dsmark_qdisc_data *p, u16 index)
-{
- return index <= p->indices && index > 0;
-}
-
-/* ------------------------- Class/flow operations ------------------------- */
-
-static int dsmark_graft(struct Qdisc *sch, unsigned long arg,
- struct Qdisc *new, struct Qdisc **old,
- struct netlink_ext_ack *extack)
-{
- struct dsmark_qdisc_data *p = qdisc_priv(sch);
-
- pr_debug("%s(sch %p,[qdisc %p],new %p,old %p)\n",
- __func__, sch, p, new, old);
-
- if (new == NULL) {
- new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
- sch->handle, NULL);
- if (new == NULL)
- new = &noop_qdisc;
- }
-
- *old = qdisc_replace(sch, new, &p->q);
- return 0;
-}
-
-static struct Qdisc *dsmark_leaf(struct Qdisc *sch, unsigned long arg)
-{
- struct dsmark_qdisc_data *p = qdisc_priv(sch);
- return p->q;
-}
-
-static unsigned long dsmark_find(struct Qdisc *sch, u32 classid)
-{
- return TC_H_MIN(classid) + 1;
-}
-
-static unsigned long dsmark_bind_filter(struct Qdisc *sch,
- unsigned long parent, u32 classid)
-{
- pr_debug("%s(sch %p,[qdisc %p],classid %x)\n",
- __func__, sch, qdisc_priv(sch), classid);
-
- return dsmark_find(sch, classid);
-}
-
-static void dsmark_unbind_filter(struct Qdisc *sch, unsigned long cl)
-{
-}
-
-static const struct nla_policy dsmark_policy[TCA_DSMARK_MAX + 1] = {
- [TCA_DSMARK_INDICES] = { .type = NLA_U16 },
- [TCA_DSMARK_DEFAULT_INDEX] = { .type = NLA_U16 },
- [TCA_DSMARK_SET_TC_INDEX] = { .type = NLA_FLAG },
- [TCA_DSMARK_MASK] = { .type = NLA_U8 },
- [TCA_DSMARK_VALUE] = { .type = NLA_U8 },
-};
-
-static int dsmark_change(struct Qdisc *sch, u32 classid, u32 parent,
- struct nlattr **tca, unsigned long *arg,
- struct netlink_ext_ack *extack)
-{
- struct dsmark_qdisc_data *p = qdisc_priv(sch);
- struct nlattr *opt = tca[TCA_OPTIONS];
- struct nlattr *tb[TCA_DSMARK_MAX + 1];
- int err = -EINVAL;
-
- pr_debug("%s(sch %p,[qdisc %p],classid %x,parent %x), arg 0x%lx\n",
- __func__, sch, p, classid, parent, *arg);
-
- if (!dsmark_valid_index(p, *arg)) {
- err = -ENOENT;
- goto errout;
- }
-
- if (!opt)
- goto errout;
-
- err = nla_parse_nested_deprecated(tb, TCA_DSMARK_MAX, opt,
- dsmark_policy, NULL);
- if (err < 0)
- goto errout;
-
- if (tb[TCA_DSMARK_VALUE])
- p->mv[*arg - 1].value = nla_get_u8(tb[TCA_DSMARK_VALUE]);
-
- if (tb[TCA_DSMARK_MASK])
- p->mv[*arg - 1].mask = nla_get_u8(tb[TCA_DSMARK_MASK]);
-
- err = 0;
-
-errout:
- return err;
-}
-
-static int dsmark_delete(struct Qdisc *sch, unsigned long arg,
- struct netlink_ext_ack *extack)
-{
- struct dsmark_qdisc_data *p = qdisc_priv(sch);
-
- if (!dsmark_valid_index(p, arg))
- return -EINVAL;
-
- p->mv[arg - 1].mask = 0xff;
- p->mv[arg - 1].value = 0;
-
- return 0;
-}
-
-static void dsmark_walk(struct Qdisc *sch, struct qdisc_walker *walker)
-{
- struct dsmark_qdisc_data *p = qdisc_priv(sch);
- int i;
-
- pr_debug("%s(sch %p,[qdisc %p],walker %p)\n",
- __func__, sch, p, walker);
-
- if (walker->stop)
- return;
-
- for (i = 0; i < p->indices; i++) {
- if (p->mv[i].mask == 0xff && !p->mv[i].value) {
- walker->count++;
- continue;
- }
- if (!tc_qdisc_stats_dump(sch, i + 1, walker))
- break;
- }
-}
-
-static struct tcf_block *dsmark_tcf_block(struct Qdisc *sch, unsigned long cl,
- struct netlink_ext_ack *extack)
-{
- struct dsmark_qdisc_data *p = qdisc_priv(sch);
-
- return p->block;
-}
-
-/* --------------------------- Qdisc operations ---------------------------- */
-
-static int dsmark_enqueue(struct sk_buff *skb, struct Qdisc *sch,
- struct sk_buff **to_free)
-{
- unsigned int len = qdisc_pkt_len(skb);
- struct dsmark_qdisc_data *p = qdisc_priv(sch);
- int err;
-
- pr_debug("%s(skb %p,sch %p,[qdisc %p])\n", __func__, skb, sch, p);
-
- if (p->set_tc_index) {
- int wlen = skb_network_offset(skb);
-
- switch (skb_protocol(skb, true)) {
- case htons(ETH_P_IP):
- wlen += sizeof(struct iphdr);
- if (!pskb_may_pull(skb, wlen) ||
- skb_try_make_writable(skb, wlen))
- goto drop;
-
- skb->tc_index = ipv4_get_dsfield(ip_hdr(skb))
- & ~INET_ECN_MASK;
- break;
-
- case htons(ETH_P_IPV6):
- wlen += sizeof(struct ipv6hdr);
- if (!pskb_may_pull(skb, wlen) ||
- skb_try_make_writable(skb, wlen))
- goto drop;
-
- skb->tc_index = ipv6_get_dsfield(ipv6_hdr(skb))
- & ~INET_ECN_MASK;
- break;
- default:
- skb->tc_index = 0;
- break;
- }
- }
-
- if (TC_H_MAJ(skb->priority) == sch->handle)
- skb->tc_index = TC_H_MIN(skb->priority);
- else {
- struct tcf_result res;
- struct tcf_proto *fl = rcu_dereference_bh(p->filter_list);
- int result = tcf_classify(skb, NULL, fl, &res, false);
-
- pr_debug("result %d class 0x%04x\n", result, res.classid);
-
- switch (result) {
-#ifdef CONFIG_NET_CLS_ACT
- case TC_ACT_QUEUED:
- case TC_ACT_STOLEN:
- case TC_ACT_TRAP:
- __qdisc_drop(skb, to_free);
- return NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
-
- case TC_ACT_SHOT:
- goto drop;
-#endif
- case TC_ACT_OK:
- skb->tc_index = TC_H_MIN(res.classid);
- break;
-
- default:
- if (p->default_index != NO_DEFAULT_INDEX)
- skb->tc_index = p->default_index;
- break;
- }
- }
-
- err = qdisc_enqueue(skb, p->q, to_free);
- if (err != NET_XMIT_SUCCESS) {
- if (net_xmit_drop_count(err))
- qdisc_qstats_drop(sch);
- return err;
- }
-
- sch->qstats.backlog += len;
- sch->q.qlen++;
-
- return NET_XMIT_SUCCESS;
-
-drop:
- qdisc_drop(skb, sch, to_free);
- return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
-}
-
-static struct sk_buff *dsmark_dequeue(struct Qdisc *sch)
-{
- struct dsmark_qdisc_data *p = qdisc_priv(sch);
- struct sk_buff *skb;
- u32 index;
-
- pr_debug("%s(sch %p,[qdisc %p])\n", __func__, sch, p);
-
- skb = qdisc_dequeue_peeked(p->q);
- if (skb == NULL)
- return NULL;
-
- qdisc_bstats_update(sch, skb);
- qdisc_qstats_backlog_dec(sch, skb);
- sch->q.qlen--;
-
- index = skb->tc_index & (p->indices - 1);
- pr_debug("index %d->%d\n", skb->tc_index, index);
-
- switch (skb_protocol(skb, true)) {
- case htons(ETH_P_IP):
- ipv4_change_dsfield(ip_hdr(skb), p->mv[index].mask,
- p->mv[index].value);
- break;
- case htons(ETH_P_IPV6):
- ipv6_change_dsfield(ipv6_hdr(skb), p->mv[index].mask,
- p->mv[index].value);
- break;
- default:
- /*
- * Only complain if a change was actually attempted.
- * This way, we can send non-IP traffic through dsmark
- * and don't need yet another qdisc as a bypass.
- */
- if (p->mv[index].mask != 0xff || p->mv[index].value)
- pr_warn("%s: unsupported protocol %d\n",
- __func__, ntohs(skb_protocol(skb, true)));
- break;
- }
-
- return skb;
-}
-
-static struct sk_buff *dsmark_peek(struct Qdisc *sch)
-{
- struct dsmark_qdisc_data *p = qdisc_priv(sch);
-
- pr_debug("%s(sch %p,[qdisc %p])\n", __func__, sch, p);
-
- return p->q->ops->peek(p->q);
-}
-
-static int dsmark_init(struct Qdisc *sch, struct nlattr *opt,
- struct netlink_ext_ack *extack)
-{
- struct dsmark_qdisc_data *p = qdisc_priv(sch);
- struct nlattr *tb[TCA_DSMARK_MAX + 1];
- int err = -EINVAL;
- u32 default_index = NO_DEFAULT_INDEX;
- u16 indices;
- int i;
-
- pr_debug("%s(sch %p,[qdisc %p],opt %p)\n", __func__, sch, p, opt);
-
- if (!opt)
- goto errout;
-
- err = tcf_block_get(&p->block, &p->filter_list, sch, extack);
- if (err)
- return err;
-
- err = nla_parse_nested_deprecated(tb, TCA_DSMARK_MAX, opt,
- dsmark_policy, NULL);
- if (err < 0)
- goto errout;
-
- err = -EINVAL;
- if (!tb[TCA_DSMARK_INDICES])
- goto errout;
- indices = nla_get_u16(tb[TCA_DSMARK_INDICES]);
-
- if (hweight32(indices) != 1)
- goto errout;
-
- if (tb[TCA_DSMARK_DEFAULT_INDEX])
- default_index = nla_get_u16(tb[TCA_DSMARK_DEFAULT_INDEX]);
-
- if (indices <= DSMARK_EMBEDDED_SZ)
- p->mv = p->embedded;
- else
- p->mv = kmalloc_array(indices, sizeof(*p->mv), GFP_KERNEL);
- if (!p->mv) {
- err = -ENOMEM;
- goto errout;
- }
- for (i = 0; i < indices; i++) {
- p->mv[i].mask = 0xff;
- p->mv[i].value = 0;
- }
- p->indices = indices;
- p->default_index = default_index;
- p->set_tc_index = nla_get_flag(tb[TCA_DSMARK_SET_TC_INDEX]);
-
- p->q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, sch->handle,
- NULL);
- if (p->q == NULL)
- p->q = &noop_qdisc;
- else
- qdisc_hash_add(p->q, true);
-
- pr_debug("%s: qdisc %p\n", __func__, p->q);
-
- err = 0;
-errout:
- return err;
-}
-
-static void dsmark_reset(struct Qdisc *sch)
-{
- struct dsmark_qdisc_data *p = qdisc_priv(sch);
-
- pr_debug("%s(sch %p,[qdisc %p])\n", __func__, sch, p);
- if (p->q)
- qdisc_reset(p->q);
-}
-
-static void dsmark_destroy(struct Qdisc *sch)
-{
- struct dsmark_qdisc_data *p = qdisc_priv(sch);
-
- pr_debug("%s(sch %p,[qdisc %p])\n", __func__, sch, p);
-
- tcf_block_put(p->block);
- qdisc_put(p->q);
- if (p->mv != p->embedded)
- kfree(p->mv);
-}
-
-static int dsmark_dump_class(struct Qdisc *sch, unsigned long cl,
- struct sk_buff *skb, struct tcmsg *tcm)
-{
- struct dsmark_qdisc_data *p = qdisc_priv(sch);
- struct nlattr *opts = NULL;
-
- pr_debug("%s(sch %p,[qdisc %p],class %ld\n", __func__, sch, p, cl);
-
- if (!dsmark_valid_index(p, cl))
- return -EINVAL;
-
- tcm->tcm_handle = TC_H_MAKE(TC_H_MAJ(sch->handle), cl - 1);
- tcm->tcm_info = p->q->handle;
-
- opts = nla_nest_start_noflag(skb, TCA_OPTIONS);
- if (opts == NULL)
- goto nla_put_failure;
- if (nla_put_u8(skb, TCA_DSMARK_MASK, p->mv[cl - 1].mask) ||
- nla_put_u8(skb, TCA_DSMARK_VALUE, p->mv[cl - 1].value))
- goto nla_put_failure;
-
- return nla_nest_end(skb, opts);
-
-nla_put_failure:
- nla_nest_cancel(skb, opts);
- return -EMSGSIZE;
-}
-
-static int dsmark_dump(struct Qdisc *sch, struct sk_buff *skb)
-{
- struct dsmark_qdisc_data *p = qdisc_priv(sch);
- struct nlattr *opts = NULL;
-
- opts = nla_nest_start_noflag(skb, TCA_OPTIONS);
- if (opts == NULL)
- goto nla_put_failure;
- if (nla_put_u16(skb, TCA_DSMARK_INDICES, p->indices))
- goto nla_put_failure;
-
- if (p->default_index != NO_DEFAULT_INDEX &&
- nla_put_u16(skb, TCA_DSMARK_DEFAULT_INDEX, p->default_index))
- goto nla_put_failure;
-
- if (p->set_tc_index &&
- nla_put_flag(skb, TCA_DSMARK_SET_TC_INDEX))
- goto nla_put_failure;
-
- return nla_nest_end(skb, opts);
-
-nla_put_failure:
- nla_nest_cancel(skb, opts);
- return -EMSGSIZE;
-}
-
-static const struct Qdisc_class_ops dsmark_class_ops = {
- .graft = dsmark_graft,
- .leaf = dsmark_leaf,
- .find = dsmark_find,
- .change = dsmark_change,
- .delete = dsmark_delete,
- .walk = dsmark_walk,
- .tcf_block = dsmark_tcf_block,
- .bind_tcf = dsmark_bind_filter,
- .unbind_tcf = dsmark_unbind_filter,
- .dump = dsmark_dump_class,
-};
-
-static struct Qdisc_ops dsmark_qdisc_ops __read_mostly = {
- .next = NULL,
- .cl_ops = &dsmark_class_ops,
- .id = "dsmark",
- .priv_size = sizeof(struct dsmark_qdisc_data),
- .enqueue = dsmark_enqueue,
- .dequeue = dsmark_dequeue,
- .peek = dsmark_peek,
- .init = dsmark_init,
- .reset = dsmark_reset,
- .destroy = dsmark_destroy,
- .change = NULL,
- .dump = dsmark_dump,
- .owner = THIS_MODULE,
-};
-
-static int __init dsmark_module_init(void)
-{
- return register_qdisc(&dsmark_qdisc_ops);
-}
-
-static void __exit dsmark_module_exit(void)
-{
- unregister_qdisc(&dsmark_qdisc_ops);
-}
-
-module_init(dsmark_module_init)
-module_exit(dsmark_module_exit)
-
-MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c
index 4c68abaa289b..48ed87b91086 100644
--- a/net/sched/sch_mqprio.c
+++ b/net/sched/sch_mqprio.c
@@ -17,6 +17,8 @@
#include <net/sch_generic.h>
#include <net/pkt_cls.h>
+#include "sch_mqprio_lib.h"
+
struct mqprio_sched {
struct Qdisc **qdiscs;
u16 mode;
@@ -27,6 +29,62 @@ struct mqprio_sched {
u64 max_rate[TC_QOPT_MAX_QUEUE];
};
+static int mqprio_enable_offload(struct Qdisc *sch,
+ const struct tc_mqprio_qopt *qopt,
+ struct netlink_ext_ack *extack)
+{
+ struct tc_mqprio_qopt_offload mqprio = {.qopt = *qopt};
+ struct mqprio_sched *priv = qdisc_priv(sch);
+ struct net_device *dev = qdisc_dev(sch);
+ int err, i;
+
+ switch (priv->mode) {
+ case TC_MQPRIO_MODE_DCB:
+ if (priv->shaper != TC_MQPRIO_SHAPER_DCB)
+ return -EINVAL;
+ break;
+ case TC_MQPRIO_MODE_CHANNEL:
+ mqprio.flags = priv->flags;
+ if (priv->flags & TC_MQPRIO_F_MODE)
+ mqprio.mode = priv->mode;
+ if (priv->flags & TC_MQPRIO_F_SHAPER)
+ mqprio.shaper = priv->shaper;
+ if (priv->flags & TC_MQPRIO_F_MIN_RATE)
+ for (i = 0; i < mqprio.qopt.num_tc; i++)
+ mqprio.min_rate[i] = priv->min_rate[i];
+ if (priv->flags & TC_MQPRIO_F_MAX_RATE)
+ for (i = 0; i < mqprio.qopt.num_tc; i++)
+ mqprio.max_rate[i] = priv->max_rate[i];
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_MQPRIO,
+ &mqprio);
+ if (err)
+ return err;
+
+ priv->hw_offload = mqprio.qopt.hw;
+
+ return 0;
+}
+
+static void mqprio_disable_offload(struct Qdisc *sch)
+{
+ struct tc_mqprio_qopt_offload mqprio = { { 0 } };
+ struct mqprio_sched *priv = qdisc_priv(sch);
+ struct net_device *dev = qdisc_dev(sch);
+
+ switch (priv->mode) {
+ case TC_MQPRIO_MODE_DCB:
+ case TC_MQPRIO_MODE_CHANNEL:
+ dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_MQPRIO,
+ &mqprio);
+ break;
+ }
+}
+
static void mqprio_destroy(struct Qdisc *sch)
{
struct net_device *dev = qdisc_dev(sch);
@@ -41,37 +99,17 @@ static void mqprio_destroy(struct Qdisc *sch)
kfree(priv->qdiscs);
}
- if (priv->hw_offload && dev->netdev_ops->ndo_setup_tc) {
- struct tc_mqprio_qopt_offload mqprio = { { 0 } };
-
- switch (priv->mode) {
- case TC_MQPRIO_MODE_DCB:
- case TC_MQPRIO_MODE_CHANNEL:
- dev->netdev_ops->ndo_setup_tc(dev,
- TC_SETUP_QDISC_MQPRIO,
- &mqprio);
- break;
- default:
- return;
- }
- } else {
+ if (priv->hw_offload && dev->netdev_ops->ndo_setup_tc)
+ mqprio_disable_offload(sch);
+ else
netdev_set_num_tc(dev, 0);
- }
}
-static int mqprio_parse_opt(struct net_device *dev, struct tc_mqprio_qopt *qopt)
+static int mqprio_parse_opt(struct net_device *dev, struct tc_mqprio_qopt *qopt,
+ const struct tc_mqprio_caps *caps,
+ struct netlink_ext_ack *extack)
{
- int i, j;
-
- /* Verify num_tc is not out of max range */
- if (qopt->num_tc > TC_MAX_QUEUE)
- return -EINVAL;
-
- /* Verify priority mapping uses valid tcs */
- for (i = 0; i < TC_BITMASK + 1; i++) {
- if (qopt->prio_tc_map[i] >= qopt->num_tc)
- return -EINVAL;
- }
+ int err;
/* Limit qopt->hw to maximum supported offload value. Drivers have
* the option of overriding this later if they don't support the a
@@ -80,31 +118,23 @@ static int mqprio_parse_opt(struct net_device *dev, struct tc_mqprio_qopt *qopt)
if (qopt->hw > TC_MQPRIO_HW_OFFLOAD_MAX)
qopt->hw = TC_MQPRIO_HW_OFFLOAD_MAX;
- /* If hardware offload is requested we will leave it to the device
- * to either populate the queue counts itself or to validate the
- * provided queue counts. If ndo_setup_tc is not present then
- * hardware doesn't support offload and we should return an error.
+ /* If hardware offload is requested, we will leave 3 options to the
+ * device driver:
+ * - populate the queue counts itself (and ignore what was requested)
+ * - validate the provided queue counts by itself (and apply them)
+ * - request queue count validation here (and apply them)
*/
- if (qopt->hw)
- return dev->netdev_ops->ndo_setup_tc ? 0 : -EINVAL;
-
- for (i = 0; i < qopt->num_tc; i++) {
- unsigned int last = qopt->offset[i] + qopt->count[i];
-
- /* Verify the queue count is in tx range being equal to the
- * real_num_tx_queues indicates the last queue is in use.
- */
- if (qopt->offset[i] >= dev->real_num_tx_queues ||
- !qopt->count[i] ||
- last > dev->real_num_tx_queues)
- return -EINVAL;
-
- /* Verify that the offset and counts do not overlap */
- for (j = i + 1; j < qopt->num_tc; j++) {
- if (last > qopt->offset[j])
- return -EINVAL;
- }
- }
+ err = mqprio_validate_qopt(dev, qopt,
+ !qopt->hw || caps->validate_queue_counts,
+ false, extack);
+ if (err)
+ return err;
+
+ /* If ndo_setup_tc is not present then hardware doesn't support offload
+ * and we should return an error.
+ */
+ if (qopt->hw && !dev->netdev_ops->ndo_setup_tc)
+ return -EINVAL;
return 0;
}
@@ -130,6 +160,67 @@ static int parse_attr(struct nlattr *tb[], int maxtype, struct nlattr *nla,
return 0;
}
+static int mqprio_parse_nlattr(struct Qdisc *sch, struct tc_mqprio_qopt *qopt,
+ struct nlattr *opt)
+{
+ struct mqprio_sched *priv = qdisc_priv(sch);
+ struct nlattr *tb[TCA_MQPRIO_MAX + 1];
+ struct nlattr *attr;
+ int i, rem, err;
+
+ err = parse_attr(tb, TCA_MQPRIO_MAX, opt, mqprio_policy,
+ sizeof(*qopt));
+ if (err < 0)
+ return err;
+
+ if (!qopt->hw)
+ return -EINVAL;
+
+ if (tb[TCA_MQPRIO_MODE]) {
+ priv->flags |= TC_MQPRIO_F_MODE;
+ priv->mode = *(u16 *)nla_data(tb[TCA_MQPRIO_MODE]);
+ }
+
+ if (tb[TCA_MQPRIO_SHAPER]) {
+ priv->flags |= TC_MQPRIO_F_SHAPER;
+ priv->shaper = *(u16 *)nla_data(tb[TCA_MQPRIO_SHAPER]);
+ }
+
+ if (tb[TCA_MQPRIO_MIN_RATE64]) {
+ if (priv->shaper != TC_MQPRIO_SHAPER_BW_RATE)
+ return -EINVAL;
+ i = 0;
+ nla_for_each_nested(attr, tb[TCA_MQPRIO_MIN_RATE64],
+ rem) {
+ if (nla_type(attr) != TCA_MQPRIO_MIN_RATE64)
+ return -EINVAL;
+ if (i >= qopt->num_tc)
+ break;
+ priv->min_rate[i] = *(u64 *)nla_data(attr);
+ i++;
+ }
+ priv->flags |= TC_MQPRIO_F_MIN_RATE;
+ }
+
+ if (tb[TCA_MQPRIO_MAX_RATE64]) {
+ if (priv->shaper != TC_MQPRIO_SHAPER_BW_RATE)
+ return -EINVAL;
+ i = 0;
+ nla_for_each_nested(attr, tb[TCA_MQPRIO_MAX_RATE64],
+ rem) {
+ if (nla_type(attr) != TCA_MQPRIO_MAX_RATE64)
+ return -EINVAL;
+ if (i >= qopt->num_tc)
+ break;
+ priv->max_rate[i] = *(u64 *)nla_data(attr);
+ i++;
+ }
+ priv->flags |= TC_MQPRIO_F_MAX_RATE;
+ }
+
+ return 0;
+}
+
static int mqprio_init(struct Qdisc *sch, struct nlattr *opt,
struct netlink_ext_ack *extack)
{
@@ -139,9 +230,7 @@ static int mqprio_init(struct Qdisc *sch, struct nlattr *opt,
struct Qdisc *qdisc;
int i, err = -EOPNOTSUPP;
struct tc_mqprio_qopt *qopt = NULL;
- struct nlattr *tb[TCA_MQPRIO_MAX + 1];
- struct nlattr *attr;
- int rem;
+ struct tc_mqprio_caps caps;
int len;
BUILD_BUG_ON(TC_MAX_QUEUE != TC_QOPT_MAX_QUEUE);
@@ -160,61 +249,18 @@ static int mqprio_init(struct Qdisc *sch, struct nlattr *opt,
if (!opt || nla_len(opt) < sizeof(*qopt))
return -EINVAL;
+ qdisc_offload_query_caps(dev, TC_SETUP_QDISC_MQPRIO,
+ &caps, sizeof(caps));
+
qopt = nla_data(opt);
- if (mqprio_parse_opt(dev, qopt))
+ if (mqprio_parse_opt(dev, qopt, &caps, extack))
return -EINVAL;
len = nla_len(opt) - NLA_ALIGN(sizeof(*qopt));
if (len > 0) {
- err = parse_attr(tb, TCA_MQPRIO_MAX, opt, mqprio_policy,
- sizeof(*qopt));
- if (err < 0)
+ err = mqprio_parse_nlattr(sch, qopt, opt);
+ if (err)
return err;
-
- if (!qopt->hw)
- return -EINVAL;
-
- if (tb[TCA_MQPRIO_MODE]) {
- priv->flags |= TC_MQPRIO_F_MODE;
- priv->mode = *(u16 *)nla_data(tb[TCA_MQPRIO_MODE]);
- }
-
- if (tb[TCA_MQPRIO_SHAPER]) {
- priv->flags |= TC_MQPRIO_F_SHAPER;
- priv->shaper = *(u16 *)nla_data(tb[TCA_MQPRIO_SHAPER]);
- }
-
- if (tb[TCA_MQPRIO_MIN_RATE64]) {
- if (priv->shaper != TC_MQPRIO_SHAPER_BW_RATE)
- return -EINVAL;
- i = 0;
- nla_for_each_nested(attr, tb[TCA_MQPRIO_MIN_RATE64],
- rem) {
- if (nla_type(attr) != TCA_MQPRIO_MIN_RATE64)
- return -EINVAL;
- if (i >= qopt->num_tc)
- break;
- priv->min_rate[i] = *(u64 *)nla_data(attr);
- i++;
- }
- priv->flags |= TC_MQPRIO_F_MIN_RATE;
- }
-
- if (tb[TCA_MQPRIO_MAX_RATE64]) {
- if (priv->shaper != TC_MQPRIO_SHAPER_BW_RATE)
- return -EINVAL;
- i = 0;
- nla_for_each_nested(attr, tb[TCA_MQPRIO_MAX_RATE64],
- rem) {
- if (nla_type(attr) != TCA_MQPRIO_MAX_RATE64)
- return -EINVAL;
- if (i >= qopt->num_tc)
- break;
- priv->max_rate[i] = *(u64 *)nla_data(attr);
- i++;
- }
- priv->flags |= TC_MQPRIO_F_MAX_RATE;
- }
}
/* pre-allocate qdisc, attachment can't fail */
@@ -241,36 +287,9 @@ static int mqprio_init(struct Qdisc *sch, struct nlattr *opt,
* supplied and verified mapping
*/
if (qopt->hw) {
- struct tc_mqprio_qopt_offload mqprio = {.qopt = *qopt};
-
- switch (priv->mode) {
- case TC_MQPRIO_MODE_DCB:
- if (priv->shaper != TC_MQPRIO_SHAPER_DCB)
- return -EINVAL;
- break;
- case TC_MQPRIO_MODE_CHANNEL:
- mqprio.flags = priv->flags;
- if (priv->flags & TC_MQPRIO_F_MODE)
- mqprio.mode = priv->mode;
- if (priv->flags & TC_MQPRIO_F_SHAPER)
- mqprio.shaper = priv->shaper;
- if (priv->flags & TC_MQPRIO_F_MIN_RATE)
- for (i = 0; i < mqprio.qopt.num_tc; i++)
- mqprio.min_rate[i] = priv->min_rate[i];
- if (priv->flags & TC_MQPRIO_F_MAX_RATE)
- for (i = 0; i < mqprio.qopt.num_tc; i++)
- mqprio.max_rate[i] = priv->max_rate[i];
- break;
- default:
- return -EINVAL;
- }
- err = dev->netdev_ops->ndo_setup_tc(dev,
- TC_SETUP_QDISC_MQPRIO,
- &mqprio);
+ err = mqprio_enable_offload(sch, qopt, extack);
if (err)
return err;
-
- priv->hw_offload = mqprio.qopt.hw;
} else {
netdev_set_num_tc(dev, qopt->num_tc);
for (i = 0; i < qopt->num_tc; i++)
@@ -387,7 +406,7 @@ static int mqprio_dump(struct Qdisc *sch, struct sk_buff *skb)
struct nlattr *nla = (struct nlattr *)skb_tail_pointer(skb);
struct tc_mqprio_qopt opt = { 0 };
struct Qdisc *qdisc;
- unsigned int ntx, tc;
+ unsigned int ntx;
sch->q.qlen = 0;
gnet_stats_basic_sync_init(&sch->bstats);
@@ -411,15 +430,9 @@ static int mqprio_dump(struct Qdisc *sch, struct sk_buff *skb)
spin_unlock_bh(qdisc_lock(qdisc));
}
- opt.num_tc = netdev_get_num_tc(dev);
- memcpy(opt.prio_tc_map, dev->prio_tc_map, sizeof(opt.prio_tc_map));
+ mqprio_qopt_reconstruct(dev, &opt);
opt.hw = priv->hw_offload;
- for (tc = 0; tc < netdev_get_num_tc(dev); tc++) {
- opt.count[tc] = dev->tc_to_txq[tc].count;
- opt.offset[tc] = dev->tc_to_txq[tc].offset;
- }
-
if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt))
goto nla_put_failure;
diff --git a/net/sched/sch_mqprio_lib.c b/net/sched/sch_mqprio_lib.c
new file mode 100644
index 000000000000..c58a533b8ec5
--- /dev/null
+++ b/net/sched/sch_mqprio_lib.c
@@ -0,0 +1,117 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/net.h>
+#include <linux/netdevice.h>
+#include <linux/netlink.h>
+#include <linux/types.h>
+#include <net/pkt_sched.h>
+
+#include "sch_mqprio_lib.h"
+
+/* Returns true if the intervals [a, b) and [c, d) overlap. */
+static bool intervals_overlap(int a, int b, int c, int d)
+{
+ int left = max(a, c), right = min(b, d);
+
+ return left < right;
+}
+
+static int mqprio_validate_queue_counts(struct net_device *dev,
+ const struct tc_mqprio_qopt *qopt,
+ bool allow_overlapping_txqs,
+ struct netlink_ext_ack *extack)
+{
+ int i, j;
+
+ for (i = 0; i < qopt->num_tc; i++) {
+ unsigned int last = qopt->offset[i] + qopt->count[i];
+
+ if (!qopt->count[i]) {
+ NL_SET_ERR_MSG_FMT_MOD(extack, "No queues for TC %d",
+ i);
+ return -EINVAL;
+ }
+
+ /* Verify the queue count is in tx range being equal to the
+ * real_num_tx_queues indicates the last queue is in use.
+ */
+ if (qopt->offset[i] >= dev->real_num_tx_queues ||
+ last > dev->real_num_tx_queues) {
+ NL_SET_ERR_MSG_FMT_MOD(extack,
+ "Queues %d:%d for TC %d exceed the %d TX queues available",
+ qopt->count[i], qopt->offset[i],
+ i, dev->real_num_tx_queues);
+ return -EINVAL;
+ }
+
+ if (allow_overlapping_txqs)
+ continue;
+
+ /* Verify that the offset and counts do not overlap */
+ for (j = i + 1; j < qopt->num_tc; j++) {
+ if (intervals_overlap(qopt->offset[i], last,
+ qopt->offset[j],
+ qopt->offset[j] +
+ qopt->count[j])) {
+ NL_SET_ERR_MSG_FMT_MOD(extack,
+ "TC %d queues %d@%d overlap with TC %d queues %d@%d",
+ i, qopt->count[i], qopt->offset[i],
+ j, qopt->count[j], qopt->offset[j]);
+ return -EINVAL;
+ }
+ }
+ }
+
+ return 0;
+}
+
+int mqprio_validate_qopt(struct net_device *dev, struct tc_mqprio_qopt *qopt,
+ bool validate_queue_counts,
+ bool allow_overlapping_txqs,
+ struct netlink_ext_ack *extack)
+{
+ int i, err;
+
+ /* Verify num_tc is not out of max range */
+ if (qopt->num_tc > TC_MAX_QUEUE) {
+ NL_SET_ERR_MSG(extack,
+ "Number of traffic classes is outside valid range");
+ return -EINVAL;
+ }
+
+ /* Verify priority mapping uses valid tcs */
+ for (i = 0; i <= TC_BITMASK; i++) {
+ if (qopt->prio_tc_map[i] >= qopt->num_tc) {
+ NL_SET_ERR_MSG(extack,
+ "Invalid traffic class in priority to traffic class mapping");
+ return -EINVAL;
+ }
+ }
+
+ if (validate_queue_counts) {
+ err = mqprio_validate_queue_counts(dev, qopt,
+ allow_overlapping_txqs,
+ extack);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(mqprio_validate_qopt);
+
+void mqprio_qopt_reconstruct(struct net_device *dev, struct tc_mqprio_qopt *qopt)
+{
+ int tc, num_tc = netdev_get_num_tc(dev);
+
+ qopt->num_tc = num_tc;
+ memcpy(qopt->prio_tc_map, dev->prio_tc_map, sizeof(qopt->prio_tc_map));
+
+ for (tc = 0; tc < num_tc; tc++) {
+ qopt->count[tc] = dev->tc_to_txq[tc].count;
+ qopt->offset[tc] = dev->tc_to_txq[tc].offset;
+ }
+}
+EXPORT_SYMBOL_GPL(mqprio_qopt_reconstruct);
+
+MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_mqprio_lib.h b/net/sched/sch_mqprio_lib.h
new file mode 100644
index 000000000000..63f725ab8761
--- /dev/null
+++ b/net/sched/sch_mqprio_lib.h
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __SCH_MQPRIO_LIB_H
+#define __SCH_MQPRIO_LIB_H
+
+#include <linux/types.h>
+
+struct net_device;
+struct netlink_ext_ack;
+struct tc_mqprio_qopt;
+
+int mqprio_validate_qopt(struct net_device *dev, struct tc_mqprio_qopt *qopt,
+ bool validate_queue_counts,
+ bool allow_overlapping_txqs,
+ struct netlink_ext_ack *extack);
+void mqprio_qopt_reconstruct(struct net_device *dev,
+ struct tc_mqprio_qopt *qopt);
+
+#endif
diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c
index c322a61eaeea..1f469861eae3 100644
--- a/net/sched/sch_taprio.c
+++ b/net/sched/sch_taprio.c
@@ -26,7 +26,11 @@
#include <net/sock.h>
#include <net/tcp.h>
+#include "sch_mqprio_lib.h"
+
static LIST_HEAD(taprio_list);
+static struct static_key_false taprio_have_broken_mqprio;
+static struct static_key_false taprio_have_working_mqprio;
#define TAPRIO_ALL_GATES_OPEN -1
@@ -35,15 +39,19 @@ static LIST_HEAD(taprio_list);
#define TAPRIO_FLAGS_INVALID U32_MAX
struct sched_entry {
- struct list_head list;
-
- /* The instant that this entry "closes" and the next one
- * should open, the qdisc will make some effort so that no
- * packet leaves after this time.
+ /* Durations between this GCL entry and the GCL entry where the
+ * respective traffic class gate closes
*/
- ktime_t close_time;
+ u64 gate_duration[TC_MAX_QUEUE];
+ atomic_t budget[TC_MAX_QUEUE];
+ /* The qdisc makes some effort so that no packet leaves
+ * after this time
+ */
+ ktime_t gate_close_time[TC_MAX_QUEUE];
+ struct list_head list;
+ /* Used to calculate when to advance the schedule */
+ ktime_t end_time;
ktime_t next_txtime;
- atomic_t budget;
int index;
u32 gate_mask;
u32 interval;
@@ -51,10 +59,16 @@ struct sched_entry {
};
struct sched_gate_list {
+ /* Longest non-zero contiguous gate durations per traffic class,
+ * or 0 if a traffic class gate never opens during the schedule.
+ */
+ u64 max_open_gate_duration[TC_MAX_QUEUE];
+ u32 max_frm_len[TC_MAX_QUEUE]; /* for the fast path */
+ u32 max_sdu[TC_MAX_QUEUE]; /* for dump */
struct rcu_head rcu;
struct list_head entries;
size_t num_entries;
- ktime_t cycle_close_time;
+ ktime_t cycle_end_time;
s64 cycle_time;
s64 cycle_time_extension;
s64 base_time;
@@ -67,6 +81,8 @@ struct taprio_sched {
enum tk_offsets tk_offset;
int clockid;
bool offloaded;
+ bool detected_mqprio;
+ bool broken_mqprio;
atomic64_t picos_per_byte; /* Using picoseconds because for 10Gbps+
* speeds it's sub-nanoseconds per byte
*/
@@ -78,8 +94,8 @@ struct taprio_sched {
struct sched_gate_list __rcu *admin_sched;
struct hrtimer advance_timer;
struct list_head taprio_list;
- u32 max_frm_len[TC_MAX_QUEUE]; /* for the fast path */
- u32 max_sdu[TC_MAX_QUEUE]; /* for dump and offloading */
+ int cur_txq[TC_MAX_QUEUE];
+ u32 max_sdu[TC_MAX_QUEUE]; /* save info from the user */
u32 txtime_delay;
};
@@ -88,6 +104,57 @@ struct __tc_taprio_qopt_offload {
struct tc_taprio_qopt_offload offload;
};
+static void taprio_calculate_gate_durations(struct taprio_sched *q,
+ struct sched_gate_list *sched)
+{
+ struct net_device *dev = qdisc_dev(q->root);
+ int num_tc = netdev_get_num_tc(dev);
+ struct sched_entry *entry, *cur;
+ int tc;
+
+ list_for_each_entry(entry, &sched->entries, list) {
+ u32 gates_still_open = entry->gate_mask;
+
+ /* For each traffic class, calculate each open gate duration,
+ * starting at this schedule entry and ending at the schedule
+ * entry containing a gate close event for that TC.
+ */
+ cur = entry;
+
+ do {
+ if (!gates_still_open)
+ break;
+
+ for (tc = 0; tc < num_tc; tc++) {
+ if (!(gates_still_open & BIT(tc)))
+ continue;
+
+ if (cur->gate_mask & BIT(tc))
+ entry->gate_duration[tc] += cur->interval;
+ else
+ gates_still_open &= ~BIT(tc);
+ }
+
+ cur = list_next_entry_circular(cur, &sched->entries, list);
+ } while (cur != entry);
+
+ /* Keep track of the maximum gate duration for each traffic
+ * class, taking care to not confuse a traffic class which is
+ * temporarily closed with one that is always closed.
+ */
+ for (tc = 0; tc < num_tc; tc++)
+ if (entry->gate_duration[tc] &&
+ sched->max_open_gate_duration[tc] < entry->gate_duration[tc])
+ sched->max_open_gate_duration[tc] = entry->gate_duration[tc];
+ }
+}
+
+static bool taprio_entry_allows_tx(ktime_t skb_end_time,
+ struct sched_entry *entry, int tc)
+{
+ return ktime_before(skb_end_time, entry->gate_close_time[tc]);
+}
+
static ktime_t sched_base_time(const struct sched_gate_list *sched)
{
if (!sched)
@@ -180,6 +247,63 @@ static int length_to_duration(struct taprio_sched *q, int len)
return div_u64(len * atomic64_read(&q->picos_per_byte), PSEC_PER_NSEC);
}
+static int duration_to_length(struct taprio_sched *q, u64 duration)
+{
+ return div_u64(duration * PSEC_PER_NSEC, atomic64_read(&q->picos_per_byte));
+}
+
+/* Sets sched->max_sdu[] and sched->max_frm_len[] to the minimum between the
+ * q->max_sdu[] requested by the user and the max_sdu dynamically determined by
+ * the maximum open gate durations at the given link speed.
+ */
+static void taprio_update_queue_max_sdu(struct taprio_sched *q,
+ struct sched_gate_list *sched,
+ struct qdisc_size_table *stab)
+{
+ struct net_device *dev = qdisc_dev(q->root);
+ int num_tc = netdev_get_num_tc(dev);
+ u32 max_sdu_from_user;
+ u32 max_sdu_dynamic;
+ u32 max_sdu;
+ int tc;
+
+ for (tc = 0; tc < num_tc; tc++) {
+ max_sdu_from_user = q->max_sdu[tc] ?: U32_MAX;
+
+ /* TC gate never closes => keep the queueMaxSDU
+ * selected by the user
+ */
+ if (sched->max_open_gate_duration[tc] == sched->cycle_time) {
+ max_sdu_dynamic = U32_MAX;
+ } else {
+ u32 max_frm_len;
+
+ max_frm_len = duration_to_length(q, sched->max_open_gate_duration[tc]);
+ /* Compensate for L1 overhead from size table,
+ * but don't let the frame size go negative
+ */
+ if (stab) {
+ max_frm_len -= stab->szopts.overhead;
+ max_frm_len = max_t(int, max_frm_len,
+ dev->hard_header_len + 1);
+ }
+ max_sdu_dynamic = max_frm_len - dev->hard_header_len;
+ if (max_sdu_dynamic > dev->max_mtu)
+ max_sdu_dynamic = U32_MAX;
+ }
+
+ max_sdu = min(max_sdu_dynamic, max_sdu_from_user);
+
+ if (max_sdu != U32_MAX) {
+ sched->max_frm_len[tc] = max_sdu + dev->hard_header_len;
+ sched->max_sdu[tc] = max_sdu;
+ } else {
+ sched->max_frm_len[tc] = U32_MAX; /* never oversized */
+ sched->max_sdu[tc] = 0;
+ }
+ }
+}
+
/* Returns the entry corresponding to next available interval. If
* validate_interval is set, it only validates whether the timestamp occurs
* when the gate corresponding to the skb's traffic class is open.
@@ -413,14 +537,33 @@ done:
return txtime;
}
-static int taprio_enqueue_one(struct sk_buff *skb, struct Qdisc *sch,
- struct Qdisc *child, struct sk_buff **to_free)
+/* Devices with full offload are expected to honor this in hardware */
+static bool taprio_skb_exceeds_queue_max_sdu(struct Qdisc *sch,
+ struct sk_buff *skb)
{
struct taprio_sched *q = qdisc_priv(sch);
struct net_device *dev = qdisc_dev(sch);
+ struct sched_gate_list *sched;
int prio = skb->priority;
+ bool exceeds = false;
u8 tc;
+ tc = netdev_get_prio_tc_map(dev, prio);
+
+ rcu_read_lock();
+ sched = rcu_dereference(q->oper_sched);
+ if (sched && skb->len > sched->max_frm_len[tc])
+ exceeds = true;
+ rcu_read_unlock();
+
+ return exceeds;
+}
+
+static int taprio_enqueue_one(struct sk_buff *skb, struct Qdisc *sch,
+ struct Qdisc *child, struct sk_buff **to_free)
+{
+ struct taprio_sched *q = qdisc_priv(sch);
+
/* sk_flags are only safe to use on full sockets. */
if (skb->sk && sk_fullsock(skb->sk) && sock_flag(skb->sk, SOCK_TXTIME)) {
if (!is_valid_interval(skb, sch))
@@ -431,17 +574,53 @@ static int taprio_enqueue_one(struct sk_buff *skb, struct Qdisc *sch,
return qdisc_drop(skb, sch, to_free);
}
- /* Devices with full offload are expected to honor this in hardware */
- tc = netdev_get_prio_tc_map(dev, prio);
- if (skb->len > q->max_frm_len[tc])
- return qdisc_drop(skb, sch, to_free);
-
qdisc_qstats_backlog_inc(sch, skb);
sch->q.qlen++;
return qdisc_enqueue(skb, child, to_free);
}
+static int taprio_enqueue_segmented(struct sk_buff *skb, struct Qdisc *sch,
+ struct Qdisc *child,
+ struct sk_buff **to_free)
+{
+ unsigned int slen = 0, numsegs = 0, len = qdisc_pkt_len(skb);
+ netdev_features_t features = netif_skb_features(skb);
+ struct sk_buff *segs, *nskb;
+ int ret;
+
+ segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
+ if (IS_ERR_OR_NULL(segs))
+ return qdisc_drop(skb, sch, to_free);
+
+ skb_list_walk_safe(segs, segs, nskb) {
+ skb_mark_not_on_list(segs);
+ qdisc_skb_cb(segs)->pkt_len = segs->len;
+ slen += segs->len;
+
+ /* FIXME: we should be segmenting to a smaller size
+ * rather than dropping these
+ */
+ if (taprio_skb_exceeds_queue_max_sdu(sch, segs))
+ ret = qdisc_drop(segs, sch, to_free);
+ else
+ ret = taprio_enqueue_one(segs, sch, child, to_free);
+
+ if (ret != NET_XMIT_SUCCESS) {
+ if (net_xmit_drop_count(ret))
+ qdisc_qstats_drop(sch);
+ } else {
+ numsegs++;
+ }
+ }
+
+ if (numsegs > 1)
+ qdisc_tree_reduce_backlog(sch, 1 - numsegs, len - slen);
+ consume_skb(skb);
+
+ return numsegs > 0 ? NET_XMIT_SUCCESS : NET_XMIT_DROP;
+}
+
/* Will not be called in the full offload case, since the TX queues are
* attached to the Qdisc created using qdisc_create_dflt()
*/
@@ -458,97 +637,190 @@ static int taprio_enqueue(struct sk_buff *skb, struct Qdisc *sch,
if (unlikely(!child))
return qdisc_drop(skb, sch, to_free);
- /* Large packets might not be transmitted when the transmission duration
- * exceeds any configured interval. Therefore, segment the skb into
- * smaller chunks. Drivers with full offload are expected to handle
- * this in hardware.
- */
- if (skb_is_gso(skb)) {
- unsigned int slen = 0, numsegs = 0, len = qdisc_pkt_len(skb);
- netdev_features_t features = netif_skb_features(skb);
- struct sk_buff *segs, *nskb;
- int ret;
-
- segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
- if (IS_ERR_OR_NULL(segs))
- return qdisc_drop(skb, sch, to_free);
+ if (taprio_skb_exceeds_queue_max_sdu(sch, skb)) {
+ /* Large packets might not be transmitted when the transmission
+ * duration exceeds any configured interval. Therefore, segment
+ * the skb into smaller chunks. Drivers with full offload are
+ * expected to handle this in hardware.
+ */
+ if (skb_is_gso(skb))
+ return taprio_enqueue_segmented(skb, sch, child,
+ to_free);
- skb_list_walk_safe(segs, segs, nskb) {
- skb_mark_not_on_list(segs);
- qdisc_skb_cb(segs)->pkt_len = segs->len;
- slen += segs->len;
+ return qdisc_drop(skb, sch, to_free);
+ }
- ret = taprio_enqueue_one(segs, sch, child, to_free);
- if (ret != NET_XMIT_SUCCESS) {
- if (net_xmit_drop_count(ret))
- qdisc_qstats_drop(sch);
- } else {
- numsegs++;
- }
- }
+ return taprio_enqueue_one(skb, sch, child, to_free);
+}
- if (numsegs > 1)
- qdisc_tree_reduce_backlog(sch, 1 - numsegs, len - slen);
- consume_skb(skb);
+static struct sk_buff *taprio_peek(struct Qdisc *sch)
+{
+ WARN_ONCE(1, "taprio only supports operating as root qdisc, peek() not implemented");
+ return NULL;
+}
+
+static void taprio_set_budgets(struct taprio_sched *q,
+ struct sched_gate_list *sched,
+ struct sched_entry *entry)
+{
+ struct net_device *dev = qdisc_dev(q->root);
+ int num_tc = netdev_get_num_tc(dev);
+ int tc, budget;
+
+ for (tc = 0; tc < num_tc; tc++) {
+ /* Traffic classes which never close have infinite budget */
+ if (entry->gate_duration[tc] == sched->cycle_time)
+ budget = INT_MAX;
+ else
+ budget = div64_u64((u64)entry->gate_duration[tc] * PSEC_PER_NSEC,
+ atomic64_read(&q->picos_per_byte));
+
+ atomic_set(&entry->budget[tc], budget);
+ }
+}
- return numsegs > 0 ? NET_XMIT_SUCCESS : NET_XMIT_DROP;
+/* When an skb is sent, it consumes from the budget of all traffic classes */
+static int taprio_update_budgets(struct sched_entry *entry, size_t len,
+ int tc_consumed, int num_tc)
+{
+ int tc, budget, new_budget = 0;
+
+ for (tc = 0; tc < num_tc; tc++) {
+ budget = atomic_read(&entry->budget[tc]);
+ /* Don't consume from infinite budget */
+ if (budget == INT_MAX) {
+ if (tc == tc_consumed)
+ new_budget = budget;
+ continue;
+ }
+
+ if (tc == tc_consumed)
+ new_budget = atomic_sub_return(len, &entry->budget[tc]);
+ else
+ atomic_sub(len, &entry->budget[tc]);
}
- return taprio_enqueue_one(skb, sch, child, to_free);
+ return new_budget;
}
-/* Will not be called in the full offload case, since the TX queues are
- * attached to the Qdisc created using qdisc_create_dflt()
- */
-static struct sk_buff *taprio_peek(struct Qdisc *sch)
+static struct sk_buff *taprio_dequeue_from_txq(struct Qdisc *sch, int txq,
+ struct sched_entry *entry,
+ u32 gate_mask)
{
struct taprio_sched *q = qdisc_priv(sch);
struct net_device *dev = qdisc_dev(sch);
- struct sched_entry *entry;
+ struct Qdisc *child = q->qdiscs[txq];
+ int num_tc = netdev_get_num_tc(dev);
struct sk_buff *skb;
- u32 gate_mask;
- int i;
+ ktime_t guard;
+ int prio;
+ int len;
+ u8 tc;
- rcu_read_lock();
- entry = rcu_dereference(q->current_entry);
- gate_mask = entry ? entry->gate_mask : TAPRIO_ALL_GATES_OPEN;
- rcu_read_unlock();
+ if (unlikely(!child))
+ return NULL;
- if (!gate_mask)
+ if (TXTIME_ASSIST_IS_ENABLED(q->flags))
+ goto skip_peek_checks;
+
+ skb = child->ops->peek(child);
+ if (!skb)
return NULL;
- for (i = 0; i < dev->num_tx_queues; i++) {
- struct Qdisc *child = q->qdiscs[i];
- int prio;
- u8 tc;
+ prio = skb->priority;
+ tc = netdev_get_prio_tc_map(dev, prio);
- if (unlikely(!child))
- continue;
+ if (!(gate_mask & BIT(tc)))
+ return NULL;
- skb = child->ops->peek(child);
- if (!skb)
- continue;
+ len = qdisc_pkt_len(skb);
+ guard = ktime_add_ns(taprio_get_time(q), length_to_duration(q, len));
- if (TXTIME_ASSIST_IS_ENABLED(q->flags))
- return skb;
+ /* In the case that there's no gate entry, there's no
+ * guard band ...
+ */
+ if (gate_mask != TAPRIO_ALL_GATES_OPEN &&
+ !taprio_entry_allows_tx(guard, entry, tc))
+ return NULL;
- prio = skb->priority;
- tc = netdev_get_prio_tc_map(dev, prio);
+ /* ... and no budget. */
+ if (gate_mask != TAPRIO_ALL_GATES_OPEN &&
+ taprio_update_budgets(entry, len, tc, num_tc) < 0)
+ return NULL;
+
+skip_peek_checks:
+ skb = child->ops->dequeue(child);
+ if (unlikely(!skb))
+ return NULL;
+
+ qdisc_bstats_update(sch, skb);
+ qdisc_qstats_backlog_dec(sch, skb);
+ sch->q.qlen--;
+
+ return skb;
+}
+
+static void taprio_next_tc_txq(struct net_device *dev, int tc, int *txq)
+{
+ int offset = dev->tc_to_txq[tc].offset;
+ int count = dev->tc_to_txq[tc].count;
+
+ (*txq)++;
+ if (*txq == offset + count)
+ *txq = offset;
+}
+
+/* Prioritize higher traffic classes, and select among TXQs belonging to the
+ * same TC using round robin
+ */
+static struct sk_buff *taprio_dequeue_tc_priority(struct Qdisc *sch,
+ struct sched_entry *entry,
+ u32 gate_mask)
+{
+ struct taprio_sched *q = qdisc_priv(sch);
+ struct net_device *dev = qdisc_dev(sch);
+ int num_tc = netdev_get_num_tc(dev);
+ struct sk_buff *skb;
+ int tc;
+
+ for (tc = num_tc - 1; tc >= 0; tc--) {
+ int first_txq = q->cur_txq[tc];
if (!(gate_mask & BIT(tc)))
continue;
- return skb;
+ do {
+ skb = taprio_dequeue_from_txq(sch, q->cur_txq[tc],
+ entry, gate_mask);
+
+ taprio_next_tc_txq(dev, tc, &q->cur_txq[tc]);
+
+ if (skb)
+ return skb;
+ } while (q->cur_txq[tc] != first_txq);
}
return NULL;
}
-static void taprio_set_budget(struct taprio_sched *q, struct sched_entry *entry)
+/* Broken way of prioritizing smaller TXQ indices and ignoring the traffic
+ * class other than to determine whether the gate is open or not
+ */
+static struct sk_buff *taprio_dequeue_txq_priority(struct Qdisc *sch,
+ struct sched_entry *entry,
+ u32 gate_mask)
{
- atomic_set(&entry->budget,
- div64_u64((u64)entry->interval * PSEC_PER_NSEC,
- atomic64_read(&q->picos_per_byte)));
+ struct net_device *dev = qdisc_dev(sch);
+ struct sk_buff *skb;
+ int i;
+
+ for (i = 0; i < dev->num_tx_queues; i++) {
+ skb = taprio_dequeue_from_txq(sch, i, entry, gate_mask);
+ if (skb)
+ return skb;
+ }
+
+ return NULL;
}
/* Will not be called in the full offload case, since the TX queues are
@@ -557,11 +829,9 @@ static void taprio_set_budget(struct taprio_sched *q, struct sched_entry *entry)
static struct sk_buff *taprio_dequeue(struct Qdisc *sch)
{
struct taprio_sched *q = qdisc_priv(sch);
- struct net_device *dev = qdisc_dev(sch);
struct sk_buff *skb = NULL;
struct sched_entry *entry;
u32 gate_mask;
- int i;
rcu_read_lock();
entry = rcu_dereference(q->current_entry);
@@ -571,69 +841,23 @@ static struct sk_buff *taprio_dequeue(struct Qdisc *sch)
* "AdminGateStates"
*/
gate_mask = entry ? entry->gate_mask : TAPRIO_ALL_GATES_OPEN;
-
if (!gate_mask)
goto done;
- for (i = 0; i < dev->num_tx_queues; i++) {
- struct Qdisc *child = q->qdiscs[i];
- ktime_t guard;
- int prio;
- int len;
- u8 tc;
-
- if (unlikely(!child))
- continue;
-
- if (TXTIME_ASSIST_IS_ENABLED(q->flags)) {
- skb = child->ops->dequeue(child);
- if (!skb)
- continue;
- goto skb_found;
- }
-
- skb = child->ops->peek(child);
- if (!skb)
- continue;
-
- prio = skb->priority;
- tc = netdev_get_prio_tc_map(dev, prio);
-
- if (!(gate_mask & BIT(tc))) {
- skb = NULL;
- continue;
- }
-
- len = qdisc_pkt_len(skb);
- guard = ktime_add_ns(taprio_get_time(q),
- length_to_duration(q, len));
-
- /* In the case that there's no gate entry, there's no
- * guard band ...
- */
- if (gate_mask != TAPRIO_ALL_GATES_OPEN &&
- ktime_after(guard, entry->close_time)) {
- skb = NULL;
- continue;
- }
-
- /* ... and no budget. */
- if (gate_mask != TAPRIO_ALL_GATES_OPEN &&
- atomic_sub_return(len, &entry->budget) < 0) {
- skb = NULL;
- continue;
- }
-
- skb = child->ops->dequeue(child);
- if (unlikely(!skb))
- goto done;
-
-skb_found:
- qdisc_bstats_update(sch, skb);
- qdisc_qstats_backlog_dec(sch, skb);
- sch->q.qlen--;
-
- goto done;
+ if (static_branch_unlikely(&taprio_have_broken_mqprio) &&
+ !static_branch_likely(&taprio_have_working_mqprio)) {
+ /* Single NIC kind which is broken */
+ skb = taprio_dequeue_txq_priority(sch, entry, gate_mask);
+ } else if (static_branch_likely(&taprio_have_working_mqprio) &&
+ !static_branch_unlikely(&taprio_have_broken_mqprio)) {
+ /* Single NIC kind which prioritizes properly */
+ skb = taprio_dequeue_tc_priority(sch, entry, gate_mask);
+ } else {
+ /* Mixed NIC kinds present in system, need dynamic testing */
+ if (q->broken_mqprio)
+ skb = taprio_dequeue_txq_priority(sch, entry, gate_mask);
+ else
+ skb = taprio_dequeue_tc_priority(sch, entry, gate_mask);
}
done:
@@ -648,7 +872,7 @@ static bool should_restart_cycle(const struct sched_gate_list *oper,
if (list_is_last(&entry->list, &oper->entries))
return true;
- if (ktime_compare(entry->close_time, oper->cycle_close_time) == 0)
+ if (ktime_compare(entry->end_time, oper->cycle_end_time) == 0)
return true;
return false;
@@ -656,7 +880,7 @@ static bool should_restart_cycle(const struct sched_gate_list *oper,
static bool should_change_schedules(const struct sched_gate_list *admin,
const struct sched_gate_list *oper,
- ktime_t close_time)
+ ktime_t end_time)
{
ktime_t next_base_time, extension_time;
@@ -665,18 +889,18 @@ static bool should_change_schedules(const struct sched_gate_list *admin,
next_base_time = sched_base_time(admin);
- /* This is the simple case, the close_time would fall after
+ /* This is the simple case, the end_time would fall after
* the next schedule base_time.
*/
- if (ktime_compare(next_base_time, close_time) <= 0)
+ if (ktime_compare(next_base_time, end_time) <= 0)
return true;
- /* This is the cycle_time_extension case, if the close_time
+ /* This is the cycle_time_extension case, if the end_time
* plus the amount that can be extended would fall after the
* next schedule base_time, we can extend the current schedule
* for that amount.
*/
- extension_time = ktime_add_ns(close_time, oper->cycle_time_extension);
+ extension_time = ktime_add_ns(end_time, oper->cycle_time_extension);
/* FIXME: the IEEE 802.1Q-2018 Specification isn't clear about
* how precisely the extension should be made. So after
@@ -692,10 +916,13 @@ static enum hrtimer_restart advance_sched(struct hrtimer *timer)
{
struct taprio_sched *q = container_of(timer, struct taprio_sched,
advance_timer);
+ struct net_device *dev = qdisc_dev(q->root);
struct sched_gate_list *oper, *admin;
+ int num_tc = netdev_get_num_tc(dev);
struct sched_entry *entry, *next;
struct Qdisc *sch = q->root;
- ktime_t close_time;
+ ktime_t end_time;
+ int tc;
spin_lock(&q->current_entry_lock);
entry = rcu_dereference_protected(q->current_entry,
@@ -714,41 +941,49 @@ static enum hrtimer_restart advance_sched(struct hrtimer *timer)
* entry of all schedules are pre-calculated during the
* schedule initialization.
*/
- if (unlikely(!entry || entry->close_time == oper->base_time)) {
+ if (unlikely(!entry || entry->end_time == oper->base_time)) {
next = list_first_entry(&oper->entries, struct sched_entry,
list);
- close_time = next->close_time;
+ end_time = next->end_time;
goto first_run;
}
if (should_restart_cycle(oper, entry)) {
next = list_first_entry(&oper->entries, struct sched_entry,
list);
- oper->cycle_close_time = ktime_add_ns(oper->cycle_close_time,
- oper->cycle_time);
+ oper->cycle_end_time = ktime_add_ns(oper->cycle_end_time,
+ oper->cycle_time);
} else {
next = list_next_entry(entry, list);
}
- close_time = ktime_add_ns(entry->close_time, next->interval);
- close_time = min_t(ktime_t, close_time, oper->cycle_close_time);
+ end_time = ktime_add_ns(entry->end_time, next->interval);
+ end_time = min_t(ktime_t, end_time, oper->cycle_end_time);
+
+ for (tc = 0; tc < num_tc; tc++) {
+ if (next->gate_duration[tc] == oper->cycle_time)
+ next->gate_close_time[tc] = KTIME_MAX;
+ else
+ next->gate_close_time[tc] = ktime_add_ns(entry->end_time,
+ next->gate_duration[tc]);
+ }
- if (should_change_schedules(admin, oper, close_time)) {
+ if (should_change_schedules(admin, oper, end_time)) {
/* Set things so the next time this runs, the new
* schedule runs.
*/
- close_time = sched_base_time(admin);
+ end_time = sched_base_time(admin);
switch_schedules(q, &admin, &oper);
}
- next->close_time = close_time;
- taprio_set_budget(q, next);
+ next->end_time = end_time;
+ taprio_set_budgets(q, oper, next);
first_run:
rcu_assign_pointer(q->current_entry, next);
spin_unlock(&q->current_entry_lock);
- hrtimer_set_expires(&q->advance_timer, close_time);
+ hrtimer_set_expires(&q->advance_timer, end_time);
rcu_read_lock();
__netif_schedule(sch);
@@ -916,6 +1151,8 @@ static int parse_taprio_schedule(struct taprio_sched *q, struct nlattr **tb,
new->cycle_time = cycle;
}
+ taprio_calculate_gate_durations(q, new);
+
return 0;
}
@@ -924,7 +1161,7 @@ static int taprio_parse_mqprio_opt(struct net_device *dev,
struct netlink_ext_ack *extack,
u32 taprio_flags)
{
- int i, j;
+ bool allow_overlapping_txqs = TXTIME_ASSIST_IS_ENABLED(taprio_flags);
if (!qopt && !dev->num_tc) {
NL_SET_ERR_MSG(extack, "'mqprio' configuration is necessary");
@@ -937,52 +1174,17 @@ static int taprio_parse_mqprio_opt(struct net_device *dev,
if (dev->num_tc)
return 0;
- /* Verify num_tc is not out of max range */
- if (qopt->num_tc > TC_MAX_QUEUE) {
- NL_SET_ERR_MSG(extack, "Number of traffic classes is outside valid range");
- return -EINVAL;
- }
-
/* taprio imposes that traffic classes map 1:n to tx queues */
if (qopt->num_tc > dev->num_tx_queues) {
NL_SET_ERR_MSG(extack, "Number of traffic classes is greater than number of HW queues");
return -EINVAL;
}
- /* Verify priority mapping uses valid tcs */
- for (i = 0; i <= TC_BITMASK; i++) {
- if (qopt->prio_tc_map[i] >= qopt->num_tc) {
- NL_SET_ERR_MSG(extack, "Invalid traffic class in priority to traffic class mapping");
- return -EINVAL;
- }
- }
-
- for (i = 0; i < qopt->num_tc; i++) {
- unsigned int last = qopt->offset[i] + qopt->count[i];
-
- /* Verify the queue count is in tx range being equal to the
- * real_num_tx_queues indicates the last queue is in use.
- */
- if (qopt->offset[i] >= dev->num_tx_queues ||
- !qopt->count[i] ||
- last > dev->real_num_tx_queues) {
- NL_SET_ERR_MSG(extack, "Invalid queue in traffic class to queue mapping");
- return -EINVAL;
- }
-
- if (TXTIME_ASSIST_IS_ENABLED(taprio_flags))
- continue;
-
- /* Verify that the offset and counts do not overlap */
- for (j = i + 1; j < qopt->num_tc; j++) {
- if (last > qopt->offset[j]) {
- NL_SET_ERR_MSG(extack, "Detected overlap in the traffic class to queue mapping");
- return -EINVAL;
- }
- }
- }
-
- return 0;
+ /* For some reason, in txtime-assist mode, we allow TXQ ranges for
+ * different TCs to overlap, and just validate the TXQ ranges.
+ */
+ return mqprio_validate_qopt(dev, qopt, true, allow_overlapping_txqs,
+ extack);
}
static int taprio_get_start_time(struct Qdisc *sch,
@@ -1019,11 +1221,14 @@ static int taprio_get_start_time(struct Qdisc *sch,
return 0;
}
-static void setup_first_close_time(struct taprio_sched *q,
- struct sched_gate_list *sched, ktime_t base)
+static void setup_first_end_time(struct taprio_sched *q,
+ struct sched_gate_list *sched, ktime_t base)
{
+ struct net_device *dev = qdisc_dev(q->root);
+ int num_tc = netdev_get_num_tc(dev);
struct sched_entry *first;
ktime_t cycle;
+ int tc;
first = list_first_entry(&sched->entries,
struct sched_entry, list);
@@ -1031,10 +1236,18 @@ static void setup_first_close_time(struct taprio_sched *q,
cycle = sched->cycle_time;
/* FIXME: find a better place to do this */
- sched->cycle_close_time = ktime_add_ns(base, cycle);
+ sched->cycle_end_time = ktime_add_ns(base, cycle);
+
+ first->end_time = ktime_add_ns(base, first->interval);
+ taprio_set_budgets(q, sched, first);
+
+ for (tc = 0; tc < num_tc; tc++) {
+ if (first->gate_duration[tc] == sched->cycle_time)
+ first->gate_close_time[tc] = KTIME_MAX;
+ else
+ first->gate_close_time[tc] = ktime_add_ns(base, first->gate_duration[tc]);
+ }
- first->close_time = ktime_add_ns(base, first->interval);
- taprio_set_budget(q, first);
rcu_assign_pointer(q->current_entry, NULL);
}
@@ -1088,6 +1301,8 @@ static int taprio_dev_notifier(struct notifier_block *nb, unsigned long event,
void *ptr)
{
struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct sched_gate_list *oper, *admin;
+ struct qdisc_size_table *stab;
struct taprio_sched *q;
ASSERT_RTNL();
@@ -1100,6 +1315,17 @@ static int taprio_dev_notifier(struct notifier_block *nb, unsigned long event,
continue;
taprio_set_picos_per_byte(dev, q);
+
+ stab = rtnl_dereference(q->root->stab);
+
+ oper = rtnl_dereference(q->oper_sched);
+ if (oper)
+ taprio_update_queue_max_sdu(q, oper, stab);
+
+ admin = rtnl_dereference(q->admin_sched);
+ if (admin)
+ taprio_update_queue_max_sdu(q, admin, stab);
+
break;
}
@@ -1203,7 +1429,8 @@ static u32 tc_map_to_queue_mask(struct net_device *dev, u32 tc_mask)
static void taprio_sched_to_offload(struct net_device *dev,
struct sched_gate_list *sched,
- struct tc_taprio_qopt_offload *offload)
+ struct tc_taprio_qopt_offload *offload,
+ const struct tc_taprio_caps *caps)
{
struct sched_entry *entry;
int i = 0;
@@ -1217,7 +1444,11 @@ static void taprio_sched_to_offload(struct net_device *dev,
e->command = entry->command;
e->interval = entry->interval;
- e->gate_mask = tc_map_to_queue_mask(dev, entry->gate_mask);
+ if (caps->gate_mask_per_txq)
+ e->gate_mask = tc_map_to_queue_mask(dev,
+ entry->gate_mask);
+ else
+ e->gate_mask = entry->gate_mask;
i++;
}
@@ -1225,6 +1456,34 @@ static void taprio_sched_to_offload(struct net_device *dev,
offload->num_entries = i;
}
+static void taprio_detect_broken_mqprio(struct taprio_sched *q)
+{
+ struct net_device *dev = qdisc_dev(q->root);
+ struct tc_taprio_caps caps;
+
+ qdisc_offload_query_caps(dev, TC_SETUP_QDISC_TAPRIO,
+ &caps, sizeof(caps));
+
+ q->broken_mqprio = caps.broken_mqprio;
+ if (q->broken_mqprio)
+ static_branch_inc(&taprio_have_broken_mqprio);
+ else
+ static_branch_inc(&taprio_have_working_mqprio);
+
+ q->detected_mqprio = true;
+}
+
+static void taprio_cleanup_broken_mqprio(struct taprio_sched *q)
+{
+ if (!q->detected_mqprio)
+ return;
+
+ if (q->broken_mqprio)
+ static_branch_dec(&taprio_have_broken_mqprio);
+ else
+ static_branch_dec(&taprio_have_working_mqprio);
+}
+
static int taprio_enable_offload(struct net_device *dev,
struct taprio_sched *q,
struct sched_gate_list *sched,
@@ -1261,7 +1520,8 @@ static int taprio_enable_offload(struct net_device *dev,
return -ENOMEM;
}
offload->enable = 1;
- taprio_sched_to_offload(dev, sched, offload);
+ mqprio_qopt_reconstruct(dev, &offload->mqprio.qopt);
+ taprio_sched_to_offload(dev, sched, offload, &caps);
for (tc = 0; tc < TC_MAX_QUEUE; tc++)
offload->max_sdu[tc] = q->max_sdu[tc];
@@ -1452,7 +1712,6 @@ static int taprio_parse_tc_entries(struct Qdisc *sch,
struct netlink_ext_ack *extack)
{
struct taprio_sched *q = qdisc_priv(sch);
- struct net_device *dev = qdisc_dev(sch);
u32 max_sdu[TC_QOPT_MAX_QUEUE];
unsigned long seen_tcs = 0;
struct nlattr *n;
@@ -1466,18 +1725,14 @@ static int taprio_parse_tc_entries(struct Qdisc *sch,
if (nla_type(n) != TCA_TAPRIO_ATTR_TC_ENTRY)
continue;
- err = taprio_parse_tc_entry(sch, n, max_sdu, &seen_tcs, extack);
+ err = taprio_parse_tc_entry(sch, n, max_sdu, &seen_tcs,
+ extack);
if (err)
goto out;
}
- for (tc = 0; tc < TC_QOPT_MAX_QUEUE; tc++) {
+ for (tc = 0; tc < TC_QOPT_MAX_QUEUE; tc++)
q->max_sdu[tc] = max_sdu[tc];
- if (max_sdu[tc])
- q->max_frm_len[tc] = max_sdu[tc] + dev->hard_header_len;
- else
- q->max_frm_len[tc] = U32_MAX; /* never oversized */
- }
out:
return err;
@@ -1533,6 +1788,7 @@ static int taprio_new_flags(const struct nlattr *attr, u32 old,
static int taprio_change(struct Qdisc *sch, struct nlattr *opt,
struct netlink_ext_ack *extack)
{
+ struct qdisc_size_table *stab = rtnl_dereference(sch->stab);
struct nlattr *tb[TCA_TAPRIO_ATTR_MAX + 1] = { };
struct sched_gate_list *oper, *admin, *new_admin;
struct taprio_sched *q = qdisc_priv(sch);
@@ -1585,6 +1841,23 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt,
goto free_sched;
}
+ if (mqprio) {
+ err = netdev_set_num_tc(dev, mqprio->num_tc);
+ if (err)
+ goto free_sched;
+ for (i = 0; i < mqprio->num_tc; i++) {
+ netdev_set_tc_queue(dev, i,
+ mqprio->count[i],
+ mqprio->offset[i]);
+ q->cur_txq[i] = mqprio->offset[i];
+ }
+
+ /* Always use supplied priority mappings */
+ for (i = 0; i <= TC_BITMASK; i++)
+ netdev_set_prio_tc_map(dev, i,
+ mqprio->prio_tc_map[i]);
+ }
+
err = parse_taprio_schedule(q, tb, new_admin, extack);
if (err < 0)
goto free_sched;
@@ -1600,21 +1873,7 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt,
goto free_sched;
taprio_set_picos_per_byte(dev, q);
-
- if (mqprio) {
- err = netdev_set_num_tc(dev, mqprio->num_tc);
- if (err)
- goto free_sched;
- for (i = 0; i < mqprio->num_tc; i++)
- netdev_set_tc_queue(dev, i,
- mqprio->count[i],
- mqprio->offset[i]);
-
- /* Always use supplied priority mappings */
- for (i = 0; i <= TC_BITMASK; i++)
- netdev_set_prio_tc_map(dev, i,
- mqprio->prio_tc_map[i]);
- }
+ taprio_update_queue_max_sdu(q, new_admin, stab);
if (FULL_OFFLOAD_IS_ENABLED(q->flags))
err = taprio_enable_offload(dev, q, new_admin, extack);
@@ -1663,7 +1922,7 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt,
if (admin)
call_rcu(&admin->rcu, taprio_free_sched_cb);
} else {
- setup_first_close_time(q, new_admin, start);
+ setup_first_end_time(q, new_admin, start);
/* Protects against advance_sched() */
spin_lock_irqsave(&q->current_entry_lock, flags);
@@ -1683,6 +1942,10 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt,
new_admin = NULL;
err = 0;
+ if (!stab)
+ NL_SET_ERR_MSG_MOD(extack,
+ "Size table not specified, frame length estimations may be inaccurate");
+
unlock:
spin_unlock_bh(qdisc_lock(sch));
@@ -1743,6 +2006,8 @@ static void taprio_destroy(struct Qdisc *sch)
if (admin)
call_rcu(&admin->rcu, taprio_free_sched_cb);
+
+ taprio_cleanup_broken_mqprio(q);
}
static int taprio_init(struct Qdisc *sch, struct nlattr *opt,
@@ -1807,6 +2072,8 @@ static int taprio_init(struct Qdisc *sch, struct nlattr *opt,
q->qdiscs[i] = qdisc;
}
+ taprio_detect_broken_mqprio(q);
+
return taprio_change(sch, opt, extack);
}
@@ -1947,7 +2214,8 @@ error_nest:
return -1;
}
-static int taprio_dump_tc_entries(struct taprio_sched *q, struct sk_buff *skb)
+static int taprio_dump_tc_entries(struct sk_buff *skb,
+ struct sched_gate_list *sched)
{
struct nlattr *n;
int tc;
@@ -1961,7 +2229,7 @@ static int taprio_dump_tc_entries(struct taprio_sched *q, struct sk_buff *skb)
goto nla_put_failure;
if (nla_put_u32(skb, TCA_TAPRIO_TC_ENTRY_MAX_SDU,
- q->max_sdu[tc]))
+ sched->max_sdu[tc]))
goto nla_put_failure;
nla_nest_end(skb, n);
@@ -1981,18 +2249,11 @@ static int taprio_dump(struct Qdisc *sch, struct sk_buff *skb)
struct sched_gate_list *oper, *admin;
struct tc_mqprio_qopt opt = { 0 };
struct nlattr *nest, *sched_nest;
- unsigned int i;
oper = rtnl_dereference(q->oper_sched);
admin = rtnl_dereference(q->admin_sched);
- opt.num_tc = netdev_get_num_tc(dev);
- memcpy(opt.prio_tc_map, dev->prio_tc_map, sizeof(opt.prio_tc_map));
-
- for (i = 0; i < netdev_get_num_tc(dev); i++) {
- opt.count[i] = dev->tc_to_txq[i].count;
- opt.offset[i] = dev->tc_to_txq[i].offset;
- }
+ mqprio_qopt_reconstruct(dev, &opt);
nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
if (!nest)
@@ -2012,7 +2273,7 @@ static int taprio_dump(struct Qdisc *sch, struct sk_buff *skb)
nla_put_u32(skb, TCA_TAPRIO_ATTR_TXTIME_DELAY, q->txtime_delay))
goto options_error;
- if (taprio_dump_tc_entries(q, skb))
+ if (oper && taprio_dump_tc_entries(skb, oper))
goto options_error;
if (oper && dump_schedule(skb, oper))