summaryrefslogtreecommitdiff
path: root/net/sched
diff options
context:
space:
mode:
Diffstat (limited to 'net/sched')
-rw-r--r--net/sched/Kconfig441
-rw-r--r--net/sched/Makefile17
-rw-r--r--net/sched/act_api.c1405
-rw-r--r--net/sched/act_bpf.c112
-rw-r--r--net/sched/act_connmark.c198
-rw-r--r--net/sched/act_csum.c171
-rw-r--r--net/sched/act_ct.c1698
-rw-r--r--net/sched/act_ctinfo.c403
-rw-r--r--net/sched/act_gact.c151
-rw-r--r--net/sched/act_gate.c676
-rw-r--r--net/sched/act_ife.c176
-rw-r--r--net/sched/act_ipt.c450
-rw-r--r--net/sched/act_meta_mark.c7
-rw-r--r--net/sched/act_meta_skbprio.c7
-rw-r--r--net/sched/act_meta_skbtcindex.c7
-rw-r--r--net/sched/act_mirred.c509
-rw-r--r--net/sched/act_mpls.c489
-rw-r--r--net/sched/act_nat.c159
-rw-r--r--net/sched/act_pedit.c531
-rw-r--r--net/sched/act_police.c337
-rw-r--r--net/sched/act_sample.c200
-rw-r--r--net/sched/act_simple.c134
-rw-r--r--net/sched/act_skbedit.c244
-rw-r--r--net/sched/act_skbmod.c164
-rw-r--r--net/sched/act_tunnel_key.c410
-rw-r--r--net/sched/act_vlan.c236
-rw-r--r--net/sched/bpf_qdisc.c472
-rw-r--r--net/sched/cls_api.c3092
-rw-r--r--net/sched/cls_basic.c57
-rw-r--r--net/sched/cls_bpf.c206
-rw-r--r--net/sched/cls_cgroup.c40
-rw-r--r--net/sched/cls_flow.c60
-rw-r--r--net/sched/cls_flower.c2540
-rw-r--r--net/sched/cls_fw.c84
-rw-r--r--net/sched/cls_matchall.c175
-rw-r--r--net/sched/cls_route.c115
-rw-r--r--net/sched/cls_rsvp.c28
-rw-r--r--net/sched/cls_rsvp.h772
-rw-r--r--net/sched/cls_rsvp6.c28
-rw-r--r--net/sched/cls_tcindex.c673
-rw-r--r--net/sched/cls_u32.c375
-rw-r--r--net/sched/em_canid.c11
-rw-r--r--net/sched/em_cmp.c16
-rw-r--r--net/sched/em_ipset.c7
-rw-r--r--net/sched/em_ipt.c62
-rw-r--r--net/sched/em_meta.c39
-rw-r--r--net/sched/em_nbyte.c13
-rw-r--r--net/sched/em_text.c24
-rw-r--r--net/sched/em_u32.c7
-rw-r--r--net/sched/ematch.c25
-rw-r--r--net/sched/sch_api.c925
-rw-r--r--net/sched/sch_atm.c705
-rw-r--r--net/sched/sch_blackhole.c6
-rw-r--r--net/sched/sch_cake.c768
-rw-r--r--net/sched/sch_cbq.c1807
-rw-r--r--net/sched/sch_cbs.c163
-rw-r--r--net/sched/sch_choke.c62
-rw-r--r--net/sched/sch_codel.c102
-rw-r--r--net/sched/sch_drr.c86
-rw-r--r--net/sched/sch_dsmark.c517
-rw-r--r--net/sched/sch_dualpi2.c1177
-rw-r--r--net/sched/sch_etf.c40
-rw-r--r--net/sched/sch_ets.c839
-rw-r--r--net/sched/sch_fifo.c123
-rw-r--r--net/sched/sch_fq.c767
-rw-r--r--net/sched/sch_fq_codel.c151
-rw-r--r--net/sched/sch_fq_pie.c595
-rw-r--r--net/sched/sch_frag.c160
-rw-r--r--net/sched/sch_generic.c815
-rw-r--r--net/sched/sch_gred.c127
-rw-r--r--net/sched/sch_hfsc.c154
-rw-r--r--net/sched/sch_hhf.c70
-rw-r--r--net/sched/sch_htb.c760
-rw-r--r--net/sched/sch_ingress.c108
-rw-r--r--net/sched/sch_mq.c49
-rw-r--r--net/sched/sch_mqprio.c546
-rw-r--r--net/sched/sch_mqprio_lib.c132
-rw-r--r--net/sched/sch_mqprio_lib.h20
-rw-r--r--net/sched/sch_multiq.c61
-rw-r--r--net/sched/sch_netem.c331
-rw-r--r--net/sched/sch_pie.c443
-rw-r--r--net/sched/sch_plug.c13
-rw-r--r--net/sched/sch_prio.c49
-rw-r--r--net/sched/sch_qfq.c207
-rw-r--r--net/sched/sch_red.c199
-rw-r--r--net/sched/sch_sfb.c71
-rw-r--r--net/sched/sch_sfq.c169
-rw-r--r--net/sched/sch_skbprio.c34
-rw-r--r--net/sched/sch_taprio.c2408
-rw-r--r--net/sched/sch_tbf.c126
-rw-r--r--net/sched/sch_teql.c21
91 files changed, 22578 insertions, 11581 deletions
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index 1b9afdee5ba9..6ddff028b81a 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
#
# Traffic control configuration.
#
@@ -5,7 +6,7 @@
menuconfig NET_SCHED
bool "QoS and/or fair queueing"
select NET_SCH_FIFO
- ---help---
+ help
When the kernel has several packets to send out over a network
device, it has to decide which ones to send first, which ones to
delay, and which ones to drop. This is the job of the queueing
@@ -44,26 +45,9 @@ if NET_SCHED
comment "Queueing/Scheduling"
-config NET_SCH_CBQ
- tristate "Class Based Queueing (CBQ)"
- ---help---
- Say Y here if you want to use the Class-Based Queueing (CBQ) packet
- scheduling algorithm. This algorithm classifies the waiting packets
- into a tree-like hierarchy of classes; the leaves of this tree are
- in turn scheduled by separate algorithms.
-
- See the top of <file:net/sched/sch_cbq.c> for more details.
-
- CBQ is a commonly used scheduler, so if you're unsure, you should
- say Y here. Then say Y to all the queueing algorithms below that you
- want to use as leaf disciplines.
-
- To compile this code as a module, choose M here: the
- module will be called sch_cbq.
-
config NET_SCH_HTB
tristate "Hierarchical Token Bucket (HTB)"
- ---help---
+ help
Say Y here if you want to use the Hierarchical Token Buckets (HTB)
packet scheduling algorithm. See
<http://luxik.cdi.cz/~devik/qos/htb/> for complete manual and
@@ -77,30 +61,16 @@ config NET_SCH_HTB
config NET_SCH_HFSC
tristate "Hierarchical Fair Service Curve (HFSC)"
- ---help---
+ help
Say Y here if you want to use the Hierarchical Fair Service Curve
(HFSC) packet scheduling algorithm.
To compile this code as a module, choose M here: the
module will be called sch_hfsc.
-config NET_SCH_ATM
- tristate "ATM Virtual Circuits (ATM)"
- depends on ATM
- ---help---
- Say Y here if you want to use the ATM pseudo-scheduler. This
- provides a framework for invoking classifiers, which in turn
- select classes of this queuing discipline. Each class maps
- the flow(s) it is handling to a given virtual circuit.
-
- See the top of <file:net/sched/sch_atm.c> for more details.
-
- To compile this code as a module, choose M here: the
- module will be called sch_atm.
-
config NET_SCH_PRIO
tristate "Multi Band Priority Queueing (PRIO)"
- ---help---
+ help
Say Y here if you want to use an n-band priority queue packet
scheduler.
@@ -109,7 +79,7 @@ config NET_SCH_PRIO
config NET_SCH_MULTIQ
tristate "Hardware Multiqueue-aware Multi Band Queuing (MULTIQ)"
- ---help---
+ help
Say Y here if you want to use an n-band queue packet scheduler
to support devices that have multiple hardware transmit queues.
@@ -118,7 +88,7 @@ config NET_SCH_MULTIQ
config NET_SCH_RED
tristate "Random Early Detection (RED)"
- ---help---
+ help
Say Y here if you want to use the Random Early Detection (RED)
packet scheduling algorithm.
@@ -129,7 +99,7 @@ config NET_SCH_RED
config NET_SCH_SFB
tristate "Stochastic Fair Blue (SFB)"
- ---help---
+ help
Say Y here if you want to use the Stochastic Fair Blue (SFB)
packet scheduling algorithm.
@@ -140,7 +110,7 @@ config NET_SCH_SFB
config NET_SCH_SFQ
tristate "Stochastic Fairness Queueing (SFQ)"
- ---help---
+ help
Say Y here if you want to use the Stochastic Fairness Queueing (SFQ)
packet scheduling algorithm.
@@ -151,7 +121,7 @@ config NET_SCH_SFQ
config NET_SCH_TEQL
tristate "True Link Equalizer (TEQL)"
- ---help---
+ help
Say Y here if you want to use the True Link Equalizer (TLE) packet
scheduling algorithm. This queueing discipline allows the combination
of several physical devices into one virtual device.
@@ -163,7 +133,7 @@ config NET_SCH_TEQL
config NET_SCH_TBF
tristate "Token Bucket Filter (TBF)"
- ---help---
+ help
Say Y here if you want to use the Token Bucket Filter (TBF) packet
scheduling algorithm.
@@ -174,7 +144,7 @@ config NET_SCH_TBF
config NET_SCH_CBS
tristate "Credit Based Shaper (CBS)"
- ---help---
+ help
Say Y here if you want to use the Credit Based Shaper (CBS) packet
scheduling algorithm.
@@ -194,8 +164,14 @@ config NET_SCH_ETF
To compile this code as a module, choose M here: the
module will be called sch_etf.
+config NET_SCH_MQPRIO_LIB
+ tristate
+ help
+ Common library for manipulating mqprio queue configurations.
+
config NET_SCH_TAPRIO
tristate "Time Aware Priority (taprio) Scheduler"
+ select NET_SCH_MQPRIO_LIB
help
Say Y here if you want to use the Time Aware Priority (taprio) packet
scheduling algorithm.
@@ -207,7 +183,7 @@ config NET_SCH_TAPRIO
config NET_SCH_GRED
tristate "Generic Random Early Detection (GRED)"
- ---help---
+ help
Say Y here if you want to use the Generic Random Early Detection
(GRED) packet scheduling algorithm for some of your network devices
(see the top of <file:net/sched/sch_red.c> for details and
@@ -216,20 +192,9 @@ config NET_SCH_GRED
To compile this code as a module, choose M here: the
module will be called sch_gred.
-config NET_SCH_DSMARK
- tristate "Differentiated Services marker (DSMARK)"
- ---help---
- Say Y if you want to schedule packets according to the
- Differentiated Services architecture proposed in RFC 2475.
- Technical information on this method, with pointers to associated
- RFCs, is available at <http://www.gta.ufrj.br/diffserv/>.
-
- To compile this code as a module, choose M here: the
- module will be called sch_dsmark.
-
config NET_SCH_NETEM
tristate "Network emulator (NETEM)"
- ---help---
+ help
Say Y if you want to emulate network delay, loss, and packet
re-ordering. This is often useful to simulate networks when
testing applications or protocols.
@@ -252,6 +217,7 @@ config NET_SCH_DRR
config NET_SCH_MQPRIO
tristate "Multi-queue priority scheduler (MQPRIO)"
+ select NET_SCH_MQPRIO_LIB
help
Say Y here if you want to use the Multi-queue Priority scheduler.
This scheduler allows QOS to be offloaded on NICs that have support
@@ -280,7 +246,7 @@ config NET_SCH_CHOKE
help
Say Y here if you want to use the CHOKe packet scheduler (CHOose
and Keep for responsive flows, CHOose and Kill for unresponsive
- flows). This is a variation of RED which trys to penalize flows
+ flows). This is a variation of RED which tries to penalize flows
that monopolize the queue.
To compile this code as a module, choose M here: the
@@ -323,7 +289,7 @@ config NET_SCH_CAKE
tristate "Common Applications Kept Enhanced (CAKE)"
help
Say Y here if you want to use the Common Applications Kept Enhanced
- (CAKE) queue management algorithm.
+ (CAKE) queue management algorithm.
To compile this driver as a module, choose M here: the module
will be called sch_cake.
@@ -336,7 +302,7 @@ config NET_SCH_FQ
Say Y here if you want to use the FQ packet scheduling algorithm.
FQ does flow separation, and is able to respect pacing requirements
- set by TCP stack into sk->sk_pacing_rate (for localy generated
+ set by TCP stack into sk->sk_pacing_rate (for locally generated
traffic)
To compile this driver as a module, choose M here: the module
@@ -358,20 +324,31 @@ config NET_SCH_PIE
help
Say Y here if you want to use the Proportional Integral controller
Enhanced scheduler packet scheduling algorithm.
- For more information, please see
- http://tools.ietf.org/html/draft-pan-tsvwg-pie-00
+ For more information, please see https://tools.ietf.org/html/rfc8033
To compile this driver as a module, choose M here: the module
will be called sch_pie.
If unsure, say N.
+config NET_SCH_FQ_PIE
+ depends on NET_SCH_PIE
+ tristate "Flow Queue Proportional Integral controller Enhanced (FQ-PIE)"
+ help
+ Say Y here if you want to use the Flow Queue Proportional Integral
+ controller Enhanced (FQ-PIE) packet scheduling algorithm.
+ For more information, please see https://tools.ietf.org/html/rfc8033
+
+ To compile this driver as a module, choose M here: the module
+ will be called sch_fq_pie.
+
+ If unsure, say N.
+
config NET_SCH_INGRESS
tristate "Ingress/classifier-action Qdisc"
depends on NET_CLS_ACT
- select NET_INGRESS
- select NET_EGRESS
- ---help---
+ select NET_XGRESS
+ help
Say Y here if you want to use classifiers for incoming and/or outgoing
packets. This qdisc doesn't do anything else besides running classifiers,
which can also have actions attached to them. In case of outgoing packets,
@@ -385,7 +362,7 @@ config NET_SCH_INGRESS
config NET_SCH_PLUG
tristate "Plug network traffic until release (PLUG)"
- ---help---
+ help
This queuing discipline allows userspace to plug/unplug a network
output queue, using the netlink interface. When it receives an
@@ -409,9 +386,50 @@ config NET_SCH_PLUG
To compile this code as a module, choose M here: the
module will be called sch_plug.
+config NET_SCH_ETS
+ tristate "Enhanced transmission selection scheduler (ETS)"
+ help
+ The Enhanced Transmission Selection scheduler is a classful
+ queuing discipline that merges functionality of PRIO and DRR
+ qdiscs in one scheduler. ETS makes it easy to configure a set of
+ strict and bandwidth-sharing bands to implement the transmission
+ selection described in 802.1Qaz.
+
+ Say Y here if you want to use the ETS packet scheduling
+ algorithm.
+
+ To compile this driver as a module, choose M here: the module
+ will be called sch_ets.
+
+ If unsure, say N.
+
+config NET_SCH_BPF
+ bool "BPF-based Qdisc"
+ depends on BPF_SYSCALL && BPF_JIT && DEBUG_INFO_BTF
+ help
+ This option allows BPF-based queueing disiplines. With BPF struct_ops,
+ users can implement supported operators in Qdisc_ops using BPF programs.
+ The queue holding skb can be built with BPF maps or graphs.
+
+ Say Y here if you want to use BPF-based Qdisc.
+
+ If unsure, say N.
+
+config NET_SCH_DUALPI2
+ tristate "Dual Queue PI Square (DUALPI2) scheduler"
+ help
+ Say Y here if you want to use the Dual Queue Proportional Integral
+ Controller Improved with a Square scheduling algorithm.
+ For more information, please see https://tools.ietf.org/html/rfc9332
+
+ To compile this driver as a module, choose M here: the module
+ will be called sch_dualpi2.
+
+ If unsure, say N.
+
menuconfig NET_SCH_DEFAULT
bool "Allow override default queue discipline"
- ---help---
+ help
Support for selection of default queuing discipline.
Nearly all users can safely say no here, and the default
@@ -438,6 +456,9 @@ choice
config DEFAULT_FQ_CODEL
bool "Fair Queue Controlled Delay" if NET_SCH_FQ_CODEL
+ config DEFAULT_FQ_PIE
+ bool "Flow Queue Proportional Integral controller Enhanced" if NET_SCH_FQ_PIE
+
config DEFAULT_SFQ
bool "Stochastic Fair Queue" if NET_SCH_SFQ
@@ -450,6 +471,7 @@ config DEFAULT_NET_SCH
default "pfifo_fast" if DEFAULT_PFIFO_FAST
default "fq" if DEFAULT_FQ
default "fq_codel" if DEFAULT_FQ_CODEL
+ default "fq_pie" if DEFAULT_FQ_PIE
default "sfq" if DEFAULT_SFQ
default "pfifo_fast"
endif
@@ -462,30 +484,19 @@ config NET_CLS
config NET_CLS_BASIC
tristate "Elementary classification (BASIC)"
select NET_CLS
- ---help---
+ help
Say Y here if you want to be able to classify packets using
only extended matches and actions.
To compile this code as a module, choose M here: the
module will be called cls_basic.
-config NET_CLS_TCINDEX
- tristate "Traffic-Control Index (TCINDEX)"
- select NET_CLS
- ---help---
- Say Y here if you want to be able to classify packets based on
- traffic control indices. You will want this feature if you want
- to implement Differentiated Services together with DSMARK.
-
- To compile this code as a module, choose M here: the
- module will be called cls_tcindex.
-
config NET_CLS_ROUTE4
tristate "Routing decision (ROUTE)"
depends on INET
select IP_ROUTE_CLASSID
select NET_CLS
- ---help---
+ help
If you say Y here, you will be able to classify packets
according to the route table entry they matched.
@@ -495,7 +506,7 @@ config NET_CLS_ROUTE4
config NET_CLS_FW
tristate "Netfilter mark (FW)"
select NET_CLS
- ---help---
+ help
If you say Y here, you will be able to classify packets
according to netfilter/firewall marks.
@@ -505,7 +516,7 @@ config NET_CLS_FW
config NET_CLS_U32
tristate "Universal 32bit comparisons w/ hashing (U32)"
select NET_CLS
- ---help---
+ help
Say Y here to be able to classify packets using a universal
32bit pieces based comparison scheme.
@@ -515,48 +526,20 @@ config NET_CLS_U32
config CLS_U32_PERF
bool "Performance counters support"
depends on NET_CLS_U32
- ---help---
+ help
Say Y here to make u32 gather additional statistics useful for
fine tuning u32 classifiers.
config CLS_U32_MARK
bool "Netfilter marks support"
depends on NET_CLS_U32
- ---help---
+ help
Say Y here to be able to use netfilter marks as u32 key.
-config NET_CLS_RSVP
- tristate "IPv4 Resource Reservation Protocol (RSVP)"
- select NET_CLS
- ---help---
- The Resource Reservation Protocol (RSVP) permits end systems to
- request a minimum and maximum data flow rate for a connection; this
- is important for real time data such as streaming sound or video.
-
- Say Y here if you want to be able to classify outgoing packets based
- on their RSVP requests.
-
- To compile this code as a module, choose M here: the
- module will be called cls_rsvp.
-
-config NET_CLS_RSVP6
- tristate "IPv6 Resource Reservation Protocol (RSVP6)"
- select NET_CLS
- ---help---
- The Resource Reservation Protocol (RSVP) permits end systems to
- request a minimum and maximum data flow rate for a connection; this
- is important for real time data such as streaming sound or video.
-
- Say Y here if you want to be able to classify outgoing packets based
- on their RSVP requests and you are using the IPv6 protocol.
-
- To compile this code as a module, choose M here: the
- module will be called cls_rsvp6.
-
config NET_CLS_FLOW
tristate "Flow classifier"
select NET_CLS
- ---help---
+ help
If you say Y here, you will be able to classify packets based on
a configurable combination of packet keys. This is mostly useful
in combination with SFQ.
@@ -569,7 +552,7 @@ config NET_CLS_CGROUP
select NET_CLS
select CGROUP_NET_CLASSID
depends on CGROUPS
- ---help---
+ help
Say Y here if you want to classify packets based on the control
cgroup of their process.
@@ -579,7 +562,7 @@ config NET_CLS_CGROUP
config NET_CLS_BPF
tristate "BPF-based classifier"
select NET_CLS
- ---help---
+ help
If you say Y here, you will be able to classify packets based on
programmable BPF (JIT'ed) filters as an alternative to ematches.
@@ -589,7 +572,7 @@ config NET_CLS_BPF
config NET_CLS_FLOWER
tristate "Flower classifier"
select NET_CLS
- ---help---
+ help
If you say Y here, you will be able to classify packets based on
a configurable combination of packet keys and masks.
@@ -599,7 +582,7 @@ config NET_CLS_FLOWER
config NET_CLS_MATCHALL
tristate "Match-all classifier"
select NET_CLS
- ---help---
+ help
If you say Y here, you will be able to classify packets based on
nothing. Every packet will match.
@@ -609,7 +592,7 @@ config NET_CLS_MATCHALL
config NET_EMATCH
bool "Extended Matches"
select NET_CLS
- ---help---
+ help
Say Y here if you want to use extended matches on top of classifiers
and select the extended matches below.
@@ -623,7 +606,7 @@ config NET_EMATCH_STACK
int "Stack size"
depends on NET_EMATCH
default "32"
- ---help---
+ help
Size of the local stack variable used while evaluating the tree of
ematches. Limits the depth of the tree, i.e. the number of
encapsulated precedences. Every level requires 4 bytes of additional
@@ -632,7 +615,7 @@ config NET_EMATCH_STACK
config NET_EMATCH_CMP
tristate "Simple packet data comparison"
depends on NET_EMATCH
- ---help---
+ help
Say Y here if you want to be able to classify packets based on
simple packet data comparisons for 8, 16, and 32bit values.
@@ -642,7 +625,7 @@ config NET_EMATCH_CMP
config NET_EMATCH_NBYTE
tristate "Multi byte comparison"
depends on NET_EMATCH
- ---help---
+ help
Say Y here if you want to be able to classify packets based on
multiple byte comparisons mainly useful for IPv6 address comparisons.
@@ -652,7 +635,7 @@ config NET_EMATCH_NBYTE
config NET_EMATCH_U32
tristate "U32 key"
depends on NET_EMATCH
- ---help---
+ help
Say Y here if you want to be able to classify packets using
the famous u32 key in combination with logic relations.
@@ -662,7 +645,7 @@ config NET_EMATCH_U32
config NET_EMATCH_META
tristate "Metadata"
depends on NET_EMATCH
- ---help---
+ help
Say Y here if you want to be able to classify packets based on
metadata such as load average, netfilter attributes, socket
attributes and routing decisions.
@@ -677,7 +660,7 @@ config NET_EMATCH_TEXT
select TEXTSEARCH_KMP
select TEXTSEARCH_BM
select TEXTSEARCH_FSM
- ---help---
+ help
Say Y here if you want to be able to classify packets based on
textsearch comparisons.
@@ -687,7 +670,7 @@ config NET_EMATCH_TEXT
config NET_EMATCH_CANID
tristate "CAN Identifier"
depends on NET_EMATCH && (CAN=y || CAN=m)
- ---help---
+ help
Say Y here if you want to be able to classify CAN frames based
on CAN Identifier.
@@ -697,7 +680,7 @@ config NET_EMATCH_CANID
config NET_EMATCH_IPSET
tristate "IPset"
depends on NET_EMATCH && IP_SET
- ---help---
+ help
Say Y here if you want to be able to classify packets based on
ipset membership.
@@ -707,7 +690,7 @@ config NET_EMATCH_IPSET
config NET_EMATCH_IPT
tristate "IPtables Matches"
depends on NET_EMATCH && NETFILTER && NETFILTER_XTABLES
- ---help---
+ help
Say Y here to be able to classify packets based on iptables
matches.
Current supported match is "policy" which allows packet classification
@@ -719,7 +702,8 @@ config NET_EMATCH_IPT
config NET_CLS_ACT
bool "Actions"
select NET_CLS
- ---help---
+ select NET_XGRESS
+ help
Say Y here if you want to use traffic control actions. Actions
get attached to classifiers and are invoked after a successful
classification. They are used to overwrite the classification
@@ -730,8 +714,8 @@ config NET_CLS_ACT
config NET_ACT_POLICE
tristate "Traffic Policing"
- depends on NET_CLS_ACT
- ---help---
+ depends on NET_CLS_ACT
+ help
Say Y here if you want to do traffic policing, i.e. strict
bandwidth limiting. This action replaces the existing policing
module.
@@ -740,9 +724,9 @@ config NET_ACT_POLICE
module will be called act_police.
config NET_ACT_GACT
- tristate "Generic actions"
- depends on NET_CLS_ACT
- ---help---
+ tristate "Generic actions"
+ depends on NET_CLS_ACT
+ help
Say Y here to take generic actions such as dropping and
accepting packets.
@@ -750,15 +734,15 @@ config NET_ACT_GACT
module will be called act_gact.
config GACT_PROB
- bool "Probability support"
- depends on NET_ACT_GACT
- ---help---
+ bool "Probability support"
+ depends on NET_ACT_GACT
+ help
Say Y here to use the generic action randomly or deterministically.
config NET_ACT_MIRRED
- tristate "Redirecting and Mirroring"
- depends on NET_CLS_ACT
- ---help---
+ tristate "Redirecting and Mirroring"
+ depends on NET_CLS_ACT
+ help
Say Y here to allow packets to be mirrored or redirected to
other devices.
@@ -766,10 +750,10 @@ config NET_ACT_MIRRED
module will be called act_mirred.
config NET_ACT_SAMPLE
- tristate "Traffic Sampling"
- depends on NET_CLS_ACT
- select PSAMPLE
- ---help---
+ tristate "Traffic Sampling"
+ depends on NET_CLS_ACT
+ select PSAMPLE
+ help
Say Y here to allow packet sampling tc action. The packet sample
action consists of statistically choosing packets and sampling
them using the psample module.
@@ -777,20 +761,10 @@ config NET_ACT_SAMPLE
To compile this code as a module, choose M here: the
module will be called act_sample.
-config NET_ACT_IPT
- tristate "IPtables targets"
- depends on NET_CLS_ACT && NETFILTER && IP_NF_IPTABLES
- ---help---
- Say Y here to be able to invoke iptables targets after successful
- classification.
-
- To compile this code as a module, choose M here: the
- module will be called act_ipt.
-
config NET_ACT_NAT
- tristate "Stateless NAT"
- depends on NET_CLS_ACT
- ---help---
+ tristate "Stateless NAT"
+ depends on NET_CLS_ACT
+ help
Say Y here to do stateless NAT on IPv4 packets. You should use
netfilter for NAT unless you know what you are doing.
@@ -798,18 +772,18 @@ config NET_ACT_NAT
module will be called act_nat.
config NET_ACT_PEDIT
- tristate "Packet Editing"
- depends on NET_CLS_ACT
- ---help---
+ tristate "Packet Editing"
+ depends on NET_CLS_ACT
+ help
Say Y here if you want to mangle the content of packets.
To compile this code as a module, choose M here: the
module will be called act_pedit.
config NET_ACT_SIMP
- tristate "Simple Example (Debug)"
- depends on NET_CLS_ACT
- ---help---
+ tristate "Simple Example (Debug)"
+ depends on NET_CLS_ACT
+ help
Say Y here to add a simple action for demonstration purposes.
It is meant as an example and for debugging purposes. It will
print a configured policy string followed by the packet count
@@ -821,9 +795,9 @@ config NET_ACT_SIMP
module will be called act_simple.
config NET_ACT_SKBEDIT
- tristate "SKB Editing"
- depends on NET_CLS_ACT
- ---help---
+ tristate "SKB Editing"
+ depends on NET_CLS_ACT
+ help
Say Y here to change skb priority or queue_mapping settings.
If unsure, say N.
@@ -832,20 +806,31 @@ config NET_ACT_SKBEDIT
module will be called act_skbedit.
config NET_ACT_CSUM
- tristate "Checksum Updating"
- depends on NET_CLS_ACT && INET
- select LIBCRC32C
- ---help---
+ tristate "Checksum Updating"
+ depends on NET_CLS_ACT && INET
+ select NET_CRC32C
+ help
Say Y here to update some common checksum after some direct
packet alterations.
To compile this code as a module, choose M here: the
module will be called act_csum.
+config NET_ACT_MPLS
+ tristate "MPLS manipulation"
+ depends on NET_CLS_ACT
+ help
+ Say Y here to push or pop MPLS headers.
+
+ If unsure, say N.
+
+ To compile this code as a module, choose M here: the
+ module will be called act_mpls.
+
config NET_ACT_VLAN
- tristate "Vlan manipulation"
- depends on NET_CLS_ACT
- ---help---
+ tristate "Vlan manipulation"
+ depends on NET_CLS_ACT
+ help
Say Y here to push or pop vlan headers.
If unsure, say N.
@@ -854,9 +839,9 @@ config NET_ACT_VLAN
module will be called act_vlan.
config NET_ACT_BPF
- tristate "BPF based action"
- depends on NET_CLS_ACT
- ---help---
+ tristate "BPF based action"
+ depends on NET_CLS_ACT
+ help
Say Y here to execute BPF code on packets. The BPF code will decide
if the packet should be dropped or not.
@@ -866,10 +851,10 @@ config NET_ACT_BPF
module will be called act_bpf.
config NET_ACT_CONNMARK
- tristate "Netfilter Connection Mark Retriever"
- depends on NET_CLS_ACT && NETFILTER && IP_NF_IPTABLES
- depends on NF_CONNTRACK && NF_CONNTRACK_MARK
- ---help---
+ tristate "Netfilter Connection Mark Retriever"
+ depends on NET_CLS_ACT && NETFILTER
+ depends on NF_CONNTRACK && NF_CONNTRACK_MARK
+ help
Say Y here to allow retrieving of conn mark
If unsure, say N.
@@ -877,22 +862,39 @@ config NET_ACT_CONNMARK
To compile this code as a module, choose M here: the
module will be called act_connmark.
+config NET_ACT_CTINFO
+ tristate "Netfilter Connection Mark Actions"
+ depends on NET_CLS_ACT && NETFILTER
+ depends on NF_CONNTRACK && NF_CONNTRACK_MARK
+ help
+ Say Y here to allow transfer of a connmark stored information.
+ Current actions transfer connmark stored DSCP into
+ ipv4/v6 diffserv and/or to transfer connmark to packet
+ mark. Both are useful for restoring egress based marks
+ back onto ingress connections for qdisc priority mapping
+ purposes.
+
+ If unsure, say N.
+
+ To compile this code as a module, choose M here: the
+ module will be called act_ctinfo.
+
config NET_ACT_SKBMOD
- tristate "skb data modification action"
- depends on NET_CLS_ACT
- ---help---
- Say Y here to allow modification of skb data
+ tristate "skb data modification action"
+ depends on NET_CLS_ACT
+ help
+ Say Y here to allow modification of skb data
- If unsure, say N.
+ If unsure, say N.
- To compile this code as a module, choose M here: the
- module will be called act_skbmod.
+ To compile this code as a module, choose M here: the
+ module will be called act_skbmod.
config NET_ACT_IFE
- tristate "Inter-FE action based on IETF ForCES InterFE LFB"
- depends on NET_CLS_ACT
- select NET_IFE
- ---help---
+ tristate "Inter-FE action based on IETF ForCES InterFE LFB"
+ depends on NET_CLS_ACT
+ select NET_IFE
+ help
Say Y here to allow for sourcing and terminating metadata
For details refer to netdev01 paper:
"Distributing Linux Traffic Control Classifier-Action Subsystem"
@@ -902,9 +904,9 @@ config NET_ACT_IFE
module will be called act_ife.
config NET_ACT_TUNNEL_KEY
- tristate "IP tunnel metadata manipulation"
- depends on NET_CLS_ACT
- ---help---
+ tristate "IP tunnel metadata manipulation"
+ depends on NET_CLS_ACT
+ help
Say Y here to set/release ip tunnel metadata.
If unsure, say N.
@@ -912,25 +914,54 @@ config NET_ACT_TUNNEL_KEY
To compile this code as a module, choose M here: the
module will be called act_tunnel_key.
+config NET_ACT_CT
+ tristate "connection tracking tc action"
+ depends on NET_CLS_ACT && NF_CONNTRACK && (!NF_NAT || NF_NAT) && NF_FLOW_TABLE
+ select NF_CONNTRACK_OVS
+ select NF_NAT_OVS if NF_NAT
+ help
+ Say Y here to allow sending the packets to conntrack module.
+
+ If unsure, say N.
+
+ To compile this code as a module, choose M here: the
+ module will be called act_ct.
+
+config NET_ACT_GATE
+ tristate "Frame gate entry list control tc action"
+ depends on NET_CLS_ACT
+ help
+ Say Y here to allow to control the ingress flow to be passed at
+ specific time slot and be dropped at other specific time slot by
+ the gate entry list.
+
+ If unsure, say N.
+ To compile this code as a module, choose M here: the
+ module will be called act_gate.
+
config NET_IFE_SKBMARK
- tristate "Support to encoding decoding skb mark on IFE action"
- depends on NET_ACT_IFE
+ tristate "Support to encoding decoding skb mark on IFE action"
+ depends on NET_ACT_IFE
config NET_IFE_SKBPRIO
- tristate "Support to encoding decoding skb prio on IFE action"
- depends on NET_ACT_IFE
+ tristate "Support to encoding decoding skb prio on IFE action"
+ depends on NET_ACT_IFE
config NET_IFE_SKBTCINDEX
- tristate "Support to encoding decoding skb tcindex on IFE action"
- depends on NET_ACT_IFE
-
-config NET_CLS_IND
- bool "Incoming device classification"
- depends on NET_CLS_U32 || NET_CLS_FW
- ---help---
- Say Y here to extend the u32 and fw classifier to support
- classification based on the incoming device. This option is
- likely to disappear in favour of the metadata ematch.
+ tristate "Support to encoding decoding skb tcindex on IFE action"
+ depends on NET_ACT_IFE
+
+config NET_TC_SKB_EXT
+ bool "TC recirculation support"
+ depends on NET_CLS_ACT
+ select SKB_EXTENSIONS
+
+ help
+ Say Y here to allow tc chain misses to continue in OvS datapath in
+ the correct recirc_id, and hardware chain misses to continue in
+ the correct chain in tc software datapath.
+
+ Say N here if you won't be using tc<->ovs offload or tc chains offload.
endif # NET_SCHED
diff --git a/net/sched/Makefile b/net/sched/Makefile
index 8a40431d7b5c..5078ea84e6ad 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -5,6 +5,7 @@
obj-y := sch_generic.o sch_mq.o
+obj-$(CONFIG_INET) += sch_frag.o
obj-$(CONFIG_NET_SCHED) += sch_api.o sch_blackhole.o
obj-$(CONFIG_NET_CLS) += cls_api.o
obj-$(CONFIG_NET_CLS_ACT) += act_api.o
@@ -12,40 +13,42 @@ obj-$(CONFIG_NET_ACT_POLICE) += act_police.o
obj-$(CONFIG_NET_ACT_GACT) += act_gact.o
obj-$(CONFIG_NET_ACT_MIRRED) += act_mirred.o
obj-$(CONFIG_NET_ACT_SAMPLE) += act_sample.o
-obj-$(CONFIG_NET_ACT_IPT) += act_ipt.o
obj-$(CONFIG_NET_ACT_NAT) += act_nat.o
obj-$(CONFIG_NET_ACT_PEDIT) += act_pedit.o
obj-$(CONFIG_NET_ACT_SIMP) += act_simple.o
obj-$(CONFIG_NET_ACT_SKBEDIT) += act_skbedit.o
obj-$(CONFIG_NET_ACT_CSUM) += act_csum.o
+obj-$(CONFIG_NET_ACT_MPLS) += act_mpls.o
obj-$(CONFIG_NET_ACT_VLAN) += act_vlan.o
obj-$(CONFIG_NET_ACT_BPF) += act_bpf.o
obj-$(CONFIG_NET_ACT_CONNMARK) += act_connmark.o
+obj-$(CONFIG_NET_ACT_CTINFO) += act_ctinfo.o
obj-$(CONFIG_NET_ACT_SKBMOD) += act_skbmod.o
obj-$(CONFIG_NET_ACT_IFE) += act_ife.o
obj-$(CONFIG_NET_IFE_SKBMARK) += act_meta_mark.o
obj-$(CONFIG_NET_IFE_SKBPRIO) += act_meta_skbprio.o
obj-$(CONFIG_NET_IFE_SKBTCINDEX) += act_meta_skbtcindex.o
obj-$(CONFIG_NET_ACT_TUNNEL_KEY)+= act_tunnel_key.o
+obj-$(CONFIG_NET_ACT_CT) += act_ct.o
+obj-$(CONFIG_NET_ACT_GATE) += act_gate.o
obj-$(CONFIG_NET_SCH_FIFO) += sch_fifo.o
-obj-$(CONFIG_NET_SCH_CBQ) += sch_cbq.o
obj-$(CONFIG_NET_SCH_HTB) += sch_htb.o
obj-$(CONFIG_NET_SCH_HFSC) += sch_hfsc.o
obj-$(CONFIG_NET_SCH_RED) += sch_red.o
obj-$(CONFIG_NET_SCH_GRED) += sch_gred.o
obj-$(CONFIG_NET_SCH_INGRESS) += sch_ingress.o
-obj-$(CONFIG_NET_SCH_DSMARK) += sch_dsmark.o
obj-$(CONFIG_NET_SCH_SFB) += sch_sfb.o
obj-$(CONFIG_NET_SCH_SFQ) += sch_sfq.o
obj-$(CONFIG_NET_SCH_TBF) += sch_tbf.o
obj-$(CONFIG_NET_SCH_TEQL) += sch_teql.o
obj-$(CONFIG_NET_SCH_PRIO) += sch_prio.o
obj-$(CONFIG_NET_SCH_MULTIQ) += sch_multiq.o
-obj-$(CONFIG_NET_SCH_ATM) += sch_atm.o
obj-$(CONFIG_NET_SCH_NETEM) += sch_netem.o
obj-$(CONFIG_NET_SCH_DRR) += sch_drr.o
obj-$(CONFIG_NET_SCH_PLUG) += sch_plug.o
+obj-$(CONFIG_NET_SCH_ETS) += sch_ets.o
obj-$(CONFIG_NET_SCH_MQPRIO) += sch_mqprio.o
+obj-$(CONFIG_NET_SCH_MQPRIO_LIB) += sch_mqprio_lib.o
obj-$(CONFIG_NET_SCH_SKBPRIO) += sch_skbprio.o
obj-$(CONFIG_NET_SCH_CHOKE) += sch_choke.o
obj-$(CONFIG_NET_SCH_QFQ) += sch_qfq.o
@@ -55,16 +58,16 @@ obj-$(CONFIG_NET_SCH_CAKE) += sch_cake.o
obj-$(CONFIG_NET_SCH_FQ) += sch_fq.o
obj-$(CONFIG_NET_SCH_HHF) += sch_hhf.o
obj-$(CONFIG_NET_SCH_PIE) += sch_pie.o
+obj-$(CONFIG_NET_SCH_FQ_PIE) += sch_fq_pie.o
obj-$(CONFIG_NET_SCH_CBS) += sch_cbs.o
obj-$(CONFIG_NET_SCH_ETF) += sch_etf.o
obj-$(CONFIG_NET_SCH_TAPRIO) += sch_taprio.o
+obj-$(CONFIG_NET_SCH_BPF) += bpf_qdisc.o
+obj-$(CONFIG_NET_SCH_DUALPI2) += sch_dualpi2.o
obj-$(CONFIG_NET_CLS_U32) += cls_u32.o
obj-$(CONFIG_NET_CLS_ROUTE4) += cls_route.o
obj-$(CONFIG_NET_CLS_FW) += cls_fw.o
-obj-$(CONFIG_NET_CLS_RSVP) += cls_rsvp.o
-obj-$(CONFIG_NET_CLS_TCINDEX) += cls_tcindex.o
-obj-$(CONFIG_NET_CLS_RSVP6) += cls_rsvp6.o
obj-$(CONFIG_NET_CLS_BASIC) += cls_basic.o
obj-$(CONFIG_NET_CLS_FLOW) += cls_flow.o
obj-$(CONFIG_NET_CLS_CGROUP) += cls_cgroup.o
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index d4b8355737d8..ff6be5cfe2b0 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -1,14 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* net/sched/act_api.c Packet action API.
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Author: Jamal Hadi Salim
- *
- *
*/
#include <linux/types.h>
@@ -25,30 +19,32 @@
#include <net/sock.h>
#include <net/sch_generic.h>
#include <net/pkt_cls.h>
+#include <net/tc_act/tc_pedit.h>
#include <net/act_api.h>
#include <net/netlink.h>
+#include <net/flow_offload.h>
+#include <net/tc_wrapper.h>
-static int tcf_action_goto_chain_init(struct tc_action *a, struct tcf_proto *tp)
-{
- u32 chain_index = a->tcfa_action & TC_ACT_EXT_VAL_MASK;
-
- if (!tp)
- return -EINVAL;
- a->goto_chain = tcf_chain_get_by_act(tp->chain->block, chain_index);
- if (!a->goto_chain)
- return -ENOMEM;
- return 0;
-}
+#ifdef CONFIG_INET
+DEFINE_STATIC_KEY_FALSE(tcf_frag_xmit_count);
+EXPORT_SYMBOL_GPL(tcf_frag_xmit_count);
+#endif
-static void tcf_action_goto_chain_fini(struct tc_action *a)
+int tcf_dev_queue_xmit(struct sk_buff *skb, int (*xmit)(struct sk_buff *skb))
{
- tcf_chain_put_by_act(a->goto_chain);
+#ifdef CONFIG_INET
+ if (static_branch_unlikely(&tcf_frag_xmit_count))
+ return sch_frag_xmit_hook(skb, xmit);
+#endif
+
+ return xmit(skb);
}
+EXPORT_SYMBOL_GPL(tcf_dev_queue_xmit);
static void tcf_action_goto_chain_exec(const struct tc_action *a,
struct tcf_result *res)
{
- const struct tcf_chain *chain = a->goto_chain;
+ const struct tcf_chain *chain = rcu_dereference_bh(a->goto_chain);
res->goto_tp = rcu_dereference_bh(chain->filter_chain);
}
@@ -66,11 +62,56 @@ static void tcf_set_action_cookie(struct tc_cookie __rcu **old_cookie,
{
struct tc_cookie *old;
- old = xchg((__force struct tc_cookie **)old_cookie, new_cookie);
+ old = unrcu_pointer(xchg(old_cookie, RCU_INITIALIZER(new_cookie)));
if (old)
call_rcu(&old->rcu, tcf_free_cookie_rcu);
}
+int tcf_action_check_ctrlact(int action, struct tcf_proto *tp,
+ struct tcf_chain **newchain,
+ struct netlink_ext_ack *extack)
+{
+ int opcode = TC_ACT_EXT_OPCODE(action), ret = -EINVAL;
+ u32 chain_index;
+
+ if (!opcode)
+ ret = action > TC_ACT_VALUE_MAX ? -EINVAL : 0;
+ else if (opcode <= TC_ACT_EXT_OPCODE_MAX || action == TC_ACT_UNSPEC)
+ ret = 0;
+ if (ret) {
+ NL_SET_ERR_MSG(extack, "invalid control action");
+ goto end;
+ }
+
+ if (TC_ACT_EXT_CMP(action, TC_ACT_GOTO_CHAIN)) {
+ chain_index = action & TC_ACT_EXT_VAL_MASK;
+ if (!tp || !newchain) {
+ ret = -EINVAL;
+ NL_SET_ERR_MSG(extack,
+ "can't goto NULL proto/chain");
+ goto end;
+ }
+ *newchain = tcf_chain_get_by_act(tp->chain->block, chain_index);
+ if (!*newchain) {
+ ret = -ENOMEM;
+ NL_SET_ERR_MSG(extack,
+ "can't allocate goto_chain");
+ }
+ }
+end:
+ return ret;
+}
+EXPORT_SYMBOL(tcf_action_check_ctrlact);
+
+struct tcf_chain *tcf_action_set_ctrlact(struct tc_action *a, int action,
+ struct tcf_chain *goto_chain)
+{
+ a->tcfa_action = action;
+ goto_chain = rcu_replace_pointer(a->goto_chain, goto_chain, 1);
+ return goto_chain;
+}
+EXPORT_SYMBOL(tcf_action_set_ctrlact);
+
/* XXX: For standalone actions, we don't need a RCU grace period either, because
* actions are always connected to filters and filters are already destroyed in
* RCU callbacks, so after a RCU grace period actions are already disconnected
@@ -78,19 +119,250 @@ static void tcf_set_action_cookie(struct tc_cookie __rcu **old_cookie,
*/
static void free_tcf(struct tc_action *p)
{
+ struct tcf_chain *chain = rcu_dereference_protected(p->goto_chain, 1);
+
free_percpu(p->cpu_bstats);
free_percpu(p->cpu_bstats_hw);
free_percpu(p->cpu_qstats);
- tcf_set_action_cookie(&p->act_cookie, NULL);
- if (p->goto_chain)
- tcf_action_goto_chain_fini(p);
+ tcf_set_action_cookie(&p->user_cookie, NULL);
+ if (chain)
+ tcf_chain_put_by_act(chain);
kfree(p);
}
+static void offload_action_hw_count_set(struct tc_action *act,
+ u32 hw_count)
+{
+ act->in_hw_count = hw_count;
+}
+
+static void offload_action_hw_count_inc(struct tc_action *act,
+ u32 hw_count)
+{
+ act->in_hw_count += hw_count;
+}
+
+static void offload_action_hw_count_dec(struct tc_action *act,
+ u32 hw_count)
+{
+ act->in_hw_count = act->in_hw_count > hw_count ?
+ act->in_hw_count - hw_count : 0;
+}
+
+static unsigned int tcf_offload_act_num_actions_single(struct tc_action *act)
+{
+ if (is_tcf_pedit(act))
+ return tcf_pedit_nkeys(act);
+ else
+ return 1;
+}
+
+static bool tc_act_skip_hw(u32 flags)
+{
+ return (flags & TCA_ACT_FLAGS_SKIP_HW) ? true : false;
+}
+
+static bool tc_act_skip_sw(u32 flags)
+{
+ return (flags & TCA_ACT_FLAGS_SKIP_SW) ? true : false;
+}
+
+/* SKIP_HW and SKIP_SW are mutually exclusive flags. */
+static bool tc_act_flags_valid(u32 flags)
+{
+ flags &= TCA_ACT_FLAGS_SKIP_HW | TCA_ACT_FLAGS_SKIP_SW;
+
+ return flags ^ (TCA_ACT_FLAGS_SKIP_HW | TCA_ACT_FLAGS_SKIP_SW);
+}
+
+static int offload_action_init(struct flow_offload_action *fl_action,
+ struct tc_action *act,
+ enum offload_act_command cmd,
+ struct netlink_ext_ack *extack)
+{
+ int err;
+
+ fl_action->extack = extack;
+ fl_action->command = cmd;
+ fl_action->index = act->tcfa_index;
+ fl_action->cookie = (unsigned long)act;
+
+ if (act->ops->offload_act_setup) {
+ spin_lock_bh(&act->tcfa_lock);
+ err = act->ops->offload_act_setup(act, fl_action, NULL,
+ false, extack);
+ spin_unlock_bh(&act->tcfa_lock);
+ return err;
+ }
+
+ return -EOPNOTSUPP;
+}
+
+static int tcf_action_offload_cmd_ex(struct flow_offload_action *fl_act,
+ u32 *hw_count)
+{
+ int err;
+
+ err = flow_indr_dev_setup_offload(NULL, NULL, TC_SETUP_ACT,
+ fl_act, NULL, NULL);
+ if (err < 0)
+ return err;
+
+ if (hw_count)
+ *hw_count = err;
+
+ return 0;
+}
+
+static int tcf_action_offload_cmd_cb_ex(struct flow_offload_action *fl_act,
+ u32 *hw_count,
+ flow_indr_block_bind_cb_t *cb,
+ void *cb_priv)
+{
+ int err;
+
+ err = cb(NULL, NULL, cb_priv, TC_SETUP_ACT, NULL, fl_act, NULL);
+ if (err < 0)
+ return err;
+
+ if (hw_count)
+ *hw_count = 1;
+
+ return 0;
+}
+
+static int tcf_action_offload_cmd(struct flow_offload_action *fl_act,
+ u32 *hw_count,
+ flow_indr_block_bind_cb_t *cb,
+ void *cb_priv)
+{
+ return cb ? tcf_action_offload_cmd_cb_ex(fl_act, hw_count,
+ cb, cb_priv) :
+ tcf_action_offload_cmd_ex(fl_act, hw_count);
+}
+
+static int tcf_action_offload_add_ex(struct tc_action *action,
+ struct netlink_ext_ack *extack,
+ flow_indr_block_bind_cb_t *cb,
+ void *cb_priv)
+{
+ bool skip_sw = tc_act_skip_sw(action->tcfa_flags);
+ struct tc_action *actions[TCA_ACT_MAX_PRIO] = {
+ [0] = action,
+ };
+ struct flow_offload_action *fl_action;
+ u32 in_hw_count = 0;
+ int num, err = 0;
+
+ if (tc_act_skip_hw(action->tcfa_flags))
+ return 0;
+
+ num = tcf_offload_act_num_actions_single(action);
+ fl_action = offload_action_alloc(num);
+ if (!fl_action)
+ return -ENOMEM;
+
+ err = offload_action_init(fl_action, action, FLOW_ACT_REPLACE, extack);
+ if (err)
+ goto fl_err;
+
+ err = tc_setup_action(&fl_action->action, actions, 0, extack);
+ if (err) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "Failed to setup tc actions for offload");
+ goto fl_err;
+ }
+
+ err = tcf_action_offload_cmd(fl_action, &in_hw_count, cb, cb_priv);
+ if (!err)
+ cb ? offload_action_hw_count_inc(action, in_hw_count) :
+ offload_action_hw_count_set(action, in_hw_count);
+
+ if (skip_sw && !tc_act_in_hw(action))
+ err = -EINVAL;
+
+ tc_cleanup_offload_action(&fl_action->action);
+
+fl_err:
+ kfree(fl_action);
+
+ return err;
+}
+
+/* offload the tc action after it is inserted */
+static int tcf_action_offload_add(struct tc_action *action,
+ struct netlink_ext_ack *extack)
+{
+ return tcf_action_offload_add_ex(action, extack, NULL, NULL);
+}
+
+int tcf_action_update_hw_stats(struct tc_action *action)
+{
+ struct flow_offload_action fl_act = {};
+ int err;
+
+ err = offload_action_init(&fl_act, action, FLOW_ACT_STATS, NULL);
+ if (err)
+ return err;
+
+ err = tcf_action_offload_cmd(&fl_act, NULL, NULL, NULL);
+ if (!err) {
+ preempt_disable();
+ tcf_action_stats_update(action, fl_act.stats.bytes,
+ fl_act.stats.pkts,
+ fl_act.stats.drops,
+ fl_act.stats.lastused,
+ true);
+ preempt_enable();
+ action->used_hw_stats = fl_act.stats.used_hw_stats;
+ action->used_hw_stats_valid = true;
+ } else {
+ return -EOPNOTSUPP;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(tcf_action_update_hw_stats);
+
+static int tcf_action_offload_del_ex(struct tc_action *action,
+ flow_indr_block_bind_cb_t *cb,
+ void *cb_priv)
+{
+ struct flow_offload_action fl_act = {};
+ u32 in_hw_count = 0;
+ int err = 0;
+
+ if (!tc_act_in_hw(action))
+ return 0;
+
+ err = offload_action_init(&fl_act, action, FLOW_ACT_DESTROY, NULL);
+ if (err)
+ return err;
+
+ err = tcf_action_offload_cmd(&fl_act, &in_hw_count, cb, cb_priv);
+ if (err < 0)
+ return err;
+
+ if (!cb && action->in_hw_count != in_hw_count)
+ return -EINVAL;
+
+ /* do not need to update hw state when deleting action */
+ if (cb && in_hw_count)
+ offload_action_hw_count_dec(action, in_hw_count);
+
+ return 0;
+}
+
+static int tcf_action_offload_del(struct tc_action *action)
+{
+ return tcf_action_offload_del_ex(action, NULL, NULL);
+}
+
static void tcf_action_cleanup(struct tc_action *p)
{
+ tcf_action_offload_del(p);
if (p->ops->cleanup)
p->ops->cleanup(p);
@@ -118,7 +390,7 @@ static int __tcf_action_put(struct tc_action *p, bool bind)
return 0;
}
-int __tcf_idr_release(struct tc_action *p, bool bind, bool strict)
+static int __tcf_idr_release(struct tc_action *p, bool bind, bool strict)
{
int ret = 0;
@@ -144,29 +416,44 @@ int __tcf_idr_release(struct tc_action *p, bool bind, bool strict)
return ret;
}
-EXPORT_SYMBOL(__tcf_idr_release);
+
+int tcf_idr_release(struct tc_action *a, bool bind)
+{
+ const struct tc_action_ops *ops = a->ops;
+ int ret;
+
+ ret = __tcf_idr_release(a, bind, false);
+ if (ret == ACT_P_DELETED)
+ module_put(ops->owner);
+ return ret;
+}
+EXPORT_SYMBOL(tcf_idr_release);
static size_t tcf_action_shared_attrs_size(const struct tc_action *act)
{
- struct tc_cookie *act_cookie;
+ struct tc_cookie *user_cookie;
u32 cookie_len = 0;
rcu_read_lock();
- act_cookie = rcu_dereference(act->act_cookie);
+ user_cookie = rcu_dereference(act->user_cookie);
- if (act_cookie)
- cookie_len = nla_total_size(act_cookie->len);
+ if (user_cookie)
+ cookie_len = nla_total_size(user_cookie->len);
rcu_read_unlock();
return nla_total_size(0) /* action number nested */
+ nla_total_size(IFNAMSIZ) /* TCA_ACT_KIND */
+ cookie_len /* TCA_ACT_COOKIE */
+ + nla_total_size(sizeof(struct nla_bitfield32)) /* TCA_ACT_HW_STATS */
+ nla_total_size(0) /* TCA_ACT_STATS nested */
+ + nla_total_size(sizeof(struct nla_bitfield32)) /* TCA_ACT_FLAGS */
/* TCA_STATS_BASIC */
+ nla_total_size_64bit(sizeof(struct gnet_stats_basic))
+ /* TCA_STATS_PKT64 */
+ + nla_total_size_64bit(sizeof(u64))
/* TCA_STATS_QUEUE */
+ nla_total_size_64bit(sizeof(struct gnet_stats_queue))
- + nla_total_size(0) /* TCA_OPTIONS nested */
+ + nla_total_size(0) /* TCA_ACT_OPTIONS nested */
+ nla_total_size(sizeof(struct tcf_t)); /* TCA_GACT_TM */
}
@@ -187,6 +474,80 @@ static size_t tcf_action_fill_size(const struct tc_action *act)
return sz;
}
+static int
+tcf_action_dump_terse(struct sk_buff *skb, struct tc_action *a, bool from_act)
+{
+ unsigned char *b = skb_tail_pointer(skb);
+ struct tc_cookie *cookie;
+
+ if (nla_put_string(skb, TCA_ACT_KIND, a->ops->kind))
+ goto nla_put_failure;
+ if (tcf_action_copy_stats(skb, a, 0))
+ goto nla_put_failure;
+ if (from_act && nla_put_u32(skb, TCA_ACT_INDEX, a->tcfa_index))
+ goto nla_put_failure;
+
+ rcu_read_lock();
+ cookie = rcu_dereference(a->user_cookie);
+ if (cookie) {
+ if (nla_put(skb, TCA_ACT_COOKIE, cookie->len, cookie->data)) {
+ rcu_read_unlock();
+ goto nla_put_failure;
+ }
+ }
+ rcu_read_unlock();
+
+ return 0;
+
+nla_put_failure:
+ nlmsg_trim(skb, b);
+ return -1;
+}
+
+static int
+tcf_action_dump_1(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
+{
+ unsigned char *b = skb_tail_pointer(skb);
+ struct nlattr *nest;
+ int err = -EINVAL;
+ u32 flags;
+
+ if (tcf_action_dump_terse(skb, a, false))
+ goto nla_put_failure;
+
+ if (a->hw_stats != TCA_ACT_HW_STATS_ANY &&
+ nla_put_bitfield32(skb, TCA_ACT_HW_STATS,
+ a->hw_stats, TCA_ACT_HW_STATS_ANY))
+ goto nla_put_failure;
+
+ if (a->used_hw_stats_valid &&
+ nla_put_bitfield32(skb, TCA_ACT_USED_HW_STATS,
+ a->used_hw_stats, TCA_ACT_HW_STATS_ANY))
+ goto nla_put_failure;
+
+ flags = a->tcfa_flags & TCA_ACT_FLAGS_USER_MASK;
+ if (flags &&
+ nla_put_bitfield32(skb, TCA_ACT_FLAGS,
+ flags, flags))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, TCA_ACT_IN_HW_COUNT, a->in_hw_count))
+ goto nla_put_failure;
+
+ nest = nla_nest_start_noflag(skb, TCA_ACT_OPTIONS);
+ if (nest == NULL)
+ goto nla_put_failure;
+ err = tcf_action_dump_old(skb, a, bind, ref);
+ if (err > 0) {
+ nla_nest_end(skb, nest);
+ return err;
+ }
+
+nla_put_failure:
+ nlmsg_trim(skb, b);
+ return -1;
+}
+
static int tcf_dump_walker(struct tcf_idrinfo *idrinfo, struct sk_buff *skb,
struct netlink_callback *cb)
{
@@ -197,27 +558,34 @@ static int tcf_dump_walker(struct tcf_idrinfo *idrinfo, struct sk_buff *skb,
struct idr *idr = &idrinfo->action_idr;
struct tc_action *p;
unsigned long id = 1;
+ unsigned long tmp;
mutex_lock(&idrinfo->lock);
s_i = cb->args[0];
- idr_for_each_entry_ul(idr, p, id) {
+ idr_for_each_entry_ul(idr, p, tmp, id) {
index++;
if (index < s_i)
continue;
+ if (IS_ERR(p))
+ continue;
if (jiffy_since &&
time_after(jiffy_since,
(unsigned long)p->tcfa_tm.lastuse))
continue;
- nest = nla_nest_start(skb, n_i);
+ tcf_action_update_hw_stats(p);
+
+ nest = nla_nest_start_noflag(skb, n_i);
if (!nest) {
index--;
goto nla_put_failure;
}
- err = tcf_action_dump_1(skb, p, 0, 0);
+ err = (act_flags & TCA_ACT_FLAG_TERSE_DUMP) ?
+ tcf_action_dump_terse(skb, p, true) :
+ tcf_action_dump_1(skb, p, 0, 0);
if (err < 0) {
index--;
nlmsg_trim(skb, nest);
@@ -225,7 +593,7 @@ static int tcf_dump_walker(struct tcf_idrinfo *idrinfo, struct sk_buff *skb,
}
nla_nest_end(skb, nest);
n_i++;
- if (!(act_flags & TCA_FLAG_LARGE_DUMP_ON) &&
+ if (!(act_flags & TCA_ACT_FLAG_LARGE_DUMP_ON) &&
n_i >= TCA_ACT_MAX_PRIO)
goto done;
}
@@ -235,7 +603,7 @@ done:
mutex_unlock(&idrinfo->lock);
if (n_i) {
- if (act_flags & TCA_FLAG_LARGE_DUMP_ON)
+ if (act_flags & TCA_ACT_FLAG_LARGE_DUMP_ON)
cb->args[1] = n_i;
}
return n_i;
@@ -260,7 +628,8 @@ static int tcf_idr_release_unsafe(struct tc_action *p)
}
static int tcf_del_walker(struct tcf_idrinfo *idrinfo, struct sk_buff *skb,
- const struct tc_action_ops *ops)
+ const struct tc_action_ops *ops,
+ struct netlink_ext_ack *extack)
{
struct nlattr *nest;
int n_i = 0;
@@ -268,27 +637,36 @@ static int tcf_del_walker(struct tcf_idrinfo *idrinfo, struct sk_buff *skb,
struct idr *idr = &idrinfo->action_idr;
struct tc_action *p;
unsigned long id = 1;
+ unsigned long tmp;
- nest = nla_nest_start(skb, 0);
+ nest = nla_nest_start_noflag(skb, 0);
if (nest == NULL)
goto nla_put_failure;
- if (nla_put_string(skb, TCA_KIND, ops->kind))
+ if (nla_put_string(skb, TCA_ACT_KIND, ops->kind))
goto nla_put_failure;
+ ret = 0;
mutex_lock(&idrinfo->lock);
- idr_for_each_entry_ul(idr, p, id) {
+ idr_for_each_entry_ul(idr, p, tmp, id) {
+ if (IS_ERR(p))
+ continue;
ret = tcf_idr_release_unsafe(p);
- if (ret == ACT_P_DELETED) {
+ if (ret == ACT_P_DELETED)
module_put(ops->owner);
- n_i++;
- } else if (ret < 0) {
- mutex_unlock(&idrinfo->lock);
- goto nla_put_failure;
- }
+ else if (ret < 0)
+ break;
+ n_i++;
}
mutex_unlock(&idrinfo->lock);
+ if (ret < 0) {
+ if (n_i)
+ NL_SET_ERR_MSG(extack, "Unable to flush all TC actions");
+ else
+ goto nla_put_failure;
+ }
- if (nla_put_u32(skb, TCA_FCNT, n_i))
+ ret = nla_put_u32(skb, TCA_FCNT, n_i);
+ if (ret)
goto nla_put_failure;
nla_nest_end(skb, nest);
@@ -306,7 +684,7 @@ int tcf_generic_walker(struct tc_action_net *tn, struct sk_buff *skb,
struct tcf_idrinfo *idrinfo = tn->idrinfo;
if (type == RTM_DELACTION) {
- return tcf_del_walker(idrinfo, skb, ops);
+ return tcf_del_walker(idrinfo, skb, ops, extack);
} else if (type == RTM_GETACTION) {
return tcf_dump_walker(idrinfo, skb, cb);
} else {
@@ -338,6 +716,31 @@ int tcf_idr_search(struct tc_action_net *tn, struct tc_action **a, u32 index)
}
EXPORT_SYMBOL(tcf_idr_search);
+static int __tcf_generic_walker(struct net *net, struct sk_buff *skb,
+ struct netlink_callback *cb, int type,
+ const struct tc_action_ops *ops,
+ struct netlink_ext_ack *extack)
+{
+ struct tc_action_net *tn = net_generic(net, ops->net_id);
+
+ if (unlikely(ops->walk))
+ return ops->walk(net, skb, cb, type, ops, extack);
+
+ return tcf_generic_walker(tn, skb, cb, type, ops, extack);
+}
+
+static int __tcf_idr_search(struct net *net,
+ const struct tc_action_ops *ops,
+ struct tc_action **a, u32 index)
+{
+ struct tc_action_net *tn = net_generic(net, ops->net_id);
+
+ if (unlikely(ops->lookup))
+ return ops->lookup(net, a, index);
+
+ return tcf_idr_search(tn, a, index);
+}
+
static int tcf_idr_delete_index(struct tcf_idrinfo *idrinfo, u32 index)
{
struct tc_action *p;
@@ -373,7 +776,7 @@ static int tcf_idr_delete_index(struct tcf_idrinfo *idrinfo, u32 index)
int tcf_idr_create(struct tc_action_net *tn, u32 index, struct nlattr *est,
struct tc_action **a, const struct tc_action_ops *ops,
- int bind, bool cpustats)
+ int bind, bool cpustats, u32 flags)
{
struct tc_action *p = kzalloc(ops->size, GFP_KERNEL);
struct tcf_idrinfo *idrinfo = tn->idrinfo;
@@ -386,30 +789,34 @@ int tcf_idr_create(struct tc_action_net *tn, u32 index, struct nlattr *est,
atomic_set(&p->tcfa_bindcnt, 1);
if (cpustats) {
- p->cpu_bstats = netdev_alloc_pcpu_stats(struct gnet_stats_basic_cpu);
+ p->cpu_bstats = netdev_alloc_pcpu_stats(struct gnet_stats_basic_sync);
if (!p->cpu_bstats)
goto err1;
- p->cpu_bstats_hw = netdev_alloc_pcpu_stats(struct gnet_stats_basic_cpu);
+ p->cpu_bstats_hw = netdev_alloc_pcpu_stats(struct gnet_stats_basic_sync);
if (!p->cpu_bstats_hw)
goto err2;
p->cpu_qstats = alloc_percpu(struct gnet_stats_queue);
if (!p->cpu_qstats)
goto err3;
}
+ gnet_stats_basic_sync_init(&p->tcfa_bstats);
+ gnet_stats_basic_sync_init(&p->tcfa_bstats_hw);
spin_lock_init(&p->tcfa_lock);
p->tcfa_index = index;
p->tcfa_tm.install = jiffies;
p->tcfa_tm.lastuse = jiffies;
p->tcfa_tm.firstuse = 0;
+ p->tcfa_flags = flags;
if (est) {
err = gen_new_estimator(&p->tcfa_bstats, p->cpu_bstats,
&p->tcfa_rate_est,
- &p->tcfa_lock, NULL, est);
+ &p->tcfa_lock, false, est);
if (err)
goto err4;
}
p->idrinfo = idrinfo;
+ __module_get(ops->owner);
p->ops = ops;
*a = p;
return 0;
@@ -425,16 +832,16 @@ err1:
}
EXPORT_SYMBOL(tcf_idr_create);
-void tcf_idr_insert(struct tc_action_net *tn, struct tc_action *a)
+int tcf_idr_create_from_flags(struct tc_action_net *tn, u32 index,
+ struct nlattr *est, struct tc_action **a,
+ const struct tc_action_ops *ops, int bind,
+ u32 flags)
{
- struct tcf_idrinfo *idrinfo = tn->idrinfo;
-
- mutex_lock(&idrinfo->lock);
- /* Replace ERR_PTR(-EBUSY) allocated by tcf_idr_check_alloc */
- WARN_ON(!IS_ERR(idr_replace(&idrinfo->action_idr, a, a->tcfa_index)));
- mutex_unlock(&idrinfo->lock);
+ /* Set cpustats according to actions flags. */
+ return tcf_idr_create(tn, index, est, a, ops, bind,
+ !(flags & TCA_ACT_FLAGS_NO_PERCPU_STATS), flags);
}
-EXPORT_SYMBOL(tcf_idr_insert);
+EXPORT_SYMBOL(tcf_idr_create_from_flags);
/* Cleanup idr index that was allocated but not initialized. */
@@ -453,6 +860,9 @@ EXPORT_SYMBOL(tcf_idr_cleanup);
* its reference and bind counters, and return 1. Otherwise insert temporary
* error pointer (to prevent concurrent users from inserting actions with same
* index) and return 0.
+ *
+ * May return -EAGAIN for binding actions in case of a parallel add/delete on
+ * the requested index.
*/
int tcf_idr_check_alloc(struct tc_action_net *tn, u32 *index,
@@ -461,43 +871,60 @@ int tcf_idr_check_alloc(struct tc_action_net *tn, u32 *index,
struct tcf_idrinfo *idrinfo = tn->idrinfo;
struct tc_action *p;
int ret;
+ u32 max;
-again:
- mutex_lock(&idrinfo->lock);
if (*index) {
+ rcu_read_lock();
p = idr_find(&idrinfo->action_idr, *index);
+
if (IS_ERR(p)) {
/* This means that another process allocated
* index but did not assign the pointer yet.
*/
- mutex_unlock(&idrinfo->lock);
- goto again;
+ rcu_read_unlock();
+ return -EAGAIN;
}
- if (p) {
- refcount_inc(&p->tcfa_refcnt);
- if (bind)
- atomic_inc(&p->tcfa_bindcnt);
- *a = p;
- ret = 1;
- } else {
- *a = NULL;
- ret = idr_alloc_u32(&idrinfo->action_idr, NULL, index,
- *index, GFP_KERNEL);
- if (!ret)
- idr_replace(&idrinfo->action_idr,
- ERR_PTR(-EBUSY), *index);
+ if (!p) {
+ /* Empty slot, try to allocate it */
+ max = *index;
+ rcu_read_unlock();
+ goto new;
}
+
+ if (!refcount_inc_not_zero(&p->tcfa_refcnt)) {
+ /* Action was deleted in parallel */
+ rcu_read_unlock();
+ return -EAGAIN;
+ }
+
+ if (bind)
+ atomic_inc(&p->tcfa_bindcnt);
+ *a = p;
+
+ rcu_read_unlock();
+
+ return 1;
} else {
+ /* Find a slot */
*index = 1;
- *a = NULL;
- ret = idr_alloc_u32(&idrinfo->action_idr, NULL, index,
- UINT_MAX, GFP_KERNEL);
- if (!ret)
- idr_replace(&idrinfo->action_idr, ERR_PTR(-EBUSY),
- *index);
+ max = UINT_MAX;
}
+
+new:
+ *a = NULL;
+
+ mutex_lock(&idrinfo->lock);
+ ret = idr_alloc_u32(&idrinfo->action_idr, ERR_PTR(-EBUSY), index, max,
+ GFP_KERNEL);
mutex_unlock(&idrinfo->lock);
+
+ /* N binds raced for action allocation,
+ * retry for all the ones that failed.
+ */
+ if (ret == -ENOSPC && *index == max)
+ ret = -EAGAIN;
+
return ret;
}
EXPORT_SYMBOL(tcf_idr_check_alloc);
@@ -506,23 +933,84 @@ void tcf_idrinfo_destroy(const struct tc_action_ops *ops,
struct tcf_idrinfo *idrinfo)
{
struct idr *idr = &idrinfo->action_idr;
+ bool mutex_taken = false;
struct tc_action *p;
- int ret;
unsigned long id = 1;
+ unsigned long tmp;
+ int ret;
- idr_for_each_entry_ul(idr, p, id) {
+ idr_for_each_entry_ul(idr, p, tmp, id) {
+ if (tc_act_in_hw(p) && !mutex_taken) {
+ rtnl_lock();
+ mutex_taken = true;
+ }
ret = __tcf_idr_release(p, false, true);
if (ret == ACT_P_DELETED)
module_put(ops->owner);
else if (ret < 0)
return;
}
+ if (mutex_taken)
+ rtnl_unlock();
idr_destroy(&idrinfo->action_idr);
}
EXPORT_SYMBOL(tcf_idrinfo_destroy);
static LIST_HEAD(act_base);
static DEFINE_RWLOCK(act_mod_lock);
+/* since act ops id is stored in pernet subsystem list,
+ * then there is no way to walk through only all the action
+ * subsystem, so we keep tc action pernet ops id for
+ * reoffload to walk through.
+ */
+static LIST_HEAD(act_pernet_id_list);
+static DEFINE_MUTEX(act_id_mutex);
+struct tc_act_pernet_id {
+ struct list_head list;
+ unsigned int id;
+};
+
+static int tcf_pernet_add_id_list(unsigned int id)
+{
+ struct tc_act_pernet_id *id_ptr;
+ int ret = 0;
+
+ mutex_lock(&act_id_mutex);
+ list_for_each_entry(id_ptr, &act_pernet_id_list, list) {
+ if (id_ptr->id == id) {
+ ret = -EEXIST;
+ goto err_out;
+ }
+ }
+
+ id_ptr = kzalloc(sizeof(*id_ptr), GFP_KERNEL);
+ if (!id_ptr) {
+ ret = -ENOMEM;
+ goto err_out;
+ }
+ id_ptr->id = id;
+
+ list_add_tail(&id_ptr->list, &act_pernet_id_list);
+
+err_out:
+ mutex_unlock(&act_id_mutex);
+ return ret;
+}
+
+static void tcf_pernet_del_id_list(unsigned int id)
+{
+ struct tc_act_pernet_id *id_ptr;
+
+ mutex_lock(&act_id_mutex);
+ list_for_each_entry(id_ptr, &act_pernet_id_list, list) {
+ if (id_ptr->id == id) {
+ list_del(&id_ptr->list);
+ kfree(id_ptr);
+ break;
+ }
+ }
+ mutex_unlock(&act_id_mutex);
+}
int tcf_register_action(struct tc_action_ops *act,
struct pernet_operations *ops)
@@ -530,7 +1018,7 @@ int tcf_register_action(struct tc_action_ops *act,
struct tc_action_ops *a;
int ret;
- if (!act->act || !act->dump || !act->init || !act->walk || !act->lookup)
+ if (!act->act || !act->dump || !act->init)
return -EINVAL;
/* We have to register pernet ops before making the action ops visible,
@@ -541,18 +1029,31 @@ int tcf_register_action(struct tc_action_ops *act,
if (ret)
return ret;
+ if (ops->id) {
+ ret = tcf_pernet_add_id_list(*ops->id);
+ if (ret)
+ goto err_id;
+ }
+
write_lock(&act_mod_lock);
list_for_each_entry(a, &act_base, head) {
- if (act->type == a->type || (strcmp(act->kind, a->kind) == 0)) {
- write_unlock(&act_mod_lock);
- unregister_pernet_subsys(ops);
- return -EEXIST;
+ if (act->id == a->id || (strcmp(act->kind, a->kind) == 0)) {
+ ret = -EEXIST;
+ goto err_out;
}
}
list_add_tail(&act->head, &act_base);
write_unlock(&act_mod_lock);
return 0;
+
+err_out:
+ write_unlock(&act_mod_lock);
+ if (ops->id)
+ tcf_pernet_del_id_list(*ops->id);
+err_id:
+ unregister_pernet_subsys(ops);
+ return ret;
}
EXPORT_SYMBOL(tcf_register_action);
@@ -571,8 +1072,11 @@ int tcf_unregister_action(struct tc_action_ops *act,
}
}
write_unlock(&act_mod_lock);
- if (!err)
+ if (!err) {
unregister_pernet_subsys(ops);
+ if (ops->id)
+ tcf_pernet_del_id_list(*ops->id);
+ }
return err;
}
EXPORT_SYMBOL(tcf_unregister_action);
@@ -615,7 +1119,7 @@ static struct tc_action_ops *tc_lookup_action(struct nlattr *kind)
return res;
}
-/*TCA_ACT_MAX_PRIO is 32, there count upto 32 */
+/*TCA_ACT_MAX_PRIO is 32, there count up to 32 */
#define TCA_ACT_MAX_PRIO_MASK 0x1FF
int tcf_action_exec(struct sk_buff *skb, struct tc_action **actions,
int nr_actions, struct tcf_result *res)
@@ -631,16 +1135,26 @@ int tcf_action_exec(struct sk_buff *skb, struct tc_action **actions,
restart_act_graph:
for (i = 0; i < nr_actions; i++) {
const struct tc_action *a = actions[i];
+ int repeat_ttl;
if (jmp_prgcnt > 0) {
jmp_prgcnt -= 1;
continue;
}
-repeat:
- ret = a->ops->act(skb, a, res);
- if (ret == TC_ACT_REPEAT)
- goto repeat; /* we need a ttl - JHS */
+ if (tc_act_skip_sw(a->tcfa_flags))
+ continue;
+
+ repeat_ttl = 32;
+repeat:
+ ret = tc_act(skb, a, res);
+ if (unlikely(ret == TC_ACT_REPEAT)) {
+ if (--repeat_ttl != 0)
+ goto repeat;
+ /* suspicious opcode, stop pipeline */
+ net_warn_ratelimited("TC_ACT_REPEAT abuse ?\n");
+ return TC_ACT_OK;
+ }
if (TC_ACT_EXT_CMP(ret, TC_ACT_JUMP)) {
jmp_prgcnt = ret & TCA_ACT_MAX_PRIO_MASK;
if (!jmp_prgcnt || (jmp_prgcnt > nr_actions)) {
@@ -654,6 +1168,11 @@ repeat:
return TC_ACT_OK;
}
} else if (TC_ACT_EXT_CMP(ret, TC_ACT_GOTO_CHAIN)) {
+ if (unlikely(!rcu_access_pointer(a->goto_chain))) {
+ tcf_set_drop_reason(skb,
+ SKB_DROP_REASON_TC_CHAIN_NOTFOUND);
+ return TC_ACT_SHOT;
+ }
tcf_action_goto_chain_exec(a, res);
}
@@ -671,8 +1190,7 @@ int tcf_action_destroy(struct tc_action *actions[], int bind)
struct tc_action *a;
int ret = 0, i;
- for (i = 0; i < TCA_ACT_MAX_PRIO && actions[i]; i++) {
- a = actions[i];
+ tcf_act_for_each_action(i, a, actions) {
actions[i] = NULL;
ops = a->ops;
ret = __tcf_idr_release(a, bind, true);
@@ -684,92 +1202,58 @@ int tcf_action_destroy(struct tc_action *actions[], int bind)
return ret;
}
-static int tcf_action_destroy_1(struct tc_action *a, int bind)
-{
- struct tc_action *actions[] = { a, NULL };
-
- return tcf_action_destroy(actions, bind);
-}
-
static int tcf_action_put(struct tc_action *p)
{
return __tcf_action_put(p, false);
}
-/* Put all actions in this array, skip those NULL's. */
static void tcf_action_put_many(struct tc_action *actions[])
{
+ struct tc_action *a;
int i;
- for (i = 0; i < TCA_ACT_MAX_PRIO; i++) {
- struct tc_action *a = actions[i];
- const struct tc_action_ops *ops;
-
- if (!a)
- continue;
- ops = a->ops;
+ tcf_act_for_each_action(i, a, actions) {
+ const struct tc_action_ops *ops = a->ops;
if (tcf_action_put(a))
module_put(ops->owner);
}
}
-int
-tcf_action_dump_old(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
+static void tca_put_bound_many(struct tc_action *actions[], int init_res[])
{
- return a->ops->dump(skb, a, bind, ref);
-}
-
-int
-tcf_action_dump_1(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
-{
- int err = -EINVAL;
- unsigned char *b = skb_tail_pointer(skb);
- struct nlattr *nest;
- struct tc_cookie *cookie;
+ struct tc_action *a;
+ int i;
- if (nla_put_string(skb, TCA_KIND, a->ops->kind))
- goto nla_put_failure;
- if (tcf_action_copy_stats(skb, a, 0))
- goto nla_put_failure;
+ tcf_act_for_each_action(i, a, actions) {
+ const struct tc_action_ops *ops = a->ops;
- rcu_read_lock();
- cookie = rcu_dereference(a->act_cookie);
- if (cookie) {
- if (nla_put(skb, TCA_ACT_COOKIE, cookie->len, cookie->data)) {
- rcu_read_unlock();
- goto nla_put_failure;
- }
- }
- rcu_read_unlock();
+ if (init_res[i] == ACT_P_CREATED)
+ continue;
- nest = nla_nest_start(skb, TCA_OPTIONS);
- if (nest == NULL)
- goto nla_put_failure;
- err = tcf_action_dump_old(skb, a, bind, ref);
- if (err > 0) {
- nla_nest_end(skb, nest);
- return err;
+ if (tcf_action_put(a))
+ module_put(ops->owner);
}
+}
-nla_put_failure:
- nlmsg_trim(skb, b);
- return -1;
+int
+tcf_action_dump_old(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
+{
+ return a->ops->dump(skb, a, bind, ref);
}
-EXPORT_SYMBOL(tcf_action_dump_1);
int tcf_action_dump(struct sk_buff *skb, struct tc_action *actions[],
- int bind, int ref)
+ int bind, int ref, bool terse)
{
struct tc_action *a;
int err = -EINVAL, i;
struct nlattr *nest;
- for (i = 0; i < TCA_ACT_MAX_PRIO && actions[i]; i++) {
- a = actions[i];
- nest = nla_nest_start(skb, a->order);
+ tcf_act_for_each_action(i, a, actions) {
+ nest = nla_nest_start_noflag(skb, i + 1);
if (nest == NULL)
goto nla_put_failure;
- err = tcf_action_dump_1(skb, a, bind, ref);
+ err = terse ? tcf_action_dump_terse(skb, a, false) :
+ tcf_action_dump_1(skb, a, bind, ref);
if (err < 0)
goto errout;
nla_nest_end(skb, nest);
@@ -800,72 +1284,91 @@ static struct tc_cookie *nla_memdup_cookie(struct nlattr **tb)
return c;
}
-static bool tcf_action_valid(int action)
+static u8 tcf_action_hw_stats_get(struct nlattr *hw_stats_attr)
{
- int opcode = TC_ACT_EXT_OPCODE(action);
+ struct nla_bitfield32 hw_stats_bf;
- if (!opcode)
- return action <= TC_ACT_VALUE_MAX;
- return opcode <= TC_ACT_EXT_OPCODE_MAX || action == TC_ACT_UNSPEC;
+ /* If the user did not pass the attr, that means he does
+ * not care about the type. Return "any" in that case
+ * which is setting on all supported types.
+ */
+ if (!hw_stats_attr)
+ return TCA_ACT_HW_STATS_ANY;
+ hw_stats_bf = nla_get_bitfield32(hw_stats_attr);
+ return hw_stats_bf.value;
}
-struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp,
- struct nlattr *nla, struct nlattr *est,
- char *name, int ovr, int bind,
- bool rtnl_held,
- struct netlink_ext_ack *extack)
+static const struct nla_policy tcf_action_policy[TCA_ACT_MAX + 1] = {
+ [TCA_ACT_KIND] = { .type = NLA_STRING },
+ [TCA_ACT_INDEX] = { .type = NLA_U32 },
+ [TCA_ACT_COOKIE] = { .type = NLA_BINARY,
+ .len = TC_COOKIE_MAX_SIZE },
+ [TCA_ACT_OPTIONS] = { .type = NLA_NESTED },
+ [TCA_ACT_FLAGS] = NLA_POLICY_BITFIELD32(TCA_ACT_FLAGS_NO_PERCPU_STATS |
+ TCA_ACT_FLAGS_SKIP_HW |
+ TCA_ACT_FLAGS_SKIP_SW),
+ [TCA_ACT_HW_STATS] = NLA_POLICY_BITFIELD32(TCA_ACT_HW_STATS_ANY),
+};
+
+void tcf_idr_insert_many(struct tc_action *actions[], int init_res[])
{
struct tc_action *a;
+ int i;
+
+ tcf_act_for_each_action(i, a, actions) {
+ struct tcf_idrinfo *idrinfo;
+
+ if (init_res[i] == ACT_P_BOUND)
+ continue;
+
+ idrinfo = a->idrinfo;
+ mutex_lock(&idrinfo->lock);
+ /* Replace ERR_PTR(-EBUSY) allocated by tcf_idr_check_alloc */
+ idr_replace(&idrinfo->action_idr, a, a->tcfa_index);
+ mutex_unlock(&idrinfo->lock);
+ }
+}
+
+struct tc_action_ops *tc_action_load_ops(struct nlattr *nla, u32 flags,
+ struct netlink_ext_ack *extack)
+{
+ bool police = flags & TCA_ACT_FLAGS_POLICE;
+ struct nlattr *tb[TCA_ACT_MAX + 1];
struct tc_action_ops *a_o;
- struct tc_cookie *cookie = NULL;
char act_name[IFNAMSIZ];
- struct nlattr *tb[TCA_ACT_MAX + 1];
struct nlattr *kind;
int err;
- if (name == NULL) {
- err = nla_parse_nested(tb, TCA_ACT_MAX, nla, NULL, extack);
+ if (!police) {
+ err = nla_parse_nested_deprecated(tb, TCA_ACT_MAX, nla,
+ tcf_action_policy, extack);
if (err < 0)
- goto err_out;
+ return ERR_PTR(err);
err = -EINVAL;
kind = tb[TCA_ACT_KIND];
if (!kind) {
NL_SET_ERR_MSG(extack, "TC action kind must be specified");
- goto err_out;
+ return ERR_PTR(err);
}
- if (nla_strlcpy(act_name, kind, IFNAMSIZ) >= IFNAMSIZ) {
+ if (nla_strscpy(act_name, kind, IFNAMSIZ) < 0) {
NL_SET_ERR_MSG(extack, "TC action name too long");
- goto err_out;
- }
- if (tb[TCA_ACT_COOKIE]) {
- int cklen = nla_len(tb[TCA_ACT_COOKIE]);
-
- if (cklen > TC_COOKIE_MAX_SIZE) {
- NL_SET_ERR_MSG(extack, "TC cookie size above the maximum");
- goto err_out;
- }
-
- cookie = nla_memdup_cookie(tb);
- if (!cookie) {
- NL_SET_ERR_MSG(extack, "No memory to generate TC cookie");
- err = -ENOMEM;
- goto err_out;
- }
+ return ERR_PTR(err);
}
} else {
- if (strlcpy(act_name, name, IFNAMSIZ) >= IFNAMSIZ) {
+ if (strscpy(act_name, "police", IFNAMSIZ) < 0) {
NL_SET_ERR_MSG(extack, "TC action name too long");
- err = -EINVAL;
- goto err_out;
+ return ERR_PTR(-EINVAL);
}
}
a_o = tc_lookup_action_n(act_name);
if (a_o == NULL) {
#ifdef CONFIG_MODULES
+ bool rtnl_held = !(flags & TCA_ACT_FLAGS_NO_RTNL);
+
if (rtnl_held)
rtnl_unlock();
- request_module("act_%s", act_name);
+ request_module(NET_ACT_ALIAS_PREFIX "%s", act_name);
if (rtnl_held)
rtnl_lock();
@@ -878,105 +1381,222 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp,
* indicate this using -EAGAIN.
*/
if (a_o != NULL) {
- err = -EAGAIN;
- goto err_mod;
+ module_put(a_o->owner);
+ return ERR_PTR(-EAGAIN);
}
#endif
NL_SET_ERR_MSG(extack, "Failed to load TC action module");
- err = -ENOENT;
- goto err_out;
+ return ERR_PTR(-ENOENT);
}
- /* backward compatibility for policer */
- if (name == NULL)
- err = a_o->init(net, tb[TCA_ACT_OPTIONS], est, &a, ovr, bind,
- rtnl_held, extack);
- else
- err = a_o->init(net, nla, est, &a, ovr, bind, rtnl_held,
- extack);
- if (err < 0)
- goto err_mod;
+ return a_o;
+}
- if (!name && tb[TCA_ACT_COOKIE])
- tcf_set_action_cookie(&a->act_cookie, cookie);
+struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp,
+ struct nlattr *nla, struct nlattr *est,
+ struct tc_action_ops *a_o, int *init_res,
+ u32 flags, struct netlink_ext_ack *extack)
+{
+ bool police = flags & TCA_ACT_FLAGS_POLICE;
+ struct nla_bitfield32 userflags = { 0, 0 };
+ struct tc_cookie *user_cookie = NULL;
+ u8 hw_stats = TCA_ACT_HW_STATS_ANY;
+ struct nlattr *tb[TCA_ACT_MAX + 1];
+ struct tc_action *a;
+ int err;
- /* module count goes up only when brand new policy is created
- * if it exists and is only bound to in a_o->init() then
- * ACT_P_CREATED is not returned (a zero is).
- */
- if (err != ACT_P_CREATED)
- module_put(a_o->owner);
-
- if (TC_ACT_EXT_CMP(a->tcfa_action, TC_ACT_GOTO_CHAIN)) {
- err = tcf_action_goto_chain_init(a, tp);
- if (err) {
- tcf_action_destroy_1(a, bind);
- NL_SET_ERR_MSG(extack, "Failed to init TC action chain");
+ /* backward compatibility for policer */
+ if (!police) {
+ err = nla_parse_nested_deprecated(tb, TCA_ACT_MAX, nla,
+ tcf_action_policy, extack);
+ if (err < 0)
return ERR_PTR(err);
+ if (tb[TCA_ACT_COOKIE]) {
+ user_cookie = nla_memdup_cookie(tb);
+ if (!user_cookie) {
+ NL_SET_ERR_MSG(extack, "No memory to generate TC cookie");
+ err = -ENOMEM;
+ goto err_out;
+ }
+ }
+ hw_stats = tcf_action_hw_stats_get(tb[TCA_ACT_HW_STATS]);
+ if (tb[TCA_ACT_FLAGS]) {
+ userflags = nla_get_bitfield32(tb[TCA_ACT_FLAGS]);
+ if (!tc_act_flags_valid(userflags.value)) {
+ err = -EINVAL;
+ goto err_out;
+ }
}
- }
- if (!tcf_action_valid(a->tcfa_action)) {
- tcf_action_destroy_1(a, bind);
- NL_SET_ERR_MSG(extack, "Invalid control action value");
- return ERR_PTR(-EINVAL);
+ err = a_o->init(net, tb[TCA_ACT_OPTIONS], est, &a, tp,
+ userflags.value | flags, extack);
+ } else {
+ err = a_o->init(net, nla, est, &a, tp, userflags.value | flags,
+ extack);
}
+ if (err < 0)
+ goto err_out;
+ *init_res = err;
+
+ if (!police && tb[TCA_ACT_COOKIE])
+ tcf_set_action_cookie(&a->user_cookie, user_cookie);
+
+ if (!police)
+ a->hw_stats = hw_stats;
return a;
-err_mod:
- module_put(a_o->owner);
err_out:
- if (cookie) {
- kfree(cookie->data);
- kfree(cookie);
+ if (user_cookie) {
+ kfree(user_cookie->data);
+ kfree(user_cookie);
}
return ERR_PTR(err);
}
+static bool tc_act_bind(u32 flags)
+{
+ return !!(flags & TCA_ACT_FLAGS_BIND);
+}
+
/* Returns numbers of initialized actions or negative error. */
int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla,
- struct nlattr *est, char *name, int ovr, int bind,
- struct tc_action *actions[], size_t *attr_size,
- bool rtnl_held, struct netlink_ext_ack *extack)
+ struct nlattr *est, struct tc_action *actions[],
+ int init_res[], size_t *attr_size,
+ u32 flags, u32 fl_flags,
+ struct netlink_ext_ack *extack)
{
- struct nlattr *tb[TCA_ACT_MAX_PRIO + 1];
+ struct tc_action_ops *ops[TCA_ACT_MAX_PRIO] = {};
+ struct nlattr *tb[TCA_ACT_MAX_PRIO + 2];
struct tc_action *act;
size_t sz = 0;
int err;
int i;
- err = nla_parse_nested(tb, TCA_ACT_MAX_PRIO, nla, NULL, extack);
+ err = nla_parse_nested_deprecated(tb, TCA_ACT_MAX_PRIO + 1, nla, NULL,
+ extack);
if (err < 0)
return err;
+ /* The nested attributes are parsed as types, but they are really an
+ * array of actions. So we parse one more than we can handle, and return
+ * an error if the last one is set (as that indicates that the request
+ * contained more than the maximum number of actions).
+ */
+ if (tb[TCA_ACT_MAX_PRIO + 1]) {
+ NL_SET_ERR_MSG_FMT(extack,
+ "Only %d actions supported per filter",
+ TCA_ACT_MAX_PRIO);
+ return -EINVAL;
+ }
+
for (i = 1; i <= TCA_ACT_MAX_PRIO && tb[i]; i++) {
- act = tcf_action_init_1(net, tp, tb[i], est, name, ovr, bind,
- rtnl_held, extack);
+ struct tc_action_ops *a_o;
+
+ a_o = tc_action_load_ops(tb[i], flags, extack);
+ if (IS_ERR(a_o)) {
+ err = PTR_ERR(a_o);
+ goto err_mod;
+ }
+ ops[i - 1] = a_o;
+ }
+
+ for (i = 1; i <= TCA_ACT_MAX_PRIO && tb[i]; i++) {
+ act = tcf_action_init_1(net, tp, tb[i], est, ops[i - 1],
+ &init_res[i - 1], flags, extack);
if (IS_ERR(act)) {
err = PTR_ERR(act);
goto err;
}
- act->order = i;
sz += tcf_action_fill_size(act);
/* Start from index 0 */
actions[i - 1] = act;
+ if (tc_act_bind(flags)) {
+ bool skip_sw = tc_skip_sw(fl_flags);
+ bool skip_hw = tc_skip_hw(fl_flags);
+
+ if (tc_act_bind(act->tcfa_flags)) {
+ /* Action is created by classifier and is not
+ * standalone. Check that the user did not set
+ * any action flags different than the
+ * classifier flags, and inherit the flags from
+ * the classifier for the compatibility case
+ * where no flags were specified at all.
+ */
+ if ((tc_act_skip_sw(act->tcfa_flags) && !skip_sw) ||
+ (tc_act_skip_hw(act->tcfa_flags) && !skip_hw)) {
+ NL_SET_ERR_MSG(extack,
+ "Mismatch between action and filter offload flags");
+ err = -EINVAL;
+ goto err;
+ }
+ if (skip_sw)
+ act->tcfa_flags |= TCA_ACT_FLAGS_SKIP_SW;
+ if (skip_hw)
+ act->tcfa_flags |= TCA_ACT_FLAGS_SKIP_HW;
+ continue;
+ }
+
+ /* Action is standalone */
+ if (skip_sw != tc_act_skip_sw(act->tcfa_flags) ||
+ skip_hw != tc_act_skip_hw(act->tcfa_flags)) {
+ NL_SET_ERR_MSG(extack,
+ "Mismatch between action and filter offload flags");
+ err = -EINVAL;
+ goto err;
+ }
+ } else {
+ err = tcf_action_offload_add(act, extack);
+ if (tc_act_skip_sw(act->tcfa_flags) && err)
+ goto err;
+ }
}
+ /* We have to commit them all together, because if any error happened in
+ * between, we could not handle the failure gracefully.
+ */
+ tcf_idr_insert_many(actions, init_res);
+
*attr_size = tcf_action_full_attrs_size(sz);
- return i - 1;
+ err = i - 1;
+ goto err_mod;
err:
- tcf_action_destroy(actions, bind);
+ tcf_action_destroy(actions, flags & TCA_ACT_FLAGS_BIND);
+err_mod:
+ for (i = 0; i < TCA_ACT_MAX_PRIO && ops[i]; i++)
+ module_put(ops[i]->owner);
return err;
}
+void tcf_action_update_stats(struct tc_action *a, u64 bytes, u64 packets,
+ u64 drops, bool hw)
+{
+ if (a->cpu_bstats) {
+ _bstats_update(this_cpu_ptr(a->cpu_bstats), bytes, packets);
+
+ this_cpu_ptr(a->cpu_qstats)->drops += drops;
+
+ if (hw)
+ _bstats_update(this_cpu_ptr(a->cpu_bstats_hw),
+ bytes, packets);
+ return;
+ }
+
+ _bstats_update(&a->tcfa_bstats, bytes, packets);
+ atomic_add(drops, &a->tcfa_drops);
+ if (hw)
+ _bstats_update(&a->tcfa_bstats_hw, bytes, packets);
+}
+EXPORT_SYMBOL(tcf_action_update_stats);
+
int tcf_action_copy_stats(struct sk_buff *skb, struct tc_action *p,
int compat_mode)
{
- int err = 0;
+ struct gnet_stats_queue qstats = {0};
struct gnet_dump d;
+ int err = 0;
if (p == NULL)
goto errout;
@@ -1000,13 +1620,17 @@ int tcf_action_copy_stats(struct sk_buff *skb, struct tc_action *p,
if (err < 0)
goto errout;
- if (gnet_stats_copy_basic(NULL, &d, p->cpu_bstats, &p->tcfa_bstats) < 0 ||
- gnet_stats_copy_basic_hw(NULL, &d, p->cpu_bstats_hw,
- &p->tcfa_bstats_hw) < 0 ||
+ qstats.drops = atomic_read(&p->tcfa_drops);
+ qstats.overlimits = atomic_read(&p->tcfa_overlimits);
+
+ if (gnet_stats_copy_basic(&d, p->cpu_bstats,
+ &p->tcfa_bstats, false) < 0 ||
+ gnet_stats_copy_basic_hw(&d, p->cpu_bstats_hw,
+ &p->tcfa_bstats_hw, false) < 0 ||
gnet_stats_copy_rate_est(&d, &p->tcfa_rate_est) < 0 ||
gnet_stats_copy_queue(&d, p->cpu_qstats,
- &p->tcfa_qstats,
- p->tcfa_qstats.qlen) < 0)
+ &qstats,
+ qstats.qlen) < 0)
goto errout;
if (gnet_stats_finish_copy(&d) < 0)
@@ -1020,7 +1644,7 @@ errout:
static int tca_get_fill(struct sk_buff *skb, struct tc_action *actions[],
u32 portid, u32 seq, u16 flags, int event, int bind,
- int ref)
+ int ref, struct netlink_ext_ack *extack)
{
struct tcamsg *t;
struct nlmsghdr *nlh;
@@ -1035,16 +1659,21 @@ static int tca_get_fill(struct sk_buff *skb, struct tc_action *actions[],
t->tca__pad1 = 0;
t->tca__pad2 = 0;
- nest = nla_nest_start(skb, TCA_ACT_TAB);
+ if (extack && extack->_msg &&
+ nla_put_string(skb, TCA_ROOT_EXT_WARN_MSG, extack->_msg))
+ goto out_nlmsg_trim;
+
+ nest = nla_nest_start_noflag(skb, TCA_ACT_TAB);
if (!nest)
goto out_nlmsg_trim;
- if (tcf_action_dump(skb, actions, bind, ref) < 0)
+ if (tcf_action_dump(skb, actions, bind, ref, false) < 0)
goto out_nlmsg_trim;
nla_nest_end(skb, nest);
nlh->nlmsg_len = skb_tail_pointer(skb) - b;
+
return skb->len;
out_nlmsg_trim:
@@ -1063,7 +1692,7 @@ tcf_get_notify(struct net *net, u32 portid, struct nlmsghdr *n,
if (!skb)
return -ENOBUFS;
if (tca_get_fill(skb, actions, portid, n->nlmsg_seq, 0, event,
- 0, 1) <= 0) {
+ 0, 1, NULL) <= 0) {
NL_SET_ERR_MSG(extack, "Failed to fill netlink attributes while adding TC action");
kfree_skb(skb);
return -EINVAL;
@@ -1082,7 +1711,8 @@ static struct tc_action *tcf_action_get_1(struct net *net, struct nlattr *nla,
int index;
int err;
- err = nla_parse_nested(tb, TCA_ACT_MAX, nla, NULL, extack);
+ err = nla_parse_nested_deprecated(tb, TCA_ACT_MAX, nla,
+ tcf_action_policy, extack);
if (err < 0)
goto err_out;
@@ -1101,7 +1731,7 @@ static struct tc_action *tcf_action_get_1(struct net *net, struct nlattr *nla,
goto err_out;
}
err = -ENOENT;
- if (ops->lookup(net, &a, index) == 0) {
+ if (__tcf_idr_search(net, ops, &a, index) == 0) {
NL_SET_ERR_MSG(extack, "TC action with specified index not found");
goto err_mod;
}
@@ -1136,7 +1766,8 @@ static int tca_action_flush(struct net *net, struct nlattr *nla,
b = skb_tail_pointer(skb);
- err = nla_parse_nested(tb, TCA_ACT_MAX, nla, NULL, extack);
+ err = nla_parse_nested_deprecated(tb, TCA_ACT_MAX, nla,
+ tcf_action_policy, extack);
if (err < 0)
goto err_out;
@@ -1159,13 +1790,13 @@ static int tca_action_flush(struct net *net, struct nlattr *nla,
t->tca__pad1 = 0;
t->tca__pad2 = 0;
- nest = nla_nest_start(skb, TCA_ACT_TAB);
+ nest = nla_nest_start_noflag(skb, TCA_ACT_TAB);
if (!nest) {
NL_SET_ERR_MSG(extack, "Failed to add new netlink message");
goto out_module_put;
}
- err = ops->walk(net, skb, &dcb, RTM_DELACTION, ops, extack);
+ err = __tcf_generic_walker(net, skb, &dcb, RTM_DELACTION, ops, extack);
if (err <= 0) {
nla_nest_cancel(skb, nest);
goto out_module_put;
@@ -1178,8 +1809,6 @@ static int tca_action_flush(struct net *net, struct nlattr *nla,
module_put(ops->owner);
err = rtnetlink_send(skb, net, portid, RTNLGRP_TC,
n->nlmsg_flags & NLM_F_ECHO);
- if (err > 0)
- return 0;
if (err < 0)
NL_SET_ERR_MSG(extack, "Failed to send TC action flush notification");
@@ -1194,10 +1823,10 @@ err_out:
static int tcf_action_delete(struct net *net, struct tc_action *actions[])
{
+ struct tc_action *a;
int i;
- for (i = 0; i < TCA_ACT_MAX_PRIO && actions[i]; i++) {
- struct tc_action *a = actions[i];
+ tcf_act_for_each_action(i, a, actions) {
const struct tc_action_ops *ops = a->ops;
/* Actions can be deleted concurrently so we must save their
* type and id to search again after reference is released.
@@ -1209,7 +1838,7 @@ static int tcf_action_delete(struct net *net, struct tc_action *actions[])
if (tcf_action_put(a)) {
/* last reference, action was deleted concurrently */
module_put(ops->owner);
- } else {
+ } else {
int ret;
/* now do the delete */
@@ -1221,23 +1850,145 @@ static int tcf_action_delete(struct net *net, struct tc_action *actions[])
return 0;
}
-static int
-tcf_del_notify(struct net *net, struct nlmsghdr *n, struct tc_action *actions[],
- u32 portid, size_t attr_size, struct netlink_ext_ack *extack)
+static struct sk_buff *tcf_reoffload_del_notify_msg(struct net *net,
+ struct tc_action *action)
+{
+ size_t attr_size = tcf_action_fill_size(action);
+ struct tc_action *actions[TCA_ACT_MAX_PRIO] = {
+ [0] = action,
+ };
+ struct sk_buff *skb;
+
+ skb = alloc_skb(max(attr_size, NLMSG_GOODSIZE), GFP_KERNEL);
+ if (!skb)
+ return ERR_PTR(-ENOBUFS);
+
+ if (tca_get_fill(skb, actions, 0, 0, 0, RTM_DELACTION, 0, 1, NULL) <= 0) {
+ kfree_skb(skb);
+ return ERR_PTR(-EINVAL);
+ }
+
+ return skb;
+}
+
+static int tcf_reoffload_del_notify(struct net *net, struct tc_action *action)
+{
+ const struct tc_action_ops *ops = action->ops;
+ struct sk_buff *skb;
+ int ret;
+
+ if (!rtnl_notify_needed(net, 0, RTNLGRP_TC)) {
+ skb = NULL;
+ } else {
+ skb = tcf_reoffload_del_notify_msg(net, action);
+ if (IS_ERR(skb))
+ return PTR_ERR(skb);
+ }
+
+ ret = tcf_idr_release_unsafe(action);
+ if (ret == ACT_P_DELETED) {
+ module_put(ops->owner);
+ ret = rtnetlink_maybe_send(skb, net, 0, RTNLGRP_TC, 0);
+ } else {
+ kfree_skb(skb);
+ }
+
+ return ret;
+}
+
+int tcf_action_reoffload_cb(flow_indr_block_bind_cb_t *cb,
+ void *cb_priv, bool add)
{
+ struct tc_act_pernet_id *id_ptr;
+ struct tcf_idrinfo *idrinfo;
+ struct tc_action_net *tn;
+ struct tc_action *p;
+ unsigned int act_id;
+ unsigned long tmp;
+ unsigned long id;
+ struct idr *idr;
+ struct net *net;
int ret;
+
+ if (!cb)
+ return -EINVAL;
+
+ down_read(&net_rwsem);
+ mutex_lock(&act_id_mutex);
+
+ for_each_net(net) {
+ list_for_each_entry(id_ptr, &act_pernet_id_list, list) {
+ act_id = id_ptr->id;
+ tn = net_generic(net, act_id);
+ if (!tn)
+ continue;
+ idrinfo = tn->idrinfo;
+ if (!idrinfo)
+ continue;
+
+ mutex_lock(&idrinfo->lock);
+ idr = &idrinfo->action_idr;
+ idr_for_each_entry_ul(idr, p, tmp, id) {
+ if (IS_ERR(p) || tc_act_bind(p->tcfa_flags))
+ continue;
+ if (add) {
+ tcf_action_offload_add_ex(p, NULL, cb,
+ cb_priv);
+ continue;
+ }
+
+ /* cb unregister to update hw count */
+ ret = tcf_action_offload_del_ex(p, cb, cb_priv);
+ if (ret < 0)
+ continue;
+ if (tc_act_skip_sw(p->tcfa_flags) &&
+ !tc_act_in_hw(p))
+ tcf_reoffload_del_notify(net, p);
+ }
+ mutex_unlock(&idrinfo->lock);
+ }
+ }
+ mutex_unlock(&act_id_mutex);
+ up_read(&net_rwsem);
+
+ return 0;
+}
+
+static struct sk_buff *tcf_del_notify_msg(struct net *net, struct nlmsghdr *n,
+ struct tc_action *actions[],
+ u32 portid, size_t attr_size,
+ struct netlink_ext_ack *extack)
+{
struct sk_buff *skb;
- skb = alloc_skb(attr_size <= NLMSG_GOODSIZE ? NLMSG_GOODSIZE : attr_size,
- GFP_KERNEL);
+ skb = alloc_skb(max(attr_size, NLMSG_GOODSIZE), GFP_KERNEL);
if (!skb)
- return -ENOBUFS;
+ return ERR_PTR(-ENOBUFS);
if (tca_get_fill(skb, actions, portid, n->nlmsg_seq, 0, RTM_DELACTION,
- 0, 2) <= 0) {
+ 0, 2, extack) <= 0) {
NL_SET_ERR_MSG(extack, "Failed to fill netlink TC action attributes");
kfree_skb(skb);
- return -EINVAL;
+ return ERR_PTR(-EINVAL);
+ }
+
+ return skb;
+}
+
+static int tcf_del_notify(struct net *net, struct nlmsghdr *n,
+ struct tc_action *actions[], u32 portid,
+ size_t attr_size, struct netlink_ext_ack *extack)
+{
+ struct sk_buff *skb;
+ int ret;
+
+ if (!rtnl_notify_needed(net, n->nlmsg_flags, RTNLGRP_TC)) {
+ skb = NULL;
+ } else {
+ skb = tcf_del_notify_msg(net, n, actions, portid, attr_size,
+ extack);
+ if (IS_ERR(skb))
+ return PTR_ERR(skb);
}
/* now do the delete */
@@ -1248,11 +1999,8 @@ tcf_del_notify(struct net *net, struct nlmsghdr *n, struct tc_action *actions[],
return ret;
}
- ret = rtnetlink_send(skb, net, portid, RTNLGRP_TC,
- n->nlmsg_flags & NLM_F_ECHO);
- if (ret > 0)
- return 0;
- return ret;
+ return rtnetlink_maybe_send(skb, net, portid, RTNLGRP_TC,
+ n->nlmsg_flags & NLM_F_ECHO);
}
static int
@@ -1265,7 +2013,8 @@ tca_action_gd(struct net *net, struct nlattr *nla, struct nlmsghdr *n,
size_t attr_size = 0;
struct tc_action *actions[TCA_ACT_MAX_PRIO] = {};
- ret = nla_parse_nested(tb, TCA_ACT_MAX_PRIO, nla, NULL, extack);
+ ret = nla_parse_nested_deprecated(tb, TCA_ACT_MAX_PRIO, nla, NULL,
+ extack);
if (ret < 0)
return ret;
@@ -1283,7 +2032,6 @@ tca_action_gd(struct net *net, struct nlattr *nla, struct nlmsghdr *n,
ret = PTR_ERR(act);
goto err;
}
- act->order = i;
attr_size += tcf_action_fill_size(act);
actions[i - 1] = act;
}
@@ -1303,55 +2051,76 @@ err:
return ret;
}
-static int
-tcf_add_notify(struct net *net, struct nlmsghdr *n, struct tc_action *actions[],
- u32 portid, size_t attr_size, struct netlink_ext_ack *extack)
+static struct sk_buff *tcf_add_notify_msg(struct net *net, struct nlmsghdr *n,
+ struct tc_action *actions[],
+ u32 portid, size_t attr_size,
+ struct netlink_ext_ack *extack)
{
struct sk_buff *skb;
- int err = 0;
- skb = alloc_skb(attr_size <= NLMSG_GOODSIZE ? NLMSG_GOODSIZE : attr_size,
- GFP_KERNEL);
+ skb = alloc_skb(max(attr_size, NLMSG_GOODSIZE), GFP_KERNEL);
if (!skb)
- return -ENOBUFS;
+ return ERR_PTR(-ENOBUFS);
if (tca_get_fill(skb, actions, portid, n->nlmsg_seq, n->nlmsg_flags,
- RTM_NEWACTION, 0, 0) <= 0) {
+ RTM_NEWACTION, 0, 0, extack) <= 0) {
NL_SET_ERR_MSG(extack, "Failed to fill netlink attributes while adding TC action");
kfree_skb(skb);
- return -EINVAL;
+ return ERR_PTR(-EINVAL);
}
- err = rtnetlink_send(skb, net, portid, RTNLGRP_TC,
- n->nlmsg_flags & NLM_F_ECHO);
- if (err > 0)
- err = 0;
- return err;
+ return skb;
+}
+
+static int tcf_add_notify(struct net *net, struct nlmsghdr *n,
+ struct tc_action *actions[], u32 portid,
+ size_t attr_size, struct netlink_ext_ack *extack)
+{
+ struct sk_buff *skb;
+
+ if (!rtnl_notify_needed(net, n->nlmsg_flags, RTNLGRP_TC)) {
+ skb = NULL;
+ } else {
+ skb = tcf_add_notify_msg(net, n, actions, portid, attr_size,
+ extack);
+ if (IS_ERR(skb))
+ return PTR_ERR(skb);
+ }
+
+ return rtnetlink_maybe_send(skb, net, portid, RTNLGRP_TC,
+ n->nlmsg_flags & NLM_F_ECHO);
}
static int tcf_action_add(struct net *net, struct nlattr *nla,
- struct nlmsghdr *n, u32 portid, int ovr,
+ struct nlmsghdr *n, u32 portid, u32 flags,
struct netlink_ext_ack *extack)
{
size_t attr_size = 0;
- int ret = 0;
+ int loop, ret;
struct tc_action *actions[TCA_ACT_MAX_PRIO] = {};
+ int init_res[TCA_ACT_MAX_PRIO] = {};
+
+ for (loop = 0; loop < 10; loop++) {
+ ret = tcf_action_init(net, NULL, nla, NULL, actions, init_res,
+ &attr_size, flags, 0, extack);
+ if (ret != -EAGAIN)
+ break;
+ }
- ret = tcf_action_init(net, NULL, nla, NULL, NULL, ovr, 0, actions,
- &attr_size, true, extack);
if (ret < 0)
return ret;
+
ret = tcf_add_notify(net, n, actions, portid, attr_size, extack);
- if (ovr)
- tcf_action_put_many(actions);
+
+ /* only put bound actions */
+ tca_put_bound_many(actions, init_res);
return ret;
}
-static u32 tcaa_root_flags_allowed = TCA_FLAG_LARGE_DUMP_ON;
static const struct nla_policy tcaa_policy[TCA_ROOT_MAX + 1] = {
- [TCA_ROOT_FLAGS] = { .type = NLA_BITFIELD32,
- .validation_data = &tcaa_root_flags_allowed },
+ [TCA_ROOT_FLAGS] = NLA_POLICY_BITFIELD32(TCA_ACT_FLAG_LARGE_DUMP_ON |
+ TCA_ACT_FLAG_TERSE_DUMP),
[TCA_ROOT_TIME_DELTA] = { .type = NLA_U32 },
};
@@ -1360,15 +2129,16 @@ static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n,
{
struct net *net = sock_net(skb->sk);
struct nlattr *tca[TCA_ROOT_MAX + 1];
- u32 portid = skb ? NETLINK_CB(skb).portid : 0;
- int ret = 0, ovr = 0;
+ u32 portid = NETLINK_CB(skb).portid;
+ u32 flags = 0;
+ int ret = 0;
if ((n->nlmsg_type != RTM_GETACTION) &&
!netlink_capable(skb, CAP_NET_ADMIN))
return -EPERM;
- ret = nlmsg_parse(n, sizeof(struct tcamsg), tca, TCA_ROOT_MAX, NULL,
- extack);
+ ret = nlmsg_parse_deprecated(n, sizeof(struct tcamsg), tca,
+ TCA_ROOT_MAX, NULL, extack);
if (ret < 0)
return ret;
@@ -1387,12 +2157,9 @@ static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n,
* is zero) then just set this
*/
if (n->nlmsg_flags & NLM_F_REPLACE)
- ovr = 1;
-replay:
- ret = tcf_action_add(net, tca[TCA_ACT_TAB], n, portid, ovr,
+ flags = TCA_ACT_FLAGS_REPLACE;
+ ret = tcf_action_add(net, tca[TCA_ACT_TAB], n, portid, flags,
extack);
- if (ret == -EAGAIN)
- goto replay;
break;
case RTM_DELACTION:
ret = tca_action_gd(net, tca[TCA_ACT_TAB], n,
@@ -1419,13 +2186,12 @@ static struct nlattr *find_dump_kind(struct nlattr **nla)
if (tb1 == NULL)
return NULL;
- if (nla_parse(tb, TCA_ACT_MAX_PRIO, nla_data(tb1),
- NLMSG_ALIGN(nla_len(tb1)), NULL, NULL) < 0)
+ if (nla_parse_deprecated(tb, TCA_ACT_MAX_PRIO, nla_data(tb1), NLMSG_ALIGN(nla_len(tb1)), NULL, NULL) < 0)
return NULL;
if (tb[1] == NULL)
return NULL;
- if (nla_parse_nested(tb2, TCA_ACT_MAX, tb[1], NULL, NULL) < 0)
+ if (nla_parse_nested_deprecated(tb2, TCA_ACT_MAX, tb[1], tcf_action_policy, NULL) < 0)
return NULL;
kind = tb2[TCA_ACT_KIND];
@@ -1449,8 +2215,8 @@ static int tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb)
u32 msecs_since = 0;
u32 act_count = 0;
- ret = nlmsg_parse(cb->nlh, sizeof(struct tcamsg), tb, TCA_ROOT_MAX,
- tcaa_policy, cb->extack);
+ ret = nlmsg_parse_deprecated(cb->nlh, sizeof(struct tcamsg), tb,
+ TCA_ROOT_MAX, tcaa_policy, cb->extack);
if (ret < 0)
return ret;
@@ -1491,11 +2257,11 @@ static int tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb)
if (!count_attr)
goto out_module_put;
- nest = nla_nest_start(skb, TCA_ACT_TAB);
+ nest = nla_nest_start_noflag(skb, TCA_ACT_TAB);
if (nest == NULL)
goto out_module_put;
- ret = a_o->walk(net, skb, cb, RTM_GETACTION, a_o, NULL);
+ ret = __tcf_generic_walker(net, skb, cb, RTM_GETACTION, a_o, NULL);
if (ret < 0)
goto out_module_put;
@@ -1520,13 +2286,16 @@ out_module_put:
return skb->len;
}
+static const struct rtnl_msg_handler tc_action_rtnl_msg_handlers[] __initconst = {
+ {.msgtype = RTM_NEWACTION, .doit = tc_ctl_action},
+ {.msgtype = RTM_DELACTION, .doit = tc_ctl_action},
+ {.msgtype = RTM_GETACTION, .doit = tc_ctl_action,
+ .dumpit = tc_dump_action},
+};
+
static int __init tc_action_init(void)
{
- rtnl_register(PF_UNSPEC, RTM_NEWACTION, tc_ctl_action, NULL, 0);
- rtnl_register(PF_UNSPEC, RTM_DELACTION, tc_ctl_action, NULL, 0);
- rtnl_register(PF_UNSPEC, RTM_GETACTION, tc_ctl_action, tc_dump_action,
- 0);
-
+ rtnl_register_many(tc_action_rtnl_msg_handlers);
return 0;
}
diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c
index c7633843e223..c2b5bc19e091 100644
--- a/net/sched/act_bpf.c
+++ b/net/sched/act_bpf.c
@@ -1,10 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Copyright (c) 2015 Jiri Pirko <jiri@resnulli.us>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
*/
#include <linux/module.h>
@@ -16,10 +12,13 @@
#include <linux/bpf.h>
#include <net/netlink.h>
+#include <net/sock.h>
#include <net/pkt_sched.h>
+#include <net/pkt_cls.h>
#include <linux/tc_act/tc_bpf.h>
#include <net/tc_act/tc_bpf.h>
+#include <net/tc_wrapper.h>
#define ACT_BPF_NAME_LEN 256
@@ -31,11 +30,11 @@ struct tcf_bpf_cfg {
bool is_ebpf;
};
-static unsigned int bpf_net_id;
static struct tc_action_ops act_bpf_ops;
-static int tcf_bpf_act(struct sk_buff *skb, const struct tc_action *act,
- struct tcf_result *res)
+TC_INDIRECT_SCOPE int tcf_bpf_act(struct sk_buff *skb,
+ const struct tc_action *act,
+ struct tcf_result *res)
{
bool at_ingress = skb_at_tc_ingress(skb);
struct tcf_bpf *prog = to_bpf(act);
@@ -43,20 +42,20 @@ static int tcf_bpf_act(struct sk_buff *skb, const struct tc_action *act,
int action, filter_res;
tcf_lastuse_update(&prog->tcf_tm);
- bstats_cpu_update(this_cpu_ptr(prog->common.cpu_bstats), skb);
+ bstats_update(this_cpu_ptr(prog->common.cpu_bstats), skb);
- rcu_read_lock();
filter = rcu_dereference(prog->filter);
if (at_ingress) {
__skb_push(skb, skb->mac_len);
- bpf_compute_data_pointers(skb);
- filter_res = BPF_PROG_RUN(filter, skb);
+ filter_res = bpf_prog_run_data_pointers(filter, skb);
__skb_pull(skb, skb->mac_len);
} else {
- bpf_compute_data_pointers(skb);
- filter_res = BPF_PROG_RUN(filter, skb);
+ filter_res = bpf_prog_run_data_pointers(filter, skb);
}
- rcu_read_unlock();
+ if (unlikely(!skb->tstamp && skb->tstamp_type))
+ skb->tstamp_type = SKB_CLOCK_REALTIME;
+ if (skb_sk_is_prefetched(skb) && filter_res != TC_ACT_OK)
+ skb_orphan(skb);
/* A BPF program may overwrite the default action opcode.
* Similarly as in cls_bpf, if filter_res == -1 we use the
@@ -65,7 +64,7 @@ static int tcf_bpf_act(struct sk_buff *skb, const struct tc_action *act,
* In case a different well-known TC_ACT opcode has been
* returned, it will overwrite the default one.
*
- * For everything else that is unkown, TC_ACT_UNSPEC is
+ * For everything else that is unknown, TC_ACT_UNSPEC is
* returned.
*/
switch (filter_res) {
@@ -277,21 +276,25 @@ static void tcf_bpf_prog_fill_cfg(const struct tcf_bpf *prog,
static int tcf_bpf_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action **act,
- int replace, int bind, bool rtnl_held,
+ struct tcf_proto *tp, u32 flags,
struct netlink_ext_ack *extack)
{
- struct tc_action_net *tn = net_generic(net, bpf_net_id);
+ struct tc_action_net *tn = net_generic(net, act_bpf_ops.net_id);
+ bool bind = flags & TCA_ACT_FLAGS_BIND;
struct nlattr *tb[TCA_ACT_BPF_MAX + 1];
+ struct tcf_chain *goto_ch = NULL;
struct tcf_bpf_cfg cfg, old;
struct tc_act_bpf *parm;
struct tcf_bpf *prog;
bool is_bpf, is_ebpf;
int ret, res = 0;
+ u32 index;
if (!nla)
return -EINVAL;
- ret = nla_parse_nested(tb, TCA_ACT_BPF_MAX, nla, act_bpf_policy, NULL);
+ ret = nla_parse_nested_deprecated(tb, TCA_ACT_BPF_MAX, nla,
+ act_bpf_policy, NULL);
if (ret < 0)
return ret;
@@ -299,13 +302,13 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla,
return -EINVAL;
parm = nla_data(tb[TCA_ACT_BPF_PARMS]);
-
- ret = tcf_idr_check_alloc(tn, &parm->index, act, bind);
+ index = parm->index;
+ ret = tcf_idr_check_alloc(tn, &index, act, bind);
if (!ret) {
- ret = tcf_idr_create(tn, parm->index, est, act,
- &act_bpf_ops, bind, true);
+ ret = tcf_idr_create(tn, index, est, act,
+ &act_bpf_ops, bind, true, flags);
if (ret < 0) {
- tcf_idr_cleanup(tn, parm->index);
+ tcf_idr_cleanup(tn, index);
return ret;
}
@@ -313,9 +316,9 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla,
} else if (ret > 0) {
/* Don't override defaults. */
if (bind)
- return 0;
+ return ACT_P_BOUND;
- if (!replace) {
+ if (!(flags & TCA_ACT_FLAGS_REPLACE)) {
tcf_idr_release(*act, bind);
return -EEXIST;
}
@@ -323,12 +326,16 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla,
return ret;
}
+ ret = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack);
+ if (ret < 0)
+ goto release_idr;
+
is_bpf = tb[TCA_ACT_BPF_OPS_LEN] && tb[TCA_ACT_BPF_OPS];
is_ebpf = tb[TCA_ACT_BPF_FD];
- if ((!is_bpf && !is_ebpf) || (is_bpf && is_ebpf)) {
+ if (is_bpf == is_ebpf) {
ret = -EINVAL;
- goto out;
+ goto put_chain;
}
memset(&cfg, 0, sizeof(cfg));
@@ -336,7 +343,7 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla,
ret = is_bpf ? tcf_bpf_init_from_ops(tb, &cfg) :
tcf_bpf_init_from_efd(tb, &cfg);
if (ret < 0)
- goto out;
+ goto put_chain;
prog = to_bpf(*act);
@@ -350,22 +357,27 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla,
if (cfg.bpf_num_ops)
prog->bpf_num_ops = cfg.bpf_num_ops;
- prog->tcf_action = parm->action;
+ goto_ch = tcf_action_set_ctrlact(*act, parm->action, goto_ch);
rcu_assign_pointer(prog->filter, cfg.filter);
spin_unlock_bh(&prog->tcf_lock);
- if (res == ACT_P_CREATED) {
- tcf_idr_insert(tn, *act);
- } else {
+ if (goto_ch)
+ tcf_chain_put_by_act(goto_ch);
+
+ if (res != ACT_P_CREATED) {
/* make sure the program being replaced is no longer executing */
synchronize_rcu();
tcf_bpf_cfg_cleanup(&old);
}
return res;
-out:
- tcf_idr_release(*act, bind);
+put_chain:
+ if (goto_ch)
+ tcf_chain_put_by_act(goto_ch);
+
+release_idr:
+ tcf_idr_release(*act, bind);
return ret;
}
@@ -377,52 +389,34 @@ static void tcf_bpf_cleanup(struct tc_action *act)
tcf_bpf_cfg_cleanup(&tmp);
}
-static int tcf_bpf_walker(struct net *net, struct sk_buff *skb,
- struct netlink_callback *cb, int type,
- const struct tc_action_ops *ops,
- struct netlink_ext_ack *extack)
-{
- struct tc_action_net *tn = net_generic(net, bpf_net_id);
-
- return tcf_generic_walker(tn, skb, cb, type, ops, extack);
-}
-
-static int tcf_bpf_search(struct net *net, struct tc_action **a, u32 index)
-{
- struct tc_action_net *tn = net_generic(net, bpf_net_id);
-
- return tcf_idr_search(tn, a, index);
-}
-
static struct tc_action_ops act_bpf_ops __read_mostly = {
.kind = "bpf",
- .type = TCA_ACT_BPF,
+ .id = TCA_ID_BPF,
.owner = THIS_MODULE,
.act = tcf_bpf_act,
.dump = tcf_bpf_dump,
.cleanup = tcf_bpf_cleanup,
.init = tcf_bpf_init,
- .walk = tcf_bpf_walker,
- .lookup = tcf_bpf_search,
.size = sizeof(struct tcf_bpf),
};
+MODULE_ALIAS_NET_ACT("bpf");
static __net_init int bpf_init_net(struct net *net)
{
- struct tc_action_net *tn = net_generic(net, bpf_net_id);
+ struct tc_action_net *tn = net_generic(net, act_bpf_ops.net_id);
- return tc_action_net_init(tn, &act_bpf_ops);
+ return tc_action_net_init(net, tn, &act_bpf_ops);
}
static void __net_exit bpf_exit_net(struct list_head *net_list)
{
- tc_action_net_exit(net_list, bpf_net_id);
+ tc_action_net_exit(net_list, act_bpf_ops.net_id);
}
static struct pernet_operations bpf_net_ops = {
.init = bpf_init_net,
.exit_batch = bpf_exit_net,
- .id = &bpf_net_id,
+ .id = &act_bpf_ops.net_id,
.size = sizeof(struct tc_action_net),
};
diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c
index 8475913f2070..26ba8c2d20ab 100644
--- a/net/sched/act_connmark.c
+++ b/net/sched/act_connmark.c
@@ -1,13 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* net/sched/act_connmark.c netfilter connmark retriever action
* skb mark is over-written
*
* Copyright (c) 2011 Felix Fietkau <nbd@openwrt.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
*/
#include <linux/module.h>
@@ -21,73 +17,78 @@
#include <net/netlink.h>
#include <net/pkt_sched.h>
#include <net/act_api.h>
+#include <net/pkt_cls.h>
#include <uapi/linux/tc_act/tc_connmark.h>
#include <net/tc_act/tc_connmark.h>
+#include <net/tc_wrapper.h>
#include <net/netfilter/nf_conntrack.h>
#include <net/netfilter/nf_conntrack_core.h>
#include <net/netfilter/nf_conntrack_zones.h>
-static unsigned int connmark_net_id;
static struct tc_action_ops act_connmark_ops;
-static int tcf_connmark_act(struct sk_buff *skb, const struct tc_action *a,
- struct tcf_result *res)
+TC_INDIRECT_SCOPE int tcf_connmark_act(struct sk_buff *skb,
+ const struct tc_action *a,
+ struct tcf_result *res)
{
const struct nf_conntrack_tuple_hash *thash;
struct nf_conntrack_tuple tuple;
enum ip_conntrack_info ctinfo;
struct tcf_connmark_info *ca = to_connmark(a);
+ struct tcf_connmark_parms *parms;
struct nf_conntrack_zone zone;
struct nf_conn *c;
int proto;
- spin_lock(&ca->tcf_lock);
tcf_lastuse_update(&ca->tcf_tm);
- bstats_update(&ca->tcf_bstats, skb);
+ tcf_action_update_bstats(&ca->common, skb);
+
+ parms = rcu_dereference_bh(ca->parms);
- if (skb->protocol == htons(ETH_P_IP)) {
+ switch (skb_protocol(skb, true)) {
+ case htons(ETH_P_IP):
if (skb->len < sizeof(struct iphdr))
goto out;
proto = NFPROTO_IPV4;
- } else if (skb->protocol == htons(ETH_P_IPV6)) {
+ break;
+ case htons(ETH_P_IPV6):
if (skb->len < sizeof(struct ipv6hdr))
goto out;
proto = NFPROTO_IPV6;
- } else {
+ break;
+ default:
goto out;
}
c = nf_ct_get(skb, &ctinfo);
if (c) {
- skb->mark = c->mark;
- /* using overlimits stats to count how many packets marked */
- ca->tcf_qstats.overlimits++;
- goto out;
+ skb->mark = READ_ONCE(c->mark);
+ goto count;
}
- if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb),
- proto, ca->net, &tuple))
+ if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb), proto, parms->net,
+ &tuple))
goto out;
- zone.id = ca->zone;
+ zone.id = parms->zone;
zone.dir = NF_CT_DEFAULT_ZONE_DIR;
- thash = nf_conntrack_find_get(ca->net, &zone, &tuple);
+ thash = nf_conntrack_find_get(parms->net, &zone, &tuple);
if (!thash)
goto out;
c = nf_ct_tuplehash_to_ctrack(thash);
- /* using overlimits stats to count how many packets marked */
- ca->tcf_qstats.overlimits++;
- skb->mark = c->mark;
+ skb->mark = READ_ONCE(c->mark);
nf_ct_put(c);
+count:
+ /* using overlimits stats to count how many packets marked */
+ tcf_action_inc_overlimit_qstats(&ca->common);
out:
- spin_unlock(&ca->tcf_lock);
- return ca->tcf_action;
+ return parms->action;
}
static const struct nla_policy connmark_policy[TCA_CONNMARK_MAX + 1] = {
@@ -96,78 +97,118 @@ static const struct nla_policy connmark_policy[TCA_CONNMARK_MAX + 1] = {
static int tcf_connmark_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action **a,
- int ovr, int bind, bool rtnl_held,
+ struct tcf_proto *tp, u32 flags,
struct netlink_ext_ack *extack)
{
- struct tc_action_net *tn = net_generic(net, connmark_net_id);
+ struct tc_action_net *tn = net_generic(net, act_connmark_ops.net_id);
+ struct tcf_connmark_parms *nparms, *oparms;
struct nlattr *tb[TCA_CONNMARK_MAX + 1];
+ bool bind = flags & TCA_ACT_FLAGS_BIND;
+ struct tcf_chain *goto_ch = NULL;
struct tcf_connmark_info *ci;
struct tc_connmark *parm;
- int ret = 0;
+ int ret = 0, err;
+ u32 index;
if (!nla)
return -EINVAL;
- ret = nla_parse_nested(tb, TCA_CONNMARK_MAX, nla, connmark_policy,
- NULL);
+ ret = nla_parse_nested_deprecated(tb, TCA_CONNMARK_MAX, nla,
+ connmark_policy, NULL);
if (ret < 0)
return ret;
if (!tb[TCA_CONNMARK_PARMS])
return -EINVAL;
- parm = nla_data(tb[TCA_CONNMARK_PARMS]);
+ nparms = kzalloc(sizeof(*nparms), GFP_KERNEL);
+ if (!nparms)
+ return -ENOMEM;
- ret = tcf_idr_check_alloc(tn, &parm->index, a, bind);
+ parm = nla_data(tb[TCA_CONNMARK_PARMS]);
+ index = parm->index;
+ ret = tcf_idr_check_alloc(tn, &index, a, bind);
if (!ret) {
- ret = tcf_idr_create(tn, parm->index, est, a,
- &act_connmark_ops, bind, false);
+ ret = tcf_idr_create_from_flags(tn, index, est, a,
+ &act_connmark_ops, bind, flags);
if (ret) {
- tcf_idr_cleanup(tn, parm->index);
- return ret;
+ tcf_idr_cleanup(tn, index);
+ err = ret;
+ goto out_free;
}
ci = to_connmark(*a);
- ci->tcf_action = parm->action;
- ci->net = net;
- ci->zone = parm->zone;
- tcf_idr_insert(tn, *a);
+ nparms->net = net;
+ nparms->zone = parm->zone;
+
ret = ACT_P_CREATED;
} else if (ret > 0) {
ci = to_connmark(*a);
- if (bind)
- return 0;
- if (!ovr) {
- tcf_idr_release(*a, bind);
- return -EEXIST;
+ if (bind) {
+ err = ACT_P_BOUND;
+ goto out_free;
}
- /* replacing action and zone */
- spin_lock_bh(&ci->tcf_lock);
- ci->tcf_action = parm->action;
- ci->zone = parm->zone;
- spin_unlock_bh(&ci->tcf_lock);
+ if (!(flags & TCA_ACT_FLAGS_REPLACE)) {
+ err = -EEXIST;
+ goto release_idr;
+ }
+
+ nparms->net = rtnl_dereference(ci->parms)->net;
+ nparms->zone = parm->zone;
+
ret = 0;
+ } else {
+ err = ret;
+ goto out_free;
}
+ err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack);
+ if (err < 0)
+ goto release_idr;
+
+ nparms->action = parm->action;
+
+ spin_lock_bh(&ci->tcf_lock);
+ goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch);
+ oparms = rcu_replace_pointer(ci->parms, nparms, lockdep_is_held(&ci->tcf_lock));
+ spin_unlock_bh(&ci->tcf_lock);
+
+ if (goto_ch)
+ tcf_chain_put_by_act(goto_ch);
+
+ if (oparms)
+ kfree_rcu(oparms, rcu);
+
return ret;
+
+release_idr:
+ tcf_idr_release(*a, bind);
+out_free:
+ kfree(nparms);
+ return err;
}
static inline int tcf_connmark_dump(struct sk_buff *skb, struct tc_action *a,
int bind, int ref)
{
+ const struct tcf_connmark_info *ci = to_connmark(a);
unsigned char *b = skb_tail_pointer(skb);
- struct tcf_connmark_info *ci = to_connmark(a);
- struct tc_connmark opt = {
- .index = ci->tcf_index,
- .refcnt = refcount_read(&ci->tcf_refcnt) - ref,
- .bindcnt = atomic_read(&ci->tcf_bindcnt) - bind,
- };
+ const struct tcf_connmark_parms *parms;
+ struct tc_connmark opt;
struct tcf_t t;
- spin_lock_bh(&ci->tcf_lock);
- opt.action = ci->tcf_action;
- opt.zone = ci->zone;
+ memset(&opt, 0, sizeof(opt));
+
+ opt.index = ci->tcf_index;
+ opt.refcnt = refcount_read(&ci->tcf_refcnt) - ref;
+ opt.bindcnt = atomic_read(&ci->tcf_bindcnt) - bind;
+
+ rcu_read_lock();
+ parms = rcu_dereference(ci->parms);
+
+ opt.action = parms->action;
+ opt.zone = parms->zone;
if (nla_put(skb, TCA_CONNMARK_PARMS, sizeof(opt), &opt))
goto nla_put_failure;
@@ -175,61 +216,54 @@ static inline int tcf_connmark_dump(struct sk_buff *skb, struct tc_action *a,
if (nla_put_64bit(skb, TCA_CONNMARK_TM, sizeof(t), &t,
TCA_CONNMARK_PAD))
goto nla_put_failure;
- spin_unlock_bh(&ci->tcf_lock);
+ rcu_read_unlock();
return skb->len;
nla_put_failure:
- spin_unlock_bh(&ci->tcf_lock);
+ rcu_read_unlock();
nlmsg_trim(skb, b);
return -1;
}
-static int tcf_connmark_walker(struct net *net, struct sk_buff *skb,
- struct netlink_callback *cb, int type,
- const struct tc_action_ops *ops,
- struct netlink_ext_ack *extack)
-{
- struct tc_action_net *tn = net_generic(net, connmark_net_id);
-
- return tcf_generic_walker(tn, skb, cb, type, ops, extack);
-}
-
-static int tcf_connmark_search(struct net *net, struct tc_action **a, u32 index)
+static void tcf_connmark_cleanup(struct tc_action *a)
{
- struct tc_action_net *tn = net_generic(net, connmark_net_id);
+ struct tcf_connmark_info *ci = to_connmark(a);
+ struct tcf_connmark_parms *parms;
- return tcf_idr_search(tn, a, index);
+ parms = rcu_dereference_protected(ci->parms, 1);
+ if (parms)
+ kfree_rcu(parms, rcu);
}
static struct tc_action_ops act_connmark_ops = {
.kind = "connmark",
- .type = TCA_ACT_CONNMARK,
+ .id = TCA_ID_CONNMARK,
.owner = THIS_MODULE,
.act = tcf_connmark_act,
.dump = tcf_connmark_dump,
.init = tcf_connmark_init,
- .walk = tcf_connmark_walker,
- .lookup = tcf_connmark_search,
+ .cleanup = tcf_connmark_cleanup,
.size = sizeof(struct tcf_connmark_info),
};
+MODULE_ALIAS_NET_ACT("connmark");
static __net_init int connmark_init_net(struct net *net)
{
- struct tc_action_net *tn = net_generic(net, connmark_net_id);
+ struct tc_action_net *tn = net_generic(net, act_connmark_ops.net_id);
- return tc_action_net_init(tn, &act_connmark_ops);
+ return tc_action_net_init(net, tn, &act_connmark_ops);
}
static void __net_exit connmark_exit_net(struct list_head *net_list)
{
- tc_action_net_exit(net_list, connmark_net_id);
+ tc_action_net_exit(net_list, act_connmark_ops.net_id);
}
static struct pernet_operations connmark_net_ops = {
.init = connmark_init_net,
.exit_batch = connmark_exit_net,
- .id = &connmark_net_id,
+ .id = &act_connmark_ops.net_id,
.size = sizeof(struct tc_action_net),
};
diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c
index 3dc25b7806d7..0939e6b2ba4d 100644
--- a/net/sched/act_csum.c
+++ b/net/sched/act_csum.c
@@ -1,13 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Checksum updating actions
*
* Copyright (c) 2010 Gregoire Baron <baronchon@n7mm.org>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the Free
- * Software Foundation; either version 2 of the License, or (at your option)
- * any later version.
- *
*/
#include <linux/types.h>
@@ -33,53 +28,58 @@
#include <net/sctp/checksum.h>
#include <net/act_api.h>
+#include <net/pkt_cls.h>
#include <linux/tc_act/tc_csum.h>
#include <net/tc_act/tc_csum.h>
+#include <net/tc_wrapper.h>
static const struct nla_policy csum_policy[TCA_CSUM_MAX + 1] = {
[TCA_CSUM_PARMS] = { .len = sizeof(struct tc_csum), },
};
-static unsigned int csum_net_id;
static struct tc_action_ops act_csum_ops;
static int tcf_csum_init(struct net *net, struct nlattr *nla,
- struct nlattr *est, struct tc_action **a, int ovr,
- int bind, bool rtnl_held,
- struct netlink_ext_ack *extack)
+ struct nlattr *est, struct tc_action **a,
+ struct tcf_proto *tp,
+ u32 flags, struct netlink_ext_ack *extack)
{
- struct tc_action_net *tn = net_generic(net, csum_net_id);
+ struct tc_action_net *tn = net_generic(net, act_csum_ops.net_id);
+ bool bind = flags & TCA_ACT_FLAGS_BIND;
struct tcf_csum_params *params_new;
struct nlattr *tb[TCA_CSUM_MAX + 1];
+ struct tcf_chain *goto_ch = NULL;
struct tc_csum *parm;
struct tcf_csum *p;
int ret = 0, err;
+ u32 index;
if (nla == NULL)
return -EINVAL;
- err = nla_parse_nested(tb, TCA_CSUM_MAX, nla, csum_policy, NULL);
+ err = nla_parse_nested_deprecated(tb, TCA_CSUM_MAX, nla, csum_policy,
+ NULL);
if (err < 0)
return err;
if (tb[TCA_CSUM_PARMS] == NULL)
return -EINVAL;
parm = nla_data(tb[TCA_CSUM_PARMS]);
-
- err = tcf_idr_check_alloc(tn, &parm->index, a, bind);
+ index = parm->index;
+ err = tcf_idr_check_alloc(tn, &index, a, bind);
if (!err) {
- ret = tcf_idr_create(tn, parm->index, est, a,
- &act_csum_ops, bind, true);
+ ret = tcf_idr_create_from_flags(tn, index, est, a,
+ &act_csum_ops, bind, flags);
if (ret) {
- tcf_idr_cleanup(tn, parm->index);
+ tcf_idr_cleanup(tn, index);
return ret;
}
ret = ACT_P_CREATED;
} else if (err > 0) {
- if (bind)/* dont override defaults */
- return 0;
- if (!ovr) {
+ if (bind) /* dont override defaults */
+ return ACT_P_BOUND;
+ if (!(flags & TCA_ACT_FLAGS_REPLACE)) {
tcf_idr_release(*a, bind);
return -EEXIST;
}
@@ -87,28 +87,38 @@ static int tcf_csum_init(struct net *net, struct nlattr *nla,
return err;
}
+ err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack);
+ if (err < 0)
+ goto release_idr;
+
p = to_tcf_csum(*a);
params_new = kzalloc(sizeof(*params_new), GFP_KERNEL);
if (unlikely(!params_new)) {
- tcf_idr_release(*a, bind);
- return -ENOMEM;
+ err = -ENOMEM;
+ goto put_chain;
}
params_new->update_flags = parm->update_flags;
+ params_new->action = parm->action;
spin_lock_bh(&p->tcf_lock);
- p->tcf_action = parm->action;
- rcu_swap_protected(p->params, params_new,
- lockdep_is_held(&p->tcf_lock));
+ goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch);
+ params_new = rcu_replace_pointer(p->params, params_new,
+ lockdep_is_held(&p->tcf_lock));
spin_unlock_bh(&p->tcf_lock);
+ if (goto_ch)
+ tcf_chain_put_by_act(goto_ch);
if (params_new)
kfree_rcu(params_new, rcu);
- if (ret == ACT_P_CREATED)
- tcf_idr_insert(tn, *a);
-
return ret;
+put_chain:
+ if (goto_ch)
+ tcf_chain_put_by_act(goto_ch);
+release_idr:
+ tcf_idr_release(*a, bind);
+ return err;
}
/**
@@ -367,8 +377,7 @@ static int tcf_csum_sctp(struct sk_buff *skb, unsigned int ihl,
sctph->checksum = sctp_compute_cksum(skb,
skb_network_offset(skb) + ihl);
- skb->ip_summed = CHECKSUM_NONE;
- skb->csum_not_inet = 0;
+ skb_reset_csum_not_inet(skb);
return 1;
}
@@ -555,25 +564,31 @@ fail:
return 0;
}
-static int tcf_csum_act(struct sk_buff *skb, const struct tc_action *a,
- struct tcf_result *res)
+TC_INDIRECT_SCOPE int tcf_csum_act(struct sk_buff *skb,
+ const struct tc_action *a,
+ struct tcf_result *res)
{
struct tcf_csum *p = to_tcf_csum(a);
+ bool orig_vlan_tag_present = false;
+ unsigned int vlan_hdr_count = 0;
struct tcf_csum_params *params;
u32 update_flags;
+ __be16 protocol;
int action;
params = rcu_dereference_bh(p->params);
tcf_lastuse_update(&p->tcf_tm);
- bstats_cpu_update(this_cpu_ptr(p->common.cpu_bstats), skb);
+ tcf_action_update_bstats(&p->common, skb);
- action = READ_ONCE(p->tcf_action);
+ action = params->action;
if (unlikely(action == TC_ACT_SHOT))
goto drop;
update_flags = params->update_flags;
- switch (tc_skb_protocol(skb)) {
+ protocol = skb_protocol(skb, false);
+again:
+ switch (protocol) {
case cpu_to_be16(ETH_P_IP):
if (!tcf_csum_ipv4(skb, update_flags))
goto drop;
@@ -582,21 +597,44 @@ static int tcf_csum_act(struct sk_buff *skb, const struct tc_action *a,
if (!tcf_csum_ipv6(skb, update_flags))
goto drop;
break;
+ case cpu_to_be16(ETH_P_8021AD):
+ fallthrough;
+ case cpu_to_be16(ETH_P_8021Q):
+ if (skb_vlan_tag_present(skb) && !orig_vlan_tag_present) {
+ protocol = skb->protocol;
+ orig_vlan_tag_present = true;
+ } else {
+ struct vlan_hdr *vlan = (struct vlan_hdr *)skb->data;
+
+ protocol = vlan->h_vlan_encapsulated_proto;
+ skb_pull(skb, VLAN_HLEN);
+ skb_reset_network_header(skb);
+ vlan_hdr_count++;
+ }
+ goto again;
+ }
+
+out:
+ /* Restore the skb for the pulled VLAN tags */
+ while (vlan_hdr_count--) {
+ skb_push(skb, VLAN_HLEN);
+ skb_reset_network_header(skb);
}
return action;
drop:
- qstats_drop_inc(this_cpu_ptr(p->common.cpu_qstats));
- return TC_ACT_SHOT;
+ tcf_action_inc_drop_qstats(&p->common);
+ action = TC_ACT_SHOT;
+ goto out;
}
static int tcf_csum_dump(struct sk_buff *skb, struct tc_action *a, int bind,
int ref)
{
+ const struct tcf_csum *p = to_tcf_csum(a);
unsigned char *b = skb_tail_pointer(skb);
- struct tcf_csum *p = to_tcf_csum(a);
- struct tcf_csum_params *params;
+ const struct tcf_csum_params *params;
struct tc_csum opt = {
.index = p->tcf_index,
.refcnt = refcount_read(&p->tcf_refcnt) - ref,
@@ -604,10 +642,9 @@ static int tcf_csum_dump(struct sk_buff *skb, struct tc_action *a, int bind,
};
struct tcf_t t;
- spin_lock_bh(&p->tcf_lock);
- params = rcu_dereference_protected(p->params,
- lockdep_is_held(&p->tcf_lock));
- opt.action = p->tcf_action;
+ rcu_read_lock();
+ params = rcu_dereference(p->params);
+ opt.action = params->action;
opt.update_flags = params->update_flags;
if (nla_put(skb, TCA_CSUM_PARMS, sizeof(opt), &opt))
@@ -616,12 +653,12 @@ static int tcf_csum_dump(struct sk_buff *skb, struct tc_action *a, int bind,
tcf_tm_dump(&t, &p->tcf_tm);
if (nla_put_64bit(skb, TCA_CSUM_TM, sizeof(t), &t, TCA_CSUM_PAD))
goto nla_put_failure;
- spin_unlock_bh(&p->tcf_lock);
+ rcu_read_unlock();
return skb->len;
nla_put_failure:
- spin_unlock_bh(&p->tcf_lock);
+ rcu_read_unlock();
nlmsg_trim(skb, b);
return -1;
}
@@ -636,58 +673,60 @@ static void tcf_csum_cleanup(struct tc_action *a)
kfree_rcu(params, rcu);
}
-static int tcf_csum_walker(struct net *net, struct sk_buff *skb,
- struct netlink_callback *cb, int type,
- const struct tc_action_ops *ops,
- struct netlink_ext_ack *extack)
+static size_t tcf_csum_get_fill_size(const struct tc_action *act)
{
- struct tc_action_net *tn = net_generic(net, csum_net_id);
-
- return tcf_generic_walker(tn, skb, cb, type, ops, extack);
+ return nla_total_size(sizeof(struct tc_csum));
}
-static int tcf_csum_search(struct net *net, struct tc_action **a, u32 index)
+static int tcf_csum_offload_act_setup(struct tc_action *act, void *entry_data,
+ u32 *index_inc, bool bind,
+ struct netlink_ext_ack *extack)
{
- struct tc_action_net *tn = net_generic(net, csum_net_id);
+ if (bind) {
+ struct flow_action_entry *entry = entry_data;
- return tcf_idr_search(tn, a, index);
-}
+ entry->id = FLOW_ACTION_CSUM;
+ entry->csum_flags = tcf_csum_update_flags(act);
+ *index_inc = 1;
+ } else {
+ struct flow_offload_action *fl_action = entry_data;
-static size_t tcf_csum_get_fill_size(const struct tc_action *act)
-{
- return nla_total_size(sizeof(struct tc_csum));
+ fl_action->id = FLOW_ACTION_CSUM;
+ }
+
+ return 0;
}
static struct tc_action_ops act_csum_ops = {
.kind = "csum",
- .type = TCA_ACT_CSUM,
+ .id = TCA_ID_CSUM,
.owner = THIS_MODULE,
.act = tcf_csum_act,
.dump = tcf_csum_dump,
.init = tcf_csum_init,
.cleanup = tcf_csum_cleanup,
- .walk = tcf_csum_walker,
- .lookup = tcf_csum_search,
.get_fill_size = tcf_csum_get_fill_size,
+ .offload_act_setup = tcf_csum_offload_act_setup,
.size = sizeof(struct tcf_csum),
};
+MODULE_ALIAS_NET_ACT("csum");
static __net_init int csum_init_net(struct net *net)
{
- struct tc_action_net *tn = net_generic(net, csum_net_id);
+ struct tc_action_net *tn = net_generic(net, act_csum_ops.net_id);
- return tc_action_net_init(tn, &act_csum_ops);
+ return tc_action_net_init(net, tn, &act_csum_ops);
}
static void __net_exit csum_exit_net(struct list_head *net_list)
{
- tc_action_net_exit(net_list, csum_net_id);
+ tc_action_net_exit(net_list, act_csum_ops.net_id);
}
static struct pernet_operations csum_net_ops = {
.init = csum_init_net,
.exit_batch = csum_exit_net,
- .id = &csum_net_id,
+ .id = &act_csum_ops.net_id,
.size = sizeof(struct tc_action_net),
};
diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c
new file mode 100644
index 000000000000..2b6ac7069dc1
--- /dev/null
+++ b/net/sched/act_ct.c
@@ -0,0 +1,1698 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/* -
+ * net/sched/act_ct.c Connection Tracking action
+ *
+ * Authors: Paul Blakey <paulb@mellanox.com>
+ * Yossi Kuperman <yossiku@mellanox.com>
+ * Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <linux/pkt_cls.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/rhashtable.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+#include <net/pkt_cls.h>
+#include <net/act_api.h>
+#include <net/ip.h>
+#include <net/ipv6_frag.h>
+#include <uapi/linux/tc_act/tc_ct.h>
+#include <net/tc_act/tc_ct.h>
+#include <net/tc_wrapper.h>
+
+#include <net/netfilter/nf_flow_table.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_zones.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_acct.h>
+#include <net/netfilter/ipv6/nf_defrag_ipv6.h>
+#include <net/netfilter/nf_conntrack_act_ct.h>
+#include <net/netfilter/nf_conntrack_seqadj.h>
+#include <uapi/linux/netfilter/nf_nat.h>
+
+static struct workqueue_struct *act_ct_wq;
+static struct rhashtable zones_ht;
+static DEFINE_MUTEX(zones_mutex);
+
+struct zones_ht_key {
+ struct net *net;
+ u16 zone;
+};
+
+struct tcf_ct_flow_table {
+ struct rhash_head node; /* In zones tables */
+
+ struct rcu_work rwork;
+ struct nf_flowtable nf_ft;
+ refcount_t ref;
+ struct zones_ht_key key;
+
+ bool dying;
+};
+
+static const struct rhashtable_params zones_params = {
+ .head_offset = offsetof(struct tcf_ct_flow_table, node),
+ .key_offset = offsetof(struct tcf_ct_flow_table, key),
+ .key_len = offsetofend(struct zones_ht_key, zone),
+ .automatic_shrinking = true,
+};
+
+static struct flow_action_entry *
+tcf_ct_flow_table_flow_action_get_next(struct flow_action *flow_action)
+{
+ int i = flow_action->num_entries++;
+
+ return &flow_action->entries[i];
+}
+
+static void tcf_ct_add_mangle_action(struct flow_action *action,
+ enum flow_action_mangle_base htype,
+ u32 offset,
+ u32 mask,
+ u32 val)
+{
+ struct flow_action_entry *entry;
+
+ entry = tcf_ct_flow_table_flow_action_get_next(action);
+ entry->id = FLOW_ACTION_MANGLE;
+ entry->mangle.htype = htype;
+ entry->mangle.mask = ~mask;
+ entry->mangle.offset = offset;
+ entry->mangle.val = val;
+}
+
+/* The following nat helper functions check if the inverted reverse tuple
+ * (target) is different then the current dir tuple - meaning nat for ports
+ * and/or ip is needed, and add the relevant mangle actions.
+ */
+static void
+tcf_ct_flow_table_add_action_nat_ipv4(const struct nf_conntrack_tuple *tuple,
+ struct nf_conntrack_tuple target,
+ struct flow_action *action)
+{
+ if (memcmp(&target.src.u3, &tuple->src.u3, sizeof(target.src.u3)))
+ tcf_ct_add_mangle_action(action, FLOW_ACT_MANGLE_HDR_TYPE_IP4,
+ offsetof(struct iphdr, saddr),
+ 0xFFFFFFFF,
+ be32_to_cpu(target.src.u3.ip));
+ if (memcmp(&target.dst.u3, &tuple->dst.u3, sizeof(target.dst.u3)))
+ tcf_ct_add_mangle_action(action, FLOW_ACT_MANGLE_HDR_TYPE_IP4,
+ offsetof(struct iphdr, daddr),
+ 0xFFFFFFFF,
+ be32_to_cpu(target.dst.u3.ip));
+}
+
+static void
+tcf_ct_add_ipv6_addr_mangle_action(struct flow_action *action,
+ union nf_inet_addr *addr,
+ u32 offset)
+{
+ int i;
+
+ for (i = 0; i < sizeof(struct in6_addr) / sizeof(u32); i++)
+ tcf_ct_add_mangle_action(action, FLOW_ACT_MANGLE_HDR_TYPE_IP6,
+ i * sizeof(u32) + offset,
+ 0xFFFFFFFF, be32_to_cpu(addr->ip6[i]));
+}
+
+static void
+tcf_ct_flow_table_add_action_nat_ipv6(const struct nf_conntrack_tuple *tuple,
+ struct nf_conntrack_tuple target,
+ struct flow_action *action)
+{
+ if (memcmp(&target.src.u3, &tuple->src.u3, sizeof(target.src.u3)))
+ tcf_ct_add_ipv6_addr_mangle_action(action, &target.src.u3,
+ offsetof(struct ipv6hdr,
+ saddr));
+ if (memcmp(&target.dst.u3, &tuple->dst.u3, sizeof(target.dst.u3)))
+ tcf_ct_add_ipv6_addr_mangle_action(action, &target.dst.u3,
+ offsetof(struct ipv6hdr,
+ daddr));
+}
+
+static void
+tcf_ct_flow_table_add_action_nat_tcp(const struct nf_conntrack_tuple *tuple,
+ struct nf_conntrack_tuple target,
+ struct flow_action *action)
+{
+ __be16 target_src = target.src.u.tcp.port;
+ __be16 target_dst = target.dst.u.tcp.port;
+
+ if (target_src != tuple->src.u.tcp.port)
+ tcf_ct_add_mangle_action(action, FLOW_ACT_MANGLE_HDR_TYPE_TCP,
+ offsetof(struct tcphdr, source),
+ 0xFFFF, be16_to_cpu(target_src));
+ if (target_dst != tuple->dst.u.tcp.port)
+ tcf_ct_add_mangle_action(action, FLOW_ACT_MANGLE_HDR_TYPE_TCP,
+ offsetof(struct tcphdr, dest),
+ 0xFFFF, be16_to_cpu(target_dst));
+}
+
+static void
+tcf_ct_flow_table_add_action_nat_udp(const struct nf_conntrack_tuple *tuple,
+ struct nf_conntrack_tuple target,
+ struct flow_action *action)
+{
+ __be16 target_src = target.src.u.udp.port;
+ __be16 target_dst = target.dst.u.udp.port;
+
+ if (target_src != tuple->src.u.udp.port)
+ tcf_ct_add_mangle_action(action, FLOW_ACT_MANGLE_HDR_TYPE_UDP,
+ offsetof(struct udphdr, source),
+ 0xFFFF, be16_to_cpu(target_src));
+ if (target_dst != tuple->dst.u.udp.port)
+ tcf_ct_add_mangle_action(action, FLOW_ACT_MANGLE_HDR_TYPE_UDP,
+ offsetof(struct udphdr, dest),
+ 0xFFFF, be16_to_cpu(target_dst));
+}
+
+static void tcf_ct_flow_table_add_action_meta(struct nf_conn *ct,
+ enum ip_conntrack_dir dir,
+ enum ip_conntrack_info ctinfo,
+ struct flow_action *action)
+{
+ struct nf_conn_labels *ct_labels;
+ struct flow_action_entry *entry;
+ u32 *act_ct_labels;
+
+ entry = tcf_ct_flow_table_flow_action_get_next(action);
+ entry->id = FLOW_ACTION_CT_METADATA;
+#if IS_ENABLED(CONFIG_NF_CONNTRACK_MARK)
+ entry->ct_metadata.mark = READ_ONCE(ct->mark);
+#endif
+ /* aligns with the CT reference on the SKB nf_ct_set */
+ entry->ct_metadata.cookie = (unsigned long)ct | ctinfo;
+ entry->ct_metadata.orig_dir = dir == IP_CT_DIR_ORIGINAL;
+
+ act_ct_labels = entry->ct_metadata.labels;
+ ct_labels = nf_ct_labels_find(ct);
+ if (ct_labels)
+ memcpy(act_ct_labels, ct_labels->bits, NF_CT_LABELS_MAX_SIZE);
+ else
+ memset(act_ct_labels, 0, NF_CT_LABELS_MAX_SIZE);
+}
+
+static int tcf_ct_flow_table_add_action_nat(struct net *net,
+ struct nf_conn *ct,
+ enum ip_conntrack_dir dir,
+ struct flow_action *action)
+{
+ const struct nf_conntrack_tuple *tuple = &ct->tuplehash[dir].tuple;
+ struct nf_conntrack_tuple target;
+
+ if (!(ct->status & IPS_NAT_MASK))
+ return 0;
+
+ nf_ct_invert_tuple(&target, &ct->tuplehash[!dir].tuple);
+
+ switch (tuple->src.l3num) {
+ case NFPROTO_IPV4:
+ tcf_ct_flow_table_add_action_nat_ipv4(tuple, target,
+ action);
+ break;
+ case NFPROTO_IPV6:
+ tcf_ct_flow_table_add_action_nat_ipv6(tuple, target,
+ action);
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ switch (nf_ct_protonum(ct)) {
+ case IPPROTO_TCP:
+ tcf_ct_flow_table_add_action_nat_tcp(tuple, target, action);
+ break;
+ case IPPROTO_UDP:
+ tcf_ct_flow_table_add_action_nat_udp(tuple, target, action);
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ return 0;
+}
+
+static int tcf_ct_flow_table_fill_actions(struct net *net,
+ struct flow_offload *flow,
+ enum flow_offload_tuple_dir tdir,
+ struct nf_flow_rule *flow_rule)
+{
+ struct flow_action *action = &flow_rule->rule->action;
+ int num_entries = action->num_entries;
+ struct nf_conn *ct = flow->ct;
+ enum ip_conntrack_info ctinfo;
+ enum ip_conntrack_dir dir;
+ int i, err;
+
+ switch (tdir) {
+ case FLOW_OFFLOAD_DIR_ORIGINAL:
+ dir = IP_CT_DIR_ORIGINAL;
+ ctinfo = test_bit(IPS_SEEN_REPLY_BIT, &ct->status) ?
+ IP_CT_ESTABLISHED : IP_CT_NEW;
+ if (ctinfo == IP_CT_ESTABLISHED)
+ set_bit(NF_FLOW_HW_ESTABLISHED, &flow->flags);
+ break;
+ case FLOW_OFFLOAD_DIR_REPLY:
+ dir = IP_CT_DIR_REPLY;
+ ctinfo = IP_CT_ESTABLISHED_REPLY;
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ err = tcf_ct_flow_table_add_action_nat(net, ct, dir, action);
+ if (err)
+ goto err_nat;
+
+ tcf_ct_flow_table_add_action_meta(ct, dir, ctinfo, action);
+ return 0;
+
+err_nat:
+ /* Clear filled actions */
+ for (i = num_entries; i < action->num_entries; i++)
+ memset(&action->entries[i], 0, sizeof(action->entries[i]));
+ action->num_entries = num_entries;
+
+ return err;
+}
+
+static bool tcf_ct_flow_is_outdated(const struct flow_offload *flow)
+{
+ return test_bit(IPS_SEEN_REPLY_BIT, &flow->ct->status) &&
+ test_bit(IPS_HW_OFFLOAD_BIT, &flow->ct->status) &&
+ !test_bit(NF_FLOW_HW_PENDING, &flow->flags) &&
+ !test_bit(NF_FLOW_HW_ESTABLISHED, &flow->flags);
+}
+
+static void tcf_ct_flow_table_get_ref(struct tcf_ct_flow_table *ct_ft);
+
+static void tcf_ct_nf_get(struct nf_flowtable *ft)
+{
+ struct tcf_ct_flow_table *ct_ft =
+ container_of(ft, struct tcf_ct_flow_table, nf_ft);
+
+ tcf_ct_flow_table_get_ref(ct_ft);
+}
+
+static void tcf_ct_flow_table_put(struct tcf_ct_flow_table *ct_ft);
+
+static void tcf_ct_nf_put(struct nf_flowtable *ft)
+{
+ struct tcf_ct_flow_table *ct_ft =
+ container_of(ft, struct tcf_ct_flow_table, nf_ft);
+
+ tcf_ct_flow_table_put(ct_ft);
+}
+
+static struct nf_flowtable_type flowtable_ct = {
+ .gc = tcf_ct_flow_is_outdated,
+ .action = tcf_ct_flow_table_fill_actions,
+ .get = tcf_ct_nf_get,
+ .put = tcf_ct_nf_put,
+ .owner = THIS_MODULE,
+};
+
+static int tcf_ct_flow_table_get(struct net *net, struct tcf_ct_params *params)
+{
+ struct zones_ht_key key = { .net = net, .zone = params->zone };
+ struct tcf_ct_flow_table *ct_ft;
+ int err = -ENOMEM;
+
+ mutex_lock(&zones_mutex);
+ ct_ft = rhashtable_lookup_fast(&zones_ht, &key, zones_params);
+ if (ct_ft && refcount_inc_not_zero(&ct_ft->ref))
+ goto out_unlock;
+
+ ct_ft = kzalloc(sizeof(*ct_ft), GFP_KERNEL);
+ if (!ct_ft)
+ goto err_alloc;
+ refcount_set(&ct_ft->ref, 1);
+
+ ct_ft->key = key;
+ err = rhashtable_insert_fast(&zones_ht, &ct_ft->node, zones_params);
+ if (err)
+ goto err_insert;
+
+ ct_ft->nf_ft.type = &flowtable_ct;
+ ct_ft->nf_ft.flags |= NF_FLOWTABLE_HW_OFFLOAD |
+ NF_FLOWTABLE_COUNTER;
+ err = nf_flow_table_init(&ct_ft->nf_ft);
+ if (err)
+ goto err_init;
+ write_pnet(&ct_ft->nf_ft.net, net);
+
+ __module_get(THIS_MODULE);
+out_unlock:
+ params->ct_ft = ct_ft;
+ params->nf_ft = &ct_ft->nf_ft;
+ mutex_unlock(&zones_mutex);
+
+ return 0;
+
+err_init:
+ rhashtable_remove_fast(&zones_ht, &ct_ft->node, zones_params);
+err_insert:
+ kfree(ct_ft);
+err_alloc:
+ mutex_unlock(&zones_mutex);
+ return err;
+}
+
+static void tcf_ct_flow_table_get_ref(struct tcf_ct_flow_table *ct_ft)
+{
+ refcount_inc(&ct_ft->ref);
+}
+
+static void tcf_ct_flow_table_cleanup_work(struct work_struct *work)
+{
+ struct tcf_ct_flow_table *ct_ft;
+ struct flow_block *block;
+
+ ct_ft = container_of(to_rcu_work(work), struct tcf_ct_flow_table,
+ rwork);
+ nf_flow_table_free(&ct_ft->nf_ft);
+
+ block = &ct_ft->nf_ft.flow_block;
+ down_write(&ct_ft->nf_ft.flow_block_lock);
+ WARN_ON(!list_empty(&block->cb_list));
+ up_write(&ct_ft->nf_ft.flow_block_lock);
+ kfree(ct_ft);
+
+ module_put(THIS_MODULE);
+}
+
+static void tcf_ct_flow_table_put(struct tcf_ct_flow_table *ct_ft)
+{
+ if (refcount_dec_and_test(&ct_ft->ref)) {
+ rhashtable_remove_fast(&zones_ht, &ct_ft->node, zones_params);
+ INIT_RCU_WORK(&ct_ft->rwork, tcf_ct_flow_table_cleanup_work);
+ queue_rcu_work(act_ct_wq, &ct_ft->rwork);
+ }
+}
+
+static void tcf_ct_flow_tc_ifidx(struct flow_offload *entry,
+ struct nf_conn_act_ct_ext *act_ct_ext, u8 dir)
+{
+ entry->tuplehash[dir].tuple.xmit_type = FLOW_OFFLOAD_XMIT_TC;
+ entry->tuplehash[dir].tuple.tc.iifidx = act_ct_ext->ifindex[dir];
+}
+
+static void tcf_ct_flow_ct_ext_ifidx_update(struct flow_offload *entry)
+{
+ struct nf_conn_act_ct_ext *act_ct_ext;
+
+ act_ct_ext = nf_conn_act_ct_ext_find(entry->ct);
+ if (act_ct_ext) {
+ tcf_ct_flow_tc_ifidx(entry, act_ct_ext, FLOW_OFFLOAD_DIR_ORIGINAL);
+ tcf_ct_flow_tc_ifidx(entry, act_ct_ext, FLOW_OFFLOAD_DIR_REPLY);
+ }
+}
+
+static void tcf_ct_flow_table_add(struct tcf_ct_flow_table *ct_ft,
+ struct nf_conn *ct,
+ bool tcp, bool bidirectional)
+{
+ struct nf_conn_act_ct_ext *act_ct_ext;
+ struct flow_offload *entry;
+ int err;
+
+ if (test_and_set_bit(IPS_OFFLOAD_BIT, &ct->status))
+ return;
+
+ entry = flow_offload_alloc(ct);
+ if (!entry) {
+ WARN_ON_ONCE(1);
+ goto err_alloc;
+ }
+
+ if (tcp) {
+ ct->proto.tcp.seen[0].flags |= IP_CT_TCP_FLAG_BE_LIBERAL;
+ ct->proto.tcp.seen[1].flags |= IP_CT_TCP_FLAG_BE_LIBERAL;
+ }
+ if (bidirectional)
+ __set_bit(NF_FLOW_HW_BIDIRECTIONAL, &entry->flags);
+
+ act_ct_ext = nf_conn_act_ct_ext_find(ct);
+ if (act_ct_ext) {
+ tcf_ct_flow_tc_ifidx(entry, act_ct_ext, FLOW_OFFLOAD_DIR_ORIGINAL);
+ tcf_ct_flow_tc_ifidx(entry, act_ct_ext, FLOW_OFFLOAD_DIR_REPLY);
+ }
+
+ err = flow_offload_add(&ct_ft->nf_ft, entry);
+ if (err)
+ goto err_add;
+
+ return;
+
+err_add:
+ flow_offload_free(entry);
+err_alloc:
+ clear_bit(IPS_OFFLOAD_BIT, &ct->status);
+}
+
+static void tcf_ct_flow_table_process_conn(struct tcf_ct_flow_table *ct_ft,
+ struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo)
+{
+ bool tcp = false, bidirectional = true;
+
+ switch (nf_ct_protonum(ct)) {
+ case IPPROTO_TCP:
+ if ((ctinfo != IP_CT_ESTABLISHED &&
+ ctinfo != IP_CT_ESTABLISHED_REPLY) ||
+ !test_bit(IPS_ASSURED_BIT, &ct->status) ||
+ ct->proto.tcp.state != TCP_CONNTRACK_ESTABLISHED)
+ return;
+
+ tcp = true;
+ break;
+ case IPPROTO_UDP:
+ if (!nf_ct_is_confirmed(ct))
+ return;
+ if (!test_bit(IPS_ASSURED_BIT, &ct->status))
+ bidirectional = false;
+ break;
+#ifdef CONFIG_NF_CT_PROTO_GRE
+ case IPPROTO_GRE: {
+ struct nf_conntrack_tuple *tuple;
+
+ if ((ctinfo != IP_CT_ESTABLISHED &&
+ ctinfo != IP_CT_ESTABLISHED_REPLY) ||
+ !test_bit(IPS_ASSURED_BIT, &ct->status) ||
+ ct->status & IPS_NAT_MASK)
+ return;
+
+ tuple = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
+ /* No support for GRE v1 */
+ if (tuple->src.u.gre.key || tuple->dst.u.gre.key)
+ return;
+ break;
+ }
+#endif
+ default:
+ return;
+ }
+
+ if (nf_ct_ext_exist(ct, NF_CT_EXT_HELPER) ||
+ ct->status & IPS_SEQ_ADJUST)
+ return;
+
+ tcf_ct_flow_table_add(ct_ft, ct, tcp, bidirectional);
+}
+
+static bool
+tcf_ct_flow_table_fill_tuple_ipv4(struct sk_buff *skb,
+ struct flow_offload_tuple *tuple,
+ struct tcphdr **tcph)
+{
+ struct flow_ports *ports;
+ unsigned int thoff;
+ struct iphdr *iph;
+ size_t hdrsize;
+ u8 ipproto;
+
+ if (!pskb_network_may_pull(skb, sizeof(*iph)))
+ return false;
+
+ iph = ip_hdr(skb);
+ thoff = iph->ihl * 4;
+
+ if (ip_is_fragment(iph) ||
+ unlikely(thoff != sizeof(struct iphdr)))
+ return false;
+
+ ipproto = iph->protocol;
+ switch (ipproto) {
+ case IPPROTO_TCP:
+ hdrsize = sizeof(struct tcphdr);
+ break;
+ case IPPROTO_UDP:
+ hdrsize = sizeof(*ports);
+ break;
+#ifdef CONFIG_NF_CT_PROTO_GRE
+ case IPPROTO_GRE:
+ hdrsize = sizeof(struct gre_base_hdr);
+ break;
+#endif
+ default:
+ return false;
+ }
+
+ if (iph->ttl <= 1)
+ return false;
+
+ if (!pskb_network_may_pull(skb, thoff + hdrsize))
+ return false;
+
+ switch (ipproto) {
+ case IPPROTO_TCP:
+ *tcph = (void *)(skb_network_header(skb) + thoff);
+ fallthrough;
+ case IPPROTO_UDP:
+ ports = (struct flow_ports *)(skb_network_header(skb) + thoff);
+ tuple->src_port = ports->source;
+ tuple->dst_port = ports->dest;
+ break;
+ case IPPROTO_GRE: {
+ struct gre_base_hdr *greh;
+
+ greh = (struct gre_base_hdr *)(skb_network_header(skb) + thoff);
+ if ((greh->flags & GRE_VERSION) != GRE_VERSION_0)
+ return false;
+ break;
+ }
+ }
+
+ iph = ip_hdr(skb);
+
+ tuple->src_v4.s_addr = iph->saddr;
+ tuple->dst_v4.s_addr = iph->daddr;
+ tuple->l3proto = AF_INET;
+ tuple->l4proto = ipproto;
+
+ return true;
+}
+
+static bool
+tcf_ct_flow_table_fill_tuple_ipv6(struct sk_buff *skb,
+ struct flow_offload_tuple *tuple,
+ struct tcphdr **tcph)
+{
+ struct flow_ports *ports;
+ struct ipv6hdr *ip6h;
+ unsigned int thoff;
+ size_t hdrsize;
+ u8 nexthdr;
+
+ if (!pskb_network_may_pull(skb, sizeof(*ip6h)))
+ return false;
+
+ ip6h = ipv6_hdr(skb);
+ thoff = sizeof(*ip6h);
+
+ nexthdr = ip6h->nexthdr;
+ switch (nexthdr) {
+ case IPPROTO_TCP:
+ hdrsize = sizeof(struct tcphdr);
+ break;
+ case IPPROTO_UDP:
+ hdrsize = sizeof(*ports);
+ break;
+#ifdef CONFIG_NF_CT_PROTO_GRE
+ case IPPROTO_GRE:
+ hdrsize = sizeof(struct gre_base_hdr);
+ break;
+#endif
+ default:
+ return false;
+ }
+
+ if (ip6h->hop_limit <= 1)
+ return false;
+
+ if (!pskb_network_may_pull(skb, thoff + hdrsize))
+ return false;
+
+ switch (nexthdr) {
+ case IPPROTO_TCP:
+ *tcph = (void *)(skb_network_header(skb) + thoff);
+ fallthrough;
+ case IPPROTO_UDP:
+ ports = (struct flow_ports *)(skb_network_header(skb) + thoff);
+ tuple->src_port = ports->source;
+ tuple->dst_port = ports->dest;
+ break;
+ case IPPROTO_GRE: {
+ struct gre_base_hdr *greh;
+
+ greh = (struct gre_base_hdr *)(skb_network_header(skb) + thoff);
+ if ((greh->flags & GRE_VERSION) != GRE_VERSION_0)
+ return false;
+ break;
+ }
+ }
+
+ ip6h = ipv6_hdr(skb);
+
+ tuple->src_v6 = ip6h->saddr;
+ tuple->dst_v6 = ip6h->daddr;
+ tuple->l3proto = AF_INET6;
+ tuple->l4proto = nexthdr;
+
+ return true;
+}
+
+static bool tcf_ct_flow_table_lookup(struct tcf_ct_params *p,
+ struct sk_buff *skb,
+ u8 family)
+{
+ struct nf_flowtable *nf_ft = &p->ct_ft->nf_ft;
+ struct flow_offload_tuple_rhash *tuplehash;
+ struct flow_offload_tuple tuple = {};
+ enum ip_conntrack_info ctinfo;
+ struct tcphdr *tcph = NULL;
+ bool force_refresh = false;
+ struct flow_offload *flow;
+ struct nf_conn *ct;
+ u8 dir;
+
+ switch (family) {
+ case NFPROTO_IPV4:
+ if (!tcf_ct_flow_table_fill_tuple_ipv4(skb, &tuple, &tcph))
+ return false;
+ break;
+ case NFPROTO_IPV6:
+ if (!tcf_ct_flow_table_fill_tuple_ipv6(skb, &tuple, &tcph))
+ return false;
+ break;
+ default:
+ return false;
+ }
+
+ tuplehash = flow_offload_lookup(nf_ft, &tuple);
+ if (!tuplehash)
+ return false;
+
+ dir = tuplehash->tuple.dir;
+ flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]);
+ ct = flow->ct;
+
+ if (dir == FLOW_OFFLOAD_DIR_REPLY &&
+ !test_bit(NF_FLOW_HW_BIDIRECTIONAL, &flow->flags)) {
+ /* Only offload reply direction after connection became
+ * assured.
+ */
+ if (test_bit(IPS_ASSURED_BIT, &ct->status))
+ set_bit(NF_FLOW_HW_BIDIRECTIONAL, &flow->flags);
+ else if (test_bit(NF_FLOW_HW_ESTABLISHED, &flow->flags))
+ /* If flow_table flow has already been updated to the
+ * established state, then don't refresh.
+ */
+ return false;
+ force_refresh = true;
+ }
+
+ if (tcph && (unlikely(tcph->fin || tcph->rst))) {
+ flow_offload_teardown(flow);
+ return false;
+ }
+
+ if (dir == FLOW_OFFLOAD_DIR_ORIGINAL)
+ ctinfo = test_bit(IPS_SEEN_REPLY_BIT, &ct->status) ?
+ IP_CT_ESTABLISHED : IP_CT_NEW;
+ else
+ ctinfo = IP_CT_ESTABLISHED_REPLY;
+
+ nf_conn_act_ct_ext_fill(skb, ct, ctinfo);
+ tcf_ct_flow_ct_ext_ifidx_update(flow);
+ flow_offload_refresh(nf_ft, flow, force_refresh);
+ if (!test_bit(IPS_ASSURED_BIT, &ct->status)) {
+ /* Process this flow in SW to allow promoting to ASSURED */
+ return false;
+ }
+
+ nf_conntrack_get(&ct->ct_general);
+ nf_ct_set(skb, ct, ctinfo);
+ if (nf_ft->flags & NF_FLOWTABLE_COUNTER)
+ nf_ct_acct_update(ct, dir, skb->len);
+
+ return true;
+}
+
+static int tcf_ct_flow_tables_init(void)
+{
+ return rhashtable_init(&zones_ht, &zones_params);
+}
+
+static void tcf_ct_flow_tables_uninit(void)
+{
+ rhashtable_destroy(&zones_ht);
+}
+
+static struct tc_action_ops act_ct_ops;
+
+struct tc_ct_action_net {
+ struct tc_action_net tn; /* Must be first */
+};
+
+/* Determine whether skb->_nfct is equal to the result of conntrack lookup. */
+static bool tcf_ct_skb_nfct_cached(struct net *net, struct sk_buff *skb,
+ struct tcf_ct_params *p)
+{
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct;
+
+ ct = nf_ct_get(skb, &ctinfo);
+ if (!ct)
+ return false;
+ if (!net_eq(net, read_pnet(&ct->ct_net)))
+ goto drop_ct;
+ if (nf_ct_zone(ct)->id != p->zone)
+ goto drop_ct;
+ if (p->helper) {
+ struct nf_conn_help *help;
+
+ help = nf_ct_ext_find(ct, NF_CT_EXT_HELPER);
+ if (help && rcu_access_pointer(help->helper) != p->helper)
+ goto drop_ct;
+ }
+
+ /* Force conntrack entry direction. */
+ if ((p->ct_action & TCA_CT_ACT_FORCE) &&
+ CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) {
+ if (nf_ct_is_confirmed(ct))
+ nf_ct_kill(ct);
+
+ goto drop_ct;
+ }
+
+ return true;
+
+drop_ct:
+ nf_ct_put(ct);
+ nf_ct_set(skb, NULL, IP_CT_UNTRACKED);
+
+ return false;
+}
+
+static u8 tcf_ct_skb_nf_family(struct sk_buff *skb)
+{
+ u8 family = NFPROTO_UNSPEC;
+
+ switch (skb_protocol(skb, true)) {
+ case htons(ETH_P_IP):
+ family = NFPROTO_IPV4;
+ break;
+ case htons(ETH_P_IPV6):
+ family = NFPROTO_IPV6;
+ break;
+ default:
+ break;
+ }
+
+ return family;
+}
+
+static int tcf_ct_ipv4_is_fragment(struct sk_buff *skb, bool *frag)
+{
+ unsigned int len;
+
+ len = skb_network_offset(skb) + sizeof(struct iphdr);
+ if (unlikely(skb->len < len))
+ return -EINVAL;
+ if (unlikely(!pskb_may_pull(skb, len)))
+ return -ENOMEM;
+
+ *frag = ip_is_fragment(ip_hdr(skb));
+ return 0;
+}
+
+static int tcf_ct_ipv6_is_fragment(struct sk_buff *skb, bool *frag)
+{
+ unsigned int flags = 0, len, payload_ofs = 0;
+ unsigned short frag_off;
+ int nexthdr;
+
+ len = skb_network_offset(skb) + sizeof(struct ipv6hdr);
+ if (unlikely(skb->len < len))
+ return -EINVAL;
+ if (unlikely(!pskb_may_pull(skb, len)))
+ return -ENOMEM;
+
+ nexthdr = ipv6_find_hdr(skb, &payload_ofs, -1, &frag_off, &flags);
+ if (unlikely(nexthdr < 0))
+ return -EPROTO;
+
+ *frag = flags & IP6_FH_F_FRAG;
+ return 0;
+}
+
+static int tcf_ct_handle_fragments(struct net *net, struct sk_buff *skb,
+ u8 family, u16 zone, bool *defrag)
+{
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct;
+ int err = 0;
+ bool frag;
+ u8 proto;
+ u16 mru;
+
+ /* Previously seen (loopback)? Ignore. */
+ ct = nf_ct_get(skb, &ctinfo);
+ if ((ct && !nf_ct_is_template(ct)) || ctinfo == IP_CT_UNTRACKED)
+ return 0;
+
+ if (family == NFPROTO_IPV4)
+ err = tcf_ct_ipv4_is_fragment(skb, &frag);
+ else
+ err = tcf_ct_ipv6_is_fragment(skb, &frag);
+ if (err || !frag)
+ return err;
+
+ err = nf_ct_handle_fragments(net, skb, zone, family, &proto, &mru);
+ if (err)
+ return err;
+
+ *defrag = true;
+ tc_skb_cb(skb)->mru = mru;
+
+ return 0;
+}
+
+static void tcf_ct_params_free(struct tcf_ct_params *params)
+{
+ if (params->helper) {
+#if IS_ENABLED(CONFIG_NF_NAT)
+ if (params->ct_action & TCA_CT_ACT_NAT)
+ nf_nat_helper_put(params->helper);
+#endif
+ nf_conntrack_helper_put(params->helper);
+ }
+ if (params->ct_ft)
+ tcf_ct_flow_table_put(params->ct_ft);
+ if (params->tmpl) {
+ if (params->put_labels)
+ nf_connlabels_put(nf_ct_net(params->tmpl));
+
+ nf_ct_put(params->tmpl);
+ }
+
+ kfree(params);
+}
+
+static void tcf_ct_params_free_rcu(struct rcu_head *head)
+{
+ struct tcf_ct_params *params;
+
+ params = container_of(head, struct tcf_ct_params, rcu);
+ tcf_ct_params_free(params);
+}
+
+static void tcf_ct_act_set_mark(struct nf_conn *ct, u32 mark, u32 mask)
+{
+#if IS_ENABLED(CONFIG_NF_CONNTRACK_MARK)
+ u32 new_mark;
+
+ if (!mask)
+ return;
+
+ new_mark = mark | (READ_ONCE(ct->mark) & ~(mask));
+ if (READ_ONCE(ct->mark) != new_mark) {
+ WRITE_ONCE(ct->mark, new_mark);
+ if (nf_ct_is_confirmed(ct))
+ nf_conntrack_event_cache(IPCT_MARK, ct);
+ }
+#endif
+}
+
+static void tcf_ct_act_set_labels(struct nf_conn *ct,
+ u32 *labels,
+ u32 *labels_m)
+{
+#if IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS)
+ size_t labels_sz = sizeof_field(struct tcf_ct_params, labels);
+
+ if (!memchr_inv(labels_m, 0, labels_sz))
+ return;
+
+ nf_connlabels_replace(ct, labels, labels_m, 4);
+#endif
+}
+
+static int tcf_ct_act_nat(struct sk_buff *skb,
+ struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ int ct_action,
+ struct nf_nat_range2 *range,
+ bool commit)
+{
+#if IS_ENABLED(CONFIG_NF_NAT)
+ int err, action = 0;
+
+ if (!(ct_action & TCA_CT_ACT_NAT))
+ return NF_ACCEPT;
+ if (ct_action & TCA_CT_ACT_NAT_SRC)
+ action |= BIT(NF_NAT_MANIP_SRC);
+ if (ct_action & TCA_CT_ACT_NAT_DST)
+ action |= BIT(NF_NAT_MANIP_DST);
+
+ err = nf_ct_nat(skb, ct, ctinfo, &action, range, commit);
+ if (err != NF_ACCEPT)
+ return err & NF_VERDICT_MASK;
+
+ if (action & BIT(NF_NAT_MANIP_SRC))
+ qdisc_skb_cb(skb)->post_ct_snat = 1;
+ if (action & BIT(NF_NAT_MANIP_DST))
+ qdisc_skb_cb(skb)->post_ct_dnat = 1;
+
+ return err;
+#else
+ return NF_ACCEPT;
+#endif
+}
+
+TC_INDIRECT_SCOPE int tcf_ct_act(struct sk_buff *skb, const struct tc_action *a,
+ struct tcf_result *res)
+{
+ struct net *net = dev_net(skb->dev);
+ enum ip_conntrack_info ctinfo;
+ struct tcf_ct *c = to_ct(a);
+ struct nf_conn *tmpl = NULL;
+ struct nf_hook_state state;
+ bool cached, commit, clear;
+ int nh_ofs, err, retval;
+ struct tcf_ct_params *p;
+ bool add_helper = false;
+ bool skip_add = false;
+ bool defrag = false;
+ struct nf_conn *ct;
+ u8 family;
+
+ p = rcu_dereference_bh(c->params);
+
+ retval = p->action;
+ commit = p->ct_action & TCA_CT_ACT_COMMIT;
+ clear = p->ct_action & TCA_CT_ACT_CLEAR;
+ tmpl = p->tmpl;
+
+ tcf_lastuse_update(&c->tcf_tm);
+ tcf_action_update_bstats(&c->common, skb);
+
+ if (clear) {
+ qdisc_skb_cb(skb)->post_ct = false;
+ ct = nf_ct_get(skb, &ctinfo);
+ if (ct) {
+ nf_ct_put(ct);
+ nf_ct_set(skb, NULL, IP_CT_UNTRACKED);
+ }
+
+ goto out_clear;
+ }
+
+ family = tcf_ct_skb_nf_family(skb);
+ if (family == NFPROTO_UNSPEC)
+ goto drop;
+
+ /* The conntrack module expects to be working at L3.
+ * We also try to pull the IPv4/6 header to linear area
+ */
+ nh_ofs = skb_network_offset(skb);
+ skb_pull_rcsum(skb, nh_ofs);
+ err = tcf_ct_handle_fragments(net, skb, family, p->zone, &defrag);
+ if (err)
+ goto out_frag;
+
+ err = nf_ct_skb_network_trim(skb, family);
+ if (err)
+ goto drop;
+
+ /* If we are recirculating packets to match on ct fields and
+ * committing with a separate ct action, then we don't need to
+ * actually run the packet through conntrack twice unless it's for a
+ * different zone.
+ */
+ cached = tcf_ct_skb_nfct_cached(net, skb, p);
+ if (!cached) {
+ if (tcf_ct_flow_table_lookup(p, skb, family)) {
+ skip_add = true;
+ goto do_nat;
+ }
+
+ /* Associate skb with specified zone. */
+ if (tmpl) {
+ nf_conntrack_put(skb_nfct(skb));
+ nf_conntrack_get(&tmpl->ct_general);
+ nf_ct_set(skb, tmpl, IP_CT_NEW);
+ }
+
+ state.hook = NF_INET_PRE_ROUTING;
+ state.net = net;
+ state.pf = family;
+ err = nf_conntrack_in(skb, &state);
+ if (err != NF_ACCEPT)
+ goto nf_error;
+ }
+
+do_nat:
+ ct = nf_ct_get(skb, &ctinfo);
+ if (!ct)
+ goto out_push;
+ nf_ct_deliver_cached_events(ct);
+ nf_conn_act_ct_ext_fill(skb, ct, ctinfo);
+
+ err = tcf_ct_act_nat(skb, ct, ctinfo, p->ct_action, &p->range, commit);
+ if (err != NF_ACCEPT)
+ goto nf_error;
+
+ if (!nf_ct_is_confirmed(ct) && commit && p->helper && !nfct_help(ct)) {
+ err = __nf_ct_try_assign_helper(ct, p->tmpl, GFP_ATOMIC);
+ if (err)
+ goto drop;
+ add_helper = true;
+ if (p->ct_action & TCA_CT_ACT_NAT && !nfct_seqadj(ct)) {
+ if (!nfct_seqadj_ext_add(ct))
+ goto drop;
+ }
+ }
+
+ if (nf_ct_is_confirmed(ct) ? ((!cached && !skip_add) || add_helper) : commit) {
+ err = nf_ct_helper(skb, ct, ctinfo, family);
+ if (err != NF_ACCEPT)
+ goto nf_error;
+ }
+
+ if (commit) {
+ tcf_ct_act_set_mark(ct, p->mark, p->mark_mask);
+ tcf_ct_act_set_labels(ct, p->labels, p->labels_mask);
+
+ if (!nf_ct_is_confirmed(ct))
+ nf_conn_act_ct_ext_add(skb, ct, ctinfo);
+
+ /* This will take care of sending queued events
+ * even if the connection is already confirmed.
+ */
+ err = nf_conntrack_confirm(skb);
+ if (err != NF_ACCEPT)
+ goto nf_error;
+
+ /* The ct may be dropped if a clash has been resolved,
+ * so it's necessary to retrieve it from skb again to
+ * prevent UAF.
+ */
+ ct = nf_ct_get(skb, &ctinfo);
+ if (!ct)
+ skip_add = true;
+ }
+
+ if (!skip_add)
+ tcf_ct_flow_table_process_conn(p->ct_ft, ct, ctinfo);
+
+out_push:
+ skb_push_rcsum(skb, nh_ofs);
+
+ qdisc_skb_cb(skb)->post_ct = true;
+ tc_skb_cb(skb)->zone = p->zone;
+out_clear:
+ if (defrag)
+ qdisc_skb_cb(skb)->pkt_len = skb->len;
+ return retval;
+
+out_frag:
+ if (err != -EINPROGRESS)
+ tcf_action_inc_drop_qstats(&c->common);
+ return TC_ACT_CONSUMED;
+
+drop:
+ tcf_action_inc_drop_qstats(&c->common);
+ return TC_ACT_SHOT;
+
+nf_error:
+ /* some verdicts store extra data in upper bits, such
+ * as errno or queue number.
+ */
+ switch (err & NF_VERDICT_MASK) {
+ case NF_DROP:
+ goto drop;
+ case NF_STOLEN:
+ tcf_action_inc_drop_qstats(&c->common);
+ return TC_ACT_CONSUMED;
+ default:
+ DEBUG_NET_WARN_ON_ONCE(1);
+ goto drop;
+ }
+}
+
+static const struct nla_policy ct_policy[TCA_CT_MAX + 1] = {
+ [TCA_CT_ACTION] = { .type = NLA_U16 },
+ [TCA_CT_PARMS] = NLA_POLICY_EXACT_LEN(sizeof(struct tc_ct)),
+ [TCA_CT_ZONE] = { .type = NLA_U16 },
+ [TCA_CT_MARK] = { .type = NLA_U32 },
+ [TCA_CT_MARK_MASK] = { .type = NLA_U32 },
+ [TCA_CT_LABELS] = { .type = NLA_BINARY,
+ .len = 128 / BITS_PER_BYTE },
+ [TCA_CT_LABELS_MASK] = { .type = NLA_BINARY,
+ .len = 128 / BITS_PER_BYTE },
+ [TCA_CT_NAT_IPV4_MIN] = { .type = NLA_U32 },
+ [TCA_CT_NAT_IPV4_MAX] = { .type = NLA_U32 },
+ [TCA_CT_NAT_IPV6_MIN] = NLA_POLICY_EXACT_LEN(sizeof(struct in6_addr)),
+ [TCA_CT_NAT_IPV6_MAX] = NLA_POLICY_EXACT_LEN(sizeof(struct in6_addr)),
+ [TCA_CT_NAT_PORT_MIN] = { .type = NLA_U16 },
+ [TCA_CT_NAT_PORT_MAX] = { .type = NLA_U16 },
+ [TCA_CT_HELPER_NAME] = { .type = NLA_STRING, .len = NF_CT_HELPER_NAME_LEN },
+ [TCA_CT_HELPER_FAMILY] = { .type = NLA_U8 },
+ [TCA_CT_HELPER_PROTO] = { .type = NLA_U8 },
+};
+
+static int tcf_ct_fill_params_nat(struct tcf_ct_params *p,
+ struct tc_ct *parm,
+ struct nlattr **tb,
+ struct netlink_ext_ack *extack)
+{
+ struct nf_nat_range2 *range;
+
+ if (!(p->ct_action & TCA_CT_ACT_NAT))
+ return 0;
+
+ if (!IS_ENABLED(CONFIG_NF_NAT)) {
+ NL_SET_ERR_MSG_MOD(extack, "Netfilter nat isn't enabled in kernel");
+ return -EOPNOTSUPP;
+ }
+
+ if (!(p->ct_action & (TCA_CT_ACT_NAT_SRC | TCA_CT_ACT_NAT_DST)))
+ return 0;
+
+ if ((p->ct_action & TCA_CT_ACT_NAT_SRC) &&
+ (p->ct_action & TCA_CT_ACT_NAT_DST)) {
+ NL_SET_ERR_MSG_MOD(extack, "dnat and snat can't be enabled at the same time");
+ return -EOPNOTSUPP;
+ }
+
+ range = &p->range;
+ if (tb[TCA_CT_NAT_IPV4_MIN]) {
+ struct nlattr *max_attr = tb[TCA_CT_NAT_IPV4_MAX];
+
+ p->ipv4_range = true;
+ range->flags |= NF_NAT_RANGE_MAP_IPS;
+ range->min_addr.ip =
+ nla_get_in_addr(tb[TCA_CT_NAT_IPV4_MIN]);
+
+ range->max_addr.ip =
+ nla_get_in_addr_default(max_attr, range->min_addr.ip);
+ } else if (tb[TCA_CT_NAT_IPV6_MIN]) {
+ struct nlattr *max_attr = tb[TCA_CT_NAT_IPV6_MAX];
+
+ p->ipv4_range = false;
+ range->flags |= NF_NAT_RANGE_MAP_IPS;
+ range->min_addr.in6 =
+ nla_get_in6_addr(tb[TCA_CT_NAT_IPV6_MIN]);
+
+ range->max_addr.in6 = max_attr ?
+ nla_get_in6_addr(max_attr) :
+ range->min_addr.in6;
+ }
+
+ if (tb[TCA_CT_NAT_PORT_MIN]) {
+ range->flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
+ range->min_proto.all = nla_get_be16(tb[TCA_CT_NAT_PORT_MIN]);
+
+ range->max_proto.all = tb[TCA_CT_NAT_PORT_MAX] ?
+ nla_get_be16(tb[TCA_CT_NAT_PORT_MAX]) :
+ range->min_proto.all;
+ }
+
+ return 0;
+}
+
+static void tcf_ct_set_key_val(struct nlattr **tb,
+ void *val, int val_type,
+ void *mask, int mask_type,
+ int len)
+{
+ if (!tb[val_type])
+ return;
+ nla_memcpy(val, tb[val_type], len);
+
+ if (!mask)
+ return;
+
+ if (mask_type == TCA_CT_UNSPEC || !tb[mask_type])
+ memset(mask, 0xff, len);
+ else
+ nla_memcpy(mask, tb[mask_type], len);
+}
+
+static int tcf_ct_fill_params(struct net *net,
+ struct tcf_ct_params *p,
+ struct tc_ct *parm,
+ struct nlattr **tb,
+ struct netlink_ext_ack *extack)
+{
+ struct nf_conntrack_zone zone;
+ int err, family, proto, len;
+ bool put_labels = false;
+ struct nf_conn *tmpl;
+ char *name;
+
+ p->zone = NF_CT_DEFAULT_ZONE_ID;
+
+ tcf_ct_set_key_val(tb,
+ &p->ct_action, TCA_CT_ACTION,
+ NULL, TCA_CT_UNSPEC,
+ sizeof(p->ct_action));
+
+ if (p->ct_action & TCA_CT_ACT_CLEAR)
+ return 0;
+
+ err = tcf_ct_fill_params_nat(p, parm, tb, extack);
+ if (err)
+ return err;
+
+ if (tb[TCA_CT_MARK]) {
+ if (!IS_ENABLED(CONFIG_NF_CONNTRACK_MARK)) {
+ NL_SET_ERR_MSG_MOD(extack, "Conntrack mark isn't enabled.");
+ return -EOPNOTSUPP;
+ }
+ tcf_ct_set_key_val(tb,
+ &p->mark, TCA_CT_MARK,
+ &p->mark_mask, TCA_CT_MARK_MASK,
+ sizeof(p->mark));
+ }
+
+ if (tb[TCA_CT_LABELS]) {
+ unsigned int n_bits = sizeof_field(struct tcf_ct_params, labels) * 8;
+
+ if (!IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS)) {
+ NL_SET_ERR_MSG_MOD(extack, "Conntrack labels isn't enabled.");
+ return -EOPNOTSUPP;
+ }
+
+ if (nf_connlabels_get(net, n_bits - 1)) {
+ NL_SET_ERR_MSG_MOD(extack, "Failed to set connlabel length");
+ return -EOPNOTSUPP;
+ } else {
+ put_labels = true;
+ }
+
+ tcf_ct_set_key_val(tb,
+ p->labels, TCA_CT_LABELS,
+ p->labels_mask, TCA_CT_LABELS_MASK,
+ sizeof(p->labels));
+ }
+
+ if (tb[TCA_CT_ZONE]) {
+ if (!IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES)) {
+ NL_SET_ERR_MSG_MOD(extack, "Conntrack zones isn't enabled.");
+ return -EOPNOTSUPP;
+ }
+
+ tcf_ct_set_key_val(tb,
+ &p->zone, TCA_CT_ZONE,
+ NULL, TCA_CT_UNSPEC,
+ sizeof(p->zone));
+ }
+
+ nf_ct_zone_init(&zone, p->zone, NF_CT_DEFAULT_ZONE_DIR, 0);
+ tmpl = nf_ct_tmpl_alloc(net, &zone, GFP_KERNEL);
+ if (!tmpl) {
+ NL_SET_ERR_MSG_MOD(extack, "Failed to allocate conntrack template");
+ return -ENOMEM;
+ }
+ p->tmpl = tmpl;
+ if (tb[TCA_CT_HELPER_NAME]) {
+ name = nla_data(tb[TCA_CT_HELPER_NAME]);
+ len = nla_len(tb[TCA_CT_HELPER_NAME]);
+ if (len > 16 || name[len - 1] != '\0') {
+ NL_SET_ERR_MSG_MOD(extack, "Failed to parse helper name.");
+ err = -EINVAL;
+ goto err;
+ }
+ family = nla_get_u8_default(tb[TCA_CT_HELPER_FAMILY], AF_INET);
+ proto = nla_get_u8_default(tb[TCA_CT_HELPER_PROTO],
+ IPPROTO_TCP);
+ err = nf_ct_add_helper(tmpl, name, family, proto,
+ p->ct_action & TCA_CT_ACT_NAT, &p->helper);
+ if (err) {
+ NL_SET_ERR_MSG_MOD(extack, "Failed to add helper");
+ goto err;
+ }
+ }
+
+ p->put_labels = put_labels;
+
+ if (p->ct_action & TCA_CT_ACT_COMMIT)
+ __set_bit(IPS_CONFIRMED_BIT, &tmpl->status);
+ return 0;
+err:
+ if (put_labels)
+ nf_connlabels_put(net);
+
+ nf_ct_put(p->tmpl);
+ p->tmpl = NULL;
+ return err;
+}
+
+static int tcf_ct_init(struct net *net, struct nlattr *nla,
+ struct nlattr *est, struct tc_action **a,
+ struct tcf_proto *tp, u32 flags,
+ struct netlink_ext_ack *extack)
+{
+ struct tc_action_net *tn = net_generic(net, act_ct_ops.net_id);
+ bool bind = flags & TCA_ACT_FLAGS_BIND;
+ struct tcf_ct_params *params = NULL;
+ struct nlattr *tb[TCA_CT_MAX + 1];
+ struct tcf_chain *goto_ch = NULL;
+ struct tc_ct *parm;
+ struct tcf_ct *c;
+ int err, res = 0;
+ u32 index;
+
+ if (!nla) {
+ NL_SET_ERR_MSG_MOD(extack, "Ct requires attributes to be passed");
+ return -EINVAL;
+ }
+
+ err = nla_parse_nested(tb, TCA_CT_MAX, nla, ct_policy, extack);
+ if (err < 0)
+ return err;
+
+ if (!tb[TCA_CT_PARMS]) {
+ NL_SET_ERR_MSG_MOD(extack, "Missing required ct parameters");
+ return -EINVAL;
+ }
+ parm = nla_data(tb[TCA_CT_PARMS]);
+ index = parm->index;
+ err = tcf_idr_check_alloc(tn, &index, a, bind);
+ if (err < 0)
+ return err;
+
+ if (!err) {
+ err = tcf_idr_create_from_flags(tn, index, est, a,
+ &act_ct_ops, bind, flags);
+ if (err) {
+ tcf_idr_cleanup(tn, index);
+ return err;
+ }
+ res = ACT_P_CREATED;
+ } else {
+ if (bind)
+ return ACT_P_BOUND;
+
+ if (!(flags & TCA_ACT_FLAGS_REPLACE)) {
+ tcf_idr_release(*a, bind);
+ return -EEXIST;
+ }
+ }
+ err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack);
+ if (err < 0)
+ goto cleanup;
+
+ c = to_ct(*a);
+
+ params = kzalloc(sizeof(*params), GFP_KERNEL);
+ if (unlikely(!params)) {
+ err = -ENOMEM;
+ goto cleanup;
+ }
+
+ err = tcf_ct_fill_params(net, params, parm, tb, extack);
+ if (err)
+ goto cleanup;
+
+ err = tcf_ct_flow_table_get(net, params);
+ if (err)
+ goto cleanup;
+
+ params->action = parm->action;
+ spin_lock_bh(&c->tcf_lock);
+ goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch);
+ params = rcu_replace_pointer(c->params, params,
+ lockdep_is_held(&c->tcf_lock));
+ spin_unlock_bh(&c->tcf_lock);
+
+ if (goto_ch)
+ tcf_chain_put_by_act(goto_ch);
+ if (params)
+ call_rcu(&params->rcu, tcf_ct_params_free_rcu);
+
+ return res;
+
+cleanup:
+ if (goto_ch)
+ tcf_chain_put_by_act(goto_ch);
+ if (params)
+ tcf_ct_params_free(params);
+ tcf_idr_release(*a, bind);
+ return err;
+}
+
+static void tcf_ct_cleanup(struct tc_action *a)
+{
+ struct tcf_ct_params *params;
+ struct tcf_ct *c = to_ct(a);
+
+ params = rcu_dereference_protected(c->params, 1);
+ if (params)
+ call_rcu(&params->rcu, tcf_ct_params_free_rcu);
+}
+
+static int tcf_ct_dump_key_val(struct sk_buff *skb,
+ const void *val, int val_type,
+ const void *mask, int mask_type,
+ int len)
+{
+ int err;
+
+ if (mask && !memchr_inv(mask, 0, len))
+ return 0;
+
+ err = nla_put(skb, val_type, len, val);
+ if (err)
+ return err;
+
+ if (mask_type != TCA_CT_UNSPEC) {
+ err = nla_put(skb, mask_type, len, mask);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
+static int tcf_ct_dump_nat(struct sk_buff *skb, const struct tcf_ct_params *p)
+{
+ const struct nf_nat_range2 *range = &p->range;
+
+ if (!(p->ct_action & TCA_CT_ACT_NAT))
+ return 0;
+
+ if (!(p->ct_action & (TCA_CT_ACT_NAT_SRC | TCA_CT_ACT_NAT_DST)))
+ return 0;
+
+ if (range->flags & NF_NAT_RANGE_MAP_IPS) {
+ if (p->ipv4_range) {
+ if (nla_put_in_addr(skb, TCA_CT_NAT_IPV4_MIN,
+ range->min_addr.ip))
+ return -1;
+ if (nla_put_in_addr(skb, TCA_CT_NAT_IPV4_MAX,
+ range->max_addr.ip))
+ return -1;
+ } else {
+ if (nla_put_in6_addr(skb, TCA_CT_NAT_IPV6_MIN,
+ &range->min_addr.in6))
+ return -1;
+ if (nla_put_in6_addr(skb, TCA_CT_NAT_IPV6_MAX,
+ &range->max_addr.in6))
+ return -1;
+ }
+ }
+
+ if (range->flags & NF_NAT_RANGE_PROTO_SPECIFIED) {
+ if (nla_put_be16(skb, TCA_CT_NAT_PORT_MIN,
+ range->min_proto.all))
+ return -1;
+ if (nla_put_be16(skb, TCA_CT_NAT_PORT_MAX,
+ range->max_proto.all))
+ return -1;
+ }
+
+ return 0;
+}
+
+static int tcf_ct_dump_helper(struct sk_buff *skb,
+ const struct nf_conntrack_helper *helper)
+{
+ if (!helper)
+ return 0;
+
+ if (nla_put_string(skb, TCA_CT_HELPER_NAME, helper->name) ||
+ nla_put_u8(skb, TCA_CT_HELPER_FAMILY, helper->tuple.src.l3num) ||
+ nla_put_u8(skb, TCA_CT_HELPER_PROTO, helper->tuple.dst.protonum))
+ return -1;
+
+ return 0;
+}
+
+static inline int tcf_ct_dump(struct sk_buff *skb, struct tc_action *a,
+ int bind, int ref)
+{
+ unsigned char *b = skb_tail_pointer(skb);
+ const struct tcf_ct *c = to_ct(a);
+ const struct tcf_ct_params *p;
+ struct tc_ct opt = {
+ .index = c->tcf_index,
+ .refcnt = refcount_read(&c->tcf_refcnt) - ref,
+ .bindcnt = atomic_read(&c->tcf_bindcnt) - bind,
+ };
+ struct tcf_t t;
+
+ rcu_read_lock();
+ p = rcu_dereference(c->params);
+ opt.action = p->action;
+
+ if (tcf_ct_dump_key_val(skb,
+ &p->ct_action, TCA_CT_ACTION,
+ NULL, TCA_CT_UNSPEC,
+ sizeof(p->ct_action)))
+ goto nla_put_failure;
+
+ if (p->ct_action & TCA_CT_ACT_CLEAR)
+ goto skip_dump;
+
+ if (IS_ENABLED(CONFIG_NF_CONNTRACK_MARK) &&
+ tcf_ct_dump_key_val(skb,
+ &p->mark, TCA_CT_MARK,
+ &p->mark_mask, TCA_CT_MARK_MASK,
+ sizeof(p->mark)))
+ goto nla_put_failure;
+
+ if (IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS) &&
+ tcf_ct_dump_key_val(skb,
+ p->labels, TCA_CT_LABELS,
+ p->labels_mask, TCA_CT_LABELS_MASK,
+ sizeof(p->labels)))
+ goto nla_put_failure;
+
+ if (IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES) &&
+ tcf_ct_dump_key_val(skb,
+ &p->zone, TCA_CT_ZONE,
+ NULL, TCA_CT_UNSPEC,
+ sizeof(p->zone)))
+ goto nla_put_failure;
+
+ if (tcf_ct_dump_nat(skb, p))
+ goto nla_put_failure;
+
+ if (tcf_ct_dump_helper(skb, p->helper))
+ goto nla_put_failure;
+
+skip_dump:
+ if (nla_put(skb, TCA_CT_PARMS, sizeof(opt), &opt))
+ goto nla_put_failure;
+
+ tcf_tm_dump(&t, &c->tcf_tm);
+ if (nla_put_64bit(skb, TCA_CT_TM, sizeof(t), &t, TCA_CT_PAD))
+ goto nla_put_failure;
+ rcu_read_unlock();
+
+ return skb->len;
+nla_put_failure:
+ rcu_read_unlock();
+ nlmsg_trim(skb, b);
+ return -1;
+}
+
+static void tcf_stats_update(struct tc_action *a, u64 bytes, u64 packets,
+ u64 drops, u64 lastuse, bool hw)
+{
+ struct tcf_ct *c = to_ct(a);
+
+ tcf_action_update_stats(a, bytes, packets, drops, hw);
+ c->tcf_tm.lastuse = max_t(u64, c->tcf_tm.lastuse, lastuse);
+}
+
+static int tcf_ct_offload_act_setup(struct tc_action *act, void *entry_data,
+ u32 *index_inc, bool bind,
+ struct netlink_ext_ack *extack)
+{
+ if (bind) {
+ struct flow_action_entry *entry = entry_data;
+
+ if (tcf_ct_helper(act))
+ return -EOPNOTSUPP;
+
+ entry->id = FLOW_ACTION_CT;
+ entry->ct.action = tcf_ct_action(act);
+ entry->ct.zone = tcf_ct_zone(act);
+ entry->ct.flow_table = tcf_ct_ft(act);
+ *index_inc = 1;
+ } else {
+ struct flow_offload_action *fl_action = entry_data;
+
+ fl_action->id = FLOW_ACTION_CT;
+ }
+
+ return 0;
+}
+
+static struct tc_action_ops act_ct_ops = {
+ .kind = "ct",
+ .id = TCA_ID_CT,
+ .owner = THIS_MODULE,
+ .act = tcf_ct_act,
+ .dump = tcf_ct_dump,
+ .init = tcf_ct_init,
+ .cleanup = tcf_ct_cleanup,
+ .stats_update = tcf_stats_update,
+ .offload_act_setup = tcf_ct_offload_act_setup,
+ .size = sizeof(struct tcf_ct),
+};
+MODULE_ALIAS_NET_ACT("ct");
+
+static __net_init int ct_init_net(struct net *net)
+{
+ struct tc_ct_action_net *tn = net_generic(net, act_ct_ops.net_id);
+
+ return tc_action_net_init(net, &tn->tn, &act_ct_ops);
+}
+
+static void __net_exit ct_exit_net(struct list_head *net_list)
+{
+ tc_action_net_exit(net_list, act_ct_ops.net_id);
+}
+
+static struct pernet_operations ct_net_ops = {
+ .init = ct_init_net,
+ .exit_batch = ct_exit_net,
+ .id = &act_ct_ops.net_id,
+ .size = sizeof(struct tc_ct_action_net),
+};
+
+static int __init ct_init_module(void)
+{
+ int err;
+
+ act_ct_wq = alloc_ordered_workqueue("act_ct_workqueue", 0);
+ if (!act_ct_wq)
+ return -ENOMEM;
+
+ err = tcf_ct_flow_tables_init();
+ if (err)
+ goto err_tbl_init;
+
+ err = tcf_register_action(&act_ct_ops, &ct_net_ops);
+ if (err)
+ goto err_register;
+
+ static_branch_inc(&tcf_frag_xmit_count);
+
+ return 0;
+
+err_register:
+ tcf_ct_flow_tables_uninit();
+err_tbl_init:
+ destroy_workqueue(act_ct_wq);
+ return err;
+}
+
+static void __exit ct_cleanup_module(void)
+{
+ static_branch_dec(&tcf_frag_xmit_count);
+ tcf_unregister_action(&act_ct_ops, &ct_net_ops);
+ tcf_ct_flow_tables_uninit();
+ destroy_workqueue(act_ct_wq);
+}
+
+module_init(ct_init_module);
+module_exit(ct_cleanup_module);
+MODULE_AUTHOR("Paul Blakey <paulb@mellanox.com>");
+MODULE_AUTHOR("Yossi Kuperman <yossiku@mellanox.com>");
+MODULE_AUTHOR("Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>");
+MODULE_DESCRIPTION("Connection tracking action");
+MODULE_LICENSE("GPL v2");
diff --git a/net/sched/act_ctinfo.c b/net/sched/act_ctinfo.c
new file mode 100644
index 000000000000..71efe04d00b5
--- /dev/null
+++ b/net/sched/act_ctinfo.c
@@ -0,0 +1,403 @@
+// SPDX-License-Identifier: GPL-2.0+
+/* net/sched/act_ctinfo.c netfilter ctinfo connmark actions
+ *
+ * Copyright (c) 2019 Kevin Darbyshire-Bryant <ldir@darbyshire-bryant.me.uk>
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <linux/pkt_cls.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+#include <net/act_api.h>
+#include <net/pkt_cls.h>
+#include <uapi/linux/tc_act/tc_ctinfo.h>
+#include <net/tc_act/tc_ctinfo.h>
+#include <net/tc_wrapper.h>
+
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_ecache.h>
+#include <net/netfilter/nf_conntrack_zones.h>
+
+static struct tc_action_ops act_ctinfo_ops;
+
+static void tcf_ctinfo_dscp_set(struct nf_conn *ct, struct tcf_ctinfo *ca,
+ struct tcf_ctinfo_params *cp,
+ struct sk_buff *skb, int wlen, int proto)
+{
+ u8 dscp, newdscp;
+
+ newdscp = (((READ_ONCE(ct->mark) & cp->dscpmask) >> cp->dscpmaskshift) << 2) &
+ ~INET_ECN_MASK;
+
+ switch (proto) {
+ case NFPROTO_IPV4:
+ dscp = ipv4_get_dsfield(ip_hdr(skb)) & ~INET_ECN_MASK;
+ if (dscp != newdscp) {
+ if (likely(!skb_try_make_writable(skb, wlen))) {
+ ipv4_change_dsfield(ip_hdr(skb),
+ INET_ECN_MASK,
+ newdscp);
+ atomic64_inc(&ca->stats_dscp_set);
+ } else {
+ atomic64_inc(&ca->stats_dscp_error);
+ }
+ }
+ break;
+ case NFPROTO_IPV6:
+ dscp = ipv6_get_dsfield(ipv6_hdr(skb)) & ~INET_ECN_MASK;
+ if (dscp != newdscp) {
+ if (likely(!skb_try_make_writable(skb, wlen))) {
+ ipv6_change_dsfield(ipv6_hdr(skb),
+ INET_ECN_MASK,
+ newdscp);
+ atomic64_inc(&ca->stats_dscp_set);
+ } else {
+ atomic64_inc(&ca->stats_dscp_error);
+ }
+ }
+ break;
+ default:
+ break;
+ }
+}
+
+static void tcf_ctinfo_cpmark_set(struct nf_conn *ct, struct tcf_ctinfo *ca,
+ struct tcf_ctinfo_params *cp,
+ struct sk_buff *skb)
+{
+ atomic64_inc(&ca->stats_cpmark_set);
+ skb->mark = READ_ONCE(ct->mark) & cp->cpmarkmask;
+}
+
+TC_INDIRECT_SCOPE int tcf_ctinfo_act(struct sk_buff *skb,
+ const struct tc_action *a,
+ struct tcf_result *res)
+{
+ const struct nf_conntrack_tuple_hash *thash = NULL;
+ struct tcf_ctinfo *ca = to_ctinfo(a);
+ struct nf_conntrack_tuple tuple;
+ struct nf_conntrack_zone zone;
+ enum ip_conntrack_info ctinfo;
+ struct tcf_ctinfo_params *cp;
+ struct nf_conn *ct;
+ int proto, wlen;
+
+ cp = rcu_dereference_bh(ca->params);
+
+ tcf_lastuse_update(&ca->tcf_tm);
+ tcf_action_update_bstats(&ca->common, skb);
+
+ wlen = skb_network_offset(skb);
+ switch (skb_protocol(skb, true)) {
+ case htons(ETH_P_IP):
+ wlen += sizeof(struct iphdr);
+ if (!pskb_may_pull(skb, wlen))
+ goto out;
+
+ proto = NFPROTO_IPV4;
+ break;
+ case htons(ETH_P_IPV6):
+ wlen += sizeof(struct ipv6hdr);
+ if (!pskb_may_pull(skb, wlen))
+ goto out;
+
+ proto = NFPROTO_IPV6;
+ break;
+ default:
+ goto out;
+ }
+
+ ct = nf_ct_get(skb, &ctinfo);
+ if (!ct) { /* look harder, usually ingress */
+ if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb),
+ proto, cp->net, &tuple))
+ goto out;
+ zone.id = cp->zone;
+ zone.dir = NF_CT_DEFAULT_ZONE_DIR;
+
+ thash = nf_conntrack_find_get(cp->net, &zone, &tuple);
+ if (!thash)
+ goto out;
+
+ ct = nf_ct_tuplehash_to_ctrack(thash);
+ }
+
+ if (cp->mode & CTINFO_MODE_DSCP)
+ if (!cp->dscpstatemask || (READ_ONCE(ct->mark) & cp->dscpstatemask))
+ tcf_ctinfo_dscp_set(ct, ca, cp, skb, wlen, proto);
+
+ if (cp->mode & CTINFO_MODE_CPMARK)
+ tcf_ctinfo_cpmark_set(ct, ca, cp, skb);
+
+ if (thash)
+ nf_ct_put(ct);
+out:
+ return cp->action;
+}
+
+static const struct nla_policy ctinfo_policy[TCA_CTINFO_MAX + 1] = {
+ [TCA_CTINFO_ACT] =
+ NLA_POLICY_EXACT_LEN(sizeof(struct tc_ctinfo)),
+ [TCA_CTINFO_ZONE] = { .type = NLA_U16 },
+ [TCA_CTINFO_PARMS_DSCP_MASK] = { .type = NLA_U32 },
+ [TCA_CTINFO_PARMS_DSCP_STATEMASK] = { .type = NLA_U32 },
+ [TCA_CTINFO_PARMS_CPMARK_MASK] = { .type = NLA_U32 },
+};
+
+static int tcf_ctinfo_init(struct net *net, struct nlattr *nla,
+ struct nlattr *est, struct tc_action **a,
+ struct tcf_proto *tp, u32 flags,
+ struct netlink_ext_ack *extack)
+{
+ struct tc_action_net *tn = net_generic(net, act_ctinfo_ops.net_id);
+ bool bind = flags & TCA_ACT_FLAGS_BIND;
+ u32 dscpmask = 0, dscpstatemask, index;
+ struct nlattr *tb[TCA_CTINFO_MAX + 1];
+ struct tcf_ctinfo_params *cp_new;
+ struct tcf_chain *goto_ch = NULL;
+ struct tc_ctinfo *actparm;
+ struct tcf_ctinfo *ci;
+ u8 dscpmaskshift;
+ int ret = 0, err;
+
+ if (!nla) {
+ NL_SET_ERR_MSG_MOD(extack, "ctinfo requires attributes to be passed");
+ return -EINVAL;
+ }
+
+ err = nla_parse_nested(tb, TCA_CTINFO_MAX, nla, ctinfo_policy, extack);
+ if (err < 0)
+ return err;
+
+ if (!tb[TCA_CTINFO_ACT]) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "Missing required TCA_CTINFO_ACT attribute");
+ return -EINVAL;
+ }
+ actparm = nla_data(tb[TCA_CTINFO_ACT]);
+
+ /* do some basic validation here before dynamically allocating things */
+ /* that we would otherwise have to clean up. */
+ if (tb[TCA_CTINFO_PARMS_DSCP_MASK]) {
+ dscpmask = nla_get_u32(tb[TCA_CTINFO_PARMS_DSCP_MASK]);
+ /* need contiguous 6 bit mask */
+ dscpmaskshift = dscpmask ? __ffs(dscpmask) : 0;
+ if ((~0 & (dscpmask >> dscpmaskshift)) != 0x3f) {
+ NL_SET_ERR_MSG_ATTR(extack,
+ tb[TCA_CTINFO_PARMS_DSCP_MASK],
+ "dscp mask must be 6 contiguous bits");
+ return -EINVAL;
+ }
+ dscpstatemask =
+ nla_get_u32_default(tb[TCA_CTINFO_PARMS_DSCP_STATEMASK],
+ 0);
+ /* mask & statemask must not overlap */
+ if (dscpmask & dscpstatemask) {
+ NL_SET_ERR_MSG_ATTR(extack,
+ tb[TCA_CTINFO_PARMS_DSCP_STATEMASK],
+ "dscp statemask must not overlap dscp mask");
+ return -EINVAL;
+ }
+ }
+
+ /* done the validation:now to the actual action allocation */
+ index = actparm->index;
+ err = tcf_idr_check_alloc(tn, &index, a, bind);
+ if (!err) {
+ ret = tcf_idr_create_from_flags(tn, index, est, a,
+ &act_ctinfo_ops, bind, flags);
+ if (ret) {
+ tcf_idr_cleanup(tn, index);
+ return ret;
+ }
+ ret = ACT_P_CREATED;
+ } else if (err > 0) {
+ if (bind) /* don't override defaults */
+ return ACT_P_BOUND;
+ if (!(flags & TCA_ACT_FLAGS_REPLACE)) {
+ tcf_idr_release(*a, bind);
+ return -EEXIST;
+ }
+ } else {
+ return err;
+ }
+
+ err = tcf_action_check_ctrlact(actparm->action, tp, &goto_ch, extack);
+ if (err < 0)
+ goto release_idr;
+
+ ci = to_ctinfo(*a);
+
+ cp_new = kzalloc(sizeof(*cp_new), GFP_KERNEL);
+ if (unlikely(!cp_new)) {
+ err = -ENOMEM;
+ goto put_chain;
+ }
+
+ cp_new->net = net;
+ cp_new->zone = nla_get_u16_default(tb[TCA_CTINFO_ZONE], 0);
+ if (dscpmask) {
+ cp_new->dscpmask = dscpmask;
+ cp_new->dscpmaskshift = dscpmaskshift;
+ cp_new->dscpstatemask = dscpstatemask;
+ cp_new->mode |= CTINFO_MODE_DSCP;
+ }
+
+ if (tb[TCA_CTINFO_PARMS_CPMARK_MASK]) {
+ cp_new->cpmarkmask =
+ nla_get_u32(tb[TCA_CTINFO_PARMS_CPMARK_MASK]);
+ cp_new->mode |= CTINFO_MODE_CPMARK;
+ }
+
+ cp_new->action = actparm->action;
+
+ spin_lock_bh(&ci->tcf_lock);
+ goto_ch = tcf_action_set_ctrlact(*a, actparm->action, goto_ch);
+ cp_new = rcu_replace_pointer(ci->params, cp_new,
+ lockdep_is_held(&ci->tcf_lock));
+ spin_unlock_bh(&ci->tcf_lock);
+
+ if (goto_ch)
+ tcf_chain_put_by_act(goto_ch);
+ if (cp_new)
+ kfree_rcu(cp_new, rcu);
+
+ return ret;
+
+put_chain:
+ if (goto_ch)
+ tcf_chain_put_by_act(goto_ch);
+release_idr:
+ tcf_idr_release(*a, bind);
+ return err;
+}
+
+static int tcf_ctinfo_dump(struct sk_buff *skb, struct tc_action *a,
+ int bind, int ref)
+{
+ const struct tcf_ctinfo *ci = to_ctinfo(a);
+ unsigned char *b = skb_tail_pointer(skb);
+ const struct tcf_ctinfo_params *cp;
+ struct tc_ctinfo opt = {
+ .index = ci->tcf_index,
+ .refcnt = refcount_read(&ci->tcf_refcnt) - ref,
+ .bindcnt = atomic_read(&ci->tcf_bindcnt) - bind,
+ };
+ struct tcf_t t;
+
+ rcu_read_lock();
+ cp = rcu_dereference(ci->params);
+
+ tcf_tm_dump(&t, &ci->tcf_tm);
+ if (nla_put_64bit(skb, TCA_CTINFO_TM, sizeof(t), &t, TCA_CTINFO_PAD))
+ goto nla_put_failure;
+
+ opt.action = cp->action;
+ if (nla_put(skb, TCA_CTINFO_ACT, sizeof(opt), &opt))
+ goto nla_put_failure;
+
+ if (nla_put_u16(skb, TCA_CTINFO_ZONE, cp->zone))
+ goto nla_put_failure;
+
+ if (cp->mode & CTINFO_MODE_DSCP) {
+ if (nla_put_u32(skb, TCA_CTINFO_PARMS_DSCP_MASK,
+ cp->dscpmask))
+ goto nla_put_failure;
+ if (nla_put_u32(skb, TCA_CTINFO_PARMS_DSCP_STATEMASK,
+ cp->dscpstatemask))
+ goto nla_put_failure;
+ }
+
+ if (cp->mode & CTINFO_MODE_CPMARK) {
+ if (nla_put_u32(skb, TCA_CTINFO_PARMS_CPMARK_MASK,
+ cp->cpmarkmask))
+ goto nla_put_failure;
+ }
+
+ if (nla_put_u64_64bit(skb, TCA_CTINFO_STATS_DSCP_SET,
+ atomic64_read(&ci->stats_dscp_set),
+ TCA_CTINFO_PAD))
+ goto nla_put_failure;
+
+ if (nla_put_u64_64bit(skb, TCA_CTINFO_STATS_DSCP_ERROR,
+ atomic64_read(&ci->stats_dscp_error),
+ TCA_CTINFO_PAD))
+ goto nla_put_failure;
+
+ if (nla_put_u64_64bit(skb, TCA_CTINFO_STATS_CPMARK_SET,
+ atomic64_read(&ci->stats_cpmark_set),
+ TCA_CTINFO_PAD))
+ goto nla_put_failure;
+
+ rcu_read_unlock();
+ return skb->len;
+
+nla_put_failure:
+ rcu_read_unlock();
+ nlmsg_trim(skb, b);
+ return -1;
+}
+
+static void tcf_ctinfo_cleanup(struct tc_action *a)
+{
+ struct tcf_ctinfo *ci = to_ctinfo(a);
+ struct tcf_ctinfo_params *cp;
+
+ cp = rcu_dereference_protected(ci->params, 1);
+ if (cp)
+ kfree_rcu(cp, rcu);
+}
+
+static struct tc_action_ops act_ctinfo_ops = {
+ .kind = "ctinfo",
+ .id = TCA_ID_CTINFO,
+ .owner = THIS_MODULE,
+ .act = tcf_ctinfo_act,
+ .dump = tcf_ctinfo_dump,
+ .init = tcf_ctinfo_init,
+ .cleanup= tcf_ctinfo_cleanup,
+ .size = sizeof(struct tcf_ctinfo),
+};
+MODULE_ALIAS_NET_ACT("ctinfo");
+
+static __net_init int ctinfo_init_net(struct net *net)
+{
+ struct tc_action_net *tn = net_generic(net, act_ctinfo_ops.net_id);
+
+ return tc_action_net_init(net, tn, &act_ctinfo_ops);
+}
+
+static void __net_exit ctinfo_exit_net(struct list_head *net_list)
+{
+ tc_action_net_exit(net_list, act_ctinfo_ops.net_id);
+}
+
+static struct pernet_operations ctinfo_net_ops = {
+ .init = ctinfo_init_net,
+ .exit_batch = ctinfo_exit_net,
+ .id = &act_ctinfo_ops.net_id,
+ .size = sizeof(struct tc_action_net),
+};
+
+static int __init ctinfo_init_module(void)
+{
+ return tcf_register_action(&act_ctinfo_ops, &ctinfo_net_ops);
+}
+
+static void __exit ctinfo_cleanup_module(void)
+{
+ tcf_unregister_action(&act_ctinfo_ops, &ctinfo_net_ops);
+}
+
+module_init(ctinfo_init_module);
+module_exit(ctinfo_cleanup_module);
+MODULE_AUTHOR("Kevin Darbyshire-Bryant <ldir@darbyshire-bryant.me.uk>");
+MODULE_DESCRIPTION("Connection tracking mark actions");
+MODULE_LICENSE("GPL");
diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c
index b61c20ebb314..e949280eb800 100644
--- a/net/sched/act_gact.c
+++ b/net/sched/act_gact.c
@@ -1,13 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* net/sched/act_gact.c Generic actions
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* copyright Jamal Hadi Salim (2002-4)
- *
*/
#include <linux/types.h>
@@ -20,17 +15,18 @@
#include <linux/init.h>
#include <net/netlink.h>
#include <net/pkt_sched.h>
+#include <net/pkt_cls.h>
#include <linux/tc_act/tc_gact.h>
#include <net/tc_act/tc_gact.h>
+#include <net/tc_wrapper.h>
-static unsigned int gact_net_id;
static struct tc_action_ops act_gact_ops;
#ifdef CONFIG_GACT_PROB
static int gact_net_rand(struct tcf_gact *gact)
{
smp_rmb(); /* coupled with smp_wmb() in tcf_gact_init() */
- if (prandom_u32() % gact->tcfg_pval)
+ if (get_random_u32_below(gact->tcfg_pval))
return gact->tcf_action;
return gact->tcfg_paction;
}
@@ -56,14 +52,17 @@ static const struct nla_policy gact_policy[TCA_GACT_MAX + 1] = {
static int tcf_gact_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action **a,
- int ovr, int bind, bool rtnl_held,
+ struct tcf_proto *tp, u32 flags,
struct netlink_ext_ack *extack)
{
- struct tc_action_net *tn = net_generic(net, gact_net_id);
+ struct tc_action_net *tn = net_generic(net, act_gact_ops.net_id);
+ bool bind = flags & TCA_ACT_FLAGS_BIND;
struct nlattr *tb[TCA_GACT_MAX + 1];
+ struct tcf_chain *goto_ch = NULL;
struct tc_gact *parm;
struct tcf_gact *gact;
int ret = 0;
+ u32 index;
int err;
#ifdef CONFIG_GACT_PROB
struct tc_gact_p *p_parm = NULL;
@@ -72,13 +71,15 @@ static int tcf_gact_init(struct net *net, struct nlattr *nla,
if (nla == NULL)
return -EINVAL;
- err = nla_parse_nested(tb, TCA_GACT_MAX, nla, gact_policy, NULL);
+ err = nla_parse_nested_deprecated(tb, TCA_GACT_MAX, nla, gact_policy,
+ NULL);
if (err < 0)
return err;
if (tb[TCA_GACT_PARMS] == NULL)
return -EINVAL;
parm = nla_data(tb[TCA_GACT_PARMS]);
+ index = parm->index;
#ifndef CONFIG_GACT_PROB
if (tb[TCA_GACT_PROB] != NULL)
@@ -96,19 +97,19 @@ static int tcf_gact_init(struct net *net, struct nlattr *nla,
}
#endif
- err = tcf_idr_check_alloc(tn, &parm->index, a, bind);
+ err = tcf_idr_check_alloc(tn, &index, a, bind);
if (!err) {
- ret = tcf_idr_create(tn, parm->index, est, a,
- &act_gact_ops, bind, true);
+ ret = tcf_idr_create_from_flags(tn, index, est, a,
+ &act_gact_ops, bind, flags);
if (ret) {
- tcf_idr_cleanup(tn, parm->index);
+ tcf_idr_cleanup(tn, index);
return ret;
}
ret = ACT_P_CREATED;
} else if (err > 0) {
if (bind)/* dont override defaults */
- return 0;
- if (!ovr) {
+ return ACT_P_BOUND;
+ if (!(flags & TCA_ACT_FLAGS_REPLACE)) {
tcf_idr_release(*a, bind);
return -EEXIST;
}
@@ -116,10 +117,13 @@ static int tcf_gact_init(struct net *net, struct nlattr *nla,
return err;
}
+ err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack);
+ if (err < 0)
+ goto release_idr;
gact = to_gact(*a);
spin_lock_bh(&gact->tcf_lock);
- gact->tcf_action = parm->action;
+ goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch);
#ifdef CONFIG_GACT_PROB
if (p_parm) {
gact->tcfg_paction = p_parm->paction;
@@ -133,13 +137,18 @@ static int tcf_gact_init(struct net *net, struct nlattr *nla,
#endif
spin_unlock_bh(&gact->tcf_lock);
- if (ret == ACT_P_CREATED)
- tcf_idr_insert(tn, *a);
+ if (goto_ch)
+ tcf_chain_put_by_act(goto_ch);
+
return ret;
+release_idr:
+ tcf_idr_release(*a, bind);
+ return err;
}
-static int tcf_gact_act(struct sk_buff *skb, const struct tc_action *a,
- struct tcf_result *res)
+TC_INDIRECT_SCOPE int tcf_gact_act(struct sk_buff *skb,
+ const struct tc_action *a,
+ struct tcf_result *res)
{
struct tcf_gact *gact = to_gact(a);
int action = READ_ONCE(gact->tcf_action);
@@ -152,31 +161,24 @@ static int tcf_gact_act(struct sk_buff *skb, const struct tc_action *a,
action = gact_rand[ptype](gact);
}
#endif
- bstats_cpu_update(this_cpu_ptr(gact->common.cpu_bstats), skb);
+ tcf_action_update_bstats(&gact->common, skb);
if (action == TC_ACT_SHOT)
- qstats_drop_inc(this_cpu_ptr(gact->common.cpu_qstats));
+ tcf_action_inc_drop_qstats(&gact->common);
tcf_lastuse_update(&gact->tcf_tm);
return action;
}
-static void tcf_gact_stats_update(struct tc_action *a, u64 bytes, u32 packets,
- u64 lastuse, bool hw)
+static void tcf_gact_stats_update(struct tc_action *a, u64 bytes, u64 packets,
+ u64 drops, u64 lastuse, bool hw)
{
struct tcf_gact *gact = to_gact(a);
int action = READ_ONCE(gact->tcf_action);
struct tcf_t *tm = &gact->tcf_tm;
- _bstats_cpu_update(this_cpu_ptr(gact->common.cpu_bstats), bytes,
- packets);
- if (action == TC_ACT_SHOT)
- this_cpu_ptr(gact->common.cpu_qstats)->drops += packets;
-
- if (hw)
- _bstats_cpu_update(this_cpu_ptr(gact->common.cpu_bstats_hw),
- bytes, packets);
-
+ tcf_action_update_stats(a, bytes, packets,
+ action == TC_ACT_SHOT ? packets : drops, hw);
tm->lastuse = max_t(u64, tm->lastuse, lastuse);
}
@@ -221,23 +223,6 @@ nla_put_failure:
return -1;
}
-static int tcf_gact_walker(struct net *net, struct sk_buff *skb,
- struct netlink_callback *cb, int type,
- const struct tc_action_ops *ops,
- struct netlink_ext_ack *extack)
-{
- struct tc_action_net *tn = net_generic(net, gact_net_id);
-
- return tcf_generic_walker(tn, skb, cb, type, ops, extack);
-}
-
-static int tcf_gact_search(struct net *net, struct tc_action **a, u32 index)
-{
- struct tc_action_net *tn = net_generic(net, gact_net_id);
-
- return tcf_idr_search(tn, a, index);
-}
-
static size_t tcf_gact_get_fill_size(const struct tc_action *act)
{
size_t sz = nla_total_size(sizeof(struct tc_gact)); /* TCA_GACT_PARMS */
@@ -251,36 +236,84 @@ static size_t tcf_gact_get_fill_size(const struct tc_action *act)
return sz;
}
+static int tcf_gact_offload_act_setup(struct tc_action *act, void *entry_data,
+ u32 *index_inc, bool bind,
+ struct netlink_ext_ack *extack)
+{
+ if (bind) {
+ struct flow_action_entry *entry = entry_data;
+
+ if (is_tcf_gact_ok(act)) {
+ entry->id = FLOW_ACTION_ACCEPT;
+ } else if (is_tcf_gact_shot(act)) {
+ entry->id = FLOW_ACTION_DROP;
+ } else if (is_tcf_gact_trap(act)) {
+ entry->id = FLOW_ACTION_TRAP;
+ } else if (is_tcf_gact_goto_chain(act)) {
+ entry->id = FLOW_ACTION_GOTO;
+ entry->chain_index = tcf_gact_goto_chain_index(act);
+ } else if (is_tcf_gact_continue(act)) {
+ NL_SET_ERR_MSG_MOD(extack, "Offload of \"continue\" action is not supported");
+ return -EOPNOTSUPP;
+ } else if (is_tcf_gact_reclassify(act)) {
+ NL_SET_ERR_MSG_MOD(extack, "Offload of \"reclassify\" action is not supported");
+ return -EOPNOTSUPP;
+ } else if (is_tcf_gact_pipe(act)) {
+ NL_SET_ERR_MSG_MOD(extack, "Offload of \"pipe\" action is not supported");
+ return -EOPNOTSUPP;
+ } else {
+ NL_SET_ERR_MSG_MOD(extack, "Unsupported generic action offload");
+ return -EOPNOTSUPP;
+ }
+ *index_inc = 1;
+ } else {
+ struct flow_offload_action *fl_action = entry_data;
+
+ if (is_tcf_gact_ok(act))
+ fl_action->id = FLOW_ACTION_ACCEPT;
+ else if (is_tcf_gact_shot(act))
+ fl_action->id = FLOW_ACTION_DROP;
+ else if (is_tcf_gact_trap(act))
+ fl_action->id = FLOW_ACTION_TRAP;
+ else if (is_tcf_gact_goto_chain(act))
+ fl_action->id = FLOW_ACTION_GOTO;
+ else
+ return -EOPNOTSUPP;
+ }
+
+ return 0;
+}
+
static struct tc_action_ops act_gact_ops = {
.kind = "gact",
- .type = TCA_ACT_GACT,
+ .id = TCA_ID_GACT,
.owner = THIS_MODULE,
.act = tcf_gact_act,
.stats_update = tcf_gact_stats_update,
.dump = tcf_gact_dump,
.init = tcf_gact_init,
- .walk = tcf_gact_walker,
- .lookup = tcf_gact_search,
.get_fill_size = tcf_gact_get_fill_size,
+ .offload_act_setup = tcf_gact_offload_act_setup,
.size = sizeof(struct tcf_gact),
};
+MODULE_ALIAS_NET_ACT("gact");
static __net_init int gact_init_net(struct net *net)
{
- struct tc_action_net *tn = net_generic(net, gact_net_id);
+ struct tc_action_net *tn = net_generic(net, act_gact_ops.net_id);
- return tc_action_net_init(tn, &act_gact_ops);
+ return tc_action_net_init(net, tn, &act_gact_ops);
}
static void __net_exit gact_exit_net(struct list_head *net_list)
{
- tc_action_net_exit(net_list, gact_net_id);
+ tc_action_net_exit(net_list, act_gact_ops.net_id);
}
static struct pernet_operations gact_net_ops = {
.init = gact_init_net,
.exit_batch = gact_exit_net,
- .id = &gact_net_id,
+ .id = &act_gact_ops.net_id,
.size = sizeof(struct tc_action_net),
};
diff --git a/net/sched/act_gate.c b/net/sched/act_gate.c
new file mode 100644
index 000000000000..c1f75f272757
--- /dev/null
+++ b/net/sched/act_gate.c
@@ -0,0 +1,676 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* Copyright 2020 NXP */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <net/act_api.h>
+#include <net/netlink.h>
+#include <net/pkt_cls.h>
+#include <net/tc_act/tc_gate.h>
+#include <net/tc_wrapper.h>
+
+static struct tc_action_ops act_gate_ops;
+
+static ktime_t gate_get_time(struct tcf_gate *gact)
+{
+ ktime_t mono = ktime_get();
+
+ switch (gact->tk_offset) {
+ case TK_OFFS_MAX:
+ return mono;
+ default:
+ return ktime_mono_to_any(mono, gact->tk_offset);
+ }
+
+ return KTIME_MAX;
+}
+
+static void gate_get_start_time(struct tcf_gate *gact, ktime_t *start)
+{
+ struct tcf_gate_params *param = &gact->param;
+ ktime_t now, base, cycle;
+ u64 n;
+
+ base = ns_to_ktime(param->tcfg_basetime);
+ now = gate_get_time(gact);
+
+ if (ktime_after(base, now)) {
+ *start = base;
+ return;
+ }
+
+ cycle = param->tcfg_cycletime;
+
+ n = div64_u64(ktime_sub_ns(now, base), cycle);
+ *start = ktime_add_ns(base, (n + 1) * cycle);
+}
+
+static void gate_start_timer(struct tcf_gate *gact, ktime_t start)
+{
+ ktime_t expires;
+
+ expires = hrtimer_get_expires(&gact->hitimer);
+ if (expires == 0)
+ expires = KTIME_MAX;
+
+ start = min_t(ktime_t, start, expires);
+
+ hrtimer_start(&gact->hitimer, start, HRTIMER_MODE_ABS_SOFT);
+}
+
+static enum hrtimer_restart gate_timer_func(struct hrtimer *timer)
+{
+ struct tcf_gate *gact = container_of(timer, struct tcf_gate,
+ hitimer);
+ struct tcf_gate_params *p = &gact->param;
+ struct tcfg_gate_entry *next;
+ ktime_t close_time, now;
+
+ spin_lock(&gact->tcf_lock);
+
+ next = gact->next_entry;
+
+ /* cycle start, clear pending bit, clear total octets */
+ gact->current_gate_status = next->gate_state ? GATE_ACT_GATE_OPEN : 0;
+ gact->current_entry_octets = 0;
+ gact->current_max_octets = next->maxoctets;
+
+ gact->current_close_time = ktime_add_ns(gact->current_close_time,
+ next->interval);
+
+ close_time = gact->current_close_time;
+
+ if (list_is_last(&next->list, &p->entries))
+ next = list_first_entry(&p->entries,
+ struct tcfg_gate_entry, list);
+ else
+ next = list_next_entry(next, list);
+
+ now = gate_get_time(gact);
+
+ if (ktime_after(now, close_time)) {
+ ktime_t cycle, base;
+ u64 n;
+
+ cycle = p->tcfg_cycletime;
+ base = ns_to_ktime(p->tcfg_basetime);
+ n = div64_u64(ktime_sub_ns(now, base), cycle);
+ close_time = ktime_add_ns(base, (n + 1) * cycle);
+ }
+
+ gact->next_entry = next;
+
+ hrtimer_set_expires(&gact->hitimer, close_time);
+
+ spin_unlock(&gact->tcf_lock);
+
+ return HRTIMER_RESTART;
+}
+
+TC_INDIRECT_SCOPE int tcf_gate_act(struct sk_buff *skb,
+ const struct tc_action *a,
+ struct tcf_result *res)
+{
+ struct tcf_gate *gact = to_gate(a);
+ int action = READ_ONCE(gact->tcf_action);
+
+ tcf_lastuse_update(&gact->tcf_tm);
+ tcf_action_update_bstats(&gact->common, skb);
+
+ spin_lock(&gact->tcf_lock);
+ if (unlikely(gact->current_gate_status & GATE_ACT_PENDING)) {
+ spin_unlock(&gact->tcf_lock);
+ return action;
+ }
+
+ if (!(gact->current_gate_status & GATE_ACT_GATE_OPEN)) {
+ spin_unlock(&gact->tcf_lock);
+ goto drop;
+ }
+
+ if (gact->current_max_octets >= 0) {
+ gact->current_entry_octets += qdisc_pkt_len(skb);
+ if (gact->current_entry_octets > gact->current_max_octets) {
+ spin_unlock(&gact->tcf_lock);
+ goto overlimit;
+ }
+ }
+ spin_unlock(&gact->tcf_lock);
+
+ return action;
+
+overlimit:
+ tcf_action_inc_overlimit_qstats(&gact->common);
+drop:
+ tcf_action_inc_drop_qstats(&gact->common);
+ return TC_ACT_SHOT;
+}
+
+static const struct nla_policy entry_policy[TCA_GATE_ENTRY_MAX + 1] = {
+ [TCA_GATE_ENTRY_INDEX] = { .type = NLA_U32 },
+ [TCA_GATE_ENTRY_GATE] = { .type = NLA_FLAG },
+ [TCA_GATE_ENTRY_INTERVAL] = { .type = NLA_U32 },
+ [TCA_GATE_ENTRY_IPV] = { .type = NLA_S32 },
+ [TCA_GATE_ENTRY_MAX_OCTETS] = { .type = NLA_S32 },
+};
+
+static const struct nla_policy gate_policy[TCA_GATE_MAX + 1] = {
+ [TCA_GATE_PARMS] =
+ NLA_POLICY_EXACT_LEN(sizeof(struct tc_gate)),
+ [TCA_GATE_PRIORITY] = { .type = NLA_S32 },
+ [TCA_GATE_ENTRY_LIST] = { .type = NLA_NESTED },
+ [TCA_GATE_BASE_TIME] = { .type = NLA_U64 },
+ [TCA_GATE_CYCLE_TIME] = { .type = NLA_U64 },
+ [TCA_GATE_CYCLE_TIME_EXT] = { .type = NLA_U64 },
+ [TCA_GATE_FLAGS] = { .type = NLA_U32 },
+ [TCA_GATE_CLOCKID] = { .type = NLA_S32 },
+};
+
+static int fill_gate_entry(struct nlattr **tb, struct tcfg_gate_entry *entry,
+ struct netlink_ext_ack *extack)
+{
+ u32 interval = 0;
+
+ entry->gate_state = nla_get_flag(tb[TCA_GATE_ENTRY_GATE]);
+
+ if (tb[TCA_GATE_ENTRY_INTERVAL])
+ interval = nla_get_u32(tb[TCA_GATE_ENTRY_INTERVAL]);
+
+ if (interval == 0) {
+ NL_SET_ERR_MSG(extack, "Invalid interval for schedule entry");
+ return -EINVAL;
+ }
+
+ entry->interval = interval;
+
+ entry->ipv = nla_get_s32_default(tb[TCA_GATE_ENTRY_IPV], -1);
+
+ entry->maxoctets = nla_get_s32_default(tb[TCA_GATE_ENTRY_MAX_OCTETS],
+ -1);
+
+ return 0;
+}
+
+static int parse_gate_entry(struct nlattr *n, struct tcfg_gate_entry *entry,
+ int index, struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[TCA_GATE_ENTRY_MAX + 1] = { };
+ int err;
+
+ err = nla_parse_nested(tb, TCA_GATE_ENTRY_MAX, n, entry_policy, extack);
+ if (err < 0) {
+ NL_SET_ERR_MSG(extack, "Could not parse nested entry");
+ return -EINVAL;
+ }
+
+ entry->index = index;
+
+ return fill_gate_entry(tb, entry, extack);
+}
+
+static void release_entry_list(struct list_head *entries)
+{
+ struct tcfg_gate_entry *entry, *e;
+
+ list_for_each_entry_safe(entry, e, entries, list) {
+ list_del(&entry->list);
+ kfree(entry);
+ }
+}
+
+static int parse_gate_list(struct nlattr *list_attr,
+ struct tcf_gate_params *sched,
+ struct netlink_ext_ack *extack)
+{
+ struct tcfg_gate_entry *entry;
+ struct nlattr *n;
+ int err, rem;
+ int i = 0;
+
+ if (!list_attr)
+ return -EINVAL;
+
+ nla_for_each_nested(n, list_attr, rem) {
+ if (nla_type(n) != TCA_GATE_ONE_ENTRY) {
+ NL_SET_ERR_MSG(extack, "Attribute isn't type 'entry'");
+ continue;
+ }
+
+ entry = kzalloc(sizeof(*entry), GFP_ATOMIC);
+ if (!entry) {
+ NL_SET_ERR_MSG(extack, "Not enough memory for entry");
+ err = -ENOMEM;
+ goto release_list;
+ }
+
+ err = parse_gate_entry(n, entry, i, extack);
+ if (err < 0) {
+ kfree(entry);
+ goto release_list;
+ }
+
+ list_add_tail(&entry->list, &sched->entries);
+ i++;
+ }
+
+ sched->num_entries = i;
+
+ return i;
+
+release_list:
+ release_entry_list(&sched->entries);
+
+ return err;
+}
+
+static void gate_setup_timer(struct tcf_gate *gact, u64 basetime,
+ enum tk_offsets tko, s32 clockid,
+ bool do_init)
+{
+ if (!do_init) {
+ if (basetime == gact->param.tcfg_basetime &&
+ tko == gact->tk_offset &&
+ clockid == gact->param.tcfg_clockid)
+ return;
+
+ spin_unlock_bh(&gact->tcf_lock);
+ hrtimer_cancel(&gact->hitimer);
+ spin_lock_bh(&gact->tcf_lock);
+ }
+ gact->param.tcfg_basetime = basetime;
+ gact->param.tcfg_clockid = clockid;
+ gact->tk_offset = tko;
+ hrtimer_setup(&gact->hitimer, gate_timer_func, clockid, HRTIMER_MODE_ABS_SOFT);
+}
+
+static int tcf_gate_init(struct net *net, struct nlattr *nla,
+ struct nlattr *est, struct tc_action **a,
+ struct tcf_proto *tp, u32 flags,
+ struct netlink_ext_ack *extack)
+{
+ struct tc_action_net *tn = net_generic(net, act_gate_ops.net_id);
+ enum tk_offsets tk_offset = TK_OFFS_TAI;
+ bool bind = flags & TCA_ACT_FLAGS_BIND;
+ struct nlattr *tb[TCA_GATE_MAX + 1];
+ struct tcf_chain *goto_ch = NULL;
+ u64 cycletime = 0, basetime = 0;
+ struct tcf_gate_params *p;
+ s32 clockid = CLOCK_TAI;
+ struct tcf_gate *gact;
+ struct tc_gate *parm;
+ int ret = 0, err;
+ u32 gflags = 0;
+ s32 prio = -1;
+ ktime_t start;
+ u32 index;
+
+ if (!nla)
+ return -EINVAL;
+
+ err = nla_parse_nested(tb, TCA_GATE_MAX, nla, gate_policy, extack);
+ if (err < 0)
+ return err;
+
+ if (!tb[TCA_GATE_PARMS])
+ return -EINVAL;
+
+ if (tb[TCA_GATE_CLOCKID]) {
+ clockid = nla_get_s32(tb[TCA_GATE_CLOCKID]);
+ switch (clockid) {
+ case CLOCK_REALTIME:
+ tk_offset = TK_OFFS_REAL;
+ break;
+ case CLOCK_MONOTONIC:
+ tk_offset = TK_OFFS_MAX;
+ break;
+ case CLOCK_BOOTTIME:
+ tk_offset = TK_OFFS_BOOT;
+ break;
+ case CLOCK_TAI:
+ tk_offset = TK_OFFS_TAI;
+ break;
+ default:
+ NL_SET_ERR_MSG(extack, "Invalid 'clockid'");
+ return -EINVAL;
+ }
+ }
+
+ parm = nla_data(tb[TCA_GATE_PARMS]);
+ index = parm->index;
+
+ err = tcf_idr_check_alloc(tn, &index, a, bind);
+ if (err < 0)
+ return err;
+
+ if (err && bind)
+ return ACT_P_BOUND;
+
+ if (!err) {
+ ret = tcf_idr_create_from_flags(tn, index, est, a,
+ &act_gate_ops, bind, flags);
+ if (ret) {
+ tcf_idr_cleanup(tn, index);
+ return ret;
+ }
+
+ ret = ACT_P_CREATED;
+ } else if (!(flags & TCA_ACT_FLAGS_REPLACE)) {
+ tcf_idr_release(*a, bind);
+ return -EEXIST;
+ }
+
+ if (tb[TCA_GATE_PRIORITY])
+ prio = nla_get_s32(tb[TCA_GATE_PRIORITY]);
+
+ if (tb[TCA_GATE_BASE_TIME])
+ basetime = nla_get_u64(tb[TCA_GATE_BASE_TIME]);
+
+ if (tb[TCA_GATE_FLAGS])
+ gflags = nla_get_u32(tb[TCA_GATE_FLAGS]);
+
+ gact = to_gate(*a);
+ if (ret == ACT_P_CREATED)
+ INIT_LIST_HEAD(&gact->param.entries);
+
+ err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack);
+ if (err < 0)
+ goto release_idr;
+
+ spin_lock_bh(&gact->tcf_lock);
+ p = &gact->param;
+
+ if (tb[TCA_GATE_CYCLE_TIME])
+ cycletime = nla_get_u64(tb[TCA_GATE_CYCLE_TIME]);
+
+ if (tb[TCA_GATE_ENTRY_LIST]) {
+ err = parse_gate_list(tb[TCA_GATE_ENTRY_LIST], p, extack);
+ if (err < 0)
+ goto chain_put;
+ }
+
+ if (!cycletime) {
+ struct tcfg_gate_entry *entry;
+ ktime_t cycle = 0;
+
+ list_for_each_entry(entry, &p->entries, list)
+ cycle = ktime_add_ns(cycle, entry->interval);
+ cycletime = cycle;
+ if (!cycletime) {
+ err = -EINVAL;
+ goto chain_put;
+ }
+ }
+ p->tcfg_cycletime = cycletime;
+
+ if (tb[TCA_GATE_CYCLE_TIME_EXT])
+ p->tcfg_cycletime_ext =
+ nla_get_u64(tb[TCA_GATE_CYCLE_TIME_EXT]);
+
+ gate_setup_timer(gact, basetime, tk_offset, clockid,
+ ret == ACT_P_CREATED);
+ p->tcfg_priority = prio;
+ p->tcfg_flags = gflags;
+ gate_get_start_time(gact, &start);
+
+ gact->current_close_time = start;
+ gact->current_gate_status = GATE_ACT_GATE_OPEN | GATE_ACT_PENDING;
+
+ gact->next_entry = list_first_entry(&p->entries,
+ struct tcfg_gate_entry, list);
+
+ goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch);
+
+ gate_start_timer(gact, start);
+
+ spin_unlock_bh(&gact->tcf_lock);
+
+ if (goto_ch)
+ tcf_chain_put_by_act(goto_ch);
+
+ return ret;
+
+chain_put:
+ spin_unlock_bh(&gact->tcf_lock);
+
+ if (goto_ch)
+ tcf_chain_put_by_act(goto_ch);
+release_idr:
+ /* action is not inserted in any list: it's safe to init hitimer
+ * without taking tcf_lock.
+ */
+ if (ret == ACT_P_CREATED)
+ gate_setup_timer(gact, gact->param.tcfg_basetime,
+ gact->tk_offset, gact->param.tcfg_clockid,
+ true);
+ tcf_idr_release(*a, bind);
+ return err;
+}
+
+static void tcf_gate_cleanup(struct tc_action *a)
+{
+ struct tcf_gate *gact = to_gate(a);
+ struct tcf_gate_params *p;
+
+ p = &gact->param;
+ hrtimer_cancel(&gact->hitimer);
+ release_entry_list(&p->entries);
+}
+
+static int dumping_entry(struct sk_buff *skb,
+ struct tcfg_gate_entry *entry)
+{
+ struct nlattr *item;
+
+ item = nla_nest_start_noflag(skb, TCA_GATE_ONE_ENTRY);
+ if (!item)
+ return -ENOSPC;
+
+ if (nla_put_u32(skb, TCA_GATE_ENTRY_INDEX, entry->index))
+ goto nla_put_failure;
+
+ if (entry->gate_state && nla_put_flag(skb, TCA_GATE_ENTRY_GATE))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, TCA_GATE_ENTRY_INTERVAL, entry->interval))
+ goto nla_put_failure;
+
+ if (nla_put_s32(skb, TCA_GATE_ENTRY_MAX_OCTETS, entry->maxoctets))
+ goto nla_put_failure;
+
+ if (nla_put_s32(skb, TCA_GATE_ENTRY_IPV, entry->ipv))
+ goto nla_put_failure;
+
+ return nla_nest_end(skb, item);
+
+nla_put_failure:
+ nla_nest_cancel(skb, item);
+ return -1;
+}
+
+static int tcf_gate_dump(struct sk_buff *skb, struct tc_action *a,
+ int bind, int ref)
+{
+ unsigned char *b = skb_tail_pointer(skb);
+ struct tcf_gate *gact = to_gate(a);
+ struct tc_gate opt = {
+ .index = gact->tcf_index,
+ .refcnt = refcount_read(&gact->tcf_refcnt) - ref,
+ .bindcnt = atomic_read(&gact->tcf_bindcnt) - bind,
+ };
+ struct tcfg_gate_entry *entry;
+ struct tcf_gate_params *p;
+ struct nlattr *entry_list;
+ struct tcf_t t;
+
+ spin_lock_bh(&gact->tcf_lock);
+ opt.action = gact->tcf_action;
+
+ p = &gact->param;
+
+ if (nla_put(skb, TCA_GATE_PARMS, sizeof(opt), &opt))
+ goto nla_put_failure;
+
+ if (nla_put_u64_64bit(skb, TCA_GATE_BASE_TIME,
+ p->tcfg_basetime, TCA_GATE_PAD))
+ goto nla_put_failure;
+
+ if (nla_put_u64_64bit(skb, TCA_GATE_CYCLE_TIME,
+ p->tcfg_cycletime, TCA_GATE_PAD))
+ goto nla_put_failure;
+
+ if (nla_put_u64_64bit(skb, TCA_GATE_CYCLE_TIME_EXT,
+ p->tcfg_cycletime_ext, TCA_GATE_PAD))
+ goto nla_put_failure;
+
+ if (nla_put_s32(skb, TCA_GATE_CLOCKID, p->tcfg_clockid))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, TCA_GATE_FLAGS, p->tcfg_flags))
+ goto nla_put_failure;
+
+ if (nla_put_s32(skb, TCA_GATE_PRIORITY, p->tcfg_priority))
+ goto nla_put_failure;
+
+ entry_list = nla_nest_start_noflag(skb, TCA_GATE_ENTRY_LIST);
+ if (!entry_list)
+ goto nla_put_failure;
+
+ list_for_each_entry(entry, &p->entries, list) {
+ if (dumping_entry(skb, entry) < 0)
+ goto nla_put_failure;
+ }
+
+ nla_nest_end(skb, entry_list);
+
+ tcf_tm_dump(&t, &gact->tcf_tm);
+ if (nla_put_64bit(skb, TCA_GATE_TM, sizeof(t), &t, TCA_GATE_PAD))
+ goto nla_put_failure;
+ spin_unlock_bh(&gact->tcf_lock);
+
+ return skb->len;
+
+nla_put_failure:
+ spin_unlock_bh(&gact->tcf_lock);
+ nlmsg_trim(skb, b);
+ return -1;
+}
+
+static void tcf_gate_stats_update(struct tc_action *a, u64 bytes, u64 packets,
+ u64 drops, u64 lastuse, bool hw)
+{
+ struct tcf_gate *gact = to_gate(a);
+ struct tcf_t *tm = &gact->tcf_tm;
+
+ tcf_action_update_stats(a, bytes, packets, drops, hw);
+ tm->lastuse = max_t(u64, tm->lastuse, lastuse);
+}
+
+static size_t tcf_gate_get_fill_size(const struct tc_action *act)
+{
+ return nla_total_size(sizeof(struct tc_gate));
+}
+
+static void tcf_gate_entry_destructor(void *priv)
+{
+ struct action_gate_entry *oe = priv;
+
+ kfree(oe);
+}
+
+static int tcf_gate_get_entries(struct flow_action_entry *entry,
+ const struct tc_action *act)
+{
+ entry->gate.entries = tcf_gate_get_list(act);
+
+ if (!entry->gate.entries)
+ return -EINVAL;
+
+ entry->destructor = tcf_gate_entry_destructor;
+ entry->destructor_priv = entry->gate.entries;
+
+ return 0;
+}
+
+static int tcf_gate_offload_act_setup(struct tc_action *act, void *entry_data,
+ u32 *index_inc, bool bind,
+ struct netlink_ext_ack *extack)
+{
+ int err;
+
+ if (bind) {
+ struct flow_action_entry *entry = entry_data;
+
+ entry->id = FLOW_ACTION_GATE;
+ entry->gate.prio = tcf_gate_prio(act);
+ entry->gate.basetime = tcf_gate_basetime(act);
+ entry->gate.cycletime = tcf_gate_cycletime(act);
+ entry->gate.cycletimeext = tcf_gate_cycletimeext(act);
+ entry->gate.num_entries = tcf_gate_num_entries(act);
+ err = tcf_gate_get_entries(entry, act);
+ if (err)
+ return err;
+ *index_inc = 1;
+ } else {
+ struct flow_offload_action *fl_action = entry_data;
+
+ fl_action->id = FLOW_ACTION_GATE;
+ }
+
+ return 0;
+}
+
+static struct tc_action_ops act_gate_ops = {
+ .kind = "gate",
+ .id = TCA_ID_GATE,
+ .owner = THIS_MODULE,
+ .act = tcf_gate_act,
+ .dump = tcf_gate_dump,
+ .init = tcf_gate_init,
+ .cleanup = tcf_gate_cleanup,
+ .stats_update = tcf_gate_stats_update,
+ .get_fill_size = tcf_gate_get_fill_size,
+ .offload_act_setup = tcf_gate_offload_act_setup,
+ .size = sizeof(struct tcf_gate),
+};
+MODULE_ALIAS_NET_ACT("gate");
+
+static __net_init int gate_init_net(struct net *net)
+{
+ struct tc_action_net *tn = net_generic(net, act_gate_ops.net_id);
+
+ return tc_action_net_init(net, tn, &act_gate_ops);
+}
+
+static void __net_exit gate_exit_net(struct list_head *net_list)
+{
+ tc_action_net_exit(net_list, act_gate_ops.net_id);
+}
+
+static struct pernet_operations gate_net_ops = {
+ .init = gate_init_net,
+ .exit_batch = gate_exit_net,
+ .id = &act_gate_ops.net_id,
+ .size = sizeof(struct tc_action_net),
+};
+
+static int __init gate_init_module(void)
+{
+ return tcf_register_action(&act_gate_ops, &gate_net_ops);
+}
+
+static void __exit gate_cleanup_module(void)
+{
+ tcf_unregister_action(&act_gate_ops, &gate_net_ops);
+}
+
+module_init(gate_init_module);
+module_exit(gate_cleanup_module);
+MODULE_DESCRIPTION("TC gate action");
+MODULE_LICENSE("GPL v2");
diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c
index 30b63fa23ee2..1dfdda6c2d4c 100644
--- a/net/sched/act_ife.c
+++ b/net/sched/act_ife.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* net/sched/ife.c Inter-FE action based on ForCES WG InterFE LFB
*
@@ -9,13 +10,7 @@
* Subsystem"
* Authors: Jamal Hadi Salim and Damascene M. Joachimpillai
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* copyright Jamal Hadi Salim (2015)
- *
*/
#include <linux/types.h>
@@ -29,12 +24,13 @@
#include <net/net_namespace.h>
#include <net/netlink.h>
#include <net/pkt_sched.h>
+#include <net/pkt_cls.h>
#include <uapi/linux/tc_act/tc_ife.h>
#include <net/tc_act/tc_ife.h>
#include <linux/etherdevice.h>
#include <net/ife.h>
+#include <net/tc_wrapper.h>
-static unsigned int ife_net_id;
static int max_metacnt = IFE_META_MAX + 1;
static struct tc_action_ops act_ife_ops;
@@ -386,7 +382,7 @@ static int dump_metalist(struct sk_buff *skb, struct tcf_ife_info *ife)
if (list_empty(&ife->metalist))
return 0;
- nest = nla_nest_start(skb, TCA_IFE_METALST);
+ nest = nla_nest_start_noflag(skb, TCA_IFE_METALST);
if (!nest)
goto out_nlmsg_trim;
@@ -440,6 +436,25 @@ static void tcf_ife_cleanup(struct tc_action *a)
kfree_rcu(p, rcu);
}
+static int load_metalist(struct nlattr **tb, bool rtnl_held)
+{
+ int i;
+
+ for (i = 1; i < max_metacnt; i++) {
+ if (tb[i]) {
+ void *val = nla_data(tb[i]);
+ int len = nla_len(tb[i]);
+ int rc;
+
+ rc = load_metaops_and_vet(i, val, len, rtnl_held);
+ if (rc != 0)
+ return rc;
+ }
+ }
+
+ return 0;
+}
+
static int populate_metalist(struct tcf_ife_info *ife, struct nlattr **tb,
bool exists, bool rtnl_held)
{
@@ -453,10 +468,6 @@ static int populate_metalist(struct tcf_ife_info *ife, struct nlattr **tb,
val = nla_data(tb[i]);
len = nla_len(tb[i]);
- rc = load_metaops_and_vet(i, val, len, rtnl_held);
- if (rc != 0)
- return rc;
-
rc = add_metainfo(ife, i, val, len, exists);
if (rc)
return rc;
@@ -468,12 +479,14 @@ static int populate_metalist(struct tcf_ife_info *ife, struct nlattr **tb,
static int tcf_ife_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action **a,
- int ovr, int bind, bool rtnl_held,
+ struct tcf_proto *tp, u32 flags,
struct netlink_ext_ack *extack)
{
- struct tc_action_net *tn = net_generic(net, ife_net_id);
+ struct tc_action_net *tn = net_generic(net, act_ife_ops.net_id);
+ bool bind = flags & TCA_ACT_FLAGS_BIND;
struct nlattr *tb[TCA_IFE_MAX + 1];
struct nlattr *tb2[IFE_META_MAX + 1];
+ struct tcf_chain *goto_ch = NULL;
struct tcf_ife_params *p;
struct tcf_ife_info *ife;
u16 ife_type = ETH_P_IFE;
@@ -482,9 +495,16 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla,
u8 *saddr = NULL;
bool exists = false;
int ret = 0;
+ u32 index;
int err;
- err = nla_parse_nested(tb, TCA_IFE_MAX, nla, ife_policy, NULL);
+ if (!nla) {
+ NL_SET_ERR_MSG_MOD(extack, "IFE requires attributes to be passed");
+ return -EINVAL;
+ }
+
+ err = nla_parse_nested_deprecated(tb, TCA_IFE_MAX, nla, ife_policy,
+ NULL);
if (err < 0)
return err;
@@ -504,7 +524,23 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla,
if (!p)
return -ENOMEM;
- err = tcf_idr_check_alloc(tn, &parm->index, a, bind);
+ if (tb[TCA_IFE_METALST]) {
+ err = nla_parse_nested_deprecated(tb2, IFE_META_MAX,
+ tb[TCA_IFE_METALST], NULL,
+ NULL);
+ if (err) {
+ kfree(p);
+ return err;
+ }
+ err = load_metalist(tb2, !(flags & TCA_ACT_FLAGS_NO_RTNL));
+ if (err) {
+ kfree(p);
+ return err;
+ }
+ }
+
+ index = parm->index;
+ err = tcf_idr_check_alloc(tn, &index, a, bind);
if (err < 0) {
kfree(p);
return err;
@@ -512,25 +548,32 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla,
exists = err;
if (exists && bind) {
kfree(p);
- return 0;
+ return ACT_P_BOUND;
}
if (!exists) {
- ret = tcf_idr_create(tn, parm->index, est, a, &act_ife_ops,
- bind, true);
+ ret = tcf_idr_create(tn, index, est, a, &act_ife_ops,
+ bind, true, flags);
if (ret) {
- tcf_idr_cleanup(tn, parm->index);
+ tcf_idr_cleanup(tn, index);
kfree(p);
return ret;
}
ret = ACT_P_CREATED;
- } else if (!ovr) {
+ } else if (!(flags & TCA_ACT_FLAGS_REPLACE)) {
tcf_idr_release(*a, bind);
kfree(p);
return -EEXIST;
}
ife = to_ife(*a);
+ if (ret == ACT_P_CREATED)
+ INIT_LIST_HEAD(&ife->metalist);
+
+ err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack);
+ if (err < 0)
+ goto release_idr;
+
p->flags = parm->flags;
if (parm->flags & IFE_ENCODE) {
@@ -556,24 +599,11 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla,
p->eth_type = ife_type;
}
-
- if (ret == ACT_P_CREATED)
- INIT_LIST_HEAD(&ife->metalist);
-
if (tb[TCA_IFE_METALST]) {
- err = nla_parse_nested(tb2, IFE_META_MAX, tb[TCA_IFE_METALST],
- NULL, NULL);
- if (err) {
-metadata_parse_err:
- tcf_idr_release(*a, bind);
- kfree(p);
- return err;
- }
-
- err = populate_metalist(ife, tb2, exists, rtnl_held);
+ err = populate_metalist(ife, tb2, exists,
+ !(flags & TCA_ACT_FLAGS_NO_RTNL));
if (err)
goto metadata_parse_err;
-
} else {
/* if no passed metadata allow list or passed allow-all
* then here we process by adding as many supported metadatum
@@ -581,28 +611,31 @@ metadata_parse_err:
* going to bail out
*/
err = use_all_metadata(ife, exists);
- if (err) {
- tcf_idr_release(*a, bind);
- kfree(p);
- return err;
- }
+ if (err)
+ goto metadata_parse_err;
}
if (exists)
spin_lock_bh(&ife->tcf_lock);
- ife->tcf_action = parm->action;
/* protected by tcf_lock when modifying existing action */
- rcu_swap_protected(ife->params, p, 1);
+ goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch);
+ p = rcu_replace_pointer(ife->params, p, 1);
if (exists)
spin_unlock_bh(&ife->tcf_lock);
+ if (goto_ch)
+ tcf_chain_put_by_act(goto_ch);
if (p)
kfree_rcu(p, rcu);
- if (ret == ACT_P_CREATED)
- tcf_idr_insert(tn, *a);
-
return ret;
+metadata_parse_err:
+ if (goto_ch)
+ tcf_chain_put_by_act(goto_ch);
+release_idr:
+ kfree(p);
+ tcf_idr_release(*a, bind);
+ return err;
}
static int tcf_ife_dump(struct sk_buff *skb, struct tc_action *a, int bind,
@@ -611,13 +644,15 @@ static int tcf_ife_dump(struct sk_buff *skb, struct tc_action *a, int bind,
unsigned char *b = skb_tail_pointer(skb);
struct tcf_ife_info *ife = to_ife(a);
struct tcf_ife_params *p;
- struct tc_ife opt = {
- .index = ife->tcf_index,
- .refcnt = refcount_read(&ife->tcf_refcnt) - ref,
- .bindcnt = atomic_read(&ife->tcf_bindcnt) - bind,
- };
+ struct tc_ife opt;
struct tcf_t t;
+ memset(&opt, 0, sizeof(opt));
+
+ opt.index = ife->tcf_index;
+ opt.refcnt = refcount_read(&ife->tcf_refcnt) - ref;
+ opt.bindcnt = atomic_read(&ife->tcf_bindcnt) - bind;
+
spin_lock_bh(&ife->tcf_lock);
opt.action = ife->tcf_action;
p = rcu_dereference_protected(ife->params,
@@ -685,7 +720,7 @@ static int tcf_ife_decode(struct sk_buff *skb, const struct tc_action *a,
u8 *tlv_data;
u16 metalen;
- bstats_cpu_update(this_cpu_ptr(ife->common.cpu_bstats), skb);
+ bstats_update(this_cpu_ptr(ife->common.cpu_bstats), skb);
tcf_lastuse_update(&ife->tcf_tm);
if (skb_at_tc_ingress(skb))
@@ -773,7 +808,7 @@ static int tcf_ife_encode(struct sk_buff *skb, const struct tc_action *a,
exceed_mtu = true;
}
- bstats_cpu_update(this_cpu_ptr(ife->common.cpu_bstats), skb);
+ bstats_update(this_cpu_ptr(ife->common.cpu_bstats), skb);
tcf_lastuse_update(&ife->tcf_tm);
if (!metalen) { /* no metadata to send */
@@ -829,8 +864,9 @@ static int tcf_ife_encode(struct sk_buff *skb, const struct tc_action *a,
return action;
}
-static int tcf_ife_act(struct sk_buff *skb, const struct tc_action *a,
- struct tcf_result *res)
+TC_INDIRECT_SCOPE int tcf_ife_act(struct sk_buff *skb,
+ const struct tc_action *a,
+ struct tcf_result *res)
{
struct tcf_ife_info *ife = to_ife(a);
struct tcf_ife_params *p;
@@ -845,52 +881,34 @@ static int tcf_ife_act(struct sk_buff *skb, const struct tc_action *a,
return tcf_ife_decode(skb, a, res);
}
-static int tcf_ife_walker(struct net *net, struct sk_buff *skb,
- struct netlink_callback *cb, int type,
- const struct tc_action_ops *ops,
- struct netlink_ext_ack *extack)
-{
- struct tc_action_net *tn = net_generic(net, ife_net_id);
-
- return tcf_generic_walker(tn, skb, cb, type, ops, extack);
-}
-
-static int tcf_ife_search(struct net *net, struct tc_action **a, u32 index)
-{
- struct tc_action_net *tn = net_generic(net, ife_net_id);
-
- return tcf_idr_search(tn, a, index);
-}
-
static struct tc_action_ops act_ife_ops = {
.kind = "ife",
- .type = TCA_ACT_IFE,
+ .id = TCA_ID_IFE,
.owner = THIS_MODULE,
.act = tcf_ife_act,
.dump = tcf_ife_dump,
.cleanup = tcf_ife_cleanup,
.init = tcf_ife_init,
- .walk = tcf_ife_walker,
- .lookup = tcf_ife_search,
.size = sizeof(struct tcf_ife_info),
};
+MODULE_ALIAS_NET_ACT("ife");
static __net_init int ife_init_net(struct net *net)
{
- struct tc_action_net *tn = net_generic(net, ife_net_id);
+ struct tc_action_net *tn = net_generic(net, act_ife_ops.net_id);
- return tc_action_net_init(tn, &act_ife_ops);
+ return tc_action_net_init(net, tn, &act_ife_ops);
}
static void __net_exit ife_exit_net(struct list_head *net_list)
{
- tc_action_net_exit(net_list, ife_net_id);
+ tc_action_net_exit(net_list, act_ife_ops.net_id);
}
static struct pernet_operations ife_net_ops = {
.init = ife_init_net,
.exit_batch = ife_exit_net,
- .id = &ife_net_id,
+ .id = &act_ife_ops.net_id,
.size = sizeof(struct tc_action_net),
};
diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c
deleted file mode 100644
index 8af6c11d2482..000000000000
--- a/net/sched/act_ipt.c
+++ /dev/null
@@ -1,450 +0,0 @@
-/*
- * net/sched/act_ipt.c iptables target interface
- *
- *TODO: Add other tables. For now we only support the ipv4 table targets
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
- * Copyright: Jamal Hadi Salim (2002-13)
- */
-
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/string.h>
-#include <linux/errno.h>
-#include <linux/skbuff.h>
-#include <linux/rtnetlink.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/slab.h>
-#include <net/netlink.h>
-#include <net/pkt_sched.h>
-#include <linux/tc_act/tc_ipt.h>
-#include <net/tc_act/tc_ipt.h>
-
-#include <linux/netfilter_ipv4/ip_tables.h>
-
-
-static unsigned int ipt_net_id;
-static struct tc_action_ops act_ipt_ops;
-
-static unsigned int xt_net_id;
-static struct tc_action_ops act_xt_ops;
-
-static int ipt_init_target(struct net *net, struct xt_entry_target *t,
- char *table, unsigned int hook)
-{
- struct xt_tgchk_param par;
- struct xt_target *target;
- struct ipt_entry e = {};
- int ret = 0;
-
- target = xt_request_find_target(AF_INET, t->u.user.name,
- t->u.user.revision);
- if (IS_ERR(target))
- return PTR_ERR(target);
-
- t->u.kernel.target = target;
- memset(&par, 0, sizeof(par));
- par.net = net;
- par.table = table;
- par.entryinfo = &e;
- par.target = target;
- par.targinfo = t->data;
- par.hook_mask = hook;
- par.family = NFPROTO_IPV4;
-
- ret = xt_check_target(&par, t->u.target_size - sizeof(*t), 0, false);
- if (ret < 0) {
- module_put(t->u.kernel.target->me);
- return ret;
- }
- return 0;
-}
-
-static void ipt_destroy_target(struct xt_entry_target *t)
-{
- struct xt_tgdtor_param par = {
- .target = t->u.kernel.target,
- .targinfo = t->data,
- .family = NFPROTO_IPV4,
- };
- if (par.target->destroy != NULL)
- par.target->destroy(&par);
- module_put(par.target->me);
-}
-
-static void tcf_ipt_release(struct tc_action *a)
-{
- struct tcf_ipt *ipt = to_ipt(a);
-
- if (ipt->tcfi_t) {
- ipt_destroy_target(ipt->tcfi_t);
- kfree(ipt->tcfi_t);
- }
- kfree(ipt->tcfi_tname);
-}
-
-static const struct nla_policy ipt_policy[TCA_IPT_MAX + 1] = {
- [TCA_IPT_TABLE] = { .type = NLA_STRING, .len = IFNAMSIZ },
- [TCA_IPT_HOOK] = { .type = NLA_U32 },
- [TCA_IPT_INDEX] = { .type = NLA_U32 },
- [TCA_IPT_TARG] = { .len = sizeof(struct xt_entry_target) },
-};
-
-static int __tcf_ipt_init(struct net *net, unsigned int id, struct nlattr *nla,
- struct nlattr *est, struct tc_action **a,
- const struct tc_action_ops *ops, int ovr, int bind)
-{
- struct tc_action_net *tn = net_generic(net, id);
- struct nlattr *tb[TCA_IPT_MAX + 1];
- struct tcf_ipt *ipt;
- struct xt_entry_target *td, *t;
- char *tname;
- bool exists = false;
- int ret = 0, err;
- u32 hook = 0;
- u32 index = 0;
-
- if (nla == NULL)
- return -EINVAL;
-
- err = nla_parse_nested(tb, TCA_IPT_MAX, nla, ipt_policy, NULL);
- if (err < 0)
- return err;
-
- if (tb[TCA_IPT_INDEX] != NULL)
- index = nla_get_u32(tb[TCA_IPT_INDEX]);
-
- err = tcf_idr_check_alloc(tn, &index, a, bind);
- if (err < 0)
- return err;
- exists = err;
- if (exists && bind)
- return 0;
-
- if (tb[TCA_IPT_HOOK] == NULL || tb[TCA_IPT_TARG] == NULL) {
- if (exists)
- tcf_idr_release(*a, bind);
- else
- tcf_idr_cleanup(tn, index);
- return -EINVAL;
- }
-
- td = (struct xt_entry_target *)nla_data(tb[TCA_IPT_TARG]);
- if (nla_len(tb[TCA_IPT_TARG]) != td->u.target_size) {
- if (exists)
- tcf_idr_release(*a, bind);
- else
- tcf_idr_cleanup(tn, index);
- return -EINVAL;
- }
-
- if (!exists) {
- ret = tcf_idr_create(tn, index, est, a, ops, bind,
- false);
- if (ret) {
- tcf_idr_cleanup(tn, index);
- return ret;
- }
- ret = ACT_P_CREATED;
- } else {
- if (bind)/* dont override defaults */
- return 0;
-
- if (!ovr) {
- tcf_idr_release(*a, bind);
- return -EEXIST;
- }
- }
- hook = nla_get_u32(tb[TCA_IPT_HOOK]);
-
- err = -ENOMEM;
- tname = kmalloc(IFNAMSIZ, GFP_KERNEL);
- if (unlikely(!tname))
- goto err1;
- if (tb[TCA_IPT_TABLE] == NULL ||
- nla_strlcpy(tname, tb[TCA_IPT_TABLE], IFNAMSIZ) >= IFNAMSIZ)
- strcpy(tname, "mangle");
-
- t = kmemdup(td, td->u.target_size, GFP_KERNEL);
- if (unlikely(!t))
- goto err2;
-
- err = ipt_init_target(net, t, tname, hook);
- if (err < 0)
- goto err3;
-
- ipt = to_ipt(*a);
-
- spin_lock_bh(&ipt->tcf_lock);
- if (ret != ACT_P_CREATED) {
- ipt_destroy_target(ipt->tcfi_t);
- kfree(ipt->tcfi_tname);
- kfree(ipt->tcfi_t);
- }
- ipt->tcfi_tname = tname;
- ipt->tcfi_t = t;
- ipt->tcfi_hook = hook;
- spin_unlock_bh(&ipt->tcf_lock);
- if (ret == ACT_P_CREATED)
- tcf_idr_insert(tn, *a);
- return ret;
-
-err3:
- kfree(t);
-err2:
- kfree(tname);
-err1:
- if (ret == ACT_P_CREATED)
- tcf_idr_release(*a, bind);
- return err;
-}
-
-static int tcf_ipt_init(struct net *net, struct nlattr *nla,
- struct nlattr *est, struct tc_action **a, int ovr,
- int bind, bool rtnl_held,
- struct netlink_ext_ack *extack)
-{
- return __tcf_ipt_init(net, ipt_net_id, nla, est, a, &act_ipt_ops, ovr,
- bind);
-}
-
-static int tcf_xt_init(struct net *net, struct nlattr *nla,
- struct nlattr *est, struct tc_action **a, int ovr,
- int bind, bool unlocked,
- struct netlink_ext_ack *extack)
-{
- return __tcf_ipt_init(net, xt_net_id, nla, est, a, &act_xt_ops, ovr,
- bind);
-}
-
-static int tcf_ipt_act(struct sk_buff *skb, const struct tc_action *a,
- struct tcf_result *res)
-{
- int ret = 0, result = 0;
- struct tcf_ipt *ipt = to_ipt(a);
- struct xt_action_param par;
- struct nf_hook_state state = {
- .net = dev_net(skb->dev),
- .in = skb->dev,
- .hook = ipt->tcfi_hook,
- .pf = NFPROTO_IPV4,
- };
-
- if (skb_unclone(skb, GFP_ATOMIC))
- return TC_ACT_UNSPEC;
-
- spin_lock(&ipt->tcf_lock);
-
- tcf_lastuse_update(&ipt->tcf_tm);
- bstats_update(&ipt->tcf_bstats, skb);
-
- /* yes, we have to worry about both in and out dev
- * worry later - danger - this API seems to have changed
- * from earlier kernels
- */
- par.state = &state;
- par.target = ipt->tcfi_t->u.kernel.target;
- par.targinfo = ipt->tcfi_t->data;
- ret = par.target->target(skb, &par);
-
- switch (ret) {
- case NF_ACCEPT:
- result = TC_ACT_OK;
- break;
- case NF_DROP:
- result = TC_ACT_SHOT;
- ipt->tcf_qstats.drops++;
- break;
- case XT_CONTINUE:
- result = TC_ACT_PIPE;
- break;
- default:
- net_notice_ratelimited("tc filter: Bogus netfilter code %d assume ACCEPT\n",
- ret);
- result = TC_ACT_OK;
- break;
- }
- spin_unlock(&ipt->tcf_lock);
- return result;
-
-}
-
-static int tcf_ipt_dump(struct sk_buff *skb, struct tc_action *a, int bind,
- int ref)
-{
- unsigned char *b = skb_tail_pointer(skb);
- struct tcf_ipt *ipt = to_ipt(a);
- struct xt_entry_target *t;
- struct tcf_t tm;
- struct tc_cnt c;
-
- /* for simple targets kernel size == user size
- * user name = target name
- * for foolproof you need to not assume this
- */
-
- spin_lock_bh(&ipt->tcf_lock);
- t = kmemdup(ipt->tcfi_t, ipt->tcfi_t->u.user.target_size, GFP_ATOMIC);
- if (unlikely(!t))
- goto nla_put_failure;
-
- c.bindcnt = atomic_read(&ipt->tcf_bindcnt) - bind;
- c.refcnt = refcount_read(&ipt->tcf_refcnt) - ref;
- strcpy(t->u.user.name, ipt->tcfi_t->u.kernel.target->name);
-
- if (nla_put(skb, TCA_IPT_TARG, ipt->tcfi_t->u.user.target_size, t) ||
- nla_put_u32(skb, TCA_IPT_INDEX, ipt->tcf_index) ||
- nla_put_u32(skb, TCA_IPT_HOOK, ipt->tcfi_hook) ||
- nla_put(skb, TCA_IPT_CNT, sizeof(struct tc_cnt), &c) ||
- nla_put_string(skb, TCA_IPT_TABLE, ipt->tcfi_tname))
- goto nla_put_failure;
-
- tcf_tm_dump(&tm, &ipt->tcf_tm);
- if (nla_put_64bit(skb, TCA_IPT_TM, sizeof(tm), &tm, TCA_IPT_PAD))
- goto nla_put_failure;
-
- spin_unlock_bh(&ipt->tcf_lock);
- kfree(t);
- return skb->len;
-
-nla_put_failure:
- spin_unlock_bh(&ipt->tcf_lock);
- nlmsg_trim(skb, b);
- kfree(t);
- return -1;
-}
-
-static int tcf_ipt_walker(struct net *net, struct sk_buff *skb,
- struct netlink_callback *cb, int type,
- const struct tc_action_ops *ops,
- struct netlink_ext_ack *extack)
-{
- struct tc_action_net *tn = net_generic(net, ipt_net_id);
-
- return tcf_generic_walker(tn, skb, cb, type, ops, extack);
-}
-
-static int tcf_ipt_search(struct net *net, struct tc_action **a, u32 index)
-{
- struct tc_action_net *tn = net_generic(net, ipt_net_id);
-
- return tcf_idr_search(tn, a, index);
-}
-
-static struct tc_action_ops act_ipt_ops = {
- .kind = "ipt",
- .type = TCA_ACT_IPT,
- .owner = THIS_MODULE,
- .act = tcf_ipt_act,
- .dump = tcf_ipt_dump,
- .cleanup = tcf_ipt_release,
- .init = tcf_ipt_init,
- .walk = tcf_ipt_walker,
- .lookup = tcf_ipt_search,
- .size = sizeof(struct tcf_ipt),
-};
-
-static __net_init int ipt_init_net(struct net *net)
-{
- struct tc_action_net *tn = net_generic(net, ipt_net_id);
-
- return tc_action_net_init(tn, &act_ipt_ops);
-}
-
-static void __net_exit ipt_exit_net(struct list_head *net_list)
-{
- tc_action_net_exit(net_list, ipt_net_id);
-}
-
-static struct pernet_operations ipt_net_ops = {
- .init = ipt_init_net,
- .exit_batch = ipt_exit_net,
- .id = &ipt_net_id,
- .size = sizeof(struct tc_action_net),
-};
-
-static int tcf_xt_walker(struct net *net, struct sk_buff *skb,
- struct netlink_callback *cb, int type,
- const struct tc_action_ops *ops,
- struct netlink_ext_ack *extack)
-{
- struct tc_action_net *tn = net_generic(net, xt_net_id);
-
- return tcf_generic_walker(tn, skb, cb, type, ops, extack);
-}
-
-static int tcf_xt_search(struct net *net, struct tc_action **a, u32 index)
-{
- struct tc_action_net *tn = net_generic(net, xt_net_id);
-
- return tcf_idr_search(tn, a, index);
-}
-
-static struct tc_action_ops act_xt_ops = {
- .kind = "xt",
- .type = TCA_ACT_XT,
- .owner = THIS_MODULE,
- .act = tcf_ipt_act,
- .dump = tcf_ipt_dump,
- .cleanup = tcf_ipt_release,
- .init = tcf_xt_init,
- .walk = tcf_xt_walker,
- .lookup = tcf_xt_search,
- .size = sizeof(struct tcf_ipt),
-};
-
-static __net_init int xt_init_net(struct net *net)
-{
- struct tc_action_net *tn = net_generic(net, xt_net_id);
-
- return tc_action_net_init(tn, &act_xt_ops);
-}
-
-static void __net_exit xt_exit_net(struct list_head *net_list)
-{
- tc_action_net_exit(net_list, xt_net_id);
-}
-
-static struct pernet_operations xt_net_ops = {
- .init = xt_init_net,
- .exit_batch = xt_exit_net,
- .id = &xt_net_id,
- .size = sizeof(struct tc_action_net),
-};
-
-MODULE_AUTHOR("Jamal Hadi Salim(2002-13)");
-MODULE_DESCRIPTION("Iptables target actions");
-MODULE_LICENSE("GPL");
-MODULE_ALIAS("act_xt");
-
-static int __init ipt_init_module(void)
-{
- int ret1, ret2;
-
- ret1 = tcf_register_action(&act_xt_ops, &xt_net_ops);
- if (ret1 < 0)
- pr_err("Failed to load xt action\n");
-
- ret2 = tcf_register_action(&act_ipt_ops, &ipt_net_ops);
- if (ret2 < 0)
- pr_err("Failed to load ipt action\n");
-
- if (ret1 < 0 && ret2 < 0) {
- return ret1;
- } else
- return 0;
-}
-
-static void __exit ipt_cleanup_module(void)
-{
- tcf_unregister_action(&act_ipt_ops, &ipt_net_ops);
- tcf_unregister_action(&act_xt_ops, &xt_net_ops);
-}
-
-module_init(ipt_init_module);
-module_exit(ipt_cleanup_module);
diff --git a/net/sched/act_meta_mark.c b/net/sched/act_meta_mark.c
index 6445184b2759..ea0573cb8b2d 100644
--- a/net/sched/act_meta_mark.c
+++ b/net/sched/act_meta_mark.c
@@ -1,13 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* net/sched/act_meta_mark.c IFE skb->mark metadata module
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* copyright Jamal Hadi Salim (2015)
- *
*/
#include <linux/types.h>
diff --git a/net/sched/act_meta_skbprio.c b/net/sched/act_meta_skbprio.c
index 4033f9fc4d4a..2df3133ce5ad 100644
--- a/net/sched/act_meta_skbprio.c
+++ b/net/sched/act_meta_skbprio.c
@@ -1,13 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* net/sched/act_meta_prio.c IFE skb->priority metadata module
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* copyright Jamal Hadi Salim (2015)
- *
*/
#include <linux/types.h>
diff --git a/net/sched/act_meta_skbtcindex.c b/net/sched/act_meta_skbtcindex.c
index 7221437ca3a6..44547caead46 100644
--- a/net/sched/act_meta_skbtcindex.c
+++ b/net/sched/act_meta_skbtcindex.c
@@ -1,13 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* net/sched/act_meta_tc_index.c IFE skb->tc_index metadata module
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* copyright Jamal Hadi Salim (2016)
- *
*/
#include <linux/types.h>
diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index c8cf4d10c435..f27b583def78 100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -1,15 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* net/sched/act_mirred.c packet mirroring and redirect actions
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Authors: Jamal Hadi Salim (2002-4)
*
* TODO: Add ingress support (and socket redirect support)
- *
*/
#include <linux/types.h>
@@ -24,10 +19,12 @@
#include <linux/if_arp.h>
#include <net/net_namespace.h>
#include <net/netlink.h>
+#include <net/dst.h>
#include <net/pkt_sched.h>
#include <net/pkt_cls.h>
#include <linux/tc_act/tc_mirred.h>
#include <net/tc_act/tc_mirred.h>
+#include <net/tc_wrapper.h>
static LIST_HEAD(mirred_list);
static DEFINE_SPINLOCK(mirred_list_lock);
@@ -80,36 +77,48 @@ static void tcf_mirred_release(struct tc_action *a)
/* last reference to action, no need to lock */
dev = rcu_dereference_protected(m->tcfm_dev, 1);
- if (dev)
- dev_put(dev);
+ netdev_put(dev, &m->tcfm_dev_tracker);
}
static const struct nla_policy mirred_policy[TCA_MIRRED_MAX + 1] = {
[TCA_MIRRED_PARMS] = { .len = sizeof(struct tc_mirred) },
+ [TCA_MIRRED_BLOCKID] = NLA_POLICY_MIN(NLA_U32, 1),
};
-static unsigned int mirred_net_id;
static struct tc_action_ops act_mirred_ops;
+static void tcf_mirred_replace_dev(struct tcf_mirred *m,
+ struct net_device *ndev)
+{
+ struct net_device *odev;
+
+ odev = rcu_replace_pointer(m->tcfm_dev, ndev,
+ lockdep_is_held(&m->tcf_lock));
+ netdev_put(odev, &m->tcfm_dev_tracker);
+}
+
static int tcf_mirred_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action **a,
- int ovr, int bind, bool rtnl_held,
- struct netlink_ext_ack *extack)
+ struct tcf_proto *tp,
+ u32 flags, struct netlink_ext_ack *extack)
{
- struct tc_action_net *tn = net_generic(net, mirred_net_id);
+ struct tc_action_net *tn = net_generic(net, act_mirred_ops.net_id);
+ bool bind = flags & TCA_ACT_FLAGS_BIND;
struct nlattr *tb[TCA_MIRRED_MAX + 1];
+ struct tcf_chain *goto_ch = NULL;
bool mac_header_xmit = false;
struct tc_mirred *parm;
struct tcf_mirred *m;
- struct net_device *dev;
bool exists = false;
int ret, err;
+ u32 index;
if (!nla) {
NL_SET_ERR_MSG_MOD(extack, "Mirred requires attributes to be passed");
return -EINVAL;
}
- ret = nla_parse_nested(tb, TCA_MIRRED_MAX, nla, mirred_policy, extack);
+ ret = nla_parse_nested_deprecated(tb, TCA_MIRRED_MAX, nla,
+ mirred_policy, extack);
if (ret < 0)
return ret;
if (!tb[TCA_MIRRED_PARMS]) {
@@ -117,13 +126,24 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla,
return -EINVAL;
}
parm = nla_data(tb[TCA_MIRRED_PARMS]);
-
- err = tcf_idr_check_alloc(tn, &parm->index, a, bind);
+ index = parm->index;
+ err = tcf_idr_check_alloc(tn, &index, a, bind);
if (err < 0)
return err;
exists = err;
if (exists && bind)
- return 0;
+ return ACT_P_BOUND;
+
+ if (tb[TCA_MIRRED_BLOCKID] && parm->ifindex) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "Cannot specify Block ID and dev simultaneously");
+ if (exists)
+ tcf_idr_release(*a, bind);
+ else
+ tcf_idr_cleanup(tn, index);
+
+ return -EINVAL;
+ }
switch (parm->eaction) {
case TCA_EGRESS_MIRROR:
@@ -135,164 +155,323 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla,
if (exists)
tcf_idr_release(*a, bind);
else
- tcf_idr_cleanup(tn, parm->index);
+ tcf_idr_cleanup(tn, index);
NL_SET_ERR_MSG_MOD(extack, "Unknown mirred option");
return -EINVAL;
}
if (!exists) {
- if (!parm->ifindex) {
- tcf_idr_cleanup(tn, parm->index);
- NL_SET_ERR_MSG_MOD(extack, "Specified device does not exist");
+ if (!parm->ifindex && !tb[TCA_MIRRED_BLOCKID]) {
+ tcf_idr_cleanup(tn, index);
+ NL_SET_ERR_MSG_MOD(extack,
+ "Must specify device or block");
return -EINVAL;
}
- ret = tcf_idr_create(tn, parm->index, est, a,
- &act_mirred_ops, bind, true);
+ ret = tcf_idr_create_from_flags(tn, index, est, a,
+ &act_mirred_ops, bind, flags);
if (ret) {
- tcf_idr_cleanup(tn, parm->index);
+ tcf_idr_cleanup(tn, index);
return ret;
}
ret = ACT_P_CREATED;
- } else if (!ovr) {
+ } else if (!(flags & TCA_ACT_FLAGS_REPLACE)) {
tcf_idr_release(*a, bind);
return -EEXIST;
}
+
m = to_mirred(*a);
+ if (ret == ACT_P_CREATED)
+ INIT_LIST_HEAD(&m->tcfm_list);
+
+ err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack);
+ if (err < 0)
+ goto release_idr;
spin_lock_bh(&m->tcf_lock);
- m->tcf_action = parm->action;
- m->tcfm_eaction = parm->eaction;
if (parm->ifindex) {
- dev = dev_get_by_index(net, parm->ifindex);
- if (!dev) {
+ struct net_device *ndev;
+
+ ndev = dev_get_by_index(net, parm->ifindex);
+ if (!ndev) {
spin_unlock_bh(&m->tcf_lock);
- tcf_idr_release(*a, bind);
- return -ENODEV;
+ err = -ENODEV;
+ goto put_chain;
}
- mac_header_xmit = dev_is_mac_header_xmit(dev);
- rcu_swap_protected(m->tcfm_dev, dev,
- lockdep_is_held(&m->tcf_lock));
- if (dev)
- dev_put(dev);
+ mac_header_xmit = dev_is_mac_header_xmit(ndev);
+ tcf_mirred_replace_dev(m, ndev);
+ netdev_tracker_alloc(ndev, &m->tcfm_dev_tracker, GFP_ATOMIC);
m->tcfm_mac_header_xmit = mac_header_xmit;
+ m->tcfm_blockid = 0;
+ } else if (tb[TCA_MIRRED_BLOCKID]) {
+ tcf_mirred_replace_dev(m, NULL);
+ m->tcfm_mac_header_xmit = false;
+ m->tcfm_blockid = nla_get_u32(tb[TCA_MIRRED_BLOCKID]);
}
+ goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch);
+ m->tcfm_eaction = parm->eaction;
spin_unlock_bh(&m->tcf_lock);
+ if (goto_ch)
+ tcf_chain_put_by_act(goto_ch);
if (ret == ACT_P_CREATED) {
spin_lock(&mirred_list_lock);
list_add(&m->tcfm_list, &mirred_list);
spin_unlock(&mirred_list_lock);
-
- tcf_idr_insert(tn, *a);
}
return ret;
+put_chain:
+ if (goto_ch)
+ tcf_chain_put_by_act(goto_ch);
+release_idr:
+ tcf_idr_release(*a, bind);
+ return err;
}
-static int tcf_mirred_act(struct sk_buff *skb, const struct tc_action *a,
- struct tcf_result *res)
+static int
+tcf_mirred_forward(bool at_ingress, bool want_ingress, struct sk_buff *skb)
{
- struct tcf_mirred *m = to_mirred(a);
- struct sk_buff *skb2 = skb;
- bool m_mac_header_xmit;
- struct net_device *dev;
- int retval, err = 0;
- bool use_reinsert;
+ int err;
+
+ if (!want_ingress)
+ err = tcf_dev_queue_xmit(skb, dev_queue_xmit);
+ else if (!at_ingress)
+ err = netif_rx(skb);
+ else
+ err = netif_receive_skb(skb);
+
+ return err;
+}
+
+static int tcf_mirred_to_dev(struct sk_buff *skb, struct tcf_mirred *m,
+ struct net_device *dev,
+ const bool m_mac_header_xmit, int m_eaction,
+ int retval)
+{
+ struct sk_buff *skb_to_send = skb;
bool want_ingress;
bool is_redirect;
- int m_eaction;
+ bool expects_nh;
+ bool at_ingress;
+ bool dont_clone;
int mac_len;
+ bool at_nh;
+ int err;
- tcf_lastuse_update(&m->tcf_tm);
- bstats_cpu_update(this_cpu_ptr(m->common.cpu_bstats), skb);
-
- m_mac_header_xmit = READ_ONCE(m->tcfm_mac_header_xmit);
- m_eaction = READ_ONCE(m->tcfm_eaction);
- retval = READ_ONCE(m->tcf_action);
- dev = rcu_dereference_bh(m->tcfm_dev);
- if (unlikely(!dev)) {
- pr_notice_once("tc mirred: target device is gone\n");
- goto out;
- }
-
- if (unlikely(!(dev->flags & IFF_UP))) {
+ is_redirect = tcf_mirred_is_act_redirect(m_eaction);
+ if (unlikely(!(dev->flags & IFF_UP)) || !netif_carrier_ok(dev)) {
net_notice_ratelimited("tc mirred to Houston: device %s is down\n",
dev->name);
- goto out;
+ goto err_cant_do;
}
/* we could easily avoid the clone only if called by ingress and clsact;
* since we can't easily detect the clsact caller, skip clone only for
* ingress - that covers the TC S/W datapath.
*/
- is_redirect = tcf_mirred_is_act_redirect(m_eaction);
- use_reinsert = skb_at_tc_ingress(skb) && is_redirect &&
- tcf_mirred_can_reinsert(retval);
- if (!use_reinsert) {
- skb2 = skb_clone(skb, GFP_ATOMIC);
- if (!skb2)
- goto out;
+ at_ingress = skb_at_tc_ingress(skb);
+ dont_clone = skb_at_tc_ingress(skb) && is_redirect &&
+ tcf_mirred_can_reinsert(retval);
+ if (!dont_clone) {
+ skb_to_send = skb_clone(skb, GFP_ATOMIC);
+ if (!skb_to_send)
+ goto err_cant_do;
}
- /* If action's target direction differs than filter's direction,
- * and devices expect a mac header on xmit, then mac push/pull is
- * needed.
- */
want_ingress = tcf_mirred_act_wants_ingress(m_eaction);
- if (skb_at_tc_ingress(skb) != want_ingress && m_mac_header_xmit) {
- if (!skb_at_tc_ingress(skb)) {
- /* caught at egress, act ingress: pull mac */
- mac_len = skb_network_header(skb) - skb_mac_header(skb);
- skb_pull_rcsum(skb2, mac_len);
+
+ /* All mirred/redirected skbs should clear previous ct info */
+ nf_reset_ct(skb_to_send);
+ if (want_ingress && !at_ingress) /* drop dst for egress -> ingress */
+ skb_dst_drop(skb_to_send);
+
+ expects_nh = want_ingress || !m_mac_header_xmit;
+ at_nh = skb->data == skb_network_header(skb);
+ if (at_nh != expects_nh) {
+ mac_len = at_ingress ? skb->mac_len :
+ skb_network_offset(skb);
+ if (expects_nh) {
+ /* target device/action expect data at nh */
+ skb_pull_rcsum(skb_to_send, mac_len);
} else {
- /* caught at ingress, act egress: push mac */
- skb_push_rcsum(skb2, skb->mac_len);
+ /* target device/action expect data at mac */
+ skb_push_rcsum(skb_to_send, mac_len);
}
}
- skb2->skb_iif = skb->dev->ifindex;
- skb2->dev = dev;
+ skb_to_send->skb_iif = skb->dev->ifindex;
+ skb_to_send->dev = dev;
- /* mirror is always swallowed */
if (is_redirect) {
- skb2->tc_redirected = 1;
- skb2->tc_from_ingress = skb2->tc_at_ingress;
- if (skb2->tc_from_ingress)
- skb2->tstamp = 0;
- /* let's the caller reinsert the packet, if possible */
- if (use_reinsert) {
- res->ingress = want_ingress;
- res->qstats = this_cpu_ptr(m->common.cpu_qstats);
- return TC_ACT_REINSERT;
- }
+ if (skb == skb_to_send)
+ retval = TC_ACT_CONSUMED;
+
+ skb_set_redirected(skb_to_send, skb_to_send->tc_at_ingress);
+
+ err = tcf_mirred_forward(at_ingress, want_ingress, skb_to_send);
+ } else {
+ err = tcf_mirred_forward(at_ingress, want_ingress, skb_to_send);
}
+ if (err)
+ tcf_action_inc_overlimit_qstats(&m->common);
- if (!want_ingress)
- err = dev_queue_xmit(skb2);
- else
- err = netif_receive_skb(skb2);
+ return retval;
- if (err) {
-out:
- qstats_overlimit_inc(this_cpu_ptr(m->common.cpu_qstats));
- if (tcf_mirred_is_act_redirect(m_eaction))
- retval = TC_ACT_SHOT;
+err_cant_do:
+ if (is_redirect)
+ retval = TC_ACT_SHOT;
+ tcf_action_inc_overlimit_qstats(&m->common);
+ return retval;
+}
+
+static int tcf_blockcast_redir(struct sk_buff *skb, struct tcf_mirred *m,
+ struct tcf_block *block, int m_eaction,
+ const u32 exception_ifindex, int retval)
+{
+ struct net_device *dev_prev = NULL;
+ struct net_device *dev = NULL;
+ unsigned long index;
+ int mirred_eaction;
+
+ mirred_eaction = tcf_mirred_act_wants_ingress(m_eaction) ?
+ TCA_INGRESS_MIRROR : TCA_EGRESS_MIRROR;
+
+ xa_for_each(&block->ports, index, dev) {
+ if (index == exception_ifindex)
+ continue;
+
+ if (!dev_prev)
+ goto assign_prev;
+
+ tcf_mirred_to_dev(skb, m, dev_prev,
+ dev_is_mac_header_xmit(dev),
+ mirred_eaction, retval);
+assign_prev:
+ dev_prev = dev;
}
+ if (dev_prev)
+ return tcf_mirred_to_dev(skb, m, dev_prev,
+ dev_is_mac_header_xmit(dev_prev),
+ m_eaction, retval);
+
return retval;
}
-static void tcf_stats_update(struct tc_action *a, u64 bytes, u32 packets,
- u64 lastuse, bool hw)
+static int tcf_blockcast_mirror(struct sk_buff *skb, struct tcf_mirred *m,
+ struct tcf_block *block, int m_eaction,
+ const u32 exception_ifindex, int retval)
+{
+ struct net_device *dev = NULL;
+ unsigned long index;
+
+ xa_for_each(&block->ports, index, dev) {
+ if (index == exception_ifindex)
+ continue;
+
+ tcf_mirred_to_dev(skb, m, dev,
+ dev_is_mac_header_xmit(dev),
+ m_eaction, retval);
+ }
+
+ return retval;
+}
+
+static int tcf_blockcast(struct sk_buff *skb, struct tcf_mirred *m,
+ const u32 blockid, struct tcf_result *res,
+ int retval)
+{
+ const u32 exception_ifindex = skb->dev->ifindex;
+ struct tcf_block *block;
+ bool is_redirect;
+ int m_eaction;
+
+ m_eaction = READ_ONCE(m->tcfm_eaction);
+ is_redirect = tcf_mirred_is_act_redirect(m_eaction);
+
+ /* we are already under rcu protection, so can call block lookup
+ * directly.
+ */
+ block = tcf_block_lookup(dev_net(skb->dev), blockid);
+ if (!block || xa_empty(&block->ports)) {
+ tcf_action_inc_overlimit_qstats(&m->common);
+ return retval;
+ }
+
+ if (is_redirect)
+ return tcf_blockcast_redir(skb, m, block, m_eaction,
+ exception_ifindex, retval);
+
+ /* If it's not redirect, it is mirror */
+ return tcf_blockcast_mirror(skb, m, block, m_eaction, exception_ifindex,
+ retval);
+}
+
+TC_INDIRECT_SCOPE int tcf_mirred_act(struct sk_buff *skb,
+ const struct tc_action *a,
+ struct tcf_result *res)
+{
+ struct tcf_mirred *m = to_mirred(a);
+ int retval = READ_ONCE(m->tcf_action);
+ struct netdev_xmit *xmit;
+ bool m_mac_header_xmit;
+ struct net_device *dev;
+ int i, m_eaction;
+ u32 blockid;
+
+#ifdef CONFIG_PREEMPT_RT
+ xmit = &current->net_xmit;
+#else
+ xmit = this_cpu_ptr(&softnet_data.xmit);
+#endif
+ if (unlikely(xmit->sched_mirred_nest >= MIRRED_NEST_LIMIT)) {
+ net_warn_ratelimited("Packet exceeded mirred recursion limit on dev %s\n",
+ netdev_name(skb->dev));
+ return TC_ACT_SHOT;
+ }
+
+ tcf_lastuse_update(&m->tcf_tm);
+ tcf_action_update_bstats(&m->common, skb);
+
+ blockid = READ_ONCE(m->tcfm_blockid);
+ if (blockid)
+ return tcf_blockcast(skb, m, blockid, res, retval);
+
+ dev = rcu_dereference_bh(m->tcfm_dev);
+ if (unlikely(!dev)) {
+ pr_notice_once("tc mirred: target device is gone\n");
+ tcf_action_inc_overlimit_qstats(&m->common);
+ return retval;
+ }
+ for (i = 0; i < xmit->sched_mirred_nest; i++) {
+ if (xmit->sched_mirred_dev[i] != dev)
+ continue;
+ pr_notice_once("tc mirred: loop on device %s\n",
+ netdev_name(dev));
+ tcf_action_inc_overlimit_qstats(&m->common);
+ return retval;
+ }
+
+ xmit->sched_mirred_dev[xmit->sched_mirred_nest++] = dev;
+
+ m_mac_header_xmit = READ_ONCE(m->tcfm_mac_header_xmit);
+ m_eaction = READ_ONCE(m->tcfm_eaction);
+
+ retval = tcf_mirred_to_dev(skb, m, dev, m_mac_header_xmit, m_eaction,
+ retval);
+ xmit->sched_mirred_nest--;
+
+ return retval;
+}
+
+static void tcf_stats_update(struct tc_action *a, u64 bytes, u64 packets,
+ u64 drops, u64 lastuse, bool hw)
{
struct tcf_mirred *m = to_mirred(a);
struct tcf_t *tm = &m->tcf_tm;
- _bstats_cpu_update(this_cpu_ptr(a->cpu_bstats), bytes, packets);
- if (hw)
- _bstats_cpu_update(this_cpu_ptr(a->cpu_bstats_hw),
- bytes, packets);
+ tcf_action_update_stats(a, bytes, packets, drops, hw);
tm->lastuse = max_t(u64, tm->lastuse, lastuse);
}
@@ -308,6 +487,7 @@ static int tcf_mirred_dump(struct sk_buff *skb, struct tc_action *a, int bind,
};
struct net_device *dev;
struct tcf_t t;
+ u32 blockid;
spin_lock_bh(&m->tcf_lock);
opt.action = m->tcf_action;
@@ -319,6 +499,10 @@ static int tcf_mirred_dump(struct sk_buff *skb, struct tc_action *a, int bind,
if (nla_put(skb, TCA_MIRRED_PARMS, sizeof(opt), &opt))
goto nla_put_failure;
+ blockid = m->tcfm_blockid;
+ if (blockid && nla_put_u32(skb, TCA_MIRRED_BLOCKID, blockid))
+ goto nla_put_failure;
+
tcf_tm_dump(&t, &m->tcf_tm);
if (nla_put_64bit(skb, TCA_MIRRED_TM, sizeof(t), &t, TCA_MIRRED_PAD))
goto nla_put_failure;
@@ -332,23 +516,6 @@ nla_put_failure:
return -1;
}
-static int tcf_mirred_walker(struct net *net, struct sk_buff *skb,
- struct netlink_callback *cb, int type,
- const struct tc_action_ops *ops,
- struct netlink_ext_ack *extack)
-{
- struct tc_action_net *tn = net_generic(net, mirred_net_id);
-
- return tcf_generic_walker(tn, skb, cb, type, ops, extack);
-}
-
-static int tcf_mirred_search(struct net *net, struct tc_action **a, u32 index)
-{
- struct tc_action_net *tn = net_generic(net, mirred_net_id);
-
- return tcf_idr_search(tn, a, index);
-}
-
static int mirred_device_event(struct notifier_block *unused,
unsigned long event, void *ptr)
{
@@ -361,7 +528,7 @@ static int mirred_device_event(struct notifier_block *unused,
list_for_each_entry(m, &mirred_list, tcfm_list) {
spin_lock_bh(&m->tcf_lock);
if (tcf_mirred_dev_dereference(m) == dev) {
- dev_put(dev);
+ netdev_put(dev, &m->tcfm_dev_tracker);
/* Note : no rcu grace period necessary, as
* net_device are already rcu protected.
*/
@@ -379,57 +546,119 @@ static struct notifier_block mirred_device_notifier = {
.notifier_call = mirred_device_event,
};
-static struct net_device *tcf_mirred_get_dev(const struct tc_action *a)
+static void tcf_mirred_dev_put(void *priv)
+{
+ struct net_device *dev = priv;
+
+ dev_put(dev);
+}
+
+static struct net_device *
+tcf_mirred_get_dev(const struct tc_action *a,
+ tc_action_priv_destructor *destructor)
{
struct tcf_mirred *m = to_mirred(a);
struct net_device *dev;
rcu_read_lock();
dev = rcu_dereference(m->tcfm_dev);
- if (dev)
+ if (dev) {
dev_hold(dev);
+ *destructor = tcf_mirred_dev_put;
+ }
rcu_read_unlock();
return dev;
}
-static void tcf_mirred_put_dev(struct net_device *dev)
+static size_t tcf_mirred_get_fill_size(const struct tc_action *act)
{
- dev_put(dev);
+ return nla_total_size(sizeof(struct tc_mirred));
+}
+
+static void tcf_offload_mirred_get_dev(struct flow_action_entry *entry,
+ const struct tc_action *act)
+{
+ entry->dev = act->ops->get_dev(act, &entry->destructor);
+ if (!entry->dev)
+ return;
+ entry->destructor_priv = entry->dev;
+}
+
+static int tcf_mirred_offload_act_setup(struct tc_action *act, void *entry_data,
+ u32 *index_inc, bool bind,
+ struct netlink_ext_ack *extack)
+{
+ if (bind) {
+ struct flow_action_entry *entry = entry_data;
+
+ if (is_tcf_mirred_egress_redirect(act)) {
+ entry->id = FLOW_ACTION_REDIRECT;
+ tcf_offload_mirred_get_dev(entry, act);
+ } else if (is_tcf_mirred_egress_mirror(act)) {
+ entry->id = FLOW_ACTION_MIRRED;
+ tcf_offload_mirred_get_dev(entry, act);
+ } else if (is_tcf_mirred_ingress_redirect(act)) {
+ entry->id = FLOW_ACTION_REDIRECT_INGRESS;
+ tcf_offload_mirred_get_dev(entry, act);
+ } else if (is_tcf_mirred_ingress_mirror(act)) {
+ entry->id = FLOW_ACTION_MIRRED_INGRESS;
+ tcf_offload_mirred_get_dev(entry, act);
+ } else {
+ NL_SET_ERR_MSG_MOD(extack, "Unsupported mirred offload");
+ return -EOPNOTSUPP;
+ }
+ *index_inc = 1;
+ } else {
+ struct flow_offload_action *fl_action = entry_data;
+
+ if (is_tcf_mirred_egress_redirect(act))
+ fl_action->id = FLOW_ACTION_REDIRECT;
+ else if (is_tcf_mirred_egress_mirror(act))
+ fl_action->id = FLOW_ACTION_MIRRED;
+ else if (is_tcf_mirred_ingress_redirect(act))
+ fl_action->id = FLOW_ACTION_REDIRECT_INGRESS;
+ else if (is_tcf_mirred_ingress_mirror(act))
+ fl_action->id = FLOW_ACTION_MIRRED_INGRESS;
+ else
+ return -EOPNOTSUPP;
+ }
+
+ return 0;
}
static struct tc_action_ops act_mirred_ops = {
.kind = "mirred",
- .type = TCA_ACT_MIRRED,
+ .id = TCA_ID_MIRRED,
.owner = THIS_MODULE,
.act = tcf_mirred_act,
.stats_update = tcf_stats_update,
.dump = tcf_mirred_dump,
.cleanup = tcf_mirred_release,
.init = tcf_mirred_init,
- .walk = tcf_mirred_walker,
- .lookup = tcf_mirred_search,
+ .get_fill_size = tcf_mirred_get_fill_size,
+ .offload_act_setup = tcf_mirred_offload_act_setup,
.size = sizeof(struct tcf_mirred),
.get_dev = tcf_mirred_get_dev,
- .put_dev = tcf_mirred_put_dev,
};
+MODULE_ALIAS_NET_ACT("mirred");
static __net_init int mirred_init_net(struct net *net)
{
- struct tc_action_net *tn = net_generic(net, mirred_net_id);
+ struct tc_action_net *tn = net_generic(net, act_mirred_ops.net_id);
- return tc_action_net_init(tn, &act_mirred_ops);
+ return tc_action_net_init(net, tn, &act_mirred_ops);
}
static void __net_exit mirred_exit_net(struct list_head *net_list)
{
- tc_action_net_exit(net_list, mirred_net_id);
+ tc_action_net_exit(net_list, act_mirred_ops.net_id);
}
static struct pernet_operations mirred_net_ops = {
.init = mirred_init_net,
.exit_batch = mirred_exit_net,
- .id = &mirred_net_id,
+ .id = &act_mirred_ops.net_id,
.size = sizeof(struct tc_action_net),
};
@@ -444,7 +673,11 @@ static int __init mirred_init_module(void)
return err;
pr_info("Mirror/redirect action on\n");
- return tcf_register_action(&act_mirred_ops, &mirred_net_ops);
+ err = tcf_register_action(&act_mirred_ops, &mirred_net_ops);
+ if (err)
+ unregister_netdevice_notifier(&mirred_device_notifier);
+
+ return err;
}
static void __exit mirred_cleanup_module(void)
diff --git a/net/sched/act_mpls.c b/net/sched/act_mpls.c
new file mode 100644
index 000000000000..6654011dcd2b
--- /dev/null
+++ b/net/sched/act_mpls.c
@@ -0,0 +1,489 @@
+// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+/* Copyright (C) 2019 Netronome Systems, Inc. */
+
+#include <linux/if_arp.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/mpls.h>
+#include <linux/rtnetlink.h>
+#include <linux/skbuff.h>
+#include <linux/tc_act/tc_mpls.h>
+#include <net/mpls.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+#include <net/pkt_cls.h>
+#include <net/tc_act/tc_mpls.h>
+#include <net/tc_wrapper.h>
+
+static struct tc_action_ops act_mpls_ops;
+
+#define ACT_MPLS_TTL_DEFAULT 255
+
+static __be32 tcf_mpls_get_lse(struct mpls_shim_hdr *lse,
+ struct tcf_mpls_params *p, bool set_bos)
+{
+ u32 new_lse = 0;
+
+ if (lse)
+ new_lse = be32_to_cpu(lse->label_stack_entry);
+
+ if (p->tcfm_label != ACT_MPLS_LABEL_NOT_SET) {
+ new_lse &= ~MPLS_LS_LABEL_MASK;
+ new_lse |= p->tcfm_label << MPLS_LS_LABEL_SHIFT;
+ }
+ if (p->tcfm_ttl) {
+ new_lse &= ~MPLS_LS_TTL_MASK;
+ new_lse |= p->tcfm_ttl << MPLS_LS_TTL_SHIFT;
+ }
+ if (p->tcfm_tc != ACT_MPLS_TC_NOT_SET) {
+ new_lse &= ~MPLS_LS_TC_MASK;
+ new_lse |= p->tcfm_tc << MPLS_LS_TC_SHIFT;
+ }
+ if (p->tcfm_bos != ACT_MPLS_BOS_NOT_SET) {
+ new_lse &= ~MPLS_LS_S_MASK;
+ new_lse |= p->tcfm_bos << MPLS_LS_S_SHIFT;
+ } else if (set_bos) {
+ new_lse |= 1 << MPLS_LS_S_SHIFT;
+ }
+
+ return cpu_to_be32(new_lse);
+}
+
+TC_INDIRECT_SCOPE int tcf_mpls_act(struct sk_buff *skb,
+ const struct tc_action *a,
+ struct tcf_result *res)
+{
+ struct tcf_mpls *m = to_mpls(a);
+ struct tcf_mpls_params *p;
+ __be32 new_lse;
+ int mac_len;
+
+ tcf_lastuse_update(&m->tcf_tm);
+ bstats_update(this_cpu_ptr(m->common.cpu_bstats), skb);
+
+ /* Ensure 'data' points at mac_header prior calling mpls manipulating
+ * functions.
+ */
+ if (skb_at_tc_ingress(skb)) {
+ skb_push_rcsum(skb, skb->mac_len);
+ mac_len = skb->mac_len;
+ } else {
+ mac_len = skb_network_offset(skb);
+ }
+
+ p = rcu_dereference_bh(m->mpls_p);
+
+ switch (p->tcfm_action) {
+ case TCA_MPLS_ACT_POP:
+ if (skb_mpls_pop(skb, p->tcfm_proto, mac_len,
+ skb->dev && skb->dev->type == ARPHRD_ETHER))
+ goto drop;
+ break;
+ case TCA_MPLS_ACT_PUSH:
+ new_lse = tcf_mpls_get_lse(NULL, p, !eth_p_mpls(skb_protocol(skb, true)));
+ if (skb_mpls_push(skb, new_lse, p->tcfm_proto, mac_len,
+ skb->dev && skb->dev->type == ARPHRD_ETHER))
+ goto drop;
+ break;
+ case TCA_MPLS_ACT_MAC_PUSH:
+ if (skb_vlan_tag_present(skb)) {
+ if (__vlan_insert_inner_tag(skb, skb->vlan_proto,
+ skb_vlan_tag_get(skb),
+ ETH_HLEN) < 0)
+ goto drop;
+
+ skb->protocol = skb->vlan_proto;
+ __vlan_hwaccel_clear_tag(skb);
+ }
+
+ new_lse = tcf_mpls_get_lse(NULL, p, mac_len ||
+ !eth_p_mpls(skb->protocol));
+
+ if (skb_mpls_push(skb, new_lse, p->tcfm_proto, 0, false))
+ goto drop;
+ break;
+ case TCA_MPLS_ACT_MODIFY:
+ if (!pskb_may_pull(skb,
+ skb_network_offset(skb) + MPLS_HLEN))
+ goto drop;
+ new_lse = tcf_mpls_get_lse(mpls_hdr(skb), p, false);
+ if (skb_mpls_update_lse(skb, new_lse))
+ goto drop;
+ break;
+ case TCA_MPLS_ACT_DEC_TTL:
+ if (skb_mpls_dec_ttl(skb))
+ goto drop;
+ break;
+ }
+
+ if (skb_at_tc_ingress(skb))
+ skb_pull_rcsum(skb, skb->mac_len);
+
+ return p->action;
+
+drop:
+ qstats_drop_inc(this_cpu_ptr(m->common.cpu_qstats));
+ return TC_ACT_SHOT;
+}
+
+static int valid_label(const struct nlattr *attr,
+ struct netlink_ext_ack *extack)
+{
+ const u32 *label = nla_data(attr);
+
+ if (nla_len(attr) != sizeof(*label)) {
+ NL_SET_ERR_MSG_MOD(extack, "Invalid MPLS label length");
+ return -EINVAL;
+ }
+
+ if (*label & ~MPLS_LABEL_MASK || *label == MPLS_LABEL_IMPLNULL) {
+ NL_SET_ERR_MSG_MOD(extack, "MPLS label out of range");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static const struct nla_policy mpls_policy[TCA_MPLS_MAX + 1] = {
+ [TCA_MPLS_PARMS] = NLA_POLICY_EXACT_LEN(sizeof(struct tc_mpls)),
+ [TCA_MPLS_PROTO] = { .type = NLA_U16 },
+ [TCA_MPLS_LABEL] = NLA_POLICY_VALIDATE_FN(NLA_BINARY,
+ valid_label),
+ [TCA_MPLS_TC] = NLA_POLICY_RANGE(NLA_U8, 0, 7),
+ [TCA_MPLS_TTL] = NLA_POLICY_MIN(NLA_U8, 1),
+ [TCA_MPLS_BOS] = NLA_POLICY_RANGE(NLA_U8, 0, 1),
+};
+
+static int tcf_mpls_init(struct net *net, struct nlattr *nla,
+ struct nlattr *est, struct tc_action **a,
+ struct tcf_proto *tp, u32 flags,
+ struct netlink_ext_ack *extack)
+{
+ struct tc_action_net *tn = net_generic(net, act_mpls_ops.net_id);
+ bool bind = flags & TCA_ACT_FLAGS_BIND;
+ struct nlattr *tb[TCA_MPLS_MAX + 1];
+ struct tcf_chain *goto_ch = NULL;
+ struct tcf_mpls_params *p;
+ struct tc_mpls *parm;
+ bool exists = false;
+ struct tcf_mpls *m;
+ int ret = 0, err;
+ u8 mpls_ttl = 0;
+ u32 index;
+
+ if (!nla) {
+ NL_SET_ERR_MSG_MOD(extack, "Missing netlink attributes");
+ return -EINVAL;
+ }
+
+ err = nla_parse_nested(tb, TCA_MPLS_MAX, nla, mpls_policy, extack);
+ if (err < 0)
+ return err;
+
+ if (!tb[TCA_MPLS_PARMS]) {
+ NL_SET_ERR_MSG_MOD(extack, "No MPLS params");
+ return -EINVAL;
+ }
+ parm = nla_data(tb[TCA_MPLS_PARMS]);
+ index = parm->index;
+
+ err = tcf_idr_check_alloc(tn, &index, a, bind);
+ if (err < 0)
+ return err;
+ exists = err;
+ if (exists && bind)
+ return ACT_P_BOUND;
+
+ if (!exists) {
+ ret = tcf_idr_create(tn, index, est, a, &act_mpls_ops, bind,
+ true, flags);
+ if (ret) {
+ tcf_idr_cleanup(tn, index);
+ return ret;
+ }
+
+ ret = ACT_P_CREATED;
+ } else if (!(flags & TCA_ACT_FLAGS_REPLACE)) {
+ tcf_idr_release(*a, bind);
+ return -EEXIST;
+ }
+
+ /* Verify parameters against action type. */
+ switch (parm->m_action) {
+ case TCA_MPLS_ACT_POP:
+ if (!tb[TCA_MPLS_PROTO]) {
+ NL_SET_ERR_MSG_MOD(extack, "Protocol must be set for MPLS pop");
+ err = -EINVAL;
+ goto release_idr;
+ }
+ if (!eth_proto_is_802_3(nla_get_be16(tb[TCA_MPLS_PROTO]))) {
+ NL_SET_ERR_MSG_MOD(extack, "Invalid protocol type for MPLS pop");
+ err = -EINVAL;
+ goto release_idr;
+ }
+ if (tb[TCA_MPLS_LABEL] || tb[TCA_MPLS_TTL] || tb[TCA_MPLS_TC] ||
+ tb[TCA_MPLS_BOS]) {
+ NL_SET_ERR_MSG_MOD(extack, "Label, TTL, TC or BOS cannot be used with MPLS pop");
+ err = -EINVAL;
+ goto release_idr;
+ }
+ break;
+ case TCA_MPLS_ACT_DEC_TTL:
+ if (tb[TCA_MPLS_PROTO] || tb[TCA_MPLS_LABEL] ||
+ tb[TCA_MPLS_TTL] || tb[TCA_MPLS_TC] || tb[TCA_MPLS_BOS]) {
+ NL_SET_ERR_MSG_MOD(extack, "Label, TTL, TC, BOS or protocol cannot be used with MPLS dec_ttl");
+ err = -EINVAL;
+ goto release_idr;
+ }
+ break;
+ case TCA_MPLS_ACT_PUSH:
+ case TCA_MPLS_ACT_MAC_PUSH:
+ if (!tb[TCA_MPLS_LABEL]) {
+ NL_SET_ERR_MSG_MOD(extack, "Label is required for MPLS push");
+ err = -EINVAL;
+ goto release_idr;
+ }
+ if (tb[TCA_MPLS_PROTO] &&
+ !eth_p_mpls(nla_get_be16(tb[TCA_MPLS_PROTO]))) {
+ NL_SET_ERR_MSG_MOD(extack, "Protocol must be an MPLS type for MPLS push");
+ err = -EPROTONOSUPPORT;
+ goto release_idr;
+ }
+ /* Push needs a TTL - if not specified, set a default value. */
+ if (!tb[TCA_MPLS_TTL]) {
+#if IS_ENABLED(CONFIG_MPLS)
+ mpls_ttl = net->mpls.default_ttl ?
+ net->mpls.default_ttl : ACT_MPLS_TTL_DEFAULT;
+#else
+ mpls_ttl = ACT_MPLS_TTL_DEFAULT;
+#endif
+ }
+ break;
+ case TCA_MPLS_ACT_MODIFY:
+ if (tb[TCA_MPLS_PROTO]) {
+ NL_SET_ERR_MSG_MOD(extack, "Protocol cannot be used with MPLS modify");
+ err = -EINVAL;
+ goto release_idr;
+ }
+ break;
+ default:
+ NL_SET_ERR_MSG_MOD(extack, "Unknown MPLS action");
+ err = -EINVAL;
+ goto release_idr;
+ }
+
+ err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack);
+ if (err < 0)
+ goto release_idr;
+
+ m = to_mpls(*a);
+
+ p = kzalloc(sizeof(*p), GFP_KERNEL);
+ if (!p) {
+ err = -ENOMEM;
+ goto put_chain;
+ }
+
+ p->tcfm_action = parm->m_action;
+ p->tcfm_label = nla_get_u32_default(tb[TCA_MPLS_LABEL],
+ ACT_MPLS_LABEL_NOT_SET);
+ p->tcfm_tc = nla_get_u8_default(tb[TCA_MPLS_TC], ACT_MPLS_TC_NOT_SET);
+ p->tcfm_ttl = nla_get_u8_default(tb[TCA_MPLS_TTL], mpls_ttl);
+ p->tcfm_bos = nla_get_u8_default(tb[TCA_MPLS_BOS],
+ ACT_MPLS_BOS_NOT_SET);
+ p->tcfm_proto = nla_get_be16_default(tb[TCA_MPLS_PROTO],
+ htons(ETH_P_MPLS_UC));
+ p->action = parm->action;
+
+ spin_lock_bh(&m->tcf_lock);
+ goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch);
+ p = rcu_replace_pointer(m->mpls_p, p, lockdep_is_held(&m->tcf_lock));
+ spin_unlock_bh(&m->tcf_lock);
+
+ if (goto_ch)
+ tcf_chain_put_by_act(goto_ch);
+ if (p)
+ kfree_rcu(p, rcu);
+
+ return ret;
+put_chain:
+ if (goto_ch)
+ tcf_chain_put_by_act(goto_ch);
+release_idr:
+ tcf_idr_release(*a, bind);
+ return err;
+}
+
+static void tcf_mpls_cleanup(struct tc_action *a)
+{
+ struct tcf_mpls *m = to_mpls(a);
+ struct tcf_mpls_params *p;
+
+ p = rcu_dereference_protected(m->mpls_p, 1);
+ if (p)
+ kfree_rcu(p, rcu);
+}
+
+static int tcf_mpls_dump(struct sk_buff *skb, struct tc_action *a,
+ int bind, int ref)
+{
+ unsigned char *b = skb_tail_pointer(skb);
+ const struct tcf_mpls *m = to_mpls(a);
+ const struct tcf_mpls_params *p;
+ struct tc_mpls opt = {
+ .index = m->tcf_index,
+ .refcnt = refcount_read(&m->tcf_refcnt) - ref,
+ .bindcnt = atomic_read(&m->tcf_bindcnt) - bind,
+ };
+ struct tcf_t t;
+
+ rcu_read_lock();
+ p = rcu_dereference(m->mpls_p);
+ opt.m_action = p->tcfm_action;
+ opt.action = p->action;
+
+ if (nla_put(skb, TCA_MPLS_PARMS, sizeof(opt), &opt))
+ goto nla_put_failure;
+
+ if (p->tcfm_label != ACT_MPLS_LABEL_NOT_SET &&
+ nla_put_u32(skb, TCA_MPLS_LABEL, p->tcfm_label))
+ goto nla_put_failure;
+
+ if (p->tcfm_tc != ACT_MPLS_TC_NOT_SET &&
+ nla_put_u8(skb, TCA_MPLS_TC, p->tcfm_tc))
+ goto nla_put_failure;
+
+ if (p->tcfm_ttl && nla_put_u8(skb, TCA_MPLS_TTL, p->tcfm_ttl))
+ goto nla_put_failure;
+
+ if (p->tcfm_bos != ACT_MPLS_BOS_NOT_SET &&
+ nla_put_u8(skb, TCA_MPLS_BOS, p->tcfm_bos))
+ goto nla_put_failure;
+
+ if (nla_put_be16(skb, TCA_MPLS_PROTO, p->tcfm_proto))
+ goto nla_put_failure;
+
+ tcf_tm_dump(&t, &m->tcf_tm);
+
+ if (nla_put_64bit(skb, TCA_MPLS_TM, sizeof(t), &t, TCA_MPLS_PAD))
+ goto nla_put_failure;
+
+ rcu_read_unlock();
+
+ return skb->len;
+
+nla_put_failure:
+ rcu_read_unlock();
+ nlmsg_trim(skb, b);
+ return -EMSGSIZE;
+}
+
+static int tcf_mpls_offload_act_setup(struct tc_action *act, void *entry_data,
+ u32 *index_inc, bool bind,
+ struct netlink_ext_ack *extack)
+{
+ if (bind) {
+ struct flow_action_entry *entry = entry_data;
+
+ switch (tcf_mpls_action(act)) {
+ case TCA_MPLS_ACT_PUSH:
+ entry->id = FLOW_ACTION_MPLS_PUSH;
+ entry->mpls_push.proto = tcf_mpls_proto(act);
+ entry->mpls_push.label = tcf_mpls_label(act);
+ entry->mpls_push.tc = tcf_mpls_tc(act);
+ entry->mpls_push.bos = tcf_mpls_bos(act);
+ entry->mpls_push.ttl = tcf_mpls_ttl(act);
+ break;
+ case TCA_MPLS_ACT_POP:
+ entry->id = FLOW_ACTION_MPLS_POP;
+ entry->mpls_pop.proto = tcf_mpls_proto(act);
+ break;
+ case TCA_MPLS_ACT_MODIFY:
+ entry->id = FLOW_ACTION_MPLS_MANGLE;
+ entry->mpls_mangle.label = tcf_mpls_label(act);
+ entry->mpls_mangle.tc = tcf_mpls_tc(act);
+ entry->mpls_mangle.bos = tcf_mpls_bos(act);
+ entry->mpls_mangle.ttl = tcf_mpls_ttl(act);
+ break;
+ case TCA_MPLS_ACT_DEC_TTL:
+ NL_SET_ERR_MSG_MOD(extack, "Offload not supported when \"dec_ttl\" option is used");
+ return -EOPNOTSUPP;
+ case TCA_MPLS_ACT_MAC_PUSH:
+ NL_SET_ERR_MSG_MOD(extack, "Offload not supported when \"mac_push\" option is used");
+ return -EOPNOTSUPP;
+ default:
+ NL_SET_ERR_MSG_MOD(extack, "Unsupported MPLS mode offload");
+ return -EOPNOTSUPP;
+ }
+ *index_inc = 1;
+ } else {
+ struct flow_offload_action *fl_action = entry_data;
+
+ switch (tcf_mpls_action(act)) {
+ case TCA_MPLS_ACT_PUSH:
+ fl_action->id = FLOW_ACTION_MPLS_PUSH;
+ break;
+ case TCA_MPLS_ACT_POP:
+ fl_action->id = FLOW_ACTION_MPLS_POP;
+ break;
+ case TCA_MPLS_ACT_MODIFY:
+ fl_action->id = FLOW_ACTION_MPLS_MANGLE;
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+ }
+
+ return 0;
+}
+
+static struct tc_action_ops act_mpls_ops = {
+ .kind = "mpls",
+ .id = TCA_ID_MPLS,
+ .owner = THIS_MODULE,
+ .act = tcf_mpls_act,
+ .dump = tcf_mpls_dump,
+ .init = tcf_mpls_init,
+ .cleanup = tcf_mpls_cleanup,
+ .offload_act_setup = tcf_mpls_offload_act_setup,
+ .size = sizeof(struct tcf_mpls),
+};
+MODULE_ALIAS_NET_ACT("mpls");
+
+static __net_init int mpls_init_net(struct net *net)
+{
+ struct tc_action_net *tn = net_generic(net, act_mpls_ops.net_id);
+
+ return tc_action_net_init(net, tn, &act_mpls_ops);
+}
+
+static void __net_exit mpls_exit_net(struct list_head *net_list)
+{
+ tc_action_net_exit(net_list, act_mpls_ops.net_id);
+}
+
+static struct pernet_operations mpls_net_ops = {
+ .init = mpls_init_net,
+ .exit_batch = mpls_exit_net,
+ .id = &act_mpls_ops.net_id,
+ .size = sizeof(struct tc_action_net),
+};
+
+static int __init mpls_init_module(void)
+{
+ return tcf_register_action(&act_mpls_ops, &mpls_net_ops);
+}
+
+static void __exit mpls_cleanup_module(void)
+{
+ tcf_unregister_action(&act_mpls_ops, &mpls_net_ops);
+}
+
+module_init(mpls_init_module);
+module_exit(mpls_cleanup_module);
+
+MODULE_SOFTDEP("post: mpls_gso");
+MODULE_AUTHOR("Netronome Systems <oss-drivers@netronome.com>");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("MPLS manipulation actions");
diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c
index c5c1e23add77..26241d80ebe0 100644
--- a/net/sched/act_nat.c
+++ b/net/sched/act_nat.c
@@ -1,12 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Stateless NAT actions
*
* Copyright (c) 2007 Herbert Xu <herbert@gondor.apana.org.au>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the Free
- * Software Foundation; either version 2 of the License, or (at your option)
- * any later version.
*/
#include <linux/errno.h>
@@ -21,15 +17,15 @@
#include <linux/string.h>
#include <linux/tc_act/tc_nat.h>
#include <net/act_api.h>
+#include <net/pkt_cls.h>
#include <net/icmp.h>
#include <net/ip.h>
#include <net/netlink.h>
#include <net/tc_act/tc_nat.h>
#include <net/tcp.h>
#include <net/udp.h>
+#include <net/tc_wrapper.h>
-
-static unsigned int nat_net_id;
static struct tc_action_ops act_nat_ops;
static const struct nla_policy nat_policy[TCA_NAT_MAX + 1] = {
@@ -37,66 +33,91 @@ static const struct nla_policy nat_policy[TCA_NAT_MAX + 1] = {
};
static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est,
- struct tc_action **a, int ovr, int bind,
- bool rtnl_held, struct netlink_ext_ack *extack)
+ struct tc_action **a, struct tcf_proto *tp,
+ u32 flags, struct netlink_ext_ack *extack)
{
- struct tc_action_net *tn = net_generic(net, nat_net_id);
+ struct tc_action_net *tn = net_generic(net, act_nat_ops.net_id);
+ bool bind = flags & TCA_ACT_FLAGS_BIND;
+ struct tcf_nat_parms *nparm, *oparm;
struct nlattr *tb[TCA_NAT_MAX + 1];
+ struct tcf_chain *goto_ch = NULL;
struct tc_nat *parm;
int ret = 0, err;
struct tcf_nat *p;
+ u32 index;
if (nla == NULL)
return -EINVAL;
- err = nla_parse_nested(tb, TCA_NAT_MAX, nla, nat_policy, NULL);
+ err = nla_parse_nested_deprecated(tb, TCA_NAT_MAX, nla, nat_policy,
+ NULL);
if (err < 0)
return err;
if (tb[TCA_NAT_PARMS] == NULL)
return -EINVAL;
parm = nla_data(tb[TCA_NAT_PARMS]);
-
- err = tcf_idr_check_alloc(tn, &parm->index, a, bind);
+ index = parm->index;
+ err = tcf_idr_check_alloc(tn, &index, a, bind);
if (!err) {
- ret = tcf_idr_create(tn, parm->index, est, a,
- &act_nat_ops, bind, false);
+ ret = tcf_idr_create_from_flags(tn, index, est, a, &act_nat_ops,
+ bind, flags);
if (ret) {
- tcf_idr_cleanup(tn, parm->index);
+ tcf_idr_cleanup(tn, index);
return ret;
}
ret = ACT_P_CREATED;
} else if (err > 0) {
if (bind)
- return 0;
- if (!ovr) {
+ return ACT_P_BOUND;
+ if (!(flags & TCA_ACT_FLAGS_REPLACE)) {
tcf_idr_release(*a, bind);
return -EEXIST;
}
} else {
return err;
}
+ err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack);
+ if (err < 0)
+ goto release_idr;
+
+ nparm = kzalloc(sizeof(*nparm), GFP_KERNEL);
+ if (!nparm) {
+ err = -ENOMEM;
+ goto release_idr;
+ }
+
+ nparm->old_addr = parm->old_addr;
+ nparm->new_addr = parm->new_addr;
+ nparm->mask = parm->mask;
+ nparm->flags = parm->flags;
+ nparm->action = parm->action;
+
p = to_tcf_nat(*a);
spin_lock_bh(&p->tcf_lock);
- p->old_addr = parm->old_addr;
- p->new_addr = parm->new_addr;
- p->mask = parm->mask;
- p->flags = parm->flags;
-
- p->tcf_action = parm->action;
+ goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch);
+ oparm = rcu_replace_pointer(p->parms, nparm, lockdep_is_held(&p->tcf_lock));
spin_unlock_bh(&p->tcf_lock);
- if (ret == ACT_P_CREATED)
- tcf_idr_insert(tn, *a);
+ if (goto_ch)
+ tcf_chain_put_by_act(goto_ch);
+
+ if (oparm)
+ kfree_rcu(oparm, rcu);
return ret;
+release_idr:
+ tcf_idr_release(*a, bind);
+ return err;
}
-static int tcf_nat_act(struct sk_buff *skb, const struct tc_action *a,
- struct tcf_result *res)
+TC_INDIRECT_SCOPE int tcf_nat_act(struct sk_buff *skb,
+ const struct tc_action *a,
+ struct tcf_result *res)
{
struct tcf_nat *p = to_tcf_nat(a);
+ struct tcf_nat_parms *parms;
struct iphdr *iph;
__be32 old_addr;
__be32 new_addr;
@@ -107,22 +128,19 @@ static int tcf_nat_act(struct sk_buff *skb, const struct tc_action *a,
int ihl;
int noff;
- spin_lock(&p->tcf_lock);
-
tcf_lastuse_update(&p->tcf_tm);
- old_addr = p->old_addr;
- new_addr = p->new_addr;
- mask = p->mask;
- egress = p->flags & TCA_NAT_FLAG_EGRESS;
- action = p->tcf_action;
-
- bstats_update(&p->tcf_bstats, skb);
-
- spin_unlock(&p->tcf_lock);
+ tcf_action_update_bstats(&p->common, skb);
+ parms = rcu_dereference_bh(p->parms);
+ action = parms->action;
if (unlikely(action == TC_ACT_SHOT))
goto drop;
+ old_addr = parms->old_addr;
+ new_addr = parms->new_addr;
+ mask = parms->mask;
+ egress = parms->flags & TCA_NAT_FLAG_EGRESS;
+
noff = skb_network_offset(skb);
if (!pskb_may_pull(skb, sizeof(*iph) + noff))
goto drop;
@@ -197,9 +215,7 @@ static int tcf_nat_act(struct sk_buff *skb, const struct tc_action *a,
icmph = (void *)(skb_network_header(skb) + ihl);
- if ((icmph->type != ICMP_DEST_UNREACH) &&
- (icmph->type != ICMP_TIME_EXCEEDED) &&
- (icmph->type != ICMP_PARAMETERPROB))
+ if (!icmp_is_err(icmph->type))
break;
if (!pskb_may_pull(skb, ihl + sizeof(*icmph) + sizeof(*iph) +
@@ -244,9 +260,7 @@ out:
return action;
drop:
- spin_lock(&p->tcf_lock);
- p->tcf_qstats.drops++;
- spin_unlock(&p->tcf_lock);
+ tcf_action_inc_drop_qstats(&p->common);
return TC_ACT_SHOT;
}
@@ -254,7 +268,8 @@ static int tcf_nat_dump(struct sk_buff *skb, struct tc_action *a,
int bind, int ref)
{
unsigned char *b = skb_tail_pointer(skb);
- struct tcf_nat *p = to_tcf_nat(a);
+ const struct tcf_nat *p = to_tcf_nat(a);
+ const struct tcf_nat_parms *parms;
struct tc_nat opt = {
.index = p->tcf_index,
.refcnt = refcount_read(&p->tcf_refcnt) - ref,
@@ -262,12 +277,15 @@ static int tcf_nat_dump(struct sk_buff *skb, struct tc_action *a,
};
struct tcf_t t;
- spin_lock_bh(&p->tcf_lock);
- opt.old_addr = p->old_addr;
- opt.new_addr = p->new_addr;
- opt.mask = p->mask;
- opt.flags = p->flags;
- opt.action = p->tcf_action;
+ rcu_read_lock();
+
+ parms = rcu_dereference(p->parms);
+
+ opt.action = parms->action;
+ opt.old_addr = parms->old_addr;
+ opt.new_addr = parms->new_addr;
+ opt.mask = parms->mask;
+ opt.flags = parms->flags;
if (nla_put(skb, TCA_NAT_PARMS, sizeof(opt), &opt))
goto nla_put_failure;
@@ -275,61 +293,54 @@ static int tcf_nat_dump(struct sk_buff *skb, struct tc_action *a,
tcf_tm_dump(&t, &p->tcf_tm);
if (nla_put_64bit(skb, TCA_NAT_TM, sizeof(t), &t, TCA_NAT_PAD))
goto nla_put_failure;
- spin_unlock_bh(&p->tcf_lock);
+ rcu_read_unlock();
return skb->len;
nla_put_failure:
- spin_unlock_bh(&p->tcf_lock);
+ rcu_read_unlock();
nlmsg_trim(skb, b);
return -1;
}
-static int tcf_nat_walker(struct net *net, struct sk_buff *skb,
- struct netlink_callback *cb, int type,
- const struct tc_action_ops *ops,
- struct netlink_ext_ack *extack)
-{
- struct tc_action_net *tn = net_generic(net, nat_net_id);
-
- return tcf_generic_walker(tn, skb, cb, type, ops, extack);
-}
-
-static int tcf_nat_search(struct net *net, struct tc_action **a, u32 index)
+static void tcf_nat_cleanup(struct tc_action *a)
{
- struct tc_action_net *tn = net_generic(net, nat_net_id);
+ struct tcf_nat *p = to_tcf_nat(a);
+ struct tcf_nat_parms *parms;
- return tcf_idr_search(tn, a, index);
+ parms = rcu_dereference_protected(p->parms, 1);
+ if (parms)
+ kfree_rcu(parms, rcu);
}
static struct tc_action_ops act_nat_ops = {
.kind = "nat",
- .type = TCA_ACT_NAT,
+ .id = TCA_ID_NAT,
.owner = THIS_MODULE,
.act = tcf_nat_act,
.dump = tcf_nat_dump,
.init = tcf_nat_init,
- .walk = tcf_nat_walker,
- .lookup = tcf_nat_search,
+ .cleanup = tcf_nat_cleanup,
.size = sizeof(struct tcf_nat),
};
+MODULE_ALIAS_NET_ACT("nat");
static __net_init int nat_init_net(struct net *net)
{
- struct tc_action_net *tn = net_generic(net, nat_net_id);
+ struct tc_action_net *tn = net_generic(net, act_nat_ops.net_id);
- return tc_action_net_init(tn, &act_nat_ops);
+ return tc_action_net_init(net, tn, &act_nat_ops);
}
static void __net_exit nat_exit_net(struct list_head *net_list)
{
- tc_action_net_exit(net_list, nat_net_id);
+ tc_action_net_exit(net_list, act_nat_ops.net_id);
}
static struct pernet_operations nat_net_ops = {
.init = nat_init_net,
.exit_batch = nat_exit_net,
- .id = &nat_net_id,
+ .id = &act_nat_ops.net_id,
.size = sizeof(struct tc_action_net),
};
diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c
index 2b372a06b432..4b65901397a8 100644
--- a/net/sched/act_pedit.c
+++ b/net/sched/act_pedit.c
@@ -1,11 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* net/sched/act_pedit.c Generic packet editor
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Authors: Jamal Hadi Salim (2002-4)
*/
@@ -17,28 +13,34 @@
#include <linux/rtnetlink.h>
#include <linux/module.h>
#include <linux/init.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
#include <linux/slab.h>
+#include <net/ipv6.h>
#include <net/netlink.h>
#include <net/pkt_sched.h>
#include <linux/tc_act/tc_pedit.h>
#include <net/tc_act/tc_pedit.h>
#include <uapi/linux/tc_act/tc_pedit.h>
+#include <net/pkt_cls.h>
+#include <net/tc_wrapper.h>
-static unsigned int pedit_net_id;
static struct tc_action_ops act_pedit_ops;
static const struct nla_policy pedit_policy[TCA_PEDIT_MAX + 1] = {
[TCA_PEDIT_PARMS] = { .len = sizeof(struct tc_pedit) },
+ [TCA_PEDIT_PARMS_EX] = { .len = sizeof(struct tc_pedit) },
[TCA_PEDIT_KEYS_EX] = { .type = NLA_NESTED },
};
static const struct nla_policy pedit_key_ex_policy[TCA_PEDIT_KEY_EX_MAX + 1] = {
- [TCA_PEDIT_KEY_EX_HTYPE] = { .type = NLA_U16 },
- [TCA_PEDIT_KEY_EX_CMD] = { .type = NLA_U16 },
+ [TCA_PEDIT_KEY_EX_HTYPE] =
+ NLA_POLICY_MAX(NLA_U16, TCA_PEDIT_HDR_TYPE_MAX),
+ [TCA_PEDIT_KEY_EX_CMD] = NLA_POLICY_MAX(NLA_U16, TCA_PEDIT_CMD_MAX),
};
static struct tcf_pedit_key_ex *tcf_pedit_keys_ex_parse(struct nlattr *nla,
- u8 n)
+ u8 n, struct netlink_ext_ack *extack)
{
struct tcf_pedit_key_ex *keys_ex;
struct tcf_pedit_key_ex *k;
@@ -46,7 +48,7 @@ static struct tcf_pedit_key_ex *tcf_pedit_keys_ex_parse(struct nlattr *nla,
int err = -EINVAL;
int rem;
- if (!nla || !n)
+ if (!nla)
return NULL;
keys_ex = kcalloc(n, sizeof(*k), GFP_KERNEL);
@@ -59,40 +61,44 @@ static struct tcf_pedit_key_ex *tcf_pedit_keys_ex_parse(struct nlattr *nla,
struct nlattr *tb[TCA_PEDIT_KEY_EX_MAX + 1];
if (!n) {
+ NL_SET_ERR_MSG_MOD(extack, "Can't parse more extended keys than requested");
err = -EINVAL;
goto err_out;
}
n--;
if (nla_type(ka) != TCA_PEDIT_KEY_EX) {
+ NL_SET_ERR_MSG_ATTR(extack, ka, "Unknown attribute, expected extended key");
err = -EINVAL;
goto err_out;
}
- err = nla_parse_nested(tb, TCA_PEDIT_KEY_EX_MAX, ka,
- pedit_key_ex_policy, NULL);
+ err = nla_parse_nested_deprecated(tb, TCA_PEDIT_KEY_EX_MAX,
+ ka, pedit_key_ex_policy,
+ NULL);
if (err)
goto err_out;
- if (!tb[TCA_PEDIT_KEY_EX_HTYPE] ||
- !tb[TCA_PEDIT_KEY_EX_CMD]) {
+ if (NL_REQ_ATTR_CHECK(extack, nla, tb, TCA_PEDIT_KEY_EX_HTYPE)) {
+ NL_SET_ERR_MSG(extack, "Missing required attribute");
err = -EINVAL;
goto err_out;
}
- k->htype = nla_get_u16(tb[TCA_PEDIT_KEY_EX_HTYPE]);
- k->cmd = nla_get_u16(tb[TCA_PEDIT_KEY_EX_CMD]);
-
- if (k->htype > TCA_PEDIT_HDR_TYPE_MAX ||
- k->cmd > TCA_PEDIT_CMD_MAX) {
+ if (NL_REQ_ATTR_CHECK(extack, nla, tb, TCA_PEDIT_KEY_EX_CMD)) {
+ NL_SET_ERR_MSG(extack, "Missing required attribute");
err = -EINVAL;
goto err_out;
}
+ k->htype = nla_get_u16(tb[TCA_PEDIT_KEY_EX_HTYPE]);
+ k->cmd = nla_get_u16(tb[TCA_PEDIT_KEY_EX_CMD]);
+
k++;
}
if (n) {
+ NL_SET_ERR_MSG_MOD(extack, "Not enough extended keys to parse");
err = -EINVAL;
goto err_out;
}
@@ -107,14 +113,15 @@ err_out:
static int tcf_pedit_key_ex_dump(struct sk_buff *skb,
struct tcf_pedit_key_ex *keys_ex, int n)
{
- struct nlattr *keys_start = nla_nest_start(skb, TCA_PEDIT_KEYS_EX);
+ struct nlattr *keys_start = nla_nest_start_noflag(skb,
+ TCA_PEDIT_KEYS_EX);
if (!keys_start)
goto nla_failure;
for (; n > 0; n--) {
struct nlattr *key_start;
- key_start = nla_nest_start(skb, TCA_PEDIT_KEY_EX);
+ key_start = nla_nest_start_noflag(skb, TCA_PEDIT_KEY_EX);
if (!key_start)
goto nla_failure;
@@ -135,27 +142,41 @@ nla_failure:
return -EINVAL;
}
+static void tcf_pedit_cleanup_rcu(struct rcu_head *head)
+{
+ struct tcf_pedit_parms *parms =
+ container_of(head, struct tcf_pedit_parms, rcu);
+
+ kfree(parms->tcfp_keys_ex);
+ kfree(parms->tcfp_keys);
+
+ kfree(parms);
+}
+
static int tcf_pedit_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action **a,
- int ovr, int bind, bool rtnl_held,
+ struct tcf_proto *tp, u32 flags,
struct netlink_ext_ack *extack)
{
- struct tc_action_net *tn = net_generic(net, pedit_net_id);
+ struct tc_action_net *tn = net_generic(net, act_pedit_ops.net_id);
+ bool bind = flags & TCA_ACT_FLAGS_BIND;
+ struct tcf_chain *goto_ch = NULL;
+ struct tcf_pedit_parms *oparms, *nparms;
struct nlattr *tb[TCA_PEDIT_MAX + 1];
- struct tc_pedit_key *keys = NULL;
- struct tcf_pedit_key_ex *keys_ex;
struct tc_pedit *parm;
struct nlattr *pattr;
struct tcf_pedit *p;
int ret = 0, err;
- int ksize;
+ int i, ksize;
+ u32 index;
if (!nla) {
NL_SET_ERR_MSG_MOD(extack, "Pedit requires attributes to be passed");
return -EINVAL;
}
- err = nla_parse_nested(tb, TCA_PEDIT_MAX, nla, pedit_policy, NULL);
+ err = nla_parse_nested_deprecated(tb, TCA_PEDIT_MAX, nla,
+ pedit_policy, NULL);
if (err < 0)
return err;
@@ -168,86 +189,133 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla,
}
parm = nla_data(pattr);
- ksize = parm->nkeys * sizeof(struct tc_pedit_key);
- if (nla_len(pattr) < sizeof(*parm) + ksize) {
- NL_SET_ERR_MSG_ATTR(extack, pattr, "Length of TCA_PEDIT_PARMS or TCA_PEDIT_PARMS_EX pedit attribute is invalid");
- return -EINVAL;
- }
- keys_ex = tcf_pedit_keys_ex_parse(tb[TCA_PEDIT_KEYS_EX], parm->nkeys);
- if (IS_ERR(keys_ex))
- return PTR_ERR(keys_ex);
-
- err = tcf_idr_check_alloc(tn, &parm->index, a, bind);
+ index = parm->index;
+ err = tcf_idr_check_alloc(tn, &index, a, bind);
if (!err) {
- if (!parm->nkeys) {
- tcf_idr_cleanup(tn, parm->index);
- NL_SET_ERR_MSG_MOD(extack, "Pedit requires keys to be passed");
- ret = -EINVAL;
- goto out_free;
- }
- ret = tcf_idr_create(tn, parm->index, est, a,
- &act_pedit_ops, bind, false);
+ ret = tcf_idr_create_from_flags(tn, index, est, a,
+ &act_pedit_ops, bind, flags);
if (ret) {
- tcf_idr_cleanup(tn, parm->index);
- goto out_free;
+ tcf_idr_cleanup(tn, index);
+ return ret;
}
ret = ACT_P_CREATED;
} else if (err > 0) {
if (bind)
- goto out_free;
- if (!ovr) {
+ return ACT_P_BOUND;
+ if (!(flags & TCA_ACT_FLAGS_REPLACE)) {
ret = -EEXIST;
goto out_release;
}
} else {
- ret = err;
+ return err;
+ }
+
+ if (!parm->nkeys) {
+ NL_SET_ERR_MSG_MOD(extack, "Pedit requires keys to be passed");
+ ret = -EINVAL;
+ goto out_release;
+ }
+ ksize = parm->nkeys * sizeof(struct tc_pedit_key);
+ if (nla_len(pattr) < sizeof(*parm) + ksize) {
+ NL_SET_ERR_MSG_ATTR(extack, pattr, "Length of TCA_PEDIT_PARMS or TCA_PEDIT_PARMS_EX pedit attribute is invalid");
+ ret = -EINVAL;
+ goto out_release;
+ }
+
+ nparms = kzalloc(sizeof(*nparms), GFP_KERNEL);
+ if (!nparms) {
+ ret = -ENOMEM;
+ goto out_release;
+ }
+
+ nparms->tcfp_keys_ex =
+ tcf_pedit_keys_ex_parse(tb[TCA_PEDIT_KEYS_EX], parm->nkeys, extack);
+ if (IS_ERR(nparms->tcfp_keys_ex)) {
+ ret = PTR_ERR(nparms->tcfp_keys_ex);
goto out_free;
}
- p = to_pedit(*a);
- spin_lock_bh(&p->tcf_lock);
+ err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack);
+ if (err < 0) {
+ ret = err;
+ goto out_free_ex;
+ }
- if (ret == ACT_P_CREATED ||
- (p->tcfp_nkeys && p->tcfp_nkeys != parm->nkeys)) {
- keys = kmalloc(ksize, GFP_ATOMIC);
- if (!keys) {
- spin_unlock_bh(&p->tcf_lock);
- ret = -ENOMEM;
- goto out_release;
- }
- kfree(p->tcfp_keys);
- p->tcfp_keys = keys;
- p->tcfp_nkeys = parm->nkeys;
+ nparms->tcfp_off_max_hint = 0;
+ nparms->tcfp_flags = parm->flags;
+ nparms->tcfp_nkeys = parm->nkeys;
+
+ nparms->tcfp_keys = kmemdup(parm->keys, ksize, GFP_KERNEL);
+ if (!nparms->tcfp_keys) {
+ ret = -ENOMEM;
+ goto put_chain;
}
- memcpy(p->tcfp_keys, parm->keys, ksize);
- p->tcfp_flags = parm->flags;
- p->tcf_action = parm->action;
+ for (i = 0; i < nparms->tcfp_nkeys; ++i) {
+ u32 offmask = nparms->tcfp_keys[i].offmask;
+ u32 cur = nparms->tcfp_keys[i].off;
+
+ /* The AT option can be added to static offsets in the datapath */
+ if (!offmask && cur % 4) {
+ NL_SET_ERR_MSG_MOD(extack, "Offsets must be on 32bit boundaries");
+ ret = -EINVAL;
+ goto out_free_keys;
+ }
+
+ /* sanitize the shift value for any later use */
+ nparms->tcfp_keys[i].shift = min_t(size_t,
+ BITS_PER_TYPE(int) - 1,
+ nparms->tcfp_keys[i].shift);
- kfree(p->tcfp_keys_ex);
- p->tcfp_keys_ex = keys_ex;
+ /* The AT option can read a single byte, we can bound the actual
+ * value with uchar max.
+ */
+ cur += (0xff & offmask) >> nparms->tcfp_keys[i].shift;
+
+ /* Each key touches 4 bytes starting from the computed offset */
+ nparms->tcfp_off_max_hint =
+ max(nparms->tcfp_off_max_hint, cur + 4);
+ }
+ p = to_pedit(*a);
+ nparms->action = parm->action;
+ spin_lock_bh(&p->tcf_lock);
+ goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch);
+ oparms = rcu_replace_pointer(p->parms, nparms, 1);
spin_unlock_bh(&p->tcf_lock);
- if (ret == ACT_P_CREATED)
- tcf_idr_insert(tn, *a);
+
+ if (oparms)
+ call_rcu(&oparms->rcu, tcf_pedit_cleanup_rcu);
+
+ if (goto_ch)
+ tcf_chain_put_by_act(goto_ch);
+
return ret;
+out_free_keys:
+ kfree(nparms->tcfp_keys);
+put_chain:
+ if (goto_ch)
+ tcf_chain_put_by_act(goto_ch);
+out_free_ex:
+ kfree(nparms->tcfp_keys_ex);
+out_free:
+ kfree(nparms);
out_release:
tcf_idr_release(*a, bind);
-out_free:
- kfree(keys_ex);
return ret;
-
}
static void tcf_pedit_cleanup(struct tc_action *a)
{
struct tcf_pedit *p = to_pedit(a);
- struct tc_pedit_key *keys = p->tcfp_keys;
+ struct tcf_pedit_parms *parms;
+
+ parms = rcu_dereference_protected(p->parms, 1);
- kfree(keys);
- kfree(p->tcfp_keys_ex);
+ if (parms)
+ call_rcu(&parms->rcu, tcf_pedit_cleanup_rcu);
}
static bool offset_valid(struct sk_buff *skb, int offset)
@@ -261,11 +329,35 @@ static bool offset_valid(struct sk_buff *skb, int offset)
return true;
}
-static int pedit_skb_hdr_offset(struct sk_buff *skb,
- enum pedit_header_type htype, int *hoffset)
+static int pedit_l4_skb_offset(struct sk_buff *skb, int *hoffset, const int header_type)
{
+ const int noff = skb_network_offset(skb);
int ret = -EINVAL;
+ struct iphdr _iph;
+
+ switch (skb->protocol) {
+ case htons(ETH_P_IP): {
+ const struct iphdr *iph = skb_header_pointer(skb, noff, sizeof(_iph), &_iph);
+
+ if (!iph)
+ goto out;
+ *hoffset = noff + iph->ihl * 4;
+ ret = 0;
+ break;
+ }
+ case htons(ETH_P_IPV6):
+ ret = ipv6_find_hdr(skb, hoffset, header_type, NULL, NULL) == header_type ? 0 : -EINVAL;
+ break;
+ }
+out:
+ return ret;
+}
+static int pedit_skb_hdr_offset(struct sk_buff *skb,
+ enum pedit_header_type htype, int *hoffset)
+{
+ int ret = -EINVAL;
+ /* 'htype' is validated in the netlink parsing */
switch (htype) {
case TCA_PEDIT_KEY_EX_HDR_TYPE_ETH:
if (skb_mac_header_was_set(skb)) {
@@ -280,153 +372,162 @@ static int pedit_skb_hdr_offset(struct sk_buff *skb,
ret = 0;
break;
case TCA_PEDIT_KEY_EX_HDR_TYPE_TCP:
+ ret = pedit_l4_skb_offset(skb, hoffset, IPPROTO_TCP);
+ break;
case TCA_PEDIT_KEY_EX_HDR_TYPE_UDP:
- if (skb_transport_header_was_set(skb)) {
- *hoffset = skb_transport_offset(skb);
- ret = 0;
- }
+ ret = pedit_l4_skb_offset(skb, hoffset, IPPROTO_UDP);
break;
default:
- ret = -EINVAL;
break;
}
-
return ret;
}
-static int tcf_pedit_act(struct sk_buff *skb, const struct tc_action *a,
- struct tcf_result *res)
+TC_INDIRECT_SCOPE int tcf_pedit_act(struct sk_buff *skb,
+ const struct tc_action *a,
+ struct tcf_result *res)
{
+ enum pedit_header_type htype = TCA_PEDIT_KEY_EX_HDR_TYPE_NETWORK;
+ enum pedit_cmd cmd = TCA_PEDIT_KEY_EX_CMD_SET;
struct tcf_pedit *p = to_pedit(a);
+ struct tcf_pedit_key_ex *tkey_ex;
+ struct tcf_pedit_parms *parms;
+ struct tc_pedit_key *tkey;
+ u32 max_offset;
int i;
- if (skb_unclone(skb, GFP_ATOMIC))
- return p->tcf_action;
+ parms = rcu_dereference_bh(p->parms);
- spin_lock(&p->tcf_lock);
+ max_offset = (skb_transport_header_was_set(skb) ?
+ skb_transport_offset(skb) :
+ skb_network_offset(skb)) +
+ parms->tcfp_off_max_hint;
+ if (skb_ensure_writable(skb, min(skb->len, max_offset)))
+ goto done;
tcf_lastuse_update(&p->tcf_tm);
+ tcf_action_update_bstats(&p->common, skb);
- if (p->tcfp_nkeys > 0) {
- struct tc_pedit_key *tkey = p->tcfp_keys;
- struct tcf_pedit_key_ex *tkey_ex = p->tcfp_keys_ex;
- enum pedit_header_type htype =
- TCA_PEDIT_KEY_EX_HDR_TYPE_NETWORK;
- enum pedit_cmd cmd = TCA_PEDIT_KEY_EX_CMD_SET;
-
- for (i = p->tcfp_nkeys; i > 0; i--, tkey++) {
- u32 *ptr, hdata;
- int offset = tkey->off;
- int hoffset;
- u32 val;
- int rc;
-
- if (tkey_ex) {
- htype = tkey_ex->htype;
- cmd = tkey_ex->cmd;
-
- tkey_ex++;
- }
+ tkey = parms->tcfp_keys;
+ tkey_ex = parms->tcfp_keys_ex;
- rc = pedit_skb_hdr_offset(skb, htype, &hoffset);
- if (rc) {
- pr_info("tc action pedit bad header type specified (0x%x)\n",
- htype);
- goto bad;
- }
+ for (i = parms->tcfp_nkeys; i > 0; i--, tkey++) {
+ int offset = tkey->off;
+ int hoffset = 0;
+ u32 *ptr, hdata;
+ u32 val;
+ int rc;
- if (tkey->offmask) {
- u8 *d, _d;
-
- if (!offset_valid(skb, hoffset + tkey->at)) {
- pr_info("tc action pedit 'at' offset %d out of bounds\n",
- hoffset + tkey->at);
- goto bad;
- }
- d = skb_header_pointer(skb, hoffset + tkey->at,
- sizeof(_d), &_d);
- if (!d)
- goto bad;
- offset += (*d & tkey->offmask) >> tkey->shift;
- }
+ if (tkey_ex) {
+ htype = tkey_ex->htype;
+ cmd = tkey_ex->cmd;
- if (offset % 4) {
- pr_info("tc action pedit offset must be on 32 bit boundaries\n");
- goto bad;
- }
+ tkey_ex++;
+ }
+
+ rc = pedit_skb_hdr_offset(skb, htype, &hoffset);
+ if (rc) {
+ pr_info_ratelimited("tc action pedit unable to extract header offset for header type (0x%x)\n", htype);
+ goto bad;
+ }
- if (!offset_valid(skb, hoffset + offset)) {
- pr_info("tc action pedit offset %d out of bounds\n",
- hoffset + offset);
+ if (tkey->offmask) {
+ u8 *d, _d;
+
+ if (!offset_valid(skb, hoffset + tkey->at)) {
+ pr_info_ratelimited("tc action pedit 'at' offset %d out of bounds\n",
+ hoffset + tkey->at);
goto bad;
}
-
- ptr = skb_header_pointer(skb, hoffset + offset,
- sizeof(hdata), &hdata);
- if (!ptr)
+ d = skb_header_pointer(skb, hoffset + tkey->at,
+ sizeof(_d), &_d);
+ if (!d)
goto bad;
- /* just do it, baby */
- switch (cmd) {
- case TCA_PEDIT_KEY_EX_CMD_SET:
- val = tkey->val;
- break;
- case TCA_PEDIT_KEY_EX_CMD_ADD:
- val = (*ptr + tkey->val) & ~tkey->mask;
- break;
- default:
- pr_info("tc action pedit bad command (%d)\n",
- cmd);
+
+ offset += (*d & tkey->offmask) >> tkey->shift;
+ if (offset % 4) {
+ pr_info_ratelimited("tc action pedit offset must be on 32 bit boundaries\n");
goto bad;
}
+ }
- *ptr = ((*ptr & tkey->mask) ^ val);
- if (ptr == &hdata)
- skb_store_bits(skb, hoffset + offset, ptr, 4);
+ if (!offset_valid(skb, hoffset + offset)) {
+ pr_info_ratelimited("tc action pedit offset %d out of bounds\n", hoffset + offset);
+ goto bad;
}
- goto done;
- } else {
- WARN(1, "pedit BUG: index %d\n", p->tcf_index);
+ ptr = skb_header_pointer(skb, hoffset + offset,
+ sizeof(hdata), &hdata);
+ if (!ptr)
+ goto bad;
+ /* just do it, baby */
+ switch (cmd) {
+ case TCA_PEDIT_KEY_EX_CMD_SET:
+ val = tkey->val;
+ break;
+ case TCA_PEDIT_KEY_EX_CMD_ADD:
+ val = (*ptr + tkey->val) & ~tkey->mask;
+ break;
+ default:
+ pr_info_ratelimited("tc action pedit bad command (%d)\n", cmd);
+ goto bad;
+ }
+
+ *ptr = ((*ptr & tkey->mask) ^ val);
+ if (ptr == &hdata)
+ skb_store_bits(skb, hoffset + offset, ptr, 4);
}
+ goto done;
+
bad:
- p->tcf_qstats.overlimits++;
+ tcf_action_inc_overlimit_qstats(&p->common);
done:
- bstats_update(&p->tcf_bstats, skb);
- spin_unlock(&p->tcf_lock);
- return p->tcf_action;
+ return parms->action;
+}
+
+static void tcf_pedit_stats_update(struct tc_action *a, u64 bytes, u64 packets,
+ u64 drops, u64 lastuse, bool hw)
+{
+ struct tcf_pedit *d = to_pedit(a);
+ struct tcf_t *tm = &d->tcf_tm;
+
+ tcf_action_update_stats(a, bytes, packets, drops, hw);
+ tm->lastuse = max_t(u64, tm->lastuse, lastuse);
}
static int tcf_pedit_dump(struct sk_buff *skb, struct tc_action *a,
int bind, int ref)
{
unsigned char *b = skb_tail_pointer(skb);
- struct tcf_pedit *p = to_pedit(a);
+ const struct tcf_pedit *p = to_pedit(a);
+ const struct tcf_pedit_parms *parms;
struct tc_pedit *opt;
struct tcf_t t;
int s;
- s = sizeof(*opt) + p->tcfp_nkeys * sizeof(struct tc_pedit_key);
+ rcu_read_lock();
+ parms = rcu_dereference(p->parms);
+ s = struct_size(opt, keys, parms->tcfp_nkeys);
- /* netlink spinlocks held above us - must use ATOMIC */
opt = kzalloc(s, GFP_ATOMIC);
- if (unlikely(!opt))
+ if (unlikely(!opt)) {
+ rcu_read_unlock();
return -ENOBUFS;
+ }
+ opt->nkeys = parms->tcfp_nkeys;
- spin_lock_bh(&p->tcf_lock);
- memcpy(opt->keys, p->tcfp_keys,
- p->tcfp_nkeys * sizeof(struct tc_pedit_key));
+ memcpy(opt->keys, parms->tcfp_keys,
+ flex_array_size(opt, keys, parms->tcfp_nkeys));
opt->index = p->tcf_index;
- opt->nkeys = p->tcfp_nkeys;
- opt->flags = p->tcfp_flags;
- opt->action = p->tcf_action;
+ opt->flags = parms->tcfp_flags;
+ opt->action = parms->action;
opt->refcnt = refcount_read(&p->tcf_refcnt) - ref;
opt->bindcnt = atomic_read(&p->tcf_bindcnt) - bind;
- if (p->tcfp_keys_ex) {
- if (tcf_pedit_key_ex_dump(skb,
- p->tcfp_keys_ex,
- p->tcfp_nkeys))
+ if (parms->tcfp_keys_ex) {
+ if (tcf_pedit_key_ex_dump(skb, parms->tcfp_keys_ex,
+ parms->tcfp_nkeys))
goto nla_put_failure;
if (nla_put(skb, TCA_PEDIT_PARMS_EX, s, opt))
@@ -439,64 +540,104 @@ static int tcf_pedit_dump(struct sk_buff *skb, struct tc_action *a,
tcf_tm_dump(&t, &p->tcf_tm);
if (nla_put_64bit(skb, TCA_PEDIT_TM, sizeof(t), &t, TCA_PEDIT_PAD))
goto nla_put_failure;
- spin_unlock_bh(&p->tcf_lock);
+ rcu_read_unlock();
kfree(opt);
return skb->len;
nla_put_failure:
- spin_unlock_bh(&p->tcf_lock);
+ rcu_read_unlock();
nlmsg_trim(skb, b);
kfree(opt);
return -1;
}
-static int tcf_pedit_walker(struct net *net, struct sk_buff *skb,
- struct netlink_callback *cb, int type,
- const struct tc_action_ops *ops,
- struct netlink_ext_ack *extack)
+static int tcf_pedit_offload_act_setup(struct tc_action *act, void *entry_data,
+ u32 *index_inc, bool bind,
+ struct netlink_ext_ack *extack)
{
- struct tc_action_net *tn = net_generic(net, pedit_net_id);
+ if (bind) {
+ struct flow_action_entry *entry = entry_data;
+ int k;
- return tcf_generic_walker(tn, skb, cb, type, ops, extack);
-}
+ for (k = 0; k < tcf_pedit_nkeys(act); k++) {
+ switch (tcf_pedit_cmd(act, k)) {
+ case TCA_PEDIT_KEY_EX_CMD_SET:
+ entry->id = FLOW_ACTION_MANGLE;
+ break;
+ case TCA_PEDIT_KEY_EX_CMD_ADD:
+ entry->id = FLOW_ACTION_ADD;
+ break;
+ default:
+ NL_SET_ERR_MSG_MOD(extack, "Unsupported pedit command offload");
+ return -EOPNOTSUPP;
+ }
+ entry->mangle.htype = tcf_pedit_htype(act, k);
+ entry->mangle.mask = tcf_pedit_mask(act, k);
+ entry->mangle.val = tcf_pedit_val(act, k);
+ entry->mangle.offset = tcf_pedit_offset(act, k);
+ entry->hw_stats = tc_act_hw_stats(act->hw_stats);
+ entry++;
+ }
+ *index_inc = k;
+ } else {
+ struct flow_offload_action *fl_action = entry_data;
+ u32 cmd = tcf_pedit_cmd(act, 0);
+ int k;
+
+ switch (cmd) {
+ case TCA_PEDIT_KEY_EX_CMD_SET:
+ fl_action->id = FLOW_ACTION_MANGLE;
+ break;
+ case TCA_PEDIT_KEY_EX_CMD_ADD:
+ fl_action->id = FLOW_ACTION_ADD;
+ break;
+ default:
+ NL_SET_ERR_MSG_MOD(extack, "Unsupported pedit command offload");
+ return -EOPNOTSUPP;
+ }
-static int tcf_pedit_search(struct net *net, struct tc_action **a, u32 index)
-{
- struct tc_action_net *tn = net_generic(net, pedit_net_id);
+ for (k = 1; k < tcf_pedit_nkeys(act); k++) {
+ if (cmd != tcf_pedit_cmd(act, k)) {
+ NL_SET_ERR_MSG_MOD(extack, "Unsupported pedit command offload");
+ return -EOPNOTSUPP;
+ }
+ }
+ }
- return tcf_idr_search(tn, a, index);
+ return 0;
}
static struct tc_action_ops act_pedit_ops = {
.kind = "pedit",
- .type = TCA_ACT_PEDIT,
+ .id = TCA_ID_PEDIT,
.owner = THIS_MODULE,
.act = tcf_pedit_act,
+ .stats_update = tcf_pedit_stats_update,
.dump = tcf_pedit_dump,
.cleanup = tcf_pedit_cleanup,
.init = tcf_pedit_init,
- .walk = tcf_pedit_walker,
- .lookup = tcf_pedit_search,
+ .offload_act_setup = tcf_pedit_offload_act_setup,
.size = sizeof(struct tcf_pedit),
};
+MODULE_ALIAS_NET_ACT("pedit");
static __net_init int pedit_init_net(struct net *net)
{
- struct tc_action_net *tn = net_generic(net, pedit_net_id);
+ struct tc_action_net *tn = net_generic(net, act_pedit_ops.net_id);
- return tc_action_net_init(tn, &act_pedit_ops);
+ return tc_action_net_init(net, tn, &act_pedit_ops);
}
static void __net_exit pedit_exit_net(struct list_head *net_list)
{
- tc_action_net_exit(net_list, pedit_net_id);
+ tc_action_net_exit(net_list, act_pedit_ops.net_id);
}
static struct pernet_operations pedit_net_ops = {
.init = pedit_init_net,
.exit_batch = pedit_exit_net,
- .id = &pedit_net_id,
+ .id = &act_pedit_ops.net_id,
.size = sizeof(struct tc_action_net),
};
diff --git a/net/sched/act_police.c b/net/sched/act_police.c
index ec8ec55e0fe8..0e1c61183379 100644
--- a/net/sched/act_police.c
+++ b/net/sched/act_police.c
@@ -1,11 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* net/sched/act_police.c Input police filter
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
* J Hadi Salim (action changes)
*/
@@ -20,84 +16,51 @@
#include <linux/init.h>
#include <linux/slab.h>
#include <net/act_api.h>
+#include <net/gso.h>
#include <net/netlink.h>
-
-struct tcf_police_params {
- int tcfp_result;
- u32 tcfp_ewma_rate;
- s64 tcfp_burst;
- u32 tcfp_mtu;
- s64 tcfp_mtu_ptoks;
- struct psched_ratecfg rate;
- bool rate_present;
- struct psched_ratecfg peak;
- bool peak_present;
- struct rcu_head rcu;
-};
-
-struct tcf_police {
- struct tc_action common;
- struct tcf_police_params __rcu *params;
-
- spinlock_t tcfp_lock ____cacheline_aligned_in_smp;
- s64 tcfp_toks;
- s64 tcfp_ptoks;
- s64 tcfp_t_c;
-};
-
-#define to_police(pc) ((struct tcf_police *)pc)
-
-/* old policer structure from before tc actions */
-struct tc_police_compat {
- u32 index;
- int action;
- u32 limit;
- u32 burst;
- u32 mtu;
- struct tc_ratespec rate;
- struct tc_ratespec peakrate;
-};
+#include <net/pkt_cls.h>
+#include <net/tc_act/tc_police.h>
+#include <net/tc_wrapper.h>
/* Each policer is serialized by its individual spinlock */
-static unsigned int police_net_id;
static struct tc_action_ops act_police_ops;
-static int tcf_police_walker(struct net *net, struct sk_buff *skb,
- struct netlink_callback *cb, int type,
- const struct tc_action_ops *ops,
- struct netlink_ext_ack *extack)
-{
- struct tc_action_net *tn = net_generic(net, police_net_id);
-
- return tcf_generic_walker(tn, skb, cb, type, ops, extack);
-}
-
static const struct nla_policy police_policy[TCA_POLICE_MAX + 1] = {
[TCA_POLICE_RATE] = { .len = TC_RTAB_SIZE },
[TCA_POLICE_PEAKRATE] = { .len = TC_RTAB_SIZE },
[TCA_POLICE_AVRATE] = { .type = NLA_U32 },
[TCA_POLICE_RESULT] = { .type = NLA_U32 },
+ [TCA_POLICE_RATE64] = { .type = NLA_U64 },
+ [TCA_POLICE_PEAKRATE64] = { .type = NLA_U64 },
+ [TCA_POLICE_PKTRATE64] = { .type = NLA_U64, .min = 1 },
+ [TCA_POLICE_PKTBURST64] = { .type = NLA_U64, .min = 1 },
};
static int tcf_police_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action **a,
- int ovr, int bind, bool rtnl_held,
+ struct tcf_proto *tp, u32 flags,
struct netlink_ext_ack *extack)
{
int ret = 0, tcfp_result = TC_ACT_OK, err, size;
+ bool bind = flags & TCA_ACT_FLAGS_BIND;
struct nlattr *tb[TCA_POLICE_MAX + 1];
+ struct tcf_chain *goto_ch = NULL;
struct tc_police *parm;
struct tcf_police *police;
struct qdisc_rate_table *R_tab = NULL, *P_tab = NULL;
- struct tc_action_net *tn = net_generic(net, police_net_id);
+ struct tc_action_net *tn = net_generic(net, act_police_ops.net_id);
struct tcf_police_params *new;
bool exists = false;
+ u32 index;
+ u64 rate64, prate64;
+ u64 pps, ppsburst;
if (nla == NULL)
return -EINVAL;
- err = nla_parse_nested(tb, TCA_POLICE_MAX, nla, police_policy, NULL);
+ err = nla_parse_nested_deprecated(tb, TCA_POLICE_MAX, nla,
+ police_policy, NULL);
if (err < 0)
return err;
@@ -108,26 +71,30 @@ static int tcf_police_init(struct net *net, struct nlattr *nla,
return -EINVAL;
parm = nla_data(tb[TCA_POLICE_TBF]);
- err = tcf_idr_check_alloc(tn, &parm->index, a, bind);
+ index = parm->index;
+ err = tcf_idr_check_alloc(tn, &index, a, bind);
if (err < 0)
return err;
exists = err;
if (exists && bind)
- return 0;
+ return ACT_P_BOUND;
if (!exists) {
- ret = tcf_idr_create(tn, parm->index, NULL, a,
- &act_police_ops, bind, true);
+ ret = tcf_idr_create(tn, index, NULL, a,
+ &act_police_ops, bind, true, flags);
if (ret) {
- tcf_idr_cleanup(tn, parm->index);
+ tcf_idr_cleanup(tn, index);
return ret;
}
ret = ACT_P_CREATED;
spin_lock_init(&(to_police(*a)->tcfp_lock));
- } else if (!ovr) {
+ } else if (!(flags & TCA_ACT_FLAGS_REPLACE)) {
tcf_idr_release(*a, bind);
return -EEXIST;
}
+ err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack);
+ if (err < 0)
+ goto release_idr;
police = to_police(*a);
if (parm->rate.rate) {
@@ -149,7 +116,7 @@ static int tcf_police_init(struct net *net, struct nlattr *nla,
police->common.cpu_bstats,
&police->tcf_rate_est,
&police->tcf_lock,
- NULL, est);
+ false, est);
if (err)
goto failure;
} else if (tb[TCA_POLICE_AVRATE] &&
@@ -169,6 +136,21 @@ static int tcf_police_init(struct net *net, struct nlattr *nla,
}
}
+ if ((tb[TCA_POLICE_PKTRATE64] && !tb[TCA_POLICE_PKTBURST64]) ||
+ (!tb[TCA_POLICE_PKTRATE64] && tb[TCA_POLICE_PKTBURST64])) {
+ NL_SET_ERR_MSG(extack,
+ "Both or neither packet-per-second burst and rate must be provided");
+ err = -EINVAL;
+ goto failure;
+ }
+
+ if (tb[TCA_POLICE_PKTRATE64] && R_tab) {
+ NL_SET_ERR_MSG(extack,
+ "packet-per-second and byte-per-second rate limits not allowed in same action");
+ err = -EINVAL;
+ goto failure;
+ }
+
new = kzalloc(sizeof(*new), GFP_KERNEL);
if (unlikely(!new)) {
err = -ENOMEM;
@@ -185,14 +167,16 @@ static int tcf_police_init(struct net *net, struct nlattr *nla,
}
if (R_tab) {
new->rate_present = true;
- psched_ratecfg_precompute(&new->rate, &R_tab->rate, 0);
+ rate64 = nla_get_u64_default(tb[TCA_POLICE_RATE64], 0);
+ psched_ratecfg_precompute(&new->rate, &R_tab->rate, rate64);
qdisc_put_rtab(R_tab);
} else {
new->rate_present = false;
}
if (P_tab) {
new->peak_present = true;
- psched_ratecfg_precompute(&new->peak, &P_tab->rate, 0);
+ prate64 = nla_get_u64_default(tb[TCA_POLICE_PEAKRATE64], 0);
+ psched_ratecfg_precompute(&new->peak, &P_tab->rate, prate64);
qdisc_put_rtab(P_tab);
} else {
new->peak_present = false;
@@ -206,6 +190,15 @@ static int tcf_police_init(struct net *net, struct nlattr *nla,
if (tb[TCA_POLICE_AVRATE])
new->tcfp_ewma_rate = nla_get_u32(tb[TCA_POLICE_AVRATE]);
+ if (tb[TCA_POLICE_PKTRATE64]) {
+ pps = nla_get_u64(tb[TCA_POLICE_PKTRATE64]);
+ ppsburst = nla_get_u64(tb[TCA_POLICE_PKTBURST64]);
+ new->pps_present = true;
+ new->tcfp_pkt_burst = PSCHED_TICKS2NS(ppsburst);
+ psched_ppscfg_precompute(&new->ppsrate, pps);
+ }
+
+ new->action = parm->action;
spin_lock_bh(&police->tcf_lock);
spin_lock_bh(&police->tcfp_lock);
police->tcfp_t_c = ktime_get_ns();
@@ -213,39 +206,57 @@ static int tcf_police_init(struct net *net, struct nlattr *nla,
if (new->peak_present)
police->tcfp_ptoks = new->tcfp_mtu_ptoks;
spin_unlock_bh(&police->tcfp_lock);
- police->tcf_action = parm->action;
- rcu_swap_protected(police->params,
- new,
- lockdep_is_held(&police->tcf_lock));
+ goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch);
+ new = rcu_replace_pointer(police->params,
+ new,
+ lockdep_is_held(&police->tcf_lock));
spin_unlock_bh(&police->tcf_lock);
+ if (goto_ch)
+ tcf_chain_put_by_act(goto_ch);
if (new)
kfree_rcu(new, rcu);
- if (ret == ACT_P_CREATED)
- tcf_idr_insert(tn, *a);
return ret;
failure:
qdisc_put_rtab(P_tab);
qdisc_put_rtab(R_tab);
+ if (goto_ch)
+ tcf_chain_put_by_act(goto_ch);
+release_idr:
tcf_idr_release(*a, bind);
return err;
}
-static int tcf_police_act(struct sk_buff *skb, const struct tc_action *a,
- struct tcf_result *res)
+static bool tcf_police_mtu_check(struct sk_buff *skb, u32 limit)
+{
+ u32 len;
+
+ if (skb_is_gso(skb))
+ return skb_gso_validate_mac_len(skb, limit);
+
+ len = qdisc_pkt_len(skb);
+ if (skb_at_tc_ingress(skb))
+ len += skb->mac_len;
+
+ return len <= limit;
+}
+
+TC_INDIRECT_SCOPE int tcf_police_act(struct sk_buff *skb,
+ const struct tc_action *a,
+ struct tcf_result *res)
{
struct tcf_police *police = to_police(a);
+ s64 now, toks, ppstoks = 0, ptoks = 0;
struct tcf_police_params *p;
- s64 now, toks, ptoks = 0;
int ret;
tcf_lastuse_update(&police->tcf_tm);
- bstats_cpu_update(this_cpu_ptr(police->common.cpu_bstats), skb);
+ bstats_update(this_cpu_ptr(police->common.cpu_bstats), skb);
- ret = READ_ONCE(police->tcf_action);
p = rcu_dereference_bh(police->params);
+ ret = p->action;
if (p->tcfp_ewma_rate) {
struct gnet_stats_rate_est64 sample;
@@ -255,8 +266,8 @@ static int tcf_police_act(struct sk_buff *skb, const struct tc_action *a,
goto inc_overlimits;
}
- if (qdisc_pkt_len(skb) <= p->tcfp_mtu) {
- if (!p->rate_present) {
+ if (tcf_police_mtu_check(skb, p->tcfp_mtu)) {
+ if (!p->rate_present && !p->pps_present) {
ret = p->tcfp_result;
goto end;
}
@@ -271,14 +282,23 @@ static int tcf_police_act(struct sk_buff *skb, const struct tc_action *a,
ptoks -= (s64)psched_l2t_ns(&p->peak,
qdisc_pkt_len(skb));
}
- toks += police->tcfp_toks;
- if (toks > p->tcfp_burst)
- toks = p->tcfp_burst;
- toks -= (s64)psched_l2t_ns(&p->rate, qdisc_pkt_len(skb));
- if ((toks|ptoks) >= 0) {
+ if (p->rate_present) {
+ toks += police->tcfp_toks;
+ if (toks > p->tcfp_burst)
+ toks = p->tcfp_burst;
+ toks -= (s64)psched_l2t_ns(&p->rate, qdisc_pkt_len(skb));
+ } else if (p->pps_present) {
+ ppstoks = min_t(s64, now - police->tcfp_t_c, p->tcfp_pkt_burst);
+ ppstoks += police->tcfp_pkttoks;
+ if (ppstoks > p->tcfp_pkt_burst)
+ ppstoks = p->tcfp_pkt_burst;
+ ppstoks -= (s64)psched_pkt2t_ns(&p->ppsrate, 1);
+ }
+ if ((toks | ptoks | ppstoks) >= 0) {
police->tcfp_t_c = now;
police->tcfp_toks = toks;
police->tcfp_ptoks = ptoks;
+ police->tcfp_pkttoks = ppstoks;
spin_unlock_bh(&police->tcfp_lock);
ret = p->tcfp_result;
goto inc_drops;
@@ -305,12 +325,23 @@ static void tcf_police_cleanup(struct tc_action *a)
kfree_rcu(p, rcu);
}
+static void tcf_police_stats_update(struct tc_action *a,
+ u64 bytes, u64 packets, u64 drops,
+ u64 lastuse, bool hw)
+{
+ struct tcf_police *police = to_police(a);
+ struct tcf_t *tm = &police->tcf_tm;
+
+ tcf_action_update_stats(a, bytes, packets, drops, hw);
+ tm->lastuse = max_t(u64, tm->lastuse, lastuse);
+}
+
static int tcf_police_dump(struct sk_buff *skb, struct tc_action *a,
int bind, int ref)
{
+ const struct tcf_police *police = to_police(a);
unsigned char *b = skb_tail_pointer(skb);
- struct tcf_police *police = to_police(a);
- struct tcf_police_params *p;
+ const struct tcf_police_params *p;
struct tc_police opt = {
.index = police->tcf_index,
.refcnt = refcount_read(&police->tcf_refcnt) - ref,
@@ -318,16 +349,37 @@ static int tcf_police_dump(struct sk_buff *skb, struct tc_action *a,
};
struct tcf_t t;
- spin_lock_bh(&police->tcf_lock);
- opt.action = police->tcf_action;
- p = rcu_dereference_protected(police->params,
- lockdep_is_held(&police->tcf_lock));
+ rcu_read_lock();
+ p = rcu_dereference(police->params);
+ opt.action = p->action;
opt.mtu = p->tcfp_mtu;
opt.burst = PSCHED_NS2TICKS(p->tcfp_burst);
- if (p->rate_present)
+ if (p->rate_present) {
psched_ratecfg_getrate(&opt.rate, &p->rate);
- if (p->peak_present)
+ if ((p->rate.rate_bytes_ps >= (1ULL << 32)) &&
+ nla_put_u64_64bit(skb, TCA_POLICE_RATE64,
+ p->rate.rate_bytes_ps,
+ TCA_POLICE_PAD))
+ goto nla_put_failure;
+ }
+ if (p->peak_present) {
psched_ratecfg_getrate(&opt.peakrate, &p->peak);
+ if ((p->peak.rate_bytes_ps >= (1ULL << 32)) &&
+ nla_put_u64_64bit(skb, TCA_POLICE_PEAKRATE64,
+ p->peak.rate_bytes_ps,
+ TCA_POLICE_PAD))
+ goto nla_put_failure;
+ }
+ if (p->pps_present) {
+ if (nla_put_u64_64bit(skb, TCA_POLICE_PKTRATE64,
+ p->ppsrate.rate_pkts_ps,
+ TCA_POLICE_PAD))
+ goto nla_put_failure;
+ if (nla_put_u64_64bit(skb, TCA_POLICE_PKTBURST64,
+ PSCHED_NS2TICKS(p->tcfp_pkt_burst),
+ TCA_POLICE_PAD))
+ goto nla_put_failure;
+ }
if (nla_put(skb, TCA_POLICE_TBF, sizeof(opt), &opt))
goto nla_put_failure;
if (p->tcfp_result &&
@@ -337,27 +389,99 @@ static int tcf_police_dump(struct sk_buff *skb, struct tc_action *a,
nla_put_u32(skb, TCA_POLICE_AVRATE, p->tcfp_ewma_rate))
goto nla_put_failure;
- t.install = jiffies_to_clock_t(jiffies - police->tcf_tm.install);
- t.lastuse = jiffies_to_clock_t(jiffies - police->tcf_tm.lastuse);
- t.firstuse = jiffies_to_clock_t(jiffies - police->tcf_tm.firstuse);
- t.expires = jiffies_to_clock_t(police->tcf_tm.expires);
+ tcf_tm_dump(&t, &police->tcf_tm);
if (nla_put_64bit(skb, TCA_POLICE_TM, sizeof(t), &t, TCA_POLICE_PAD))
goto nla_put_failure;
- spin_unlock_bh(&police->tcf_lock);
+ rcu_read_unlock();
return skb->len;
nla_put_failure:
- spin_unlock_bh(&police->tcf_lock);
+ rcu_read_unlock();
nlmsg_trim(skb, b);
return -1;
}
-static int tcf_police_search(struct net *net, struct tc_action **a, u32 index)
+static int tcf_police_act_to_flow_act(int tc_act, u32 *extval,
+ struct netlink_ext_ack *extack)
{
- struct tc_action_net *tn = net_generic(net, police_net_id);
+ int act_id = -EOPNOTSUPP;
+
+ if (!TC_ACT_EXT_OPCODE(tc_act)) {
+ if (tc_act == TC_ACT_OK)
+ act_id = FLOW_ACTION_ACCEPT;
+ else if (tc_act == TC_ACT_SHOT)
+ act_id = FLOW_ACTION_DROP;
+ else if (tc_act == TC_ACT_PIPE)
+ act_id = FLOW_ACTION_PIPE;
+ else if (tc_act == TC_ACT_RECLASSIFY)
+ NL_SET_ERR_MSG_MOD(extack, "Offload not supported when conform/exceed action is \"reclassify\"");
+ else
+ NL_SET_ERR_MSG_MOD(extack, "Unsupported conform/exceed action offload");
+ } else if (TC_ACT_EXT_CMP(tc_act, TC_ACT_GOTO_CHAIN)) {
+ act_id = FLOW_ACTION_GOTO;
+ *extval = tc_act & TC_ACT_EXT_VAL_MASK;
+ } else if (TC_ACT_EXT_CMP(tc_act, TC_ACT_JUMP)) {
+ act_id = FLOW_ACTION_JUMP;
+ *extval = tc_act & TC_ACT_EXT_VAL_MASK;
+ } else if (tc_act == TC_ACT_UNSPEC) {
+ act_id = FLOW_ACTION_CONTINUE;
+ } else {
+ NL_SET_ERR_MSG_MOD(extack, "Unsupported conform/exceed action offload");
+ }
+
+ return act_id;
+}
+
+static int tcf_police_offload_act_setup(struct tc_action *act, void *entry_data,
+ u32 *index_inc, bool bind,
+ struct netlink_ext_ack *extack)
+{
+ if (bind) {
+ struct flow_action_entry *entry = entry_data;
+ struct tcf_police *police = to_police(act);
+ struct tcf_police_params *p;
+ int act_id;
+
+ p = rcu_dereference_protected(police->params,
+ lockdep_is_held(&police->tcf_lock));
+
+ entry->id = FLOW_ACTION_POLICE;
+ entry->police.burst = tcf_police_burst(act);
+ entry->police.rate_bytes_ps =
+ tcf_police_rate_bytes_ps(act);
+ entry->police.peakrate_bytes_ps = tcf_police_peakrate_bytes_ps(act);
+ entry->police.avrate = tcf_police_tcfp_ewma_rate(act);
+ entry->police.overhead = tcf_police_rate_overhead(act);
+ entry->police.burst_pkt = tcf_police_burst_pkt(act);
+ entry->police.rate_pkt_ps =
+ tcf_police_rate_pkt_ps(act);
+ entry->police.mtu = tcf_police_tcfp_mtu(act);
+
+ act_id = tcf_police_act_to_flow_act(police->tcf_action,
+ &entry->police.exceed.extval,
+ extack);
+ if (act_id < 0)
+ return act_id;
+
+ entry->police.exceed.act_id = act_id;
+
+ act_id = tcf_police_act_to_flow_act(p->tcfp_result,
+ &entry->police.notexceed.extval,
+ extack);
+ if (act_id < 0)
+ return act_id;
+
+ entry->police.notexceed.act_id = act_id;
+
+ *index_inc = 1;
+ } else {
+ struct flow_offload_action *fl_action = entry_data;
+
+ fl_action->id = FLOW_ACTION_POLICE;
+ }
- return tcf_idr_search(tn, a, index);
+ return 0;
}
MODULE_AUTHOR("Alexey Kuznetsov");
@@ -366,33 +490,34 @@ MODULE_LICENSE("GPL");
static struct tc_action_ops act_police_ops = {
.kind = "police",
- .type = TCA_ID_POLICE,
+ .id = TCA_ID_POLICE,
.owner = THIS_MODULE,
+ .stats_update = tcf_police_stats_update,
.act = tcf_police_act,
.dump = tcf_police_dump,
.init = tcf_police_init,
- .walk = tcf_police_walker,
- .lookup = tcf_police_search,
.cleanup = tcf_police_cleanup,
+ .offload_act_setup = tcf_police_offload_act_setup,
.size = sizeof(struct tcf_police),
};
+MODULE_ALIAS_NET_ACT("police");
static __net_init int police_init_net(struct net *net)
{
- struct tc_action_net *tn = net_generic(net, police_net_id);
+ struct tc_action_net *tn = net_generic(net, act_police_ops.net_id);
- return tc_action_net_init(tn, &act_police_ops);
+ return tc_action_net_init(net, tn, &act_police_ops);
}
static void __net_exit police_exit_net(struct list_head *net_list)
{
- tc_action_net_exit(net_list, police_net_id);
+ tc_action_net_exit(net_list, act_police_ops.net_id);
}
static struct pernet_operations police_net_ops = {
.init = police_init_net,
.exit_batch = police_exit_net,
- .id = &police_net_id,
+ .id = &act_police_ops.net_id,
.size = sizeof(struct tc_action_net),
};
diff --git a/net/sched/act_sample.c b/net/sched/act_sample.c
index 1a0c682fd734..2ceb4d141b71 100644
--- a/net/sched/act_sample.c
+++ b/net/sched/act_sample.c
@@ -1,10 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* net/sched/act_sample.c - Packet sampling tc action
* Copyright (c) 2017 Yotam Gigi <yotamg@mellanox.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include <linux/types.h>
@@ -22,10 +19,11 @@
#include <linux/tc_act/tc_sample.h>
#include <net/tc_act/tc_sample.h>
#include <net/psample.h>
+#include <net/pkt_cls.h>
+#include <net/tc_wrapper.h>
#include <linux/if_arp.h>
-static unsigned int sample_net_id;
static struct tc_action_ops act_sample_ops;
static const struct nla_policy sample_policy[TCA_SAMPLE_MAX + 1] = {
@@ -36,64 +34,84 @@ static const struct nla_policy sample_policy[TCA_SAMPLE_MAX + 1] = {
};
static int tcf_sample_init(struct net *net, struct nlattr *nla,
- struct nlattr *est, struct tc_action **a, int ovr,
- int bind, bool rtnl_held,
- struct netlink_ext_ack *extack)
+ struct nlattr *est, struct tc_action **a,
+ struct tcf_proto *tp,
+ u32 flags, struct netlink_ext_ack *extack)
{
- struct tc_action_net *tn = net_generic(net, sample_net_id);
+ struct tc_action_net *tn = net_generic(net, act_sample_ops.net_id);
+ bool bind = flags & TCA_ACT_FLAGS_BIND;
struct nlattr *tb[TCA_SAMPLE_MAX + 1];
struct psample_group *psample_group;
+ u32 psample_group_num, rate, index;
+ struct tcf_chain *goto_ch = NULL;
struct tc_sample *parm;
- u32 psample_group_num;
struct tcf_sample *s;
bool exists = false;
int ret, err;
if (!nla)
return -EINVAL;
- ret = nla_parse_nested(tb, TCA_SAMPLE_MAX, nla, sample_policy, NULL);
+ ret = nla_parse_nested_deprecated(tb, TCA_SAMPLE_MAX, nla,
+ sample_policy, NULL);
if (ret < 0)
return ret;
- if (!tb[TCA_SAMPLE_PARMS] || !tb[TCA_SAMPLE_RATE] ||
- !tb[TCA_SAMPLE_PSAMPLE_GROUP])
+
+ if (!tb[TCA_SAMPLE_PARMS])
return -EINVAL;
parm = nla_data(tb[TCA_SAMPLE_PARMS]);
-
- err = tcf_idr_check_alloc(tn, &parm->index, a, bind);
+ index = parm->index;
+ err = tcf_idr_check_alloc(tn, &index, a, bind);
if (err < 0)
return err;
exists = err;
if (exists && bind)
- return 0;
+ return ACT_P_BOUND;
if (!exists) {
- ret = tcf_idr_create(tn, parm->index, est, a,
- &act_sample_ops, bind, true);
+ ret = tcf_idr_create(tn, index, est, a,
+ &act_sample_ops, bind, true, flags);
if (ret) {
- tcf_idr_cleanup(tn, parm->index);
+ tcf_idr_cleanup(tn, index);
return ret;
}
ret = ACT_P_CREATED;
- } else if (!ovr) {
+ } else if (!(flags & TCA_ACT_FLAGS_REPLACE)) {
tcf_idr_release(*a, bind);
return -EEXIST;
}
+ if (!tb[TCA_SAMPLE_RATE] || !tb[TCA_SAMPLE_PSAMPLE_GROUP]) {
+ NL_SET_ERR_MSG(extack, "sample rate and group are required");
+ err = -EINVAL;
+ goto release_idr;
+ }
+
+ err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack);
+ if (err < 0)
+ goto release_idr;
+
+ rate = nla_get_u32(tb[TCA_SAMPLE_RATE]);
+ if (!rate) {
+ NL_SET_ERR_MSG(extack, "invalid sample rate");
+ err = -EINVAL;
+ goto put_chain;
+ }
psample_group_num = nla_get_u32(tb[TCA_SAMPLE_PSAMPLE_GROUP]);
psample_group = psample_group_get(net, psample_group_num);
if (!psample_group) {
- tcf_idr_release(*a, bind);
- return -ENOMEM;
+ err = -ENOMEM;
+ goto put_chain;
}
s = to_sample(*a);
spin_lock_bh(&s->tcf_lock);
- s->tcf_action = parm->action;
- s->rate = nla_get_u32(tb[TCA_SAMPLE_RATE]);
+ goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch);
+ s->rate = rate;
s->psample_group_num = psample_group_num;
- RCU_INIT_POINTER(s->psample_group, psample_group);
+ psample_group = rcu_replace_pointer(s->psample_group, psample_group,
+ lockdep_is_held(&s->tcf_lock));
if (tb[TCA_SAMPLE_TRUNC_SIZE]) {
s->truncate = true;
@@ -101,9 +119,18 @@ static int tcf_sample_init(struct net *net, struct nlattr *nla,
}
spin_unlock_bh(&s->tcf_lock);
- if (ret == ACT_P_CREATED)
- tcf_idr_insert(tn, *a);
+ if (psample_group)
+ psample_group_put(psample_group);
+ if (goto_ch)
+ tcf_chain_put_by_act(goto_ch);
+
return ret;
+put_chain:
+ if (goto_ch)
+ tcf_chain_put_by_act(goto_ch);
+release_idr:
+ tcf_idr_release(*a, bind);
+ return err;
}
static void tcf_sample_cleanup(struct tc_action *a)
@@ -125,6 +152,7 @@ static bool tcf_sample_dev_ok_push(struct net_device *dev)
case ARPHRD_TUNNEL6:
case ARPHRD_SIT:
case ARPHRD_IPGRE:
+ case ARPHRD_IP6GRE:
case ARPHRD_VOID:
case ARPHRD_NONE:
return false;
@@ -133,39 +161,48 @@ static bool tcf_sample_dev_ok_push(struct net_device *dev)
}
}
-static int tcf_sample_act(struct sk_buff *skb, const struct tc_action *a,
- struct tcf_result *res)
+TC_INDIRECT_SCOPE int tcf_sample_act(struct sk_buff *skb,
+ const struct tc_action *a,
+ struct tcf_result *res)
{
struct tcf_sample *s = to_sample(a);
struct psample_group *psample_group;
+ u8 cookie_data[TC_COOKIE_MAX_SIZE];
+ struct psample_metadata md = {};
+ struct tc_cookie *user_cookie;
int retval;
- int size;
- int iif;
- int oif;
tcf_lastuse_update(&s->tcf_tm);
- bstats_cpu_update(this_cpu_ptr(s->common.cpu_bstats), skb);
+ bstats_update(this_cpu_ptr(s->common.cpu_bstats), skb);
retval = READ_ONCE(s->tcf_action);
psample_group = rcu_dereference_bh(s->psample_group);
/* randomly sample packets according to rate */
- if (psample_group && (prandom_u32() % s->rate == 0)) {
+ if (psample_group && (get_random_u32_below(s->rate) == 0)) {
if (!skb_at_tc_ingress(skb)) {
- iif = skb->skb_iif;
- oif = skb->dev->ifindex;
+ md.in_ifindex = skb->skb_iif;
+ md.out_ifindex = skb->dev->ifindex;
} else {
- iif = skb->dev->ifindex;
- oif = 0;
+ md.in_ifindex = skb->dev->ifindex;
}
/* on ingress, the mac header gets popped, so push it back */
if (skb_at_tc_ingress(skb) && tcf_sample_dev_ok_push(skb->dev))
skb_push(skb, skb->mac_len);
- size = s->truncate ? s->trunc_size : skb->len;
- psample_sample_packet(psample_group, skb, size, iif, oif,
- s->rate);
+ rcu_read_lock();
+ user_cookie = rcu_dereference(a->user_cookie);
+ if (user_cookie) {
+ memcpy(cookie_data, user_cookie->data,
+ user_cookie->len);
+ md.user_cookie = cookie_data;
+ md.user_cookie_len = user_cookie->len;
+ }
+ rcu_read_unlock();
+
+ md.trunc_size = s->truncate ? s->trunc_size : skb->len;
+ psample_sample_packet(psample_group, skb, s->rate, &md);
if (skb_at_tc_ingress(skb) && tcf_sample_dev_ok_push(skb->dev))
skb_pull(skb, skb->mac_len);
@@ -174,6 +211,16 @@ static int tcf_sample_act(struct sk_buff *skb, const struct tc_action *a,
return retval;
}
+static void tcf_sample_stats_update(struct tc_action *a, u64 bytes, u64 packets,
+ u64 drops, u64 lastuse, bool hw)
+{
+ struct tcf_sample *s = to_sample(a);
+ struct tcf_t *tm = &s->tcf_tm;
+
+ tcf_action_update_stats(a, bytes, packets, drops, hw);
+ tm->lastuse = max_t(u64, tm->lastuse, lastuse);
+}
+
static int tcf_sample_dump(struct sk_buff *skb, struct tc_action *a,
int bind, int ref)
{
@@ -214,52 +261,91 @@ nla_put_failure:
return -1;
}
-static int tcf_sample_walker(struct net *net, struct sk_buff *skb,
- struct netlink_callback *cb, int type,
- const struct tc_action_ops *ops,
- struct netlink_ext_ack *extack)
+static void tcf_psample_group_put(void *priv)
+{
+ struct psample_group *group = priv;
+
+ psample_group_put(group);
+}
+
+static struct psample_group *
+tcf_sample_get_group(const struct tc_action *a,
+ tc_action_priv_destructor *destructor)
{
- struct tc_action_net *tn = net_generic(net, sample_net_id);
+ struct tcf_sample *s = to_sample(a);
+ struct psample_group *group;
+
+ group = rcu_dereference_protected(s->psample_group,
+ lockdep_is_held(&s->tcf_lock));
+ if (group) {
+ psample_group_take(group);
+ *destructor = tcf_psample_group_put;
+ }
- return tcf_generic_walker(tn, skb, cb, type, ops, extack);
+ return group;
}
-static int tcf_sample_search(struct net *net, struct tc_action **a, u32 index)
+static void tcf_offload_sample_get_group(struct flow_action_entry *entry,
+ const struct tc_action *act)
{
- struct tc_action_net *tn = net_generic(net, sample_net_id);
+ entry->sample.psample_group =
+ act->ops->get_psample_group(act, &entry->destructor);
+ entry->destructor_priv = entry->sample.psample_group;
+}
+
+static int tcf_sample_offload_act_setup(struct tc_action *act, void *entry_data,
+ u32 *index_inc, bool bind,
+ struct netlink_ext_ack *extack)
+{
+ if (bind) {
+ struct flow_action_entry *entry = entry_data;
+
+ entry->id = FLOW_ACTION_SAMPLE;
+ entry->sample.trunc_size = tcf_sample_trunc_size(act);
+ entry->sample.truncate = tcf_sample_truncate(act);
+ entry->sample.rate = tcf_sample_rate(act);
+ tcf_offload_sample_get_group(entry, act);
+ *index_inc = 1;
+ } else {
+ struct flow_offload_action *fl_action = entry_data;
+
+ fl_action->id = FLOW_ACTION_SAMPLE;
+ }
- return tcf_idr_search(tn, a, index);
+ return 0;
}
static struct tc_action_ops act_sample_ops = {
.kind = "sample",
- .type = TCA_ACT_SAMPLE,
+ .id = TCA_ID_SAMPLE,
.owner = THIS_MODULE,
.act = tcf_sample_act,
+ .stats_update = tcf_sample_stats_update,
.dump = tcf_sample_dump,
.init = tcf_sample_init,
.cleanup = tcf_sample_cleanup,
- .walk = tcf_sample_walker,
- .lookup = tcf_sample_search,
+ .get_psample_group = tcf_sample_get_group,
+ .offload_act_setup = tcf_sample_offload_act_setup,
.size = sizeof(struct tcf_sample),
};
+MODULE_ALIAS_NET_ACT("sample");
static __net_init int sample_init_net(struct net *net)
{
- struct tc_action_net *tn = net_generic(net, sample_net_id);
+ struct tc_action_net *tn = net_generic(net, act_sample_ops.net_id);
- return tc_action_net_init(tn, &act_sample_ops);
+ return tc_action_net_init(net, tn, &act_sample_ops);
}
static void __net_exit sample_exit_net(struct list_head *net_list)
{
- tc_action_net_exit(net_list, sample_net_id);
+ tc_action_net_exit(net_list, act_sample_ops.net_id);
}
static struct pernet_operations sample_net_ops = {
.init = sample_init_net,
.exit_batch = sample_exit_net,
- .id = &sample_net_id,
+ .id = &act_sample_ops.net_id,
.size = sizeof(struct tc_action_net),
};
diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c
index 902957beceb3..8e69a919b4fe 100644
--- a/net/sched/act_simple.c
+++ b/net/sched/act_simple.c
@@ -1,13 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* net/sched/act_simple.c Simple example of an action
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Authors: Jamal Hadi Salim (2005-8)
- *
*/
#include <linux/module.h>
@@ -18,18 +13,18 @@
#include <linux/rtnetlink.h>
#include <net/netlink.h>
#include <net/pkt_sched.h>
-
-#define TCA_ACT_SIMP 22
+#include <net/pkt_cls.h>
+#include <net/tc_wrapper.h>
#include <linux/tc_act/tc_defact.h>
#include <net/tc_act/tc_defact.h>
-static unsigned int simp_net_id;
static struct tc_action_ops act_simp_ops;
#define SIMP_MAX_DATA 32
-static int tcf_simp_act(struct sk_buff *skb, const struct tc_action *a,
- struct tcf_result *res)
+TC_INDIRECT_SCOPE int tcf_simp_act(struct sk_buff *skb,
+ const struct tc_action *a,
+ struct tcf_result *res)
{
struct tcf_defact *d = to_defact(a);
@@ -41,8 +36,9 @@ static int tcf_simp_act(struct sk_buff *skb, const struct tc_action *a,
* Example if this was the 3rd packet and the string was "hello"
* then it would look like "hello_3" (without quotes)
*/
- pr_info("simple: %s_%d\n",
- (char *)d->tcfd_defdata, d->tcf_bstats.packets);
+ pr_info("simple: %s_%llu\n",
+ (char *)d->tcfd_defdata,
+ u64_stats_read(&d->tcf_bstats.packets));
spin_unlock(&d->tcf_lock);
return d->tcf_action;
}
@@ -58,18 +54,29 @@ static int alloc_defdata(struct tcf_defact *d, const struct nlattr *defdata)
d->tcfd_defdata = kzalloc(SIMP_MAX_DATA, GFP_KERNEL);
if (unlikely(!d->tcfd_defdata))
return -ENOMEM;
- nla_strlcpy(d->tcfd_defdata, defdata, SIMP_MAX_DATA);
+ nla_strscpy(d->tcfd_defdata, defdata, SIMP_MAX_DATA);
return 0;
}
-static void reset_policy(struct tcf_defact *d, const struct nlattr *defdata,
- struct tc_defact *p)
+static int reset_policy(struct tc_action *a, const struct nlattr *defdata,
+ struct tc_defact *p, struct tcf_proto *tp,
+ struct netlink_ext_ack *extack)
{
+ struct tcf_chain *goto_ch = NULL;
+ struct tcf_defact *d;
+ int err;
+
+ err = tcf_action_check_ctrlact(p->action, tp, &goto_ch, extack);
+ if (err < 0)
+ return err;
+ d = to_defact(a);
spin_lock_bh(&d->tcf_lock);
- d->tcf_action = p->action;
- memset(d->tcfd_defdata, 0, SIMP_MAX_DATA);
- nla_strlcpy(d->tcfd_defdata, defdata, SIMP_MAX_DATA);
+ goto_ch = tcf_action_set_ctrlact(a, p->action, goto_ch);
+ nla_strscpy(d->tcfd_defdata, defdata, SIMP_MAX_DATA);
spin_unlock_bh(&d->tcf_lock);
+ if (goto_ch)
+ tcf_chain_put_by_act(goto_ch);
+ return 0;
}
static const struct nla_policy simple_policy[TCA_DEF_MAX + 1] = {
@@ -79,20 +86,24 @@ static const struct nla_policy simple_policy[TCA_DEF_MAX + 1] = {
static int tcf_simp_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action **a,
- int ovr, int bind, bool rtnl_held,
+ struct tcf_proto *tp, u32 flags,
struct netlink_ext_ack *extack)
{
- struct tc_action_net *tn = net_generic(net, simp_net_id);
+ struct tc_action_net *tn = net_generic(net, act_simp_ops.net_id);
+ bool bind = flags & TCA_ACT_FLAGS_BIND;
struct nlattr *tb[TCA_DEF_MAX + 1];
+ struct tcf_chain *goto_ch = NULL;
struct tc_defact *parm;
struct tcf_defact *d;
bool exists = false;
int ret = 0, err;
+ u32 index;
if (nla == NULL)
return -EINVAL;
- err = nla_parse_nested(tb, TCA_DEF_MAX, nla, simple_policy, NULL);
+ err = nla_parse_nested_deprecated(tb, TCA_DEF_MAX, nla, simple_policy,
+ NULL);
if (err < 0)
return err;
@@ -100,51 +111,60 @@ static int tcf_simp_init(struct net *net, struct nlattr *nla,
return -EINVAL;
parm = nla_data(tb[TCA_DEF_PARMS]);
- err = tcf_idr_check_alloc(tn, &parm->index, a, bind);
+ index = parm->index;
+ err = tcf_idr_check_alloc(tn, &index, a, bind);
if (err < 0)
return err;
exists = err;
if (exists && bind)
- return 0;
+ return ACT_P_BOUND;
if (tb[TCA_DEF_DATA] == NULL) {
if (exists)
tcf_idr_release(*a, bind);
else
- tcf_idr_cleanup(tn, parm->index);
+ tcf_idr_cleanup(tn, index);
return -EINVAL;
}
if (!exists) {
- ret = tcf_idr_create(tn, parm->index, est, a,
- &act_simp_ops, bind, false);
+ ret = tcf_idr_create(tn, index, est, a,
+ &act_simp_ops, bind, false, flags);
if (ret) {
- tcf_idr_cleanup(tn, parm->index);
+ tcf_idr_cleanup(tn, index);
return ret;
}
d = to_defact(*a);
- ret = alloc_defdata(d, tb[TCA_DEF_DATA]);
- if (ret < 0) {
- tcf_idr_release(*a, bind);
- return ret;
- }
- d->tcf_action = parm->action;
+ err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch,
+ extack);
+ if (err < 0)
+ goto release_idr;
+
+ err = alloc_defdata(d, tb[TCA_DEF_DATA]);
+ if (err < 0)
+ goto put_chain;
+
+ tcf_action_set_ctrlact(*a, parm->action, goto_ch);
ret = ACT_P_CREATED;
} else {
- d = to_defact(*a);
-
- if (!ovr) {
- tcf_idr_release(*a, bind);
- return -EEXIST;
+ if (!(flags & TCA_ACT_FLAGS_REPLACE)) {
+ err = -EEXIST;
+ goto release_idr;
}
- reset_policy(d, tb[TCA_DEF_DATA], parm);
+ err = reset_policy(*a, tb[TCA_DEF_DATA], parm, tp, extack);
+ if (err)
+ goto release_idr;
}
- if (ret == ACT_P_CREATED)
- tcf_idr_insert(tn, *a);
return ret;
+put_chain:
+ if (goto_ch)
+ tcf_chain_put_by_act(goto_ch);
+release_idr:
+ tcf_idr_release(*a, bind);
+ return err;
}
static int tcf_simp_dump(struct sk_buff *skb, struct tc_action *a,
@@ -178,52 +198,34 @@ nla_put_failure:
return -1;
}
-static int tcf_simp_walker(struct net *net, struct sk_buff *skb,
- struct netlink_callback *cb, int type,
- const struct tc_action_ops *ops,
- struct netlink_ext_ack *extack)
-{
- struct tc_action_net *tn = net_generic(net, simp_net_id);
-
- return tcf_generic_walker(tn, skb, cb, type, ops, extack);
-}
-
-static int tcf_simp_search(struct net *net, struct tc_action **a, u32 index)
-{
- struct tc_action_net *tn = net_generic(net, simp_net_id);
-
- return tcf_idr_search(tn, a, index);
-}
-
static struct tc_action_ops act_simp_ops = {
.kind = "simple",
- .type = TCA_ACT_SIMP,
+ .id = TCA_ID_SIMP,
.owner = THIS_MODULE,
.act = tcf_simp_act,
.dump = tcf_simp_dump,
.cleanup = tcf_simp_release,
.init = tcf_simp_init,
- .walk = tcf_simp_walker,
- .lookup = tcf_simp_search,
.size = sizeof(struct tcf_defact),
};
+MODULE_ALIAS_NET_ACT("simple");
static __net_init int simp_init_net(struct net *net)
{
- struct tc_action_net *tn = net_generic(net, simp_net_id);
+ struct tc_action_net *tn = net_generic(net, act_simp_ops.net_id);
- return tc_action_net_init(tn, &act_simp_ops);
+ return tc_action_net_init(net, tn, &act_simp_ops);
}
static void __net_exit simp_exit_net(struct list_head *net_list)
{
- tc_action_net_exit(net_list, simp_net_id);
+ tc_action_net_exit(net_list, act_simp_ops.net_id);
}
static struct pernet_operations simp_net_ops = {
.init = simp_init_net,
.exit_batch = simp_exit_net,
- .id = &simp_net_id,
+ .id = &act_simp_ops.net_id,
.size = sizeof(struct tc_action_net),
};
diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c
index 64dba3708fce..8c1d1554f657 100644
--- a/net/sched/act_skbedit.c
+++ b/net/sched/act_skbedit.c
@@ -1,18 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (c) 2008, Intel Corporation.
*
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, see <http://www.gnu.org/licenses/>.
- *
* Author: Alexander Duyck <alexander.h.duyck@intel.com>
*/
@@ -26,32 +15,46 @@
#include <net/ip.h>
#include <net/ipv6.h>
#include <net/dsfield.h>
+#include <net/pkt_cls.h>
+#include <net/tc_wrapper.h>
#include <linux/tc_act/tc_skbedit.h>
#include <net/tc_act/tc_skbedit.h>
-static unsigned int skbedit_net_id;
static struct tc_action_ops act_skbedit_ops;
-static int tcf_skbedit_act(struct sk_buff *skb, const struct tc_action *a,
- struct tcf_result *res)
+static u16 tcf_skbedit_hash(struct tcf_skbedit_params *params,
+ struct sk_buff *skb)
+{
+ u16 queue_mapping = params->queue_mapping;
+
+ if (params->flags & SKBEDIT_F_TXQ_SKBHASH) {
+ u32 hash = skb_get_hash(skb);
+
+ queue_mapping += hash % params->mapping_mod;
+ }
+
+ return netdev_cap_txqueue(skb->dev, queue_mapping);
+}
+
+TC_INDIRECT_SCOPE int tcf_skbedit_act(struct sk_buff *skb,
+ const struct tc_action *a,
+ struct tcf_result *res)
{
struct tcf_skbedit *d = to_skbedit(a);
struct tcf_skbedit_params *params;
- int action;
tcf_lastuse_update(&d->tcf_tm);
- bstats_cpu_update(this_cpu_ptr(d->common.cpu_bstats), skb);
+ bstats_update(this_cpu_ptr(d->common.cpu_bstats), skb);
params = rcu_dereference_bh(d->params);
- action = READ_ONCE(d->tcf_action);
if (params->flags & SKBEDIT_F_PRIORITY)
skb->priority = params->priority;
if (params->flags & SKBEDIT_F_INHERITDSFIELD) {
int wlen = skb_network_offset(skb);
- switch (tc_skb_protocol(skb)) {
+ switch (skb_protocol(skb, true)) {
case htons(ETH_P_IP):
wlen += sizeof(struct iphdr);
if (!pskb_may_pull(skb, wlen))
@@ -68,21 +71,36 @@ static int tcf_skbedit_act(struct sk_buff *skb, const struct tc_action *a,
}
}
if (params->flags & SKBEDIT_F_QUEUE_MAPPING &&
- skb->dev->real_num_tx_queues > params->queue_mapping)
- skb_set_queue_mapping(skb, params->queue_mapping);
+ skb->dev->real_num_tx_queues > params->queue_mapping) {
+#ifdef CONFIG_NET_EGRESS
+ netdev_xmit_skip_txqueue(true);
+#endif
+ skb_set_queue_mapping(skb, tcf_skbedit_hash(params, skb));
+ }
if (params->flags & SKBEDIT_F_MARK) {
skb->mark &= ~params->mask;
skb->mark |= params->mark & params->mask;
}
if (params->flags & SKBEDIT_F_PTYPE)
skb->pkt_type = params->ptype;
- return action;
+ return params->action;
err:
qstats_drop_inc(this_cpu_ptr(d->common.cpu_qstats));
return TC_ACT_SHOT;
}
+static void tcf_skbedit_stats_update(struct tc_action *a, u64 bytes,
+ u64 packets, u64 drops,
+ u64 lastuse, bool hw)
+{
+ struct tcf_skbedit *d = to_skbedit(a);
+ struct tcf_t *tm = &d->tcf_tm;
+
+ tcf_action_update_stats(a, bytes, packets, drops, hw);
+ tm->lastuse = max_t(u64, tm->lastuse, lastuse);
+}
+
static const struct nla_policy skbedit_policy[TCA_SKBEDIT_MAX + 1] = {
[TCA_SKBEDIT_PARMS] = { .len = sizeof(struct tc_skbedit) },
[TCA_SKBEDIT_PRIORITY] = { .len = sizeof(u32) },
@@ -91,27 +109,33 @@ static const struct nla_policy skbedit_policy[TCA_SKBEDIT_MAX + 1] = {
[TCA_SKBEDIT_PTYPE] = { .len = sizeof(u16) },
[TCA_SKBEDIT_MASK] = { .len = sizeof(u32) },
[TCA_SKBEDIT_FLAGS] = { .len = sizeof(u64) },
+ [TCA_SKBEDIT_QUEUE_MAPPING_MAX] = { .len = sizeof(u16) },
};
static int tcf_skbedit_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action **a,
- int ovr, int bind, bool rtnl_held,
+ struct tcf_proto *tp, u32 act_flags,
struct netlink_ext_ack *extack)
{
- struct tc_action_net *tn = net_generic(net, skbedit_net_id);
+ struct tc_action_net *tn = net_generic(net, act_skbedit_ops.net_id);
+ bool bind = act_flags & TCA_ACT_FLAGS_BIND;
struct tcf_skbedit_params *params_new;
struct nlattr *tb[TCA_SKBEDIT_MAX + 1];
+ struct tcf_chain *goto_ch = NULL;
struct tc_skbedit *parm;
struct tcf_skbedit *d;
u32 flags = 0, *priority = NULL, *mark = NULL, *mask = NULL;
u16 *queue_mapping = NULL, *ptype = NULL;
+ u16 mapping_mod = 1;
bool exists = false;
int ret = 0, err;
+ u32 index;
if (nla == NULL)
return -EINVAL;
- err = nla_parse_nested(tb, TCA_SKBEDIT_MAX, nla, skbedit_policy, NULL);
+ err = nla_parse_nested_deprecated(tb, TCA_SKBEDIT_MAX, nla,
+ skbedit_policy, NULL);
if (err < 0)
return err;
@@ -124,6 +148,11 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla,
}
if (tb[TCA_SKBEDIT_QUEUE_MAPPING] != NULL) {
+ if (is_tcf_skbedit_ingress(act_flags) &&
+ !(act_flags & TCA_ACT_FLAGS_SKIP_SW)) {
+ NL_SET_ERR_MSG_MOD(extack, "\"queue_mapping\" option on receive side is hardware only, use skip_sw");
+ return -EOPNOTSUPP;
+ }
flags |= SKBEDIT_F_QUEUE_MAPPING;
queue_mapping = nla_data(tb[TCA_SKBEDIT_QUEUE_MAPPING]);
}
@@ -148,32 +177,51 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla,
if (tb[TCA_SKBEDIT_FLAGS] != NULL) {
u64 *pure_flags = nla_data(tb[TCA_SKBEDIT_FLAGS]);
+ if (*pure_flags & SKBEDIT_F_TXQ_SKBHASH) {
+ u16 *queue_mapping_max;
+
+ if (!tb[TCA_SKBEDIT_QUEUE_MAPPING] ||
+ !tb[TCA_SKBEDIT_QUEUE_MAPPING_MAX]) {
+ NL_SET_ERR_MSG_MOD(extack, "Missing required range of queue_mapping.");
+ return -EINVAL;
+ }
+
+ queue_mapping_max =
+ nla_data(tb[TCA_SKBEDIT_QUEUE_MAPPING_MAX]);
+ if (*queue_mapping_max < *queue_mapping) {
+ NL_SET_ERR_MSG_MOD(extack, "The range of queue_mapping is invalid, max < min.");
+ return -EINVAL;
+ }
+
+ mapping_mod = *queue_mapping_max - *queue_mapping + 1;
+ flags |= SKBEDIT_F_TXQ_SKBHASH;
+ }
if (*pure_flags & SKBEDIT_F_INHERITDSFIELD)
flags |= SKBEDIT_F_INHERITDSFIELD;
}
parm = nla_data(tb[TCA_SKBEDIT_PARMS]);
-
- err = tcf_idr_check_alloc(tn, &parm->index, a, bind);
+ index = parm->index;
+ err = tcf_idr_check_alloc(tn, &index, a, bind);
if (err < 0)
return err;
exists = err;
if (exists && bind)
- return 0;
+ return ACT_P_BOUND;
if (!flags) {
if (exists)
tcf_idr_release(*a, bind);
else
- tcf_idr_cleanup(tn, parm->index);
+ tcf_idr_cleanup(tn, index);
return -EINVAL;
}
if (!exists) {
- ret = tcf_idr_create(tn, parm->index, est, a,
- &act_skbedit_ops, bind, true);
+ ret = tcf_idr_create(tn, index, est, a,
+ &act_skbedit_ops, bind, true, act_flags);
if (ret) {
- tcf_idr_cleanup(tn, parm->index);
+ tcf_idr_cleanup(tn, index);
return ret;
}
@@ -181,24 +229,28 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla,
ret = ACT_P_CREATED;
} else {
d = to_skbedit(*a);
- if (!ovr) {
+ if (!(act_flags & TCA_ACT_FLAGS_REPLACE)) {
tcf_idr_release(*a, bind);
return -EEXIST;
}
}
+ err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack);
+ if (err < 0)
+ goto release_idr;
params_new = kzalloc(sizeof(*params_new), GFP_KERNEL);
if (unlikely(!params_new)) {
- if (ret == ACT_P_CREATED)
- tcf_idr_release(*a, bind);
- return -ENOMEM;
+ err = -ENOMEM;
+ goto put_chain;
}
params_new->flags = flags;
if (flags & SKBEDIT_F_PRIORITY)
params_new->priority = *priority;
- if (flags & SKBEDIT_F_QUEUE_MAPPING)
+ if (flags & SKBEDIT_F_QUEUE_MAPPING) {
params_new->queue_mapping = *queue_mapping;
+ params_new->mapping_mod = mapping_mod;
+ }
if (flags & SKBEDIT_F_MARK)
params_new->mark = *mark;
if (flags & SKBEDIT_F_PTYPE)
@@ -208,25 +260,32 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla,
if (flags & SKBEDIT_F_MASK)
params_new->mask = *mask;
+ params_new->action = parm->action;
spin_lock_bh(&d->tcf_lock);
- d->tcf_action = parm->action;
- rcu_swap_protected(d->params, params_new,
- lockdep_is_held(&d->tcf_lock));
+ goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch);
+ params_new = rcu_replace_pointer(d->params, params_new,
+ lockdep_is_held(&d->tcf_lock));
spin_unlock_bh(&d->tcf_lock);
if (params_new)
kfree_rcu(params_new, rcu);
+ if (goto_ch)
+ tcf_chain_put_by_act(goto_ch);
- if (ret == ACT_P_CREATED)
- tcf_idr_insert(tn, *a);
return ret;
+put_chain:
+ if (goto_ch)
+ tcf_chain_put_by_act(goto_ch);
+release_idr:
+ tcf_idr_release(*a, bind);
+ return err;
}
static int tcf_skbedit_dump(struct sk_buff *skb, struct tc_action *a,
int bind, int ref)
{
+ const struct tcf_skbedit *d = to_skbedit(a);
unsigned char *b = skb_tail_pointer(skb);
- struct tcf_skbedit *d = to_skbedit(a);
- struct tcf_skbedit_params *params;
+ const struct tcf_skbedit_params *params;
struct tc_skbedit opt = {
.index = d->tcf_index,
.refcnt = refcount_read(&d->tcf_refcnt) - ref,
@@ -235,10 +294,9 @@ static int tcf_skbedit_dump(struct sk_buff *skb, struct tc_action *a,
u64 pure_flags = 0;
struct tcf_t t;
- spin_lock_bh(&d->tcf_lock);
- params = rcu_dereference_protected(d->params,
- lockdep_is_held(&d->tcf_lock));
- opt.action = d->tcf_action;
+ rcu_read_lock();
+ params = rcu_dereference(d->params);
+ opt.action = params->action;
if (nla_put(skb, TCA_SKBEDIT_PARMS, sizeof(opt), &opt))
goto nla_put_failure;
@@ -259,6 +317,13 @@ static int tcf_skbedit_dump(struct sk_buff *skb, struct tc_action *a,
goto nla_put_failure;
if (params->flags & SKBEDIT_F_INHERITDSFIELD)
pure_flags |= SKBEDIT_F_INHERITDSFIELD;
+ if (params->flags & SKBEDIT_F_TXQ_SKBHASH) {
+ if (nla_put_u16(skb, TCA_SKBEDIT_QUEUE_MAPPING_MAX,
+ params->queue_mapping + params->mapping_mod - 1))
+ goto nla_put_failure;
+
+ pure_flags |= SKBEDIT_F_TXQ_SKBHASH;
+ }
if (pure_flags != 0 &&
nla_put(skb, TCA_SKBEDIT_FLAGS, sizeof(pure_flags), &pure_flags))
goto nla_put_failure;
@@ -266,12 +331,12 @@ static int tcf_skbedit_dump(struct sk_buff *skb, struct tc_action *a,
tcf_tm_dump(&t, &d->tcf_tm);
if (nla_put_64bit(skb, TCA_SKBEDIT_TM, sizeof(t), &t, TCA_SKBEDIT_PAD))
goto nla_put_failure;
- spin_unlock_bh(&d->tcf_lock);
+ rcu_read_unlock();
return skb->len;
nla_put_failure:
- spin_unlock_bh(&d->tcf_lock);
+ rcu_read_unlock();
nlmsg_trim(skb, b);
return -1;
}
@@ -286,52 +351,97 @@ static void tcf_skbedit_cleanup(struct tc_action *a)
kfree_rcu(params, rcu);
}
-static int tcf_skbedit_walker(struct net *net, struct sk_buff *skb,
- struct netlink_callback *cb, int type,
- const struct tc_action_ops *ops,
- struct netlink_ext_ack *extack)
+static size_t tcf_skbedit_get_fill_size(const struct tc_action *act)
{
- struct tc_action_net *tn = net_generic(net, skbedit_net_id);
-
- return tcf_generic_walker(tn, skb, cb, type, ops, extack);
+ return nla_total_size(sizeof(struct tc_skbedit))
+ + nla_total_size(sizeof(u32)) /* TCA_SKBEDIT_PRIORITY */
+ + nla_total_size(sizeof(u16)) /* TCA_SKBEDIT_QUEUE_MAPPING */
+ + nla_total_size(sizeof(u16)) /* TCA_SKBEDIT_QUEUE_MAPPING_MAX */
+ + nla_total_size(sizeof(u32)) /* TCA_SKBEDIT_MARK */
+ + nla_total_size(sizeof(u16)) /* TCA_SKBEDIT_PTYPE */
+ + nla_total_size(sizeof(u32)) /* TCA_SKBEDIT_MASK */
+ + nla_total_size_64bit(sizeof(u64)); /* TCA_SKBEDIT_FLAGS */
}
-static int tcf_skbedit_search(struct net *net, struct tc_action **a, u32 index)
+static int tcf_skbedit_offload_act_setup(struct tc_action *act, void *entry_data,
+ u32 *index_inc, bool bind,
+ struct netlink_ext_ack *extack)
{
- struct tc_action_net *tn = net_generic(net, skbedit_net_id);
+ if (bind) {
+ struct flow_action_entry *entry = entry_data;
+
+ if (is_tcf_skbedit_mark(act)) {
+ entry->id = FLOW_ACTION_MARK;
+ entry->mark = tcf_skbedit_mark(act);
+ } else if (is_tcf_skbedit_ptype(act)) {
+ entry->id = FLOW_ACTION_PTYPE;
+ entry->ptype = tcf_skbedit_ptype(act);
+ } else if (is_tcf_skbedit_priority(act)) {
+ entry->id = FLOW_ACTION_PRIORITY;
+ entry->priority = tcf_skbedit_priority(act);
+ } else if (is_tcf_skbedit_tx_queue_mapping(act)) {
+ NL_SET_ERR_MSG_MOD(extack, "Offload not supported when \"queue_mapping\" option is used on transmit side");
+ return -EOPNOTSUPP;
+ } else if (is_tcf_skbedit_rx_queue_mapping(act)) {
+ entry->id = FLOW_ACTION_RX_QUEUE_MAPPING;
+ entry->rx_queue = tcf_skbedit_rx_queue_mapping(act);
+ } else if (is_tcf_skbedit_inheritdsfield(act)) {
+ NL_SET_ERR_MSG_MOD(extack, "Offload not supported when \"inheritdsfield\" option is used");
+ return -EOPNOTSUPP;
+ } else {
+ NL_SET_ERR_MSG_MOD(extack, "Unsupported skbedit option offload");
+ return -EOPNOTSUPP;
+ }
+ *index_inc = 1;
+ } else {
+ struct flow_offload_action *fl_action = entry_data;
+
+ if (is_tcf_skbedit_mark(act))
+ fl_action->id = FLOW_ACTION_MARK;
+ else if (is_tcf_skbedit_ptype(act))
+ fl_action->id = FLOW_ACTION_PTYPE;
+ else if (is_tcf_skbedit_priority(act))
+ fl_action->id = FLOW_ACTION_PRIORITY;
+ else if (is_tcf_skbedit_rx_queue_mapping(act))
+ fl_action->id = FLOW_ACTION_RX_QUEUE_MAPPING;
+ else
+ return -EOPNOTSUPP;
+ }
- return tcf_idr_search(tn, a, index);
+ return 0;
}
static struct tc_action_ops act_skbedit_ops = {
.kind = "skbedit",
- .type = TCA_ACT_SKBEDIT,
+ .id = TCA_ID_SKBEDIT,
.owner = THIS_MODULE,
.act = tcf_skbedit_act,
+ .stats_update = tcf_skbedit_stats_update,
.dump = tcf_skbedit_dump,
.init = tcf_skbedit_init,
.cleanup = tcf_skbedit_cleanup,
- .walk = tcf_skbedit_walker,
- .lookup = tcf_skbedit_search,
+ .get_fill_size = tcf_skbedit_get_fill_size,
+ .offload_act_setup = tcf_skbedit_offload_act_setup,
.size = sizeof(struct tcf_skbedit),
};
+MODULE_ALIAS_NET_ACT("skbedit");
static __net_init int skbedit_init_net(struct net *net)
{
- struct tc_action_net *tn = net_generic(net, skbedit_net_id);
+ struct tc_action_net *tn = net_generic(net, act_skbedit_ops.net_id);
- return tc_action_net_init(tn, &act_skbedit_ops);
+ return tc_action_net_init(net, tn, &act_skbedit_ops);
}
static void __net_exit skbedit_exit_net(struct list_head *net_list)
{
- tc_action_net_exit(net_list, skbedit_net_id);
+ tc_action_net_exit(net_list, act_skbedit_ops.net_id);
}
static struct pernet_operations skbedit_net_ops = {
.init = skbedit_init_net,
.exit_batch = skbedit_exit_net,
- .id = &skbedit_net_id,
+ .id = &act_skbedit_ops.net_id,
.size = sizeof(struct tc_action_net),
};
diff --git a/net/sched/act_skbmod.c b/net/sched/act_skbmod.c
index 59710a183bd3..a9e0c1326e2a 100644
--- a/net/sched/act_skbmod.c
+++ b/net/sched/act_skbmod.c
@@ -1,55 +1,70 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* net/sched/act_skbmod.c skb data modifier
*
* Copyright (c) 2016 Jamal Hadi Salim <jhs@mojatatu.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
*/
#include <linux/module.h>
+#include <linux/if_arp.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/skbuff.h>
#include <linux/rtnetlink.h>
+#include <net/inet_ecn.h>
#include <net/netlink.h>
#include <net/pkt_sched.h>
+#include <net/pkt_cls.h>
+#include <net/tc_wrapper.h>
#include <linux/tc_act/tc_skbmod.h>
#include <net/tc_act/tc_skbmod.h>
-static unsigned int skbmod_net_id;
static struct tc_action_ops act_skbmod_ops;
-#define MAX_EDIT_LEN ETH_HLEN
-static int tcf_skbmod_act(struct sk_buff *skb, const struct tc_action *a,
- struct tcf_result *res)
+TC_INDIRECT_SCOPE int tcf_skbmod_act(struct sk_buff *skb,
+ const struct tc_action *a,
+ struct tcf_result *res)
{
struct tcf_skbmod *d = to_skbmod(a);
- int action;
struct tcf_skbmod_params *p;
+ int max_edit_len, err;
u64 flags;
- int err;
tcf_lastuse_update(&d->tcf_tm);
- bstats_cpu_update(this_cpu_ptr(d->common.cpu_bstats), skb);
+ bstats_update(this_cpu_ptr(d->common.cpu_bstats), skb);
- /* XXX: if you are going to edit more fields beyond ethernet header
- * (example when you add IP header replacement or vlan swap)
- * then MAX_EDIT_LEN needs to change appropriately
- */
- err = skb_ensure_writable(skb, MAX_EDIT_LEN);
- if (unlikely(err)) /* best policy is to drop on the floor */
+ p = rcu_dereference_bh(d->skbmod_p);
+ if (unlikely(p->action == TC_ACT_SHOT))
goto drop;
- action = READ_ONCE(d->tcf_action);
- if (unlikely(action == TC_ACT_SHOT))
+ max_edit_len = skb_mac_header_len(skb);
+ flags = p->flags;
+
+ /* tcf_skbmod_init() guarantees "flags" to be one of the following:
+ * 1. a combination of SKBMOD_F_{DMAC,SMAC,ETYPE}
+ * 2. SKBMOD_F_SWAPMAC
+ * 3. SKBMOD_F_ECN
+ * SKBMOD_F_ECN only works with IP packets; all other flags only work with Ethernet
+ * packets.
+ */
+ if (flags == SKBMOD_F_ECN) {
+ switch (skb_protocol(skb, true)) {
+ case cpu_to_be16(ETH_P_IP):
+ case cpu_to_be16(ETH_P_IPV6):
+ max_edit_len += skb_network_header_len(skb);
+ break;
+ default:
+ goto out;
+ }
+ } else if (!skb->dev || skb->dev->type != ARPHRD_ETHER) {
+ goto out;
+ }
+
+ err = skb_ensure_writable(skb, max_edit_len);
+ if (unlikely(err)) /* best policy is to drop on the floor */
goto drop;
- p = rcu_dereference_bh(d->skbmod_p);
- flags = p->flags;
if (flags & SKBMOD_F_DMAC)
ether_addr_copy(eth_hdr(skb)->h_dest, p->eth_dst);
if (flags & SKBMOD_F_SMAC)
@@ -65,7 +80,11 @@ static int tcf_skbmod_act(struct sk_buff *skb, const struct tc_action *a,
ether_addr_copy(eth_hdr(skb)->h_source, (u8 *)tmpaddr);
}
- return action;
+ if (flags & SKBMOD_F_ECN)
+ INET_ECN_set_ce(skb);
+
+out:
+ return p->action;
drop:
qstats_overlimit_inc(this_cpu_ptr(d->common.cpu_qstats));
@@ -81,25 +100,29 @@ static const struct nla_policy skbmod_policy[TCA_SKBMOD_MAX + 1] = {
static int tcf_skbmod_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action **a,
- int ovr, int bind, bool rtnl_held,
+ struct tcf_proto *tp, u32 flags,
struct netlink_ext_ack *extack)
{
- struct tc_action_net *tn = net_generic(net, skbmod_net_id);
+ struct tc_action_net *tn = net_generic(net, act_skbmod_ops.net_id);
+ bool ovr = flags & TCA_ACT_FLAGS_REPLACE;
+ bool bind = flags & TCA_ACT_FLAGS_BIND;
struct nlattr *tb[TCA_SKBMOD_MAX + 1];
struct tcf_skbmod_params *p, *p_old;
+ struct tcf_chain *goto_ch = NULL;
struct tc_skbmod *parm;
+ u32 lflags = 0, index;
struct tcf_skbmod *d;
bool exists = false;
u8 *daddr = NULL;
u8 *saddr = NULL;
u16 eth_type = 0;
- u32 lflags = 0;
int ret = 0, err;
if (!nla)
return -EINVAL;
- err = nla_parse_nested(tb, TCA_SKBMOD_MAX, nla, skbmod_policy, NULL);
+ err = nla_parse_nested_deprecated(tb, TCA_SKBMOD_MAX, nla,
+ skbmod_policy, NULL);
if (err < 0)
return err;
@@ -122,29 +145,32 @@ static int tcf_skbmod_init(struct net *net, struct nlattr *nla,
}
parm = nla_data(tb[TCA_SKBMOD_PARMS]);
+ index = parm->index;
if (parm->flags & SKBMOD_F_SWAPMAC)
lflags = SKBMOD_F_SWAPMAC;
+ if (parm->flags & SKBMOD_F_ECN)
+ lflags = SKBMOD_F_ECN;
- err = tcf_idr_check_alloc(tn, &parm->index, a, bind);
+ err = tcf_idr_check_alloc(tn, &index, a, bind);
if (err < 0)
return err;
exists = err;
if (exists && bind)
- return 0;
+ return ACT_P_BOUND;
if (!lflags) {
if (exists)
tcf_idr_release(*a, bind);
else
- tcf_idr_cleanup(tn, parm->index);
+ tcf_idr_cleanup(tn, index);
return -EINVAL;
}
if (!exists) {
- ret = tcf_idr_create(tn, parm->index, est, a,
- &act_skbmod_ops, bind, true);
+ ret = tcf_idr_create(tn, index, est, a,
+ &act_skbmod_ops, bind, true, flags);
if (ret) {
- tcf_idr_cleanup(tn, parm->index);
+ tcf_idr_cleanup(tn, index);
return ret;
}
@@ -153,21 +179,24 @@ static int tcf_skbmod_init(struct net *net, struct nlattr *nla,
tcf_idr_release(*a, bind);
return -EEXIST;
}
+ err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack);
+ if (err < 0)
+ goto release_idr;
d = to_skbmod(*a);
p = kzalloc(sizeof(struct tcf_skbmod_params), GFP_KERNEL);
if (unlikely(!p)) {
- tcf_idr_release(*a, bind);
- return -ENOMEM;
+ err = -ENOMEM;
+ goto put_chain;
}
p->flags = lflags;
- d->tcf_action = parm->action;
-
+ p->action = parm->action;
if (ovr)
spin_lock_bh(&d->tcf_lock);
/* Protected by tcf_lock if overwriting existing action. */
+ goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch);
p_old = rcu_dereference_protected(d->skbmod_p, 1);
if (lflags & SKBMOD_F_DMAC)
@@ -183,10 +212,16 @@ static int tcf_skbmod_init(struct net *net, struct nlattr *nla,
if (p_old)
kfree_rcu(p_old, rcu);
+ if (goto_ch)
+ tcf_chain_put_by_act(goto_ch);
- if (ret == ACT_P_CREATED)
- tcf_idr_insert(tn, *a);
return ret;
+put_chain:
+ if (goto_ch)
+ tcf_chain_put_by_act(goto_ch);
+release_idr:
+ tcf_idr_release(*a, bind);
+ return err;
}
static void tcf_skbmod_cleanup(struct tc_action *a)
@@ -205,17 +240,16 @@ static int tcf_skbmod_dump(struct sk_buff *skb, struct tc_action *a,
struct tcf_skbmod *d = to_skbmod(a);
unsigned char *b = skb_tail_pointer(skb);
struct tcf_skbmod_params *p;
- struct tc_skbmod opt = {
- .index = d->tcf_index,
- .refcnt = refcount_read(&d->tcf_refcnt) - ref,
- .bindcnt = atomic_read(&d->tcf_bindcnt) - bind,
- };
+ struct tc_skbmod opt;
struct tcf_t t;
- spin_lock_bh(&d->tcf_lock);
- opt.action = d->tcf_action;
- p = rcu_dereference_protected(d->skbmod_p,
- lockdep_is_held(&d->tcf_lock));
+ memset(&opt, 0, sizeof(opt));
+ opt.index = d->tcf_index;
+ opt.refcnt = refcount_read(&d->tcf_refcnt) - ref;
+ opt.bindcnt = atomic_read(&d->tcf_bindcnt) - bind;
+ rcu_read_lock();
+ p = rcu_dereference(d->skbmod_p);
+ opt.action = p->action;
opt.flags = p->flags;
if (nla_put(skb, TCA_SKBMOD_PARMS, sizeof(opt), &opt))
goto nla_put_failure;
@@ -233,60 +267,42 @@ static int tcf_skbmod_dump(struct sk_buff *skb, struct tc_action *a,
if (nla_put_64bit(skb, TCA_SKBMOD_TM, sizeof(t), &t, TCA_SKBMOD_PAD))
goto nla_put_failure;
- spin_unlock_bh(&d->tcf_lock);
+ rcu_read_unlock();
return skb->len;
nla_put_failure:
- spin_unlock_bh(&d->tcf_lock);
+ rcu_read_unlock();
nlmsg_trim(skb, b);
return -1;
}
-static int tcf_skbmod_walker(struct net *net, struct sk_buff *skb,
- struct netlink_callback *cb, int type,
- const struct tc_action_ops *ops,
- struct netlink_ext_ack *extack)
-{
- struct tc_action_net *tn = net_generic(net, skbmod_net_id);
-
- return tcf_generic_walker(tn, skb, cb, type, ops, extack);
-}
-
-static int tcf_skbmod_search(struct net *net, struct tc_action **a, u32 index)
-{
- struct tc_action_net *tn = net_generic(net, skbmod_net_id);
-
- return tcf_idr_search(tn, a, index);
-}
-
static struct tc_action_ops act_skbmod_ops = {
.kind = "skbmod",
- .type = TCA_ACT_SKBMOD,
+ .id = TCA_ACT_SKBMOD,
.owner = THIS_MODULE,
.act = tcf_skbmod_act,
.dump = tcf_skbmod_dump,
.init = tcf_skbmod_init,
.cleanup = tcf_skbmod_cleanup,
- .walk = tcf_skbmod_walker,
- .lookup = tcf_skbmod_search,
.size = sizeof(struct tcf_skbmod),
};
+MODULE_ALIAS_NET_ACT("skbmod");
static __net_init int skbmod_init_net(struct net *net)
{
- struct tc_action_net *tn = net_generic(net, skbmod_net_id);
+ struct tc_action_net *tn = net_generic(net, act_skbmod_ops.net_id);
- return tc_action_net_init(tn, &act_skbmod_ops);
+ return tc_action_net_init(net, tn, &act_skbmod_ops);
}
static void __net_exit skbmod_exit_net(struct list_head *net_list)
{
- tc_action_net_exit(net_list, skbmod_net_id);
+ tc_action_net_exit(net_list, act_skbmod_ops.net_id);
}
static struct pernet_operations skbmod_net_ops = {
.init = skbmod_init_net,
.exit_batch = skbmod_exit_net,
- .id = &skbmod_net_id,
+ .id = &act_skbmod_ops.net_id,
.size = sizeof(struct tc_action_net),
};
diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c
index 8b43fe0130f7..876b30c5709e 100644
--- a/net/sched/act_tunnel_key.c
+++ b/net/sched/act_tunnel_key.c
@@ -1,11 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Copyright (c) 2016, Amir Vadai <amir@vadai.me>
* Copyright (c) 2016, Mellanox Technologies. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
*/
#include <linux/module.h>
@@ -14,28 +10,30 @@
#include <linux/skbuff.h>
#include <linux/rtnetlink.h>
#include <net/geneve.h>
+#include <net/vxlan.h>
+#include <net/erspan.h>
#include <net/netlink.h>
#include <net/pkt_sched.h>
#include <net/dst.h>
+#include <net/pkt_cls.h>
+#include <net/tc_wrapper.h>
#include <linux/tc_act/tc_tunnel_key.h>
#include <net/tc_act/tc_tunnel_key.h>
-static unsigned int tunnel_key_net_id;
static struct tc_action_ops act_tunnel_key_ops;
-static int tunnel_key_act(struct sk_buff *skb, const struct tc_action *a,
- struct tcf_result *res)
+TC_INDIRECT_SCOPE int tunnel_key_act(struct sk_buff *skb,
+ const struct tc_action *a,
+ struct tcf_result *res)
{
struct tcf_tunnel_key *t = to_tunnel_key(a);
struct tcf_tunnel_key_params *params;
- int action;
params = rcu_dereference_bh(t->params);
tcf_lastuse_update(&t->tcf_tm);
- bstats_cpu_update(this_cpu_ptr(t->common.cpu_bstats), skb);
- action = READ_ONCE(t->tcf_action);
+ tcf_action_update_bstats(&t->common, skb);
switch (params->tcft_action) {
case TCA_TUNNEL_KEY_ACT_RELEASE:
@@ -51,12 +49,16 @@ static int tunnel_key_act(struct sk_buff *skb, const struct tc_action *a,
break;
}
- return action;
+ return params->action;
}
static const struct nla_policy
enc_opts_policy[TCA_TUNNEL_KEY_ENC_OPTS_MAX + 1] = {
+ [TCA_TUNNEL_KEY_ENC_OPTS_UNSPEC] = {
+ .strict_start_type = TCA_TUNNEL_KEY_ENC_OPTS_VXLAN },
[TCA_TUNNEL_KEY_ENC_OPTS_GENEVE] = { .type = NLA_NESTED },
+ [TCA_TUNNEL_KEY_ENC_OPTS_VXLAN] = { .type = NLA_NESTED },
+ [TCA_TUNNEL_KEY_ENC_OPTS_ERSPAN] = { .type = NLA_NESTED },
};
static const struct nla_policy
@@ -64,7 +66,20 @@ geneve_opt_policy[TCA_TUNNEL_KEY_ENC_OPT_GENEVE_MAX + 1] = {
[TCA_TUNNEL_KEY_ENC_OPT_GENEVE_CLASS] = { .type = NLA_U16 },
[TCA_TUNNEL_KEY_ENC_OPT_GENEVE_TYPE] = { .type = NLA_U8 },
[TCA_TUNNEL_KEY_ENC_OPT_GENEVE_DATA] = { .type = NLA_BINARY,
- .len = 128 },
+ .len = 127 },
+};
+
+static const struct nla_policy
+vxlan_opt_policy[TCA_TUNNEL_KEY_ENC_OPT_VXLAN_MAX + 1] = {
+ [TCA_TUNNEL_KEY_ENC_OPT_VXLAN_GBP] = { .type = NLA_U32 },
+};
+
+static const struct nla_policy
+erspan_opt_policy[TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_MAX + 1] = {
+ [TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_VER] = { .type = NLA_U8 },
+ [TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_INDEX] = { .type = NLA_U32 },
+ [TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_DIR] = { .type = NLA_U8 },
+ [TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_HWID] = { .type = NLA_U8 },
};
static int
@@ -75,8 +90,9 @@ tunnel_key_copy_geneve_opt(const struct nlattr *nla, void *dst, int dst_len,
int err, data_len, opt_len;
u8 *data;
- err = nla_parse_nested(tb, TCA_TUNNEL_KEY_ENC_OPT_GENEVE_MAX,
- nla, geneve_opt_policy, extack);
+ err = nla_parse_nested_deprecated(tb,
+ TCA_TUNNEL_KEY_ENC_OPT_GENEVE_MAX,
+ nla, geneve_opt_policy, extack);
if (err < 0)
return err;
@@ -118,29 +134,142 @@ tunnel_key_copy_geneve_opt(const struct nlattr *nla, void *dst, int dst_len,
return opt_len;
}
+static int
+tunnel_key_copy_vxlan_opt(const struct nlattr *nla, void *dst, int dst_len,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[TCA_TUNNEL_KEY_ENC_OPT_VXLAN_MAX + 1];
+ int err;
+
+ err = nla_parse_nested(tb, TCA_TUNNEL_KEY_ENC_OPT_VXLAN_MAX, nla,
+ vxlan_opt_policy, extack);
+ if (err < 0)
+ return err;
+
+ if (!tb[TCA_TUNNEL_KEY_ENC_OPT_VXLAN_GBP]) {
+ NL_SET_ERR_MSG(extack, "Missing tunnel key vxlan option gbp");
+ return -EINVAL;
+ }
+
+ if (dst) {
+ struct vxlan_metadata *md = dst;
+
+ md->gbp = nla_get_u32(tb[TCA_TUNNEL_KEY_ENC_OPT_VXLAN_GBP]);
+ md->gbp &= VXLAN_GBP_MASK;
+ }
+
+ return sizeof(struct vxlan_metadata);
+}
+
+static int
+tunnel_key_copy_erspan_opt(const struct nlattr *nla, void *dst, int dst_len,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_MAX + 1];
+ int err;
+ u8 ver;
+
+ err = nla_parse_nested(tb, TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_MAX, nla,
+ erspan_opt_policy, extack);
+ if (err < 0)
+ return err;
+
+ if (!tb[TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_VER]) {
+ NL_SET_ERR_MSG(extack, "Missing tunnel key erspan option ver");
+ return -EINVAL;
+ }
+
+ ver = nla_get_u8(tb[TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_VER]);
+ if (ver == 1) {
+ if (!tb[TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_INDEX]) {
+ NL_SET_ERR_MSG(extack, "Missing tunnel key erspan option index");
+ return -EINVAL;
+ }
+ } else if (ver == 2) {
+ if (!tb[TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_DIR] ||
+ !tb[TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_HWID]) {
+ NL_SET_ERR_MSG(extack, "Missing tunnel key erspan option dir or hwid");
+ return -EINVAL;
+ }
+ } else {
+ NL_SET_ERR_MSG(extack, "Tunnel key erspan option ver is incorrect");
+ return -EINVAL;
+ }
+
+ if (dst) {
+ struct erspan_metadata *md = dst;
+
+ md->version = ver;
+ if (ver == 1) {
+ nla = tb[TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_INDEX];
+ md->u.index = nla_get_be32(nla);
+ } else {
+ nla = tb[TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_DIR];
+ md->u.md2.dir = nla_get_u8(nla);
+ nla = tb[TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_HWID];
+ set_hwid(&md->u.md2, nla_get_u8(nla));
+ }
+ }
+
+ return sizeof(struct erspan_metadata);
+}
+
static int tunnel_key_copy_opts(const struct nlattr *nla, u8 *dst,
int dst_len, struct netlink_ext_ack *extack)
{
- int err, rem, opt_len, len = nla_len(nla), opts_len = 0;
+ int err, rem, opt_len, len = nla_len(nla), opts_len = 0, type = 0;
const struct nlattr *attr, *head = nla_data(nla);
- err = nla_validate(head, len, TCA_TUNNEL_KEY_ENC_OPTS_MAX,
- enc_opts_policy, extack);
+ err = nla_validate_deprecated(head, len, TCA_TUNNEL_KEY_ENC_OPTS_MAX,
+ enc_opts_policy, extack);
if (err)
return err;
nla_for_each_attr(attr, head, len, rem) {
switch (nla_type(attr)) {
case TCA_TUNNEL_KEY_ENC_OPTS_GENEVE:
+ if (type && type != IP_TUNNEL_GENEVE_OPT_BIT) {
+ NL_SET_ERR_MSG(extack, "Duplicate type for geneve options");
+ return -EINVAL;
+ }
opt_len = tunnel_key_copy_geneve_opt(attr, dst,
dst_len, extack);
if (opt_len < 0)
return opt_len;
opts_len += opt_len;
+ if (opts_len > IP_TUNNEL_OPTS_MAX) {
+ NL_SET_ERR_MSG(extack, "Tunnel options exceeds max size");
+ return -EINVAL;
+ }
if (dst) {
dst_len -= opt_len;
dst += opt_len;
}
+ type = IP_TUNNEL_GENEVE_OPT_BIT;
+ break;
+ case TCA_TUNNEL_KEY_ENC_OPTS_VXLAN:
+ if (type) {
+ NL_SET_ERR_MSG(extack, "Duplicate type for vxlan options");
+ return -EINVAL;
+ }
+ opt_len = tunnel_key_copy_vxlan_opt(attr, dst,
+ dst_len, extack);
+ if (opt_len < 0)
+ return opt_len;
+ opts_len += opt_len;
+ type = IP_TUNNEL_VXLAN_OPT_BIT;
+ break;
+ case TCA_TUNNEL_KEY_ENC_OPTS_ERSPAN:
+ if (type) {
+ NL_SET_ERR_MSG(extack, "Duplicate type for erspan options");
+ return -EINVAL;
+ }
+ opt_len = tunnel_key_copy_erspan_opt(attr, dst,
+ dst_len, extack);
+ if (opt_len < 0)
+ return opt_len;
+ opts_len += opt_len;
+ type = IP_TUNNEL_ERSPAN_OPT_BIT;
break;
}
}
@@ -171,7 +300,23 @@ static int tunnel_key_opts_set(struct nlattr *nla, struct ip_tunnel_info *info,
switch (nla_type(nla_data(nla))) {
case TCA_TUNNEL_KEY_ENC_OPTS_GENEVE:
#if IS_ENABLED(CONFIG_INET)
- info->key.tun_flags |= TUNNEL_GENEVE_OPT;
+ __set_bit(IP_TUNNEL_GENEVE_OPT_BIT, info->key.tun_flags);
+ return tunnel_key_copy_opts(nla, ip_tunnel_info_opts(info),
+ opts_len, extack);
+#else
+ return -EAFNOSUPPORT;
+#endif
+ case TCA_TUNNEL_KEY_ENC_OPTS_VXLAN:
+#if IS_ENABLED(CONFIG_INET)
+ __set_bit(IP_TUNNEL_VXLAN_OPT_BIT, info->key.tun_flags);
+ return tunnel_key_copy_opts(nla, ip_tunnel_info_opts(info),
+ opts_len, extack);
+#else
+ return -EAFNOSUPPORT;
+#endif
+ case TCA_TUNNEL_KEY_ENC_OPTS_ERSPAN:
+#if IS_ENABLED(CONFIG_INET)
+ __set_bit(IP_TUNNEL_ERSPAN_OPT_BIT, info->key.tun_flags);
return tunnel_key_copy_opts(nla, ip_tunnel_info_opts(info),
opts_len, extack);
#else
@@ -203,27 +348,31 @@ static void tunnel_key_release_params(struct tcf_tunnel_key_params *p)
return;
if (p->tcft_action == TCA_TUNNEL_KEY_ACT_SET)
dst_release(&p->tcft_enc_metadata->dst);
+
kfree_rcu(p, rcu);
}
static int tunnel_key_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action **a,
- int ovr, int bind, bool rtnl_held,
+ struct tcf_proto *tp, u32 act_flags,
struct netlink_ext_ack *extack)
{
- struct tc_action_net *tn = net_generic(net, tunnel_key_net_id);
+ struct tc_action_net *tn = net_generic(net, act_tunnel_key_ops.net_id);
+ bool bind = act_flags & TCA_ACT_FLAGS_BIND;
struct nlattr *tb[TCA_TUNNEL_KEY_MAX + 1];
struct tcf_tunnel_key_params *params_new;
+ IP_TUNNEL_DECLARE_FLAGS(flags) = { };
struct metadata_dst *metadata = NULL;
+ struct tcf_chain *goto_ch = NULL;
struct tc_tunnel_key *parm;
struct tcf_tunnel_key *t;
bool exists = false;
__be16 dst_port = 0;
__be64 key_id = 0;
int opts_len = 0;
- __be16 flags = 0;
u8 tos, ttl;
int ret = 0;
+ u32 index;
int err;
if (!nla) {
@@ -231,8 +380,8 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla,
return -EINVAL;
}
- err = nla_parse_nested(tb, TCA_TUNNEL_KEY_MAX, nla, tunnel_key_policy,
- extack);
+ err = nla_parse_nested_deprecated(tb, TCA_TUNNEL_KEY_MAX, nla,
+ tunnel_key_policy, extack);
if (err < 0) {
NL_SET_ERR_MSG(extack, "Failed to parse nested tunnel key attributes");
return err;
@@ -244,12 +393,13 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla,
}
parm = nla_data(tb[TCA_TUNNEL_KEY_PARMS]);
- err = tcf_idr_check_alloc(tn, &parm->index, a, bind);
+ index = parm->index;
+ err = tcf_idr_check_alloc(tn, &index, a, bind);
if (err < 0)
return err;
exists = err;
if (exists && bind)
- return 0;
+ return ACT_P_BOUND;
switch (parm->t_action) {
case TCA_TUNNEL_KEY_ACT_RELEASE:
@@ -260,13 +410,16 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla,
key32 = nla_get_be32(tb[TCA_TUNNEL_KEY_ENC_KEY_ID]);
key_id = key32_to_tunnel_id(key32);
- flags = TUNNEL_KEY;
+ __set_bit(IP_TUNNEL_KEY_BIT, flags);
}
- flags |= TUNNEL_CSUM;
+ __set_bit(IP_TUNNEL_CSUM_BIT, flags);
if (tb[TCA_TUNNEL_KEY_NO_CSUM] &&
nla_get_u8(tb[TCA_TUNNEL_KEY_NO_CSUM]))
- flags &= ~TUNNEL_CSUM;
+ __clear_bit(IP_TUNNEL_CSUM_BIT, flags);
+
+ if (nla_get_flag(tb[TCA_TUNNEL_KEY_NO_FRAG]))
+ __set_bit(IP_TUNNEL_DONT_FRAGMENT_BIT, flags);
if (tb[TCA_TUNNEL_KEY_ENC_DST_PORT])
dst_port = nla_get_be16(tb[TCA_TUNNEL_KEY_ENC_DST_PORT]);
@@ -308,7 +461,7 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla,
metadata = __ipv6_tun_set_dst(&saddr, &daddr, tos, ttl, dst_port,
0, flags,
- key_id, 0);
+ key_id, opts_len);
} else {
NL_SET_ERR_MSG(extack, "Missing either ipv4 or ipv6 src and dst");
ret = -EINVAL;
@@ -321,6 +474,12 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla,
goto err_out;
}
+#ifdef CONFIG_DST_CACHE
+ ret = dst_cache_init(&metadata->u.tun_info.dst_cache, GFP_KERNEL);
+ if (ret)
+ goto release_tun_meta;
+#endif
+
if (opts_len) {
ret = tunnel_key_opts_set(tb[TCA_TUNNEL_KEY_ENC_OPTS],
&metadata->u.tun_info,
@@ -338,20 +497,27 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla,
}
if (!exists) {
- ret = tcf_idr_create(tn, parm->index, est, a,
- &act_tunnel_key_ops, bind, true);
+ ret = tcf_idr_create_from_flags(tn, index, est, a,
+ &act_tunnel_key_ops, bind,
+ act_flags);
if (ret) {
NL_SET_ERR_MSG(extack, "Cannot create TC IDR");
goto release_tun_meta;
}
ret = ACT_P_CREATED;
- } else if (!ovr) {
+ } else if (!(act_flags & TCA_ACT_FLAGS_REPLACE)) {
NL_SET_ERR_MSG(extack, "TC IDR already exists");
ret = -EEXIST;
goto release_tun_meta;
}
+ err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack);
+ if (err < 0) {
+ ret = err;
+ exists = true;
+ goto release_tun_meta;
+ }
t = to_tunnel_key(*a);
params_new = kzalloc(sizeof(*params_new), GFP_KERNEL);
@@ -359,31 +525,36 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla,
NL_SET_ERR_MSG(extack, "Cannot allocate tunnel key parameters");
ret = -ENOMEM;
exists = true;
- goto release_tun_meta;
+ goto put_chain;
}
params_new->tcft_action = parm->t_action;
params_new->tcft_enc_metadata = metadata;
+ params_new->action = parm->action;
spin_lock_bh(&t->tcf_lock);
- t->tcf_action = parm->action;
- rcu_swap_protected(t->params, params_new,
- lockdep_is_held(&t->tcf_lock));
+ goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch);
+ params_new = rcu_replace_pointer(t->params, params_new,
+ lockdep_is_held(&t->tcf_lock));
spin_unlock_bh(&t->tcf_lock);
tunnel_key_release_params(params_new);
-
- if (ret == ACT_P_CREATED)
- tcf_idr_insert(tn, *a);
+ if (goto_ch)
+ tcf_chain_put_by_act(goto_ch);
return ret;
+put_chain:
+ if (goto_ch)
+ tcf_chain_put_by_act(goto_ch);
+
release_tun_meta:
- dst_release(&metadata->dst);
+ if (metadata)
+ dst_release(&metadata->dst);
err_out:
if (exists)
tcf_idr_release(*a, bind);
else
- tcf_idr_cleanup(tn, parm->index);
+ tcf_idr_cleanup(tn, index);
return ret;
}
@@ -399,16 +570,16 @@ static void tunnel_key_release(struct tc_action *a)
static int tunnel_key_geneve_opts_dump(struct sk_buff *skb,
const struct ip_tunnel_info *info)
{
+ const u8 *src = ip_tunnel_info_opts(info);
int len = info->options_len;
- u8 *src = (u8 *)(info + 1);
struct nlattr *start;
- start = nla_nest_start(skb, TCA_TUNNEL_KEY_ENC_OPTS_GENEVE);
+ start = nla_nest_start_noflag(skb, TCA_TUNNEL_KEY_ENC_OPTS_GENEVE);
if (!start)
return -EMSGSIZE;
while (len > 0) {
- struct geneve_opt *opt = (struct geneve_opt *)src;
+ const struct geneve_opt *opt = (const struct geneve_opt *)src;
if (nla_put_be16(skb, TCA_TUNNEL_KEY_ENC_OPT_GENEVE_CLASS,
opt->opt_class) ||
@@ -428,6 +599,56 @@ static int tunnel_key_geneve_opts_dump(struct sk_buff *skb,
return 0;
}
+static int tunnel_key_vxlan_opts_dump(struct sk_buff *skb,
+ const struct ip_tunnel_info *info)
+{
+ const struct vxlan_metadata *md = ip_tunnel_info_opts(info);
+ struct nlattr *start;
+
+ start = nla_nest_start_noflag(skb, TCA_TUNNEL_KEY_ENC_OPTS_VXLAN);
+ if (!start)
+ return -EMSGSIZE;
+
+ if (nla_put_u32(skb, TCA_TUNNEL_KEY_ENC_OPT_VXLAN_GBP, md->gbp)) {
+ nla_nest_cancel(skb, start);
+ return -EMSGSIZE;
+ }
+
+ nla_nest_end(skb, start);
+ return 0;
+}
+
+static int tunnel_key_erspan_opts_dump(struct sk_buff *skb,
+ const struct ip_tunnel_info *info)
+{
+ const struct erspan_metadata *md = ip_tunnel_info_opts(info);
+ struct nlattr *start;
+
+ start = nla_nest_start_noflag(skb, TCA_TUNNEL_KEY_ENC_OPTS_ERSPAN);
+ if (!start)
+ return -EMSGSIZE;
+
+ if (nla_put_u8(skb, TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_VER, md->version))
+ goto err;
+
+ if (md->version == 1 &&
+ nla_put_be32(skb, TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_INDEX, md->u.index))
+ goto err;
+
+ if (md->version == 2 &&
+ (nla_put_u8(skb, TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_DIR,
+ md->u.md2.dir) ||
+ nla_put_u8(skb, TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_HWID,
+ get_hwid(&md->u.md2))))
+ goto err;
+
+ nla_nest_end(skb, start);
+ return 0;
+err:
+ nla_nest_cancel(skb, start);
+ return -EMSGSIZE;
+}
+
static int tunnel_key_opts_dump(struct sk_buff *skb,
const struct ip_tunnel_info *info)
{
@@ -437,14 +658,22 @@ static int tunnel_key_opts_dump(struct sk_buff *skb,
if (!info->options_len)
return 0;
- start = nla_nest_start(skb, TCA_TUNNEL_KEY_ENC_OPTS);
+ start = nla_nest_start_noflag(skb, TCA_TUNNEL_KEY_ENC_OPTS);
if (!start)
return -EMSGSIZE;
- if (info->key.tun_flags & TUNNEL_GENEVE_OPT) {
+ if (test_bit(IP_TUNNEL_GENEVE_OPT_BIT, info->key.tun_flags)) {
err = tunnel_key_geneve_opts_dump(skb, info);
if (err)
goto err_out;
+ } else if (test_bit(IP_TUNNEL_VXLAN_OPT_BIT, info->key.tun_flags)) {
+ err = tunnel_key_vxlan_opts_dump(skb, info);
+ if (err)
+ goto err_out;
+ } else if (test_bit(IP_TUNNEL_ERSPAN_OPT_BIT, info->key.tun_flags)) {
+ err = tunnel_key_erspan_opts_dump(skb, info);
+ if (err)
+ goto err_out;
} else {
err_out:
nla_nest_cancel(skb, start);
@@ -496,10 +725,9 @@ static int tunnel_key_dump(struct sk_buff *skb, struct tc_action *a,
};
struct tcf_t tm;
- spin_lock_bh(&t->tcf_lock);
- params = rcu_dereference_protected(t->params,
- lockdep_is_held(&t->tcf_lock));
- opt.action = t->tcf_action;
+ rcu_read_lock();
+ params = rcu_dereference(t->params);
+ opt.action = params->action;
opt.t_action = params->tcft_action;
if (nla_put(skb, TCA_TUNNEL_KEY_PARMS, sizeof(opt), &opt))
@@ -511,7 +739,7 @@ static int tunnel_key_dump(struct sk_buff *skb, struct tc_action *a,
struct ip_tunnel_key *key = &info->key;
__be32 key_id = tunnel_id_to_key32(key->tun_id);
- if (((key->tun_flags & TUNNEL_KEY) &&
+ if ((test_bit(IP_TUNNEL_KEY_BIT, key->tun_flags) &&
nla_put_be32(skb, TCA_TUNNEL_KEY_ENC_KEY_ID, key_id)) ||
tunnel_key_dump_addresses(skb,
&params->tcft_enc_metadata->u.tun_info) ||
@@ -519,7 +747,9 @@ static int tunnel_key_dump(struct sk_buff *skb, struct tc_action *a,
nla_put_be16(skb, TCA_TUNNEL_KEY_ENC_DST_PORT,
key->tp_dst)) ||
nla_put_u8(skb, TCA_TUNNEL_KEY_NO_CSUM,
- !(key->tun_flags & TUNNEL_CSUM)) ||
+ !test_bit(IP_TUNNEL_CSUM_BIT, key->tun_flags)) ||
+ (test_bit(IP_TUNNEL_DONT_FRAGMENT_BIT, key->tun_flags) &&
+ nla_put_flag(skb, TCA_TUNNEL_KEY_NO_FRAG)) ||
tunnel_key_opts_dump(skb, info))
goto nla_put_failure;
@@ -534,62 +764,100 @@ static int tunnel_key_dump(struct sk_buff *skb, struct tc_action *a,
if (nla_put_64bit(skb, TCA_TUNNEL_KEY_TM, sizeof(tm),
&tm, TCA_TUNNEL_KEY_PAD))
goto nla_put_failure;
- spin_unlock_bh(&t->tcf_lock);
+ rcu_read_unlock();
return skb->len;
nla_put_failure:
- spin_unlock_bh(&t->tcf_lock);
+ rcu_read_unlock();
nlmsg_trim(skb, b);
return -1;
}
-static int tunnel_key_walker(struct net *net, struct sk_buff *skb,
- struct netlink_callback *cb, int type,
- const struct tc_action_ops *ops,
- struct netlink_ext_ack *extack)
+static void tcf_tunnel_encap_put_tunnel(void *priv)
{
- struct tc_action_net *tn = net_generic(net, tunnel_key_net_id);
+ struct ip_tunnel_info *tunnel = priv;
- return tcf_generic_walker(tn, skb, cb, type, ops, extack);
+ kfree(tunnel);
+}
+
+static int tcf_tunnel_encap_get_tunnel(struct flow_action_entry *entry,
+ const struct tc_action *act)
+{
+ entry->tunnel = tcf_tunnel_info_copy(act);
+ if (!entry->tunnel)
+ return -ENOMEM;
+ entry->destructor = tcf_tunnel_encap_put_tunnel;
+ entry->destructor_priv = entry->tunnel;
+ return 0;
}
-static int tunnel_key_search(struct net *net, struct tc_action **a, u32 index)
+static int tcf_tunnel_key_offload_act_setup(struct tc_action *act,
+ void *entry_data,
+ u32 *index_inc,
+ bool bind,
+ struct netlink_ext_ack *extack)
{
- struct tc_action_net *tn = net_generic(net, tunnel_key_net_id);
+ int err;
+
+ if (bind) {
+ struct flow_action_entry *entry = entry_data;
+
+ if (is_tcf_tunnel_set(act)) {
+ entry->id = FLOW_ACTION_TUNNEL_ENCAP;
+ err = tcf_tunnel_encap_get_tunnel(entry, act);
+ if (err)
+ return err;
+ } else if (is_tcf_tunnel_release(act)) {
+ entry->id = FLOW_ACTION_TUNNEL_DECAP;
+ } else {
+ NL_SET_ERR_MSG_MOD(extack, "Unsupported tunnel key mode offload");
+ return -EOPNOTSUPP;
+ }
+ *index_inc = 1;
+ } else {
+ struct flow_offload_action *fl_action = entry_data;
+
+ if (is_tcf_tunnel_set(act))
+ fl_action->id = FLOW_ACTION_TUNNEL_ENCAP;
+ else if (is_tcf_tunnel_release(act))
+ fl_action->id = FLOW_ACTION_TUNNEL_DECAP;
+ else
+ return -EOPNOTSUPP;
+ }
- return tcf_idr_search(tn, a, index);
+ return 0;
}
static struct tc_action_ops act_tunnel_key_ops = {
.kind = "tunnel_key",
- .type = TCA_ACT_TUNNEL_KEY,
+ .id = TCA_ID_TUNNEL_KEY,
.owner = THIS_MODULE,
.act = tunnel_key_act,
.dump = tunnel_key_dump,
.init = tunnel_key_init,
.cleanup = tunnel_key_release,
- .walk = tunnel_key_walker,
- .lookup = tunnel_key_search,
+ .offload_act_setup = tcf_tunnel_key_offload_act_setup,
.size = sizeof(struct tcf_tunnel_key),
};
+MODULE_ALIAS_NET_ACT("tunnel_key");
static __net_init int tunnel_key_init_net(struct net *net)
{
- struct tc_action_net *tn = net_generic(net, tunnel_key_net_id);
+ struct tc_action_net *tn = net_generic(net, act_tunnel_key_ops.net_id);
- return tc_action_net_init(tn, &act_tunnel_key_ops);
+ return tc_action_net_init(net, tn, &act_tunnel_key_ops);
}
static void __net_exit tunnel_key_exit_net(struct list_head *net_list)
{
- tc_action_net_exit(net_list, tunnel_key_net_id);
+ tc_action_net_exit(net_list, act_tunnel_key_ops.net_id);
}
static struct pernet_operations tunnel_key_net_ops = {
.init = tunnel_key_init_net,
.exit_batch = tunnel_key_exit_net,
- .id = &tunnel_key_net_id,
+ .id = &act_tunnel_key_ops.net_id,
.size = sizeof(struct tc_action_net),
};
diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c
index 93fdaf707313..a74621797d69 100644
--- a/net/sched/act_vlan.c
+++ b/net/sched/act_vlan.c
@@ -1,10 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Copyright (c) 2014 Jiri Pirko <jiri@resnulli.us>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
*/
#include <linux/module.h>
@@ -15,24 +11,25 @@
#include <linux/if_vlan.h>
#include <net/netlink.h>
#include <net/pkt_sched.h>
+#include <net/pkt_cls.h>
+#include <net/tc_wrapper.h>
#include <linux/tc_act/tc_vlan.h>
#include <net/tc_act/tc_vlan.h>
-static unsigned int vlan_net_id;
static struct tc_action_ops act_vlan_ops;
-static int tcf_vlan_act(struct sk_buff *skb, const struct tc_action *a,
- struct tcf_result *res)
+TC_INDIRECT_SCOPE int tcf_vlan_act(struct sk_buff *skb,
+ const struct tc_action *a,
+ struct tcf_result *res)
{
struct tcf_vlan *v = to_vlan(a);
struct tcf_vlan_params *p;
- int action;
int err;
u16 tci;
tcf_lastuse_update(&v->tcf_tm);
- bstats_cpu_update(this_cpu_ptr(v->common.cpu_bstats), skb);
+ tcf_action_update_bstats(&v->common, skb);
/* Ensure 'data' points at mac_header prior calling vlan manipulating
* functions.
@@ -40,8 +37,6 @@ static int tcf_vlan_act(struct sk_buff *skb, const struct tc_action *a,
if (skb_at_tc_ingress(skb))
skb_push_rcsum(skb, skb->mac_len);
- action = READ_ONCE(v->tcf_action);
-
p = rcu_dereference_bh(v->vlan_p);
switch (p->tcfv_action) {
@@ -73,13 +68,23 @@ static int tcf_vlan_act(struct sk_buff *skb, const struct tc_action *a,
/* replace the vid */
tci = (tci & ~VLAN_VID_MASK) | p->tcfv_push_vid;
/* replace prio bits, if tcfv_push_prio specified */
- if (p->tcfv_push_prio) {
+ if (p->tcfv_push_prio_exists) {
tci &= ~VLAN_PRIO_MASK;
tci |= p->tcfv_push_prio << VLAN_PRIO_SHIFT;
}
/* put updated tci as hwaccel tag */
__vlan_hwaccel_put_tag(skb, p->tcfv_push_proto, tci);
break;
+ case TCA_VLAN_ACT_POP_ETH:
+ err = skb_eth_pop(skb);
+ if (err)
+ goto drop;
+ break;
+ case TCA_VLAN_ACT_PUSH_ETH:
+ err = skb_eth_push(skb, p->tcfv_push_dst, p->tcfv_push_src);
+ if (err)
+ goto drop;
+ break;
default:
BUG();
}
@@ -88,27 +93,34 @@ out:
if (skb_at_tc_ingress(skb))
skb_pull_rcsum(skb, skb->mac_len);
- return action;
+ skb_reset_mac_len(skb);
+ return p->action;
drop:
- qstats_drop_inc(this_cpu_ptr(v->common.cpu_qstats));
+ tcf_action_inc_drop_qstats(&v->common);
return TC_ACT_SHOT;
}
static const struct nla_policy vlan_policy[TCA_VLAN_MAX + 1] = {
+ [TCA_VLAN_UNSPEC] = { .strict_start_type = TCA_VLAN_PUSH_ETH_DST },
[TCA_VLAN_PARMS] = { .len = sizeof(struct tc_vlan) },
[TCA_VLAN_PUSH_VLAN_ID] = { .type = NLA_U16 },
[TCA_VLAN_PUSH_VLAN_PROTOCOL] = { .type = NLA_U16 },
[TCA_VLAN_PUSH_VLAN_PRIORITY] = { .type = NLA_U8 },
+ [TCA_VLAN_PUSH_ETH_DST] = NLA_POLICY_ETH_ADDR,
+ [TCA_VLAN_PUSH_ETH_SRC] = NLA_POLICY_ETH_ADDR,
};
static int tcf_vlan_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action **a,
- int ovr, int bind, bool rtnl_held,
+ struct tcf_proto *tp, u32 flags,
struct netlink_ext_ack *extack)
{
- struct tc_action_net *tn = net_generic(net, vlan_net_id);
+ struct tc_action_net *tn = net_generic(net, act_vlan_ops.net_id);
+ bool bind = flags & TCA_ACT_FLAGS_BIND;
struct nlattr *tb[TCA_VLAN_MAX + 1];
+ struct tcf_chain *goto_ch = NULL;
+ bool push_prio_exists = false;
struct tcf_vlan_params *p;
struct tc_vlan *parm;
struct tcf_vlan *v;
@@ -118,23 +130,26 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla,
u8 push_prio = 0;
bool exists = false;
int ret = 0, err;
+ u32 index;
if (!nla)
return -EINVAL;
- err = nla_parse_nested(tb, TCA_VLAN_MAX, nla, vlan_policy, NULL);
+ err = nla_parse_nested_deprecated(tb, TCA_VLAN_MAX, nla, vlan_policy,
+ NULL);
if (err < 0)
return err;
if (!tb[TCA_VLAN_PARMS])
return -EINVAL;
parm = nla_data(tb[TCA_VLAN_PARMS]);
- err = tcf_idr_check_alloc(tn, &parm->index, a, bind);
+ index = parm->index;
+ err = tcf_idr_check_alloc(tn, &index, a, bind);
if (err < 0)
return err;
exists = err;
if (exists && bind)
- return 0;
+ return ACT_P_BOUND;
switch (parm->v_action) {
case TCA_VLAN_ACT_POP:
@@ -145,7 +160,7 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla,
if (exists)
tcf_idr_release(*a, bind);
else
- tcf_idr_cleanup(tn, parm->index);
+ tcf_idr_cleanup(tn, index);
return -EINVAL;
}
push_vid = nla_get_u16(tb[TCA_VLAN_PUSH_VLAN_ID]);
@@ -153,7 +168,7 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla,
if (exists)
tcf_idr_release(*a, bind);
else
- tcf_idr_cleanup(tn, parm->index);
+ tcf_idr_cleanup(tn, index);
return -ERANGE;
}
@@ -167,63 +182,94 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla,
if (exists)
tcf_idr_release(*a, bind);
else
- tcf_idr_cleanup(tn, parm->index);
+ tcf_idr_cleanup(tn, index);
return -EPROTONOSUPPORT;
}
} else {
push_proto = htons(ETH_P_8021Q);
}
- if (tb[TCA_VLAN_PUSH_VLAN_PRIORITY])
+ push_prio_exists = !!tb[TCA_VLAN_PUSH_VLAN_PRIORITY];
+ if (push_prio_exists)
push_prio = nla_get_u8(tb[TCA_VLAN_PUSH_VLAN_PRIORITY]);
break;
+ case TCA_VLAN_ACT_POP_ETH:
+ break;
+ case TCA_VLAN_ACT_PUSH_ETH:
+ if (!tb[TCA_VLAN_PUSH_ETH_DST] || !tb[TCA_VLAN_PUSH_ETH_SRC]) {
+ if (exists)
+ tcf_idr_release(*a, bind);
+ else
+ tcf_idr_cleanup(tn, index);
+ return -EINVAL;
+ }
+ break;
default:
if (exists)
tcf_idr_release(*a, bind);
else
- tcf_idr_cleanup(tn, parm->index);
+ tcf_idr_cleanup(tn, index);
return -EINVAL;
}
action = parm->v_action;
if (!exists) {
- ret = tcf_idr_create(tn, parm->index, est, a,
- &act_vlan_ops, bind, true);
+ ret = tcf_idr_create_from_flags(tn, index, est, a,
+ &act_vlan_ops, bind, flags);
if (ret) {
- tcf_idr_cleanup(tn, parm->index);
+ tcf_idr_cleanup(tn, index);
return ret;
}
ret = ACT_P_CREATED;
- } else if (!ovr) {
+ } else if (!(flags & TCA_ACT_FLAGS_REPLACE)) {
tcf_idr_release(*a, bind);
return -EEXIST;
}
+ err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack);
+ if (err < 0)
+ goto release_idr;
+
v = to_vlan(*a);
p = kzalloc(sizeof(*p), GFP_KERNEL);
if (!p) {
- tcf_idr_release(*a, bind);
- return -ENOMEM;
+ err = -ENOMEM;
+ goto put_chain;
}
p->tcfv_action = action;
p->tcfv_push_vid = push_vid;
p->tcfv_push_prio = push_prio;
+ p->tcfv_push_prio_exists = push_prio_exists || action == TCA_VLAN_ACT_PUSH;
p->tcfv_push_proto = push_proto;
+ if (action == TCA_VLAN_ACT_PUSH_ETH) {
+ nla_memcpy(&p->tcfv_push_dst, tb[TCA_VLAN_PUSH_ETH_DST],
+ ETH_ALEN);
+ nla_memcpy(&p->tcfv_push_src, tb[TCA_VLAN_PUSH_ETH_SRC],
+ ETH_ALEN);
+ }
+
+ p->action = parm->action;
spin_lock_bh(&v->tcf_lock);
- v->tcf_action = parm->action;
- rcu_swap_protected(v->vlan_p, p, lockdep_is_held(&v->tcf_lock));
+ goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch);
+ p = rcu_replace_pointer(v->vlan_p, p, lockdep_is_held(&v->tcf_lock));
spin_unlock_bh(&v->tcf_lock);
+ if (goto_ch)
+ tcf_chain_put_by_act(goto_ch);
if (p)
kfree_rcu(p, rcu);
- if (ret == ACT_P_CREATED)
- tcf_idr_insert(tn, *a);
return ret;
+put_chain:
+ if (goto_ch)
+ tcf_chain_put_by_act(goto_ch);
+release_idr:
+ tcf_idr_release(*a, bind);
+ return err;
}
static void tcf_vlan_cleanup(struct tc_action *a)
@@ -249,9 +295,9 @@ static int tcf_vlan_dump(struct sk_buff *skb, struct tc_action *a,
};
struct tcf_t t;
- spin_lock_bh(&v->tcf_lock);
- opt.action = v->tcf_action;
- p = rcu_dereference_protected(v->vlan_p, lockdep_is_held(&v->tcf_lock));
+ rcu_read_lock();
+ p = rcu_dereference(v->vlan_p);
+ opt.action = p->action;
opt.v_action = p->tcfv_action;
if (nla_put(skb, TCA_VLAN_PARMS, sizeof(opt), &opt))
goto nla_put_failure;
@@ -261,69 +307,143 @@ static int tcf_vlan_dump(struct sk_buff *skb, struct tc_action *a,
(nla_put_u16(skb, TCA_VLAN_PUSH_VLAN_ID, p->tcfv_push_vid) ||
nla_put_be16(skb, TCA_VLAN_PUSH_VLAN_PROTOCOL,
p->tcfv_push_proto) ||
- (nla_put_u8(skb, TCA_VLAN_PUSH_VLAN_PRIORITY,
- p->tcfv_push_prio))))
+ (p->tcfv_push_prio_exists &&
+ nla_put_u8(skb, TCA_VLAN_PUSH_VLAN_PRIORITY, p->tcfv_push_prio))))
goto nla_put_failure;
+ if (p->tcfv_action == TCA_VLAN_ACT_PUSH_ETH) {
+ if (nla_put(skb, TCA_VLAN_PUSH_ETH_DST, ETH_ALEN,
+ p->tcfv_push_dst))
+ goto nla_put_failure;
+ if (nla_put(skb, TCA_VLAN_PUSH_ETH_SRC, ETH_ALEN,
+ p->tcfv_push_src))
+ goto nla_put_failure;
+ }
+
tcf_tm_dump(&t, &v->tcf_tm);
if (nla_put_64bit(skb, TCA_VLAN_TM, sizeof(t), &t, TCA_VLAN_PAD))
goto nla_put_failure;
- spin_unlock_bh(&v->tcf_lock);
+ rcu_read_unlock();
return skb->len;
nla_put_failure:
- spin_unlock_bh(&v->tcf_lock);
+ rcu_read_unlock();
nlmsg_trim(skb, b);
return -1;
}
-static int tcf_vlan_walker(struct net *net, struct sk_buff *skb,
- struct netlink_callback *cb, int type,
- const struct tc_action_ops *ops,
- struct netlink_ext_ack *extack)
+static void tcf_vlan_stats_update(struct tc_action *a, u64 bytes, u64 packets,
+ u64 drops, u64 lastuse, bool hw)
{
- struct tc_action_net *tn = net_generic(net, vlan_net_id);
+ struct tcf_vlan *v = to_vlan(a);
+ struct tcf_t *tm = &v->tcf_tm;
+
+ tcf_action_update_stats(a, bytes, packets, drops, hw);
+ tm->lastuse = max_t(u64, tm->lastuse, lastuse);
+}
- return tcf_generic_walker(tn, skb, cb, type, ops, extack);
+static size_t tcf_vlan_get_fill_size(const struct tc_action *act)
+{
+ return nla_total_size(sizeof(struct tc_vlan))
+ + nla_total_size(sizeof(u16)) /* TCA_VLAN_PUSH_VLAN_ID */
+ + nla_total_size(sizeof(u16)) /* TCA_VLAN_PUSH_VLAN_PROTOCOL */
+ + nla_total_size(sizeof(u8)); /* TCA_VLAN_PUSH_VLAN_PRIORITY */
}
-static int tcf_vlan_search(struct net *net, struct tc_action **a, u32 index)
+static int tcf_vlan_offload_act_setup(struct tc_action *act, void *entry_data,
+ u32 *index_inc, bool bind,
+ struct netlink_ext_ack *extack)
{
- struct tc_action_net *tn = net_generic(net, vlan_net_id);
+ if (bind) {
+ struct flow_action_entry *entry = entry_data;
+
+ switch (tcf_vlan_action(act)) {
+ case TCA_VLAN_ACT_PUSH:
+ entry->id = FLOW_ACTION_VLAN_PUSH;
+ entry->vlan.vid = tcf_vlan_push_vid(act);
+ entry->vlan.proto = tcf_vlan_push_proto(act);
+ entry->vlan.prio = tcf_vlan_push_prio(act);
+ break;
+ case TCA_VLAN_ACT_POP:
+ entry->id = FLOW_ACTION_VLAN_POP;
+ break;
+ case TCA_VLAN_ACT_MODIFY:
+ entry->id = FLOW_ACTION_VLAN_MANGLE;
+ entry->vlan.vid = tcf_vlan_push_vid(act);
+ entry->vlan.proto = tcf_vlan_push_proto(act);
+ entry->vlan.prio = tcf_vlan_push_prio(act);
+ break;
+ case TCA_VLAN_ACT_POP_ETH:
+ entry->id = FLOW_ACTION_VLAN_POP_ETH;
+ break;
+ case TCA_VLAN_ACT_PUSH_ETH:
+ entry->id = FLOW_ACTION_VLAN_PUSH_ETH;
+ tcf_vlan_push_eth(entry->vlan_push_eth.src, entry->vlan_push_eth.dst, act);
+ break;
+ default:
+ NL_SET_ERR_MSG_MOD(extack, "Unsupported vlan action mode offload");
+ return -EOPNOTSUPP;
+ }
+ *index_inc = 1;
+ } else {
+ struct flow_offload_action *fl_action = entry_data;
+
+ switch (tcf_vlan_action(act)) {
+ case TCA_VLAN_ACT_PUSH:
+ fl_action->id = FLOW_ACTION_VLAN_PUSH;
+ break;
+ case TCA_VLAN_ACT_POP:
+ fl_action->id = FLOW_ACTION_VLAN_POP;
+ break;
+ case TCA_VLAN_ACT_MODIFY:
+ fl_action->id = FLOW_ACTION_VLAN_MANGLE;
+ break;
+ case TCA_VLAN_ACT_POP_ETH:
+ fl_action->id = FLOW_ACTION_VLAN_POP_ETH;
+ break;
+ case TCA_VLAN_ACT_PUSH_ETH:
+ fl_action->id = FLOW_ACTION_VLAN_PUSH_ETH;
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+ }
- return tcf_idr_search(tn, a, index);
+ return 0;
}
static struct tc_action_ops act_vlan_ops = {
.kind = "vlan",
- .type = TCA_ACT_VLAN,
+ .id = TCA_ID_VLAN,
.owner = THIS_MODULE,
.act = tcf_vlan_act,
.dump = tcf_vlan_dump,
.init = tcf_vlan_init,
.cleanup = tcf_vlan_cleanup,
- .walk = tcf_vlan_walker,
- .lookup = tcf_vlan_search,
+ .stats_update = tcf_vlan_stats_update,
+ .get_fill_size = tcf_vlan_get_fill_size,
+ .offload_act_setup = tcf_vlan_offload_act_setup,
.size = sizeof(struct tcf_vlan),
};
+MODULE_ALIAS_NET_ACT("vlan");
static __net_init int vlan_init_net(struct net *net)
{
- struct tc_action_net *tn = net_generic(net, vlan_net_id);
+ struct tc_action_net *tn = net_generic(net, act_vlan_ops.net_id);
- return tc_action_net_init(tn, &act_vlan_ops);
+ return tc_action_net_init(net, tn, &act_vlan_ops);
}
static void __net_exit vlan_exit_net(struct list_head *net_list)
{
- tc_action_net_exit(net_list, vlan_net_id);
+ tc_action_net_exit(net_list, act_vlan_ops.net_id);
}
static struct pernet_operations vlan_net_ops = {
.init = vlan_init_net,
.exit_batch = vlan_exit_net,
- .id = &vlan_net_id,
+ .id = &act_vlan_ops.net_id,
.size = sizeof(struct tc_action_net),
};
diff --git a/net/sched/bpf_qdisc.c b/net/sched/bpf_qdisc.c
new file mode 100644
index 000000000000..adcb618a2bfc
--- /dev/null
+++ b/net/sched/bpf_qdisc.c
@@ -0,0 +1,472 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/types.h>
+#include <linux/bpf_verifier.h>
+#include <linux/bpf.h>
+#include <linux/btf.h>
+#include <linux/filter.h>
+#include <net/pkt_sched.h>
+#include <net/pkt_cls.h>
+
+#define QDISC_OP_IDX(op) (offsetof(struct Qdisc_ops, op) / sizeof(void (*)(void)))
+#define QDISC_MOFF_IDX(moff) (moff / sizeof(void (*)(void)))
+
+static struct bpf_struct_ops bpf_Qdisc_ops;
+
+struct bpf_sched_data {
+ struct qdisc_watchdog watchdog;
+};
+
+struct bpf_sk_buff_ptr {
+ struct sk_buff *skb;
+};
+
+static int bpf_qdisc_init(struct btf *btf)
+{
+ return 0;
+}
+
+BTF_ID_LIST_SINGLE(bpf_qdisc_ids, struct, Qdisc)
+BTF_ID_LIST_SINGLE(bpf_sk_buff_ids, struct, sk_buff)
+BTF_ID_LIST_SINGLE(bpf_sk_buff_ptr_ids, struct, bpf_sk_buff_ptr)
+
+static bool bpf_qdisc_is_valid_access(int off, int size,
+ enum bpf_access_type type,
+ const struct bpf_prog *prog,
+ struct bpf_insn_access_aux *info)
+{
+ struct btf *btf = prog->aux->attach_btf;
+ u32 arg;
+
+ arg = btf_ctx_arg_idx(btf, prog->aux->attach_func_proto, off);
+ if (prog->aux->attach_st_ops_member_off == offsetof(struct Qdisc_ops, enqueue)) {
+ if (arg == 2 && type == BPF_READ) {
+ info->reg_type = PTR_TO_BTF_ID | PTR_TRUSTED;
+ info->btf = btf;
+ info->btf_id = bpf_sk_buff_ptr_ids[0];
+ return true;
+ }
+ }
+
+ return bpf_tracing_btf_ctx_access(off, size, type, prog, info);
+}
+
+static int bpf_qdisc_qdisc_access(struct bpf_verifier_log *log,
+ const struct bpf_reg_state *reg,
+ int off, size_t *end)
+{
+ switch (off) {
+ case offsetof(struct Qdisc, limit):
+ *end = offsetofend(struct Qdisc, limit);
+ break;
+ case offsetof(struct Qdisc, q) + offsetof(struct qdisc_skb_head, qlen):
+ *end = offsetof(struct Qdisc, q) + offsetofend(struct qdisc_skb_head, qlen);
+ break;
+ case offsetof(struct Qdisc, qstats) ... offsetofend(struct Qdisc, qstats) - 1:
+ *end = offsetofend(struct Qdisc, qstats);
+ break;
+ default:
+ return -EACCES;
+ }
+
+ return 0;
+}
+
+static int bpf_qdisc_sk_buff_access(struct bpf_verifier_log *log,
+ const struct bpf_reg_state *reg,
+ int off, size_t *end)
+{
+ switch (off) {
+ case offsetof(struct sk_buff, tstamp):
+ *end = offsetofend(struct sk_buff, tstamp);
+ break;
+ case offsetof(struct sk_buff, cb) + offsetof(struct qdisc_skb_cb, data[0]) ...
+ offsetof(struct sk_buff, cb) + offsetof(struct qdisc_skb_cb,
+ data[QDISC_CB_PRIV_LEN - 1]):
+ *end = offsetof(struct sk_buff, cb) +
+ offsetofend(struct qdisc_skb_cb, data[QDISC_CB_PRIV_LEN - 1]);
+ break;
+ default:
+ return -EACCES;
+ }
+
+ return 0;
+}
+
+static int bpf_qdisc_btf_struct_access(struct bpf_verifier_log *log,
+ const struct bpf_reg_state *reg,
+ int off, int size)
+{
+ const struct btf_type *t, *skbt, *qdisct;
+ size_t end;
+ int err;
+
+ skbt = btf_type_by_id(reg->btf, bpf_sk_buff_ids[0]);
+ qdisct = btf_type_by_id(reg->btf, bpf_qdisc_ids[0]);
+ t = btf_type_by_id(reg->btf, reg->btf_id);
+
+ if (t == skbt) {
+ err = bpf_qdisc_sk_buff_access(log, reg, off, &end);
+ } else if (t == qdisct) {
+ err = bpf_qdisc_qdisc_access(log, reg, off, &end);
+ } else {
+ bpf_log(log, "only read is supported\n");
+ return -EACCES;
+ }
+
+ if (err) {
+ bpf_log(log, "no write support to %s at off %d\n",
+ btf_name_by_offset(reg->btf, t->name_off), off);
+ return -EACCES;
+ }
+
+ if (off + size > end) {
+ bpf_log(log,
+ "write access at off %d with size %d beyond the member of %s ended at %zu\n",
+ off, size, btf_name_by_offset(reg->btf, t->name_off), end);
+ return -EACCES;
+ }
+
+ return 0;
+}
+
+BTF_ID_LIST_SINGLE(bpf_qdisc_init_prologue_ids, func, bpf_qdisc_init_prologue)
+
+static int bpf_qdisc_gen_prologue(struct bpf_insn *insn_buf, bool direct_write,
+ const struct bpf_prog *prog)
+{
+ struct bpf_insn *insn = insn_buf;
+
+ if (prog->aux->attach_st_ops_member_off != offsetof(struct Qdisc_ops, init))
+ return 0;
+
+ /* r6 = r1; // r6 will be "u64 *ctx". r1 is "u64 *ctx".
+ * r2 = r1[16]; // r2 will be "struct netlink_ext_ack *extack"
+ * r1 = r1[0]; // r1 will be "struct Qdisc *sch"
+ * r0 = bpf_qdisc_init_prologue(r1, r2);
+ * if r0 == 0 goto pc+1;
+ * BPF_EXIT;
+ * r1 = r6; // r1 will be "u64 *ctx".
+ */
+ *insn++ = BPF_MOV64_REG(BPF_REG_6, BPF_REG_1);
+ *insn++ = BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1, 16);
+ *insn++ = BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_1, 0);
+ *insn++ = BPF_CALL_KFUNC(0, bpf_qdisc_init_prologue_ids[0]);
+ *insn++ = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1);
+ *insn++ = BPF_EXIT_INSN();
+ *insn++ = BPF_MOV64_REG(BPF_REG_1, BPF_REG_6);
+ *insn++ = prog->insnsi[0];
+
+ return insn - insn_buf;
+}
+
+BTF_ID_LIST_SINGLE(bpf_qdisc_reset_destroy_epilogue_ids, func, bpf_qdisc_reset_destroy_epilogue)
+
+static int bpf_qdisc_gen_epilogue(struct bpf_insn *insn_buf, const struct bpf_prog *prog,
+ s16 ctx_stack_off)
+{
+ struct bpf_insn *insn = insn_buf;
+
+ if (prog->aux->attach_st_ops_member_off != offsetof(struct Qdisc_ops, reset) &&
+ prog->aux->attach_st_ops_member_off != offsetof(struct Qdisc_ops, destroy))
+ return 0;
+
+ /* r1 = stack[ctx_stack_off]; // r1 will be "u64 *ctx"
+ * r1 = r1[0]; // r1 will be "struct Qdisc *sch"
+ * r0 = bpf_qdisc_reset_destroy_epilogue(r1);
+ * BPF_EXIT;
+ */
+ *insn++ = BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_FP, ctx_stack_off);
+ *insn++ = BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_1, 0);
+ *insn++ = BPF_CALL_KFUNC(0, bpf_qdisc_reset_destroy_epilogue_ids[0]);
+ *insn++ = BPF_EXIT_INSN();
+
+ return insn - insn_buf;
+}
+
+__bpf_kfunc_start_defs();
+
+/* bpf_skb_get_hash - Get the flow hash of an skb.
+ * @skb: The skb to get the flow hash from.
+ */
+__bpf_kfunc u32 bpf_skb_get_hash(struct sk_buff *skb)
+{
+ return skb_get_hash(skb);
+}
+
+/* bpf_kfree_skb - Release an skb's reference and drop it immediately.
+ * @skb: The skb whose reference to be released and dropped.
+ */
+__bpf_kfunc void bpf_kfree_skb(struct sk_buff *skb)
+{
+ kfree_skb(skb);
+}
+
+/* bpf_qdisc_skb_drop - Drop an skb by adding it to a deferred free list.
+ * @skb: The skb whose reference to be released and dropped.
+ * @to_free_list: The list of skbs to be dropped.
+ */
+__bpf_kfunc void bpf_qdisc_skb_drop(struct sk_buff *skb,
+ struct bpf_sk_buff_ptr *to_free_list)
+{
+ __qdisc_drop(skb, (struct sk_buff **)to_free_list);
+}
+
+/* bpf_qdisc_watchdog_schedule - Schedule a qdisc to a later time using a timer.
+ * @sch: The qdisc to be scheduled.
+ * @expire: The expiry time of the timer.
+ * @delta_ns: The slack range of the timer.
+ */
+__bpf_kfunc void bpf_qdisc_watchdog_schedule(struct Qdisc *sch, u64 expire, u64 delta_ns)
+{
+ struct bpf_sched_data *q = qdisc_priv(sch);
+
+ qdisc_watchdog_schedule_range_ns(&q->watchdog, expire, delta_ns);
+}
+
+/* bpf_qdisc_init_prologue - Hidden kfunc called in prologue of .init. */
+__bpf_kfunc int bpf_qdisc_init_prologue(struct Qdisc *sch,
+ struct netlink_ext_ack *extack)
+{
+ struct bpf_sched_data *q = qdisc_priv(sch);
+ struct net_device *dev = qdisc_dev(sch);
+ struct Qdisc *p;
+
+ qdisc_watchdog_init(&q->watchdog, sch);
+
+ if (sch->parent != TC_H_ROOT) {
+ /* If qdisc_lookup() returns NULL, it means .init is called by
+ * qdisc_create_dflt() in mq/mqprio_init and the parent qdisc
+ * has not been added to qdisc_hash yet.
+ */
+ p = qdisc_lookup(dev, TC_H_MAJ(sch->parent));
+ if (p && !(p->flags & TCQ_F_MQROOT)) {
+ NL_SET_ERR_MSG(extack, "BPF qdisc only supported on root or mq");
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
+/* bpf_qdisc_reset_destroy_epilogue - Hidden kfunc called in epilogue of .reset
+ * and .destroy
+ */
+__bpf_kfunc void bpf_qdisc_reset_destroy_epilogue(struct Qdisc *sch)
+{
+ struct bpf_sched_data *q = qdisc_priv(sch);
+
+ qdisc_watchdog_cancel(&q->watchdog);
+}
+
+/* bpf_qdisc_bstats_update - Update Qdisc basic statistics
+ * @sch: The qdisc from which an skb is dequeued.
+ * @skb: The skb to be dequeued.
+ */
+__bpf_kfunc void bpf_qdisc_bstats_update(struct Qdisc *sch, const struct sk_buff *skb)
+{
+ bstats_update(&sch->bstats, skb);
+}
+
+__bpf_kfunc_end_defs();
+
+BTF_KFUNCS_START(qdisc_kfunc_ids)
+BTF_ID_FLAGS(func, bpf_skb_get_hash, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_kfree_skb, KF_RELEASE)
+BTF_ID_FLAGS(func, bpf_qdisc_skb_drop, KF_RELEASE)
+BTF_ID_FLAGS(func, bpf_dynptr_from_skb, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_qdisc_watchdog_schedule, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_qdisc_init_prologue, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_qdisc_reset_destroy_epilogue, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_qdisc_bstats_update, KF_TRUSTED_ARGS)
+BTF_KFUNCS_END(qdisc_kfunc_ids)
+
+BTF_SET_START(qdisc_common_kfunc_set)
+BTF_ID(func, bpf_skb_get_hash)
+BTF_ID(func, bpf_kfree_skb)
+BTF_ID(func, bpf_dynptr_from_skb)
+BTF_SET_END(qdisc_common_kfunc_set)
+
+BTF_SET_START(qdisc_enqueue_kfunc_set)
+BTF_ID(func, bpf_qdisc_skb_drop)
+BTF_ID(func, bpf_qdisc_watchdog_schedule)
+BTF_SET_END(qdisc_enqueue_kfunc_set)
+
+BTF_SET_START(qdisc_dequeue_kfunc_set)
+BTF_ID(func, bpf_qdisc_watchdog_schedule)
+BTF_ID(func, bpf_qdisc_bstats_update)
+BTF_SET_END(qdisc_dequeue_kfunc_set)
+
+enum qdisc_ops_kf_flags {
+ QDISC_OPS_KF_COMMON = 0,
+ QDISC_OPS_KF_ENQUEUE = 1 << 0,
+ QDISC_OPS_KF_DEQUEUE = 1 << 1,
+};
+
+static const u32 qdisc_ops_context_flags[] = {
+ [QDISC_OP_IDX(enqueue)] = QDISC_OPS_KF_ENQUEUE,
+ [QDISC_OP_IDX(dequeue)] = QDISC_OPS_KF_DEQUEUE,
+ [QDISC_OP_IDX(init)] = QDISC_OPS_KF_COMMON,
+ [QDISC_OP_IDX(reset)] = QDISC_OPS_KF_COMMON,
+ [QDISC_OP_IDX(destroy)] = QDISC_OPS_KF_COMMON,
+};
+
+static int bpf_qdisc_kfunc_filter(const struct bpf_prog *prog, u32 kfunc_id)
+{
+ u32 moff, flags;
+
+ if (!btf_id_set8_contains(&qdisc_kfunc_ids, kfunc_id))
+ return 0;
+
+ if (prog->aux->st_ops != &bpf_Qdisc_ops)
+ return -EACCES;
+
+ moff = prog->aux->attach_st_ops_member_off;
+ flags = qdisc_ops_context_flags[QDISC_MOFF_IDX(moff)];
+
+ if ((flags & QDISC_OPS_KF_ENQUEUE) &&
+ btf_id_set_contains(&qdisc_enqueue_kfunc_set, kfunc_id))
+ return 0;
+
+ if ((flags & QDISC_OPS_KF_DEQUEUE) &&
+ btf_id_set_contains(&qdisc_dequeue_kfunc_set, kfunc_id))
+ return 0;
+
+ if (btf_id_set_contains(&qdisc_common_kfunc_set, kfunc_id))
+ return 0;
+
+ return -EACCES;
+}
+
+static const struct btf_kfunc_id_set bpf_qdisc_kfunc_set = {
+ .owner = THIS_MODULE,
+ .set = &qdisc_kfunc_ids,
+ .filter = bpf_qdisc_kfunc_filter,
+};
+
+static const struct bpf_verifier_ops bpf_qdisc_verifier_ops = {
+ .get_func_proto = bpf_base_func_proto,
+ .is_valid_access = bpf_qdisc_is_valid_access,
+ .btf_struct_access = bpf_qdisc_btf_struct_access,
+ .gen_prologue = bpf_qdisc_gen_prologue,
+ .gen_epilogue = bpf_qdisc_gen_epilogue,
+};
+
+static int bpf_qdisc_init_member(const struct btf_type *t,
+ const struct btf_member *member,
+ void *kdata, const void *udata)
+{
+ const struct Qdisc_ops *uqdisc_ops;
+ struct Qdisc_ops *qdisc_ops;
+ u32 moff;
+
+ uqdisc_ops = (const struct Qdisc_ops *)udata;
+ qdisc_ops = (struct Qdisc_ops *)kdata;
+
+ moff = __btf_member_bit_offset(t, member) / 8;
+ switch (moff) {
+ case offsetof(struct Qdisc_ops, priv_size):
+ if (uqdisc_ops->priv_size)
+ return -EINVAL;
+ qdisc_ops->priv_size = sizeof(struct bpf_sched_data);
+ return 1;
+ case offsetof(struct Qdisc_ops, peek):
+ qdisc_ops->peek = qdisc_peek_dequeued;
+ return 0;
+ case offsetof(struct Qdisc_ops, id):
+ if (bpf_obj_name_cpy(qdisc_ops->id, uqdisc_ops->id,
+ sizeof(qdisc_ops->id)) <= 0)
+ return -EINVAL;
+ return 1;
+ }
+
+ return 0;
+}
+
+static int bpf_qdisc_reg(void *kdata, struct bpf_link *link)
+{
+ return register_qdisc(kdata);
+}
+
+static void bpf_qdisc_unreg(void *kdata, struct bpf_link *link)
+{
+ return unregister_qdisc(kdata);
+}
+
+static int bpf_qdisc_validate(void *kdata)
+{
+ struct Qdisc_ops *ops = (struct Qdisc_ops *)kdata;
+
+ if (!ops->enqueue || !ops->dequeue || !ops->init ||
+ !ops->reset || !ops->destroy)
+ return -EINVAL;
+
+ return 0;
+}
+
+static int Qdisc_ops__enqueue(struct sk_buff *skb__ref, struct Qdisc *sch,
+ struct sk_buff **to_free)
+{
+ return 0;
+}
+
+static struct sk_buff *Qdisc_ops__dequeue(struct Qdisc *sch)
+{
+ return NULL;
+}
+
+static int Qdisc_ops__init(struct Qdisc *sch, struct nlattr *arg,
+ struct netlink_ext_ack *extack)
+{
+ return 0;
+}
+
+static void Qdisc_ops__reset(struct Qdisc *sch)
+{
+}
+
+static void Qdisc_ops__destroy(struct Qdisc *sch)
+{
+}
+
+static struct Qdisc_ops __bpf_ops_qdisc_ops = {
+ .enqueue = Qdisc_ops__enqueue,
+ .dequeue = Qdisc_ops__dequeue,
+ .init = Qdisc_ops__init,
+ .reset = Qdisc_ops__reset,
+ .destroy = Qdisc_ops__destroy,
+};
+
+static struct bpf_struct_ops bpf_Qdisc_ops = {
+ .verifier_ops = &bpf_qdisc_verifier_ops,
+ .reg = bpf_qdisc_reg,
+ .unreg = bpf_qdisc_unreg,
+ .validate = bpf_qdisc_validate,
+ .init_member = bpf_qdisc_init_member,
+ .init = bpf_qdisc_init,
+ .name = "Qdisc_ops",
+ .cfi_stubs = &__bpf_ops_qdisc_ops,
+ .owner = THIS_MODULE,
+};
+
+BTF_ID_LIST_SINGLE(bpf_sk_buff_dtor_ids, func, bpf_kfree_skb)
+
+static int __init bpf_qdisc_kfunc_init(void)
+{
+ int ret;
+ const struct btf_id_dtor_kfunc skb_kfunc_dtors[] = {
+ {
+ .btf_id = bpf_sk_buff_ids[0],
+ .kfunc_btf_id = bpf_sk_buff_dtor_ids[0]
+ },
+ };
+
+ ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &bpf_qdisc_kfunc_set);
+ ret = ret ?: register_btf_id_dtor_kfuncs(skb_kfunc_dtors,
+ ARRAY_SIZE(skb_kfunc_dtors),
+ THIS_MODULE);
+ ret = ret ?: register_bpf_struct_ops(&bpf_Qdisc_ops, Qdisc_ops);
+
+ return ret;
+}
+late_initcall(bpf_qdisc_kfunc_init);
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index e2b5cb2eb34e..ebca4b926dcf 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -1,17 +1,12 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* net/sched/cls_api.c Packet classifier API.
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
*
* Changes:
*
* Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
- *
*/
#include <linux/module.h>
@@ -25,14 +20,28 @@
#include <linux/kmod.h>
#include <linux/slab.h>
#include <linux/idr.h>
+#include <linux/jhash.h>
+#include <linux/rculist.h>
#include <linux/rhashtable.h>
#include <net/net_namespace.h>
#include <net/sock.h>
#include <net/netlink.h>
#include <net/pkt_sched.h>
#include <net/pkt_cls.h>
-
-extern const struct nla_policy rtm_tca_policy[TCA_MAX + 1];
+#include <net/tc_act/tc_pedit.h>
+#include <net/tc_act/tc_mirred.h>
+#include <net/tc_act/tc_vlan.h>
+#include <net/tc_act/tc_tunnel_key.h>
+#include <net/tc_act/tc_csum.h>
+#include <net/tc_act/tc_gact.h>
+#include <net/tc_act/tc_police.h>
+#include <net/tc_act/tc_sample.h>
+#include <net/tc_act/tc_skbedit.h>
+#include <net/tc_act/tc_ct.h>
+#include <net/tc_act/tc_mpls.h>
+#include <net/tc_act/tc_gate.h>
+#include <net/flow_offload.h>
+#include <net/tc_wrapper.h>
/* The list of all installed classifier types */
static LIST_HEAD(tcf_proto_base);
@@ -40,6 +49,182 @@ static LIST_HEAD(tcf_proto_base);
/* Protects list of registered TC modules. It is pure SMP lock. */
static DEFINE_RWLOCK(cls_mod_lock);
+static struct xarray tcf_exts_miss_cookies_xa;
+struct tcf_exts_miss_cookie_node {
+ const struct tcf_chain *chain;
+ const struct tcf_proto *tp;
+ const struct tcf_exts *exts;
+ u32 chain_index;
+ u32 tp_prio;
+ u32 handle;
+ u32 miss_cookie_base;
+ struct rcu_head rcu;
+};
+
+/* Each tc action entry cookie will be comprised of 32bit miss_cookie_base +
+ * action index in the exts tc actions array.
+ */
+union tcf_exts_miss_cookie {
+ struct {
+ u32 miss_cookie_base;
+ u32 act_index;
+ };
+ u64 miss_cookie;
+};
+
+#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
+static int
+tcf_exts_miss_cookie_base_alloc(struct tcf_exts *exts, struct tcf_proto *tp,
+ u32 handle)
+{
+ struct tcf_exts_miss_cookie_node *n;
+ static u32 next;
+ int err;
+
+ if (WARN_ON(!handle || !tp->ops->get_exts))
+ return -EINVAL;
+
+ n = kzalloc(sizeof(*n), GFP_KERNEL);
+ if (!n)
+ return -ENOMEM;
+
+ n->chain_index = tp->chain->index;
+ n->chain = tp->chain;
+ n->tp_prio = tp->prio;
+ n->tp = tp;
+ n->exts = exts;
+ n->handle = handle;
+
+ err = xa_alloc_cyclic(&tcf_exts_miss_cookies_xa, &n->miss_cookie_base,
+ n, xa_limit_32b, &next, GFP_KERNEL);
+ if (err < 0)
+ goto err_xa_alloc;
+
+ exts->miss_cookie_node = n;
+ return 0;
+
+err_xa_alloc:
+ kfree(n);
+ return err;
+}
+
+static void tcf_exts_miss_cookie_base_destroy(struct tcf_exts *exts)
+{
+ struct tcf_exts_miss_cookie_node *n;
+
+ if (!exts->miss_cookie_node)
+ return;
+
+ n = exts->miss_cookie_node;
+ xa_erase(&tcf_exts_miss_cookies_xa, n->miss_cookie_base);
+ kfree_rcu(n, rcu);
+}
+
+static struct tcf_exts_miss_cookie_node *
+tcf_exts_miss_cookie_lookup(u64 miss_cookie, int *act_index)
+{
+ union tcf_exts_miss_cookie mc = { .miss_cookie = miss_cookie, };
+
+ *act_index = mc.act_index;
+ return xa_load(&tcf_exts_miss_cookies_xa, mc.miss_cookie_base);
+}
+#else /* IS_ENABLED(CONFIG_NET_TC_SKB_EXT) */
+static int
+tcf_exts_miss_cookie_base_alloc(struct tcf_exts *exts, struct tcf_proto *tp,
+ u32 handle)
+{
+ return 0;
+}
+
+static void tcf_exts_miss_cookie_base_destroy(struct tcf_exts *exts)
+{
+}
+#endif /* IS_ENABLED(CONFIG_NET_TC_SKB_EXT) */
+
+static u64 tcf_exts_miss_cookie_get(u32 miss_cookie_base, int act_index)
+{
+ union tcf_exts_miss_cookie mc = { .act_index = act_index, };
+
+ if (!miss_cookie_base)
+ return 0;
+
+ mc.miss_cookie_base = miss_cookie_base;
+ return mc.miss_cookie;
+}
+
+#ifdef CONFIG_NET_CLS_ACT
+DEFINE_STATIC_KEY_FALSE(tc_skb_ext_tc);
+EXPORT_SYMBOL(tc_skb_ext_tc);
+
+void tc_skb_ext_tc_enable(void)
+{
+ static_branch_inc(&tc_skb_ext_tc);
+}
+EXPORT_SYMBOL(tc_skb_ext_tc_enable);
+
+void tc_skb_ext_tc_disable(void)
+{
+ static_branch_dec(&tc_skb_ext_tc);
+}
+EXPORT_SYMBOL(tc_skb_ext_tc_disable);
+#endif
+
+static u32 destroy_obj_hashfn(const struct tcf_proto *tp)
+{
+ return jhash_3words(tp->chain->index, tp->prio,
+ (__force __u32)tp->protocol, 0);
+}
+
+static void tcf_proto_signal_destroying(struct tcf_chain *chain,
+ struct tcf_proto *tp)
+{
+ struct tcf_block *block = chain->block;
+
+ mutex_lock(&block->proto_destroy_lock);
+ hash_add_rcu(block->proto_destroy_ht, &tp->destroy_ht_node,
+ destroy_obj_hashfn(tp));
+ mutex_unlock(&block->proto_destroy_lock);
+}
+
+static bool tcf_proto_cmp(const struct tcf_proto *tp1,
+ const struct tcf_proto *tp2)
+{
+ return tp1->chain->index == tp2->chain->index &&
+ tp1->prio == tp2->prio &&
+ tp1->protocol == tp2->protocol;
+}
+
+static bool tcf_proto_exists_destroying(struct tcf_chain *chain,
+ struct tcf_proto *tp)
+{
+ u32 hash = destroy_obj_hashfn(tp);
+ struct tcf_proto *iter;
+ bool found = false;
+
+ rcu_read_lock();
+ hash_for_each_possible_rcu(chain->block->proto_destroy_ht, iter,
+ destroy_ht_node, hash) {
+ if (tcf_proto_cmp(tp, iter)) {
+ found = true;
+ break;
+ }
+ }
+ rcu_read_unlock();
+
+ return found;
+}
+
+static void
+tcf_proto_signal_destroyed(struct tcf_chain *chain, struct tcf_proto *tp)
+{
+ struct tcf_block *block = chain->block;
+
+ mutex_lock(&block->proto_destroy_lock);
+ if (hash_hashed(&tp->destroy_ht_node))
+ hash_del_rcu(&tp->destroy_ht_node);
+ mutex_unlock(&block->proto_destroy_lock);
+}
+
/* Find classifier type by string name */
static const struct tcf_proto_ops *__tcf_proto_lookup_ops(const char *kind)
@@ -61,7 +246,8 @@ static const struct tcf_proto_ops *__tcf_proto_lookup_ops(const char *kind)
}
static const struct tcf_proto_ops *
-tcf_proto_lookup_ops(const char *kind, struct netlink_ext_ack *extack)
+tcf_proto_lookup_ops(const char *kind, bool rtnl_held,
+ struct netlink_ext_ack *extack)
{
const struct tcf_proto_ops *ops;
@@ -69,9 +255,11 @@ tcf_proto_lookup_ops(const char *kind, struct netlink_ext_ack *extack)
if (ops)
return ops;
#ifdef CONFIG_MODULES
- rtnl_unlock();
- request_module("cls_%s", kind);
- rtnl_lock();
+ if (rtnl_held)
+ rtnl_unlock();
+ request_module(NET_CLS_ALIAS_PREFIX "%s", kind);
+ if (rtnl_held)
+ rtnl_lock();
ops = __tcf_proto_lookup_ops(kind);
/* We dropped the RTNL semaphore in order to perform
* the module load. So, even if we succeeded in loading
@@ -109,7 +297,7 @@ EXPORT_SYMBOL(register_tcf_proto_ops);
static struct workqueue_struct *tc_filter_wq;
-int unregister_tcf_proto_ops(struct tcf_proto_ops *ops)
+void unregister_tcf_proto_ops(struct tcf_proto_ops *ops)
{
struct tcf_proto_ops *t;
int rc = -ENOENT;
@@ -129,7 +317,8 @@ int unregister_tcf_proto_ops(struct tcf_proto_ops *ops)
}
}
write_unlock(&cls_mod_lock);
- return rc;
+
+ WARN(rc, "unregister tc filter kind(%s) failed %d\n", ops->kind, rc);
}
EXPORT_SYMBOL(unregister_tcf_proto_ops);
@@ -152,8 +341,37 @@ static inline u32 tcf_auto_prio(struct tcf_proto *tp)
return TC_H_MAJ(first);
}
+static bool tcf_proto_check_kind(struct nlattr *kind, char *name)
+{
+ if (kind)
+ return nla_strscpy(name, kind, IFNAMSIZ) < 0;
+ memset(name, 0, IFNAMSIZ);
+ return false;
+}
+
+static bool tcf_proto_is_unlocked(const char *kind)
+{
+ const struct tcf_proto_ops *ops;
+ bool ret;
+
+ if (strlen(kind) == 0)
+ return false;
+
+ ops = tcf_proto_lookup_ops(kind, false, NULL);
+ /* On error return false to take rtnl lock. Proto lookup/create
+ * functions will perform lookup again and properly handle errors.
+ */
+ if (IS_ERR(ops))
+ return false;
+
+ ret = !!(ops->flags & TCF_PROTO_OPS_DOIT_UNLOCKED);
+ module_put(ops->owner);
+ return ret;
+}
+
static struct tcf_proto *tcf_proto_create(const char *kind, u32 protocol,
u32 prio, struct tcf_chain *chain,
+ bool rtnl_held,
struct netlink_ext_ack *extack)
{
struct tcf_proto *tp;
@@ -163,7 +381,7 @@ static struct tcf_proto *tcf_proto_create(const char *kind, u32 protocol,
if (!tp)
return ERR_PTR(-ENOBUFS);
- tp->ops = tcf_proto_lookup_ops(kind, extack);
+ tp->ops = tcf_proto_lookup_ops(kind, rtnl_held, extack);
if (IS_ERR(tp->ops)) {
err = PTR_ERR(tp->ops);
goto errout;
@@ -172,6 +390,9 @@ static struct tcf_proto *tcf_proto_create(const char *kind, u32 protocol,
tp->protocol = protocol;
tp->prio = prio;
tp->chain = chain;
+ tp->usesw = !tp->ops->reoffload;
+ spin_lock_init(&tp->lock);
+ refcount_set(&tp->refcnt, 1);
err = tp->ops->init(tp);
if (err) {
@@ -185,14 +406,89 @@ errout:
return ERR_PTR(err);
}
-static void tcf_proto_destroy(struct tcf_proto *tp,
- struct netlink_ext_ack *extack)
+static void tcf_proto_get(struct tcf_proto *tp)
{
- tp->ops->destroy(tp, extack);
+ refcount_inc(&tp->refcnt);
+}
+
+static void tcf_proto_count_usesw(struct tcf_proto *tp, bool add)
+{
+#ifdef CONFIG_NET_CLS_ACT
+ struct tcf_block *block = tp->chain->block;
+ bool counted = false;
+
+ if (!add) {
+ if (tp->usesw && tp->counted) {
+ if (!atomic_dec_return(&block->useswcnt))
+ static_branch_dec(&tcf_sw_enabled_key);
+ tp->counted = false;
+ }
+ return;
+ }
+
+ spin_lock(&tp->lock);
+ if (tp->usesw && !tp->counted) {
+ counted = true;
+ tp->counted = true;
+ }
+ spin_unlock(&tp->lock);
+
+ if (counted && atomic_inc_return(&block->useswcnt) == 1)
+ static_branch_inc(&tcf_sw_enabled_key);
+#endif
+}
+
+static void tcf_chain_put(struct tcf_chain *chain);
+
+static void tcf_proto_destroy(struct tcf_proto *tp, bool rtnl_held,
+ bool sig_destroy, struct netlink_ext_ack *extack)
+{
+ tp->ops->destroy(tp, rtnl_held, extack);
+ tcf_proto_count_usesw(tp, false);
+ if (sig_destroy)
+ tcf_proto_signal_destroyed(tp->chain, tp);
+ tcf_chain_put(tp->chain);
module_put(tp->ops->owner);
kfree_rcu(tp, rcu);
}
+static void tcf_proto_put(struct tcf_proto *tp, bool rtnl_held,
+ struct netlink_ext_ack *extack)
+{
+ if (refcount_dec_and_test(&tp->refcnt))
+ tcf_proto_destroy(tp, rtnl_held, true, extack);
+}
+
+static bool tcf_proto_check_delete(struct tcf_proto *tp)
+{
+ if (tp->ops->delete_empty)
+ return tp->ops->delete_empty(tp);
+
+ tp->deleting = true;
+ return tp->deleting;
+}
+
+static void tcf_proto_mark_delete(struct tcf_proto *tp)
+{
+ spin_lock(&tp->lock);
+ tp->deleting = true;
+ spin_unlock(&tp->lock);
+}
+
+static bool tcf_proto_is_deleting(struct tcf_proto *tp)
+{
+ bool deleting;
+
+ spin_lock(&tp->lock);
+ deleting = tp->deleting;
+ spin_unlock(&tp->lock);
+
+ return deleting;
+}
+
+#define ASSERT_BLOCK_LOCKED(block) \
+ lockdep_assert_held(&(block)->lock)
+
struct tcf_filter_chain_list_item {
struct list_head list;
tcf_chain_head_change_t *chain_head_change;
@@ -204,10 +500,13 @@ static struct tcf_chain *tcf_chain_create(struct tcf_block *block,
{
struct tcf_chain *chain;
+ ASSERT_BLOCK_LOCKED(block);
+
chain = kzalloc(sizeof(*chain), GFP_KERNEL);
if (!chain)
return NULL;
- list_add_tail(&chain->list, &block->chain_list);
+ list_add_tail_rcu(&chain->list, &block->chain_list);
+ mutex_init(&chain->filter_chain_lock);
chain->block = block;
chain->index = chain_index;
chain->refcnt = 1;
@@ -231,29 +530,61 @@ static void tcf_chain0_head_change(struct tcf_chain *chain,
if (chain->index)
return;
+
+ mutex_lock(&block->lock);
list_for_each_entry(item, &block->chain0.filter_chain_list, list)
tcf_chain_head_change_item(item, tp_head);
+ mutex_unlock(&block->lock);
}
-static void tcf_chain_destroy(struct tcf_chain *chain)
+/* Returns true if block can be safely freed. */
+
+static bool tcf_chain_detach(struct tcf_chain *chain)
{
struct tcf_block *block = chain->block;
- list_del(&chain->list);
+ ASSERT_BLOCK_LOCKED(block);
+
+ list_del_rcu(&chain->list);
if (!chain->index)
block->chain0.chain = NULL;
- kfree(chain);
- if (list_empty(&block->chain_list) && !refcount_read(&block->refcnt))
- kfree_rcu(block, rcu);
+
+ if (list_empty(&block->chain_list) &&
+ refcount_read(&block->refcnt) == 0)
+ return true;
+
+ return false;
+}
+
+static void tcf_block_destroy(struct tcf_block *block)
+{
+ mutex_destroy(&block->lock);
+ mutex_destroy(&block->proto_destroy_lock);
+ xa_destroy(&block->ports);
+ kfree_rcu(block, rcu);
+}
+
+static void tcf_chain_destroy(struct tcf_chain *chain, bool free_block)
+{
+ struct tcf_block *block = chain->block;
+
+ mutex_destroy(&chain->filter_chain_lock);
+ kfree_rcu(chain, rcu);
+ if (free_block)
+ tcf_block_destroy(block);
}
static void tcf_chain_hold(struct tcf_chain *chain)
{
+ ASSERT_BLOCK_LOCKED(chain->block);
+
++chain->refcnt;
}
static bool tcf_chain_held_by_acts_only(struct tcf_chain *chain)
{
+ ASSERT_BLOCK_LOCKED(chain->block);
+
/* In case all the references are action references, this
* chain should not be shown to the user.
*/
@@ -265,6 +596,8 @@ static struct tcf_chain *tcf_chain_lookup(struct tcf_block *block,
{
struct tcf_chain *chain;
+ ASSERT_BLOCK_LOCKED(block);
+
list_for_each_entry(chain, &block->chain_list, list) {
if (chain->index == chain_index)
return chain;
@@ -272,38 +605,62 @@ static struct tcf_chain *tcf_chain_lookup(struct tcf_block *block,
return NULL;
}
+#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
+static struct tcf_chain *tcf_chain_lookup_rcu(const struct tcf_block *block,
+ u32 chain_index)
+{
+ struct tcf_chain *chain;
+
+ list_for_each_entry_rcu(chain, &block->chain_list, list) {
+ if (chain->index == chain_index)
+ return chain;
+ }
+ return NULL;
+}
+#endif
+
static int tc_chain_notify(struct tcf_chain *chain, struct sk_buff *oskb,
- u32 seq, u16 flags, int event, bool unicast);
+ u32 seq, u16 flags, int event, bool unicast,
+ struct netlink_ext_ack *extack);
static struct tcf_chain *__tcf_chain_get(struct tcf_block *block,
u32 chain_index, bool create,
bool by_act)
{
- struct tcf_chain *chain = tcf_chain_lookup(block, chain_index);
+ struct tcf_chain *chain = NULL;
+ bool is_first_reference;
+ mutex_lock(&block->lock);
+ chain = tcf_chain_lookup(block, chain_index);
if (chain) {
tcf_chain_hold(chain);
} else {
if (!create)
- return NULL;
+ goto errout;
chain = tcf_chain_create(block, chain_index);
if (!chain)
- return NULL;
+ goto errout;
}
if (by_act)
++chain->action_refcnt;
+ is_first_reference = chain->refcnt - chain->action_refcnt == 1;
+ mutex_unlock(&block->lock);
/* Send notification only in case we got the first
* non-action reference. Until then, the chain acts only as
* a placeholder for actions pointing to it and user ought
* not know about them.
*/
- if (chain->refcnt - chain->action_refcnt == 1 && !by_act)
+ if (is_first_reference && !by_act)
tc_chain_notify(chain, NULL, 0, NLM_F_CREATE | NLM_F_EXCL,
- RTM_NEWCHAIN, false);
+ RTM_NEWCHAIN, false, NULL);
return chain;
+
+errout:
+ mutex_unlock(&block->lock);
+ return chain;
}
static struct tcf_chain *tcf_chain_get(struct tcf_block *block, u32 chain_index,
@@ -318,311 +675,181 @@ struct tcf_chain *tcf_chain_get_by_act(struct tcf_block *block, u32 chain_index)
}
EXPORT_SYMBOL(tcf_chain_get_by_act);
-static void tc_chain_tmplt_del(struct tcf_chain *chain);
+static void tc_chain_tmplt_del(const struct tcf_proto_ops *tmplt_ops,
+ void *tmplt_priv);
+static int tc_chain_notify_delete(const struct tcf_proto_ops *tmplt_ops,
+ void *tmplt_priv, u32 chain_index,
+ struct tcf_block *block, struct sk_buff *oskb,
+ u32 seq, u16 flags);
-static void __tcf_chain_put(struct tcf_chain *chain, bool by_act)
+static void __tcf_chain_put(struct tcf_chain *chain, bool by_act,
+ bool explicitly_created)
{
+ struct tcf_block *block = chain->block;
+ const struct tcf_proto_ops *tmplt_ops;
+ unsigned int refcnt, non_act_refcnt;
+ bool free_block = false;
+ void *tmplt_priv;
+
+ mutex_lock(&block->lock);
+ if (explicitly_created) {
+ if (!chain->explicitly_created) {
+ mutex_unlock(&block->lock);
+ return;
+ }
+ chain->explicitly_created = false;
+ }
+
if (by_act)
chain->action_refcnt--;
- chain->refcnt--;
- /* The last dropped non-action reference will trigger notification. */
- if (chain->refcnt - chain->action_refcnt == 0 && !by_act)
- tc_chain_notify(chain, NULL, 0, 0, RTM_DELCHAIN, false);
+ /* tc_chain_notify_delete can't be called while holding block lock.
+ * However, when block is unlocked chain can be changed concurrently, so
+ * save these to temporary variables.
+ */
+ refcnt = --chain->refcnt;
+ non_act_refcnt = refcnt - chain->action_refcnt;
+ tmplt_ops = chain->tmplt_ops;
+ tmplt_priv = chain->tmplt_priv;
+
+ if (non_act_refcnt == chain->explicitly_created && !by_act) {
+ if (non_act_refcnt == 0)
+ tc_chain_notify_delete(tmplt_ops, tmplt_priv,
+ chain->index, block, NULL, 0, 0);
+ /* Last reference to chain, no need to lock. */
+ chain->flushing = false;
+ }
- if (chain->refcnt == 0) {
- tc_chain_tmplt_del(chain);
- tcf_chain_destroy(chain);
+ if (refcnt == 0)
+ free_block = tcf_chain_detach(chain);
+ mutex_unlock(&block->lock);
+
+ if (refcnt == 0) {
+ tc_chain_tmplt_del(tmplt_ops, tmplt_priv);
+ tcf_chain_destroy(chain, free_block);
}
}
static void tcf_chain_put(struct tcf_chain *chain)
{
- __tcf_chain_put(chain, false);
+ __tcf_chain_put(chain, false, false);
}
void tcf_chain_put_by_act(struct tcf_chain *chain)
{
- __tcf_chain_put(chain, true);
+ __tcf_chain_put(chain, true, false);
}
EXPORT_SYMBOL(tcf_chain_put_by_act);
static void tcf_chain_put_explicitly_created(struct tcf_chain *chain)
{
- if (chain->explicitly_created)
- tcf_chain_put(chain);
+ __tcf_chain_put(chain, false, true);
}
-static void tcf_chain_flush(struct tcf_chain *chain)
+static void tcf_chain_flush(struct tcf_chain *chain, bool rtnl_held)
{
- struct tcf_proto *tp = rtnl_dereference(chain->filter_chain);
+ struct tcf_proto *tp, *tp_next;
- tcf_chain0_head_change(chain, NULL);
+ mutex_lock(&chain->filter_chain_lock);
+ tp = tcf_chain_dereference(chain->filter_chain, chain);
while (tp) {
- RCU_INIT_POINTER(chain->filter_chain, tp->next);
- tcf_proto_destroy(tp, NULL);
- tp = rtnl_dereference(chain->filter_chain);
- tcf_chain_put(chain);
+ tp_next = rcu_dereference_protected(tp->next, 1);
+ tcf_proto_signal_destroying(chain, tp);
+ tp = tp_next;
}
-}
-
-static struct tcf_block *tc_dev_ingress_block(struct net_device *dev)
-{
- const struct Qdisc_class_ops *cops;
- struct Qdisc *qdisc;
-
- if (!dev_ingress_queue(dev))
- return NULL;
-
- qdisc = dev_ingress_queue(dev)->qdisc_sleeping;
- if (!qdisc)
- return NULL;
-
- cops = qdisc->ops->cl_ops;
- if (!cops)
- return NULL;
-
- if (!cops->tcf_block)
- return NULL;
-
- return cops->tcf_block(qdisc, TC_H_MIN_INGRESS, NULL);
-}
-
-static struct rhashtable indr_setup_block_ht;
-
-struct tc_indr_block_dev {
- struct rhash_head ht_node;
- struct net_device *dev;
- unsigned int refcnt;
- struct list_head cb_list;
- struct tcf_block *block;
-};
-
-struct tc_indr_block_cb {
- struct list_head list;
- void *cb_priv;
- tc_indr_block_bind_cb_t *cb;
- void *cb_ident;
-};
-
-static const struct rhashtable_params tc_indr_setup_block_ht_params = {
- .key_offset = offsetof(struct tc_indr_block_dev, dev),
- .head_offset = offsetof(struct tc_indr_block_dev, ht_node),
- .key_len = sizeof(struct net_device *),
-};
-
-static struct tc_indr_block_dev *
-tc_indr_block_dev_lookup(struct net_device *dev)
-{
- return rhashtable_lookup_fast(&indr_setup_block_ht, &dev,
- tc_indr_setup_block_ht_params);
-}
-
-static struct tc_indr_block_dev *tc_indr_block_dev_get(struct net_device *dev)
-{
- struct tc_indr_block_dev *indr_dev;
-
- indr_dev = tc_indr_block_dev_lookup(dev);
- if (indr_dev)
- goto inc_ref;
-
- indr_dev = kzalloc(sizeof(*indr_dev), GFP_KERNEL);
- if (!indr_dev)
- return NULL;
+ tp = tcf_chain_dereference(chain->filter_chain, chain);
+ RCU_INIT_POINTER(chain->filter_chain, NULL);
+ tcf_chain0_head_change(chain, NULL);
+ chain->flushing = true;
+ mutex_unlock(&chain->filter_chain_lock);
- INIT_LIST_HEAD(&indr_dev->cb_list);
- indr_dev->dev = dev;
- indr_dev->block = tc_dev_ingress_block(dev);
- if (rhashtable_insert_fast(&indr_setup_block_ht, &indr_dev->ht_node,
- tc_indr_setup_block_ht_params)) {
- kfree(indr_dev);
- return NULL;
+ while (tp) {
+ tp_next = rcu_dereference_protected(tp->next, 1);
+ tcf_proto_put(tp, rtnl_held, NULL);
+ tp = tp_next;
}
-
-inc_ref:
- indr_dev->refcnt++;
- return indr_dev;
}
-static void tc_indr_block_dev_put(struct tc_indr_block_dev *indr_dev)
-{
- if (--indr_dev->refcnt)
- return;
-
- rhashtable_remove_fast(&indr_setup_block_ht, &indr_dev->ht_node,
- tc_indr_setup_block_ht_params);
- kfree(indr_dev);
-}
+static int tcf_block_setup(struct tcf_block *block,
+ struct flow_block_offload *bo);
-static struct tc_indr_block_cb *
-tc_indr_block_cb_lookup(struct tc_indr_block_dev *indr_dev,
- tc_indr_block_bind_cb_t *cb, void *cb_ident)
+static void tcf_block_offload_init(struct flow_block_offload *bo,
+ struct net_device *dev, struct Qdisc *sch,
+ enum flow_block_command command,
+ enum flow_block_binder_type binder_type,
+ struct flow_block *flow_block,
+ bool shared, struct netlink_ext_ack *extack)
{
- struct tc_indr_block_cb *indr_block_cb;
-
- list_for_each_entry(indr_block_cb, &indr_dev->cb_list, list)
- if (indr_block_cb->cb == cb &&
- indr_block_cb->cb_ident == cb_ident)
- return indr_block_cb;
- return NULL;
+ bo->net = dev_net(dev);
+ bo->command = command;
+ bo->binder_type = binder_type;
+ bo->block = flow_block;
+ bo->block_shared = shared;
+ bo->extack = extack;
+ bo->sch = sch;
+ bo->cb_list_head = &flow_block->cb_list;
+ INIT_LIST_HEAD(&bo->cb_list);
}
-static struct tc_indr_block_cb *
-tc_indr_block_cb_add(struct tc_indr_block_dev *indr_dev, void *cb_priv,
- tc_indr_block_bind_cb_t *cb, void *cb_ident)
-{
- struct tc_indr_block_cb *indr_block_cb;
-
- indr_block_cb = tc_indr_block_cb_lookup(indr_dev, cb, cb_ident);
- if (indr_block_cb)
- return ERR_PTR(-EEXIST);
+static void tcf_block_unbind(struct tcf_block *block,
+ struct flow_block_offload *bo);
- indr_block_cb = kzalloc(sizeof(*indr_block_cb), GFP_KERNEL);
- if (!indr_block_cb)
- return ERR_PTR(-ENOMEM);
-
- indr_block_cb->cb_priv = cb_priv;
- indr_block_cb->cb = cb;
- indr_block_cb->cb_ident = cb_ident;
- list_add(&indr_block_cb->list, &indr_dev->cb_list);
-
- return indr_block_cb;
-}
-
-static void tc_indr_block_cb_del(struct tc_indr_block_cb *indr_block_cb)
+static void tc_block_indr_cleanup(struct flow_block_cb *block_cb)
{
- list_del(&indr_block_cb->list);
- kfree(indr_block_cb);
-}
-
-static void tc_indr_block_ing_cmd(struct tc_indr_block_dev *indr_dev,
- struct tc_indr_block_cb *indr_block_cb,
- enum tc_block_command command)
-{
- struct tc_block_offload bo = {
- .command = command,
- .binder_type = TCF_BLOCK_BINDER_TYPE_CLSACT_INGRESS,
- .block = indr_dev->block,
- };
-
- if (!indr_dev->block)
- return;
-
- indr_block_cb->cb(indr_dev->dev, indr_block_cb->cb_priv, TC_SETUP_BLOCK,
- &bo);
-}
-
-int __tc_indr_block_cb_register(struct net_device *dev, void *cb_priv,
- tc_indr_block_bind_cb_t *cb, void *cb_ident)
-{
- struct tc_indr_block_cb *indr_block_cb;
- struct tc_indr_block_dev *indr_dev;
- int err;
-
- indr_dev = tc_indr_block_dev_get(dev);
- if (!indr_dev)
- return -ENOMEM;
-
- indr_block_cb = tc_indr_block_cb_add(indr_dev, cb_priv, cb, cb_ident);
- err = PTR_ERR_OR_ZERO(indr_block_cb);
- if (err)
- goto err_dev_put;
-
- tc_indr_block_ing_cmd(indr_dev, indr_block_cb, TC_BLOCK_BIND);
- return 0;
-
-err_dev_put:
- tc_indr_block_dev_put(indr_dev);
- return err;
-}
-EXPORT_SYMBOL_GPL(__tc_indr_block_cb_register);
-
-int tc_indr_block_cb_register(struct net_device *dev, void *cb_priv,
- tc_indr_block_bind_cb_t *cb, void *cb_ident)
-{
- int err;
-
+ struct tcf_block *block = block_cb->indr.data;
+ struct net_device *dev = block_cb->indr.dev;
+ struct Qdisc *sch = block_cb->indr.sch;
+ struct netlink_ext_ack extack = {};
+ struct flow_block_offload bo = {};
+
+ tcf_block_offload_init(&bo, dev, sch, FLOW_BLOCK_UNBIND,
+ block_cb->indr.binder_type,
+ &block->flow_block, tcf_block_shared(block),
+ &extack);
rtnl_lock();
- err = __tc_indr_block_cb_register(dev, cb_priv, cb, cb_ident);
+ down_write(&block->cb_lock);
+ list_del(&block_cb->driver_list);
+ list_move(&block_cb->list, &bo.cb_list);
+ tcf_block_unbind(block, &bo);
+ up_write(&block->cb_lock);
rtnl_unlock();
-
- return err;
-}
-EXPORT_SYMBOL_GPL(tc_indr_block_cb_register);
-
-void __tc_indr_block_cb_unregister(struct net_device *dev,
- tc_indr_block_bind_cb_t *cb, void *cb_ident)
-{
- struct tc_indr_block_cb *indr_block_cb;
- struct tc_indr_block_dev *indr_dev;
-
- indr_dev = tc_indr_block_dev_lookup(dev);
- if (!indr_dev)
- return;
-
- indr_block_cb = tc_indr_block_cb_lookup(indr_dev, cb, cb_ident);
- if (!indr_block_cb)
- return;
-
- /* Send unbind message if required to free any block cbs. */
- tc_indr_block_ing_cmd(indr_dev, indr_block_cb, TC_BLOCK_UNBIND);
- tc_indr_block_cb_del(indr_block_cb);
- tc_indr_block_dev_put(indr_dev);
}
-EXPORT_SYMBOL_GPL(__tc_indr_block_cb_unregister);
-void tc_indr_block_cb_unregister(struct net_device *dev,
- tc_indr_block_bind_cb_t *cb, void *cb_ident)
+static bool tcf_block_offload_in_use(struct tcf_block *block)
{
- rtnl_lock();
- __tc_indr_block_cb_unregister(dev, cb, cb_ident);
- rtnl_unlock();
+ return atomic_read(&block->offloadcnt);
}
-EXPORT_SYMBOL_GPL(tc_indr_block_cb_unregister);
-static void tc_indr_block_call(struct tcf_block *block, struct net_device *dev,
- struct tcf_block_ext_info *ei,
- enum tc_block_command command,
- struct netlink_ext_ack *extack)
+static int tcf_block_offload_cmd(struct tcf_block *block,
+ struct net_device *dev, struct Qdisc *sch,
+ struct tcf_block_ext_info *ei,
+ enum flow_block_command command,
+ struct netlink_ext_ack *extack)
{
- struct tc_indr_block_cb *indr_block_cb;
- struct tc_indr_block_dev *indr_dev;
- struct tc_block_offload bo = {
- .command = command,
- .binder_type = ei->binder_type,
- .block = block,
- .extack = extack,
- };
+ struct flow_block_offload bo = {};
- indr_dev = tc_indr_block_dev_lookup(dev);
- if (!indr_dev)
- return;
+ tcf_block_offload_init(&bo, dev, sch, command, ei->binder_type,
+ &block->flow_block, tcf_block_shared(block),
+ extack);
- indr_dev->block = command == TC_BLOCK_BIND ? block : NULL;
+ if (dev->netdev_ops->ndo_setup_tc) {
+ int err;
- list_for_each_entry(indr_block_cb, &indr_dev->cb_list, list)
- indr_block_cb->cb(dev, indr_block_cb->cb_priv, TC_SETUP_BLOCK,
- &bo);
-}
+ err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_BLOCK, &bo);
+ if (err < 0) {
+ if (err != -EOPNOTSUPP)
+ NL_SET_ERR_MSG(extack, "Driver ndo_setup_tc failed");
+ return err;
+ }
-static bool tcf_block_offload_in_use(struct tcf_block *block)
-{
- return block->offloadcnt;
-}
+ return tcf_block_setup(block, &bo);
+ }
-static int tcf_block_offload_cmd(struct tcf_block *block,
- struct net_device *dev,
- struct tcf_block_ext_info *ei,
- enum tc_block_command command,
- struct netlink_ext_ack *extack)
-{
- struct tc_block_offload bo = {};
+ flow_indr_dev_setup_offload(dev, sch, TC_SETUP_BLOCK, block, &bo,
+ tc_block_indr_cleanup);
+ tcf_block_setup(block, &bo);
- bo.command = command;
- bo.binder_type = ei->binder_type;
- bo.block = block;
- bo.extack = extack;
- return dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_BLOCK, &bo);
+ return -EOPNOTSUPP;
}
static int tcf_block_offload_bind(struct tcf_block *block, struct Qdisc *q,
@@ -632,32 +859,37 @@ static int tcf_block_offload_bind(struct tcf_block *block, struct Qdisc *q,
struct net_device *dev = q->dev_queue->dev;
int err;
- if (!dev->netdev_ops->ndo_setup_tc)
- goto no_offload_dev_inc;
+ down_write(&block->cb_lock);
/* If tc offload feature is disabled and the block we try to bind
* to already has some offloaded filters, forbid to bind.
*/
- if (!tc_can_offload(dev) && tcf_block_offload_in_use(block)) {
+ if (dev->netdev_ops->ndo_setup_tc &&
+ !tc_can_offload(dev) &&
+ tcf_block_offload_in_use(block)) {
NL_SET_ERR_MSG(extack, "Bind to offloaded block failed as dev has offload disabled");
- return -EOPNOTSUPP;
+ err = -EOPNOTSUPP;
+ goto err_unlock;
}
- err = tcf_block_offload_cmd(block, dev, ei, TC_BLOCK_BIND, extack);
+ err = tcf_block_offload_cmd(block, dev, q, ei, FLOW_BLOCK_BIND, extack);
if (err == -EOPNOTSUPP)
goto no_offload_dev_inc;
if (err)
- return err;
+ goto err_unlock;
- tc_indr_block_call(block, dev, ei, TC_BLOCK_BIND, extack);
+ up_write(&block->cb_lock);
return 0;
no_offload_dev_inc:
if (tcf_block_offload_in_use(block))
- return -EOPNOTSUPP;
+ goto err_unlock;
+
+ err = 0;
block->nooffloaddevcnt++;
- tc_indr_block_call(block, dev, ei, TC_BLOCK_BIND, extack);
- return 0;
+err_unlock:
+ up_write(&block->cb_lock);
+ return err;
}
static void tcf_block_offload_unbind(struct tcf_block *block, struct Qdisc *q,
@@ -666,17 +898,16 @@ static void tcf_block_offload_unbind(struct tcf_block *block, struct Qdisc *q,
struct net_device *dev = q->dev_queue->dev;
int err;
- tc_indr_block_call(block, dev, ei, TC_BLOCK_UNBIND, NULL);
-
- if (!dev->netdev_ops->ndo_setup_tc)
- goto no_offload_dev_dec;
- err = tcf_block_offload_cmd(block, dev, ei, TC_BLOCK_UNBIND, NULL);
+ down_write(&block->cb_lock);
+ err = tcf_block_offload_cmd(block, dev, q, ei, FLOW_BLOCK_UNBIND, NULL);
if (err == -EOPNOTSUPP)
goto no_offload_dev_dec;
+ up_write(&block->cb_lock);
return;
no_offload_dev_dec:
WARN_ON(block->nooffloaddevcnt-- == 0);
+ up_write(&block->cb_lock);
}
static int
@@ -684,8 +915,8 @@ tcf_chain0_head_change_cb_add(struct tcf_block *block,
struct tcf_block_ext_info *ei,
struct netlink_ext_ack *extack)
{
- struct tcf_chain *chain0 = block->chain0.chain;
struct tcf_filter_chain_list_item *item;
+ struct tcf_chain *chain0;
item = kmalloc(sizeof(*item), GFP_KERNEL);
if (!item) {
@@ -694,9 +925,32 @@ tcf_chain0_head_change_cb_add(struct tcf_block *block,
}
item->chain_head_change = ei->chain_head_change;
item->chain_head_change_priv = ei->chain_head_change_priv;
- if (chain0 && chain0->filter_chain)
- tcf_chain_head_change_item(item, chain0->filter_chain);
- list_add(&item->list, &block->chain0.filter_chain_list);
+
+ mutex_lock(&block->lock);
+ chain0 = block->chain0.chain;
+ if (chain0)
+ tcf_chain_hold(chain0);
+ else
+ list_add(&item->list, &block->chain0.filter_chain_list);
+ mutex_unlock(&block->lock);
+
+ if (chain0) {
+ struct tcf_proto *tp_head;
+
+ mutex_lock(&chain0->filter_chain_lock);
+
+ tp_head = tcf_chain_dereference(chain0->filter_chain, chain0);
+ if (tp_head)
+ tcf_chain_head_change_item(item, tp_head);
+
+ mutex_lock(&block->lock);
+ list_add(&item->list, &block->chain0.filter_chain_list);
+ mutex_unlock(&block->lock);
+
+ mutex_unlock(&chain0->filter_chain_lock);
+ tcf_chain_put(chain0);
+ }
+
return 0;
}
@@ -704,20 +958,23 @@ static void
tcf_chain0_head_change_cb_del(struct tcf_block *block,
struct tcf_block_ext_info *ei)
{
- struct tcf_chain *chain0 = block->chain0.chain;
struct tcf_filter_chain_list_item *item;
+ mutex_lock(&block->lock);
list_for_each_entry(item, &block->chain0.filter_chain_list, list) {
if ((!ei->chain_head_change && !ei->chain_head_change_priv) ||
(item->chain_head_change == ei->chain_head_change &&
item->chain_head_change_priv == ei->chain_head_change_priv)) {
- if (chain0)
+ if (block->chain0.chain)
tcf_chain_head_change_item(item, NULL);
list_del(&item->list);
+ mutex_unlock(&block->lock);
+
kfree(item);
return;
}
}
+ mutex_unlock(&block->lock);
WARN_ON(1);
}
@@ -764,14 +1021,18 @@ static struct tcf_block *tcf_block_create(struct net *net, struct Qdisc *q,
NL_SET_ERR_MSG(extack, "Memory allocation for block failed");
return ERR_PTR(-ENOMEM);
}
+ mutex_init(&block->lock);
+ mutex_init(&block->proto_destroy_lock);
+ init_rwsem(&block->cb_lock);
+ flow_block_init(&block->flow_block);
INIT_LIST_HEAD(&block->chain_list);
- INIT_LIST_HEAD(&block->cb_list);
INIT_LIST_HEAD(&block->owner_list);
INIT_LIST_HEAD(&block->chain0.filter_chain_list);
refcount_set(&block->refcnt, 1);
block->net = net;
block->index = block_index;
+ xa_init(&block->ports);
/* Don't store q pointer for blocks which are shared */
if (!tcf_block_shared(block))
@@ -779,12 +1040,13 @@ static struct tcf_block *tcf_block_create(struct net *net, struct Qdisc *q,
return block;
}
-static struct tcf_block *tcf_block_lookup(struct net *net, u32 block_index)
+struct tcf_block *tcf_block_lookup(struct net *net, u32 block_index)
{
struct tcf_net *tn = net_generic(net, tcf_net_id);
return idr_find(&tn->idr, block_index);
}
+EXPORT_SYMBOL(tcf_block_lookup);
static struct tcf_block *tcf_block_refcnt_get(struct net *net, u32 block_index)
{
@@ -799,157 +1061,240 @@ static struct tcf_block *tcf_block_refcnt_get(struct net *net, u32 block_index)
return block;
}
-static void tcf_block_flush_all_chains(struct tcf_block *block)
+static struct tcf_chain *
+__tcf_get_next_chain(struct tcf_block *block, struct tcf_chain *chain)
{
- struct tcf_chain *chain;
+ mutex_lock(&block->lock);
+ if (chain)
+ chain = list_is_last(&chain->list, &block->chain_list) ?
+ NULL : list_next_entry(chain, list);
+ else
+ chain = list_first_entry_or_null(&block->chain_list,
+ struct tcf_chain, list);
- /* Hold a refcnt for all chains, so that they don't disappear
- * while we are iterating.
- */
- list_for_each_entry(chain, &block->chain_list, list)
+ /* skip all action-only chains */
+ while (chain && tcf_chain_held_by_acts_only(chain))
+ chain = list_is_last(&chain->list, &block->chain_list) ?
+ NULL : list_next_entry(chain, list);
+
+ if (chain)
tcf_chain_hold(chain);
+ mutex_unlock(&block->lock);
- list_for_each_entry(chain, &block->chain_list, list)
- tcf_chain_flush(chain);
+ return chain;
}
-static void tcf_block_put_all_chains(struct tcf_block *block)
+/* Function to be used by all clients that want to iterate over all chains on
+ * block. It properly obtains block->lock and takes reference to chain before
+ * returning it. Users of this function must be tolerant to concurrent chain
+ * insertion/deletion or ensure that no concurrent chain modification is
+ * possible. Note that all netlink dump callbacks cannot guarantee to provide
+ * consistent dump because rtnl lock is released each time skb is filled with
+ * data and sent to user-space.
+ */
+
+struct tcf_chain *
+tcf_get_next_chain(struct tcf_block *block, struct tcf_chain *chain)
{
- struct tcf_chain *chain, *tmp;
+ struct tcf_chain *chain_next = __tcf_get_next_chain(block, chain);
- /* At this point, all the chains should have refcnt >= 1. */
- list_for_each_entry_safe(chain, tmp, &block->chain_list, list) {
- tcf_chain_put_explicitly_created(chain);
+ if (chain)
tcf_chain_put(chain);
- }
+
+ return chain_next;
}
+EXPORT_SYMBOL(tcf_get_next_chain);
-static void __tcf_block_put(struct tcf_block *block, struct Qdisc *q,
- struct tcf_block_ext_info *ei)
+static struct tcf_proto *
+__tcf_get_next_proto(struct tcf_chain *chain, struct tcf_proto *tp)
{
- if (refcount_dec_and_test(&block->refcnt)) {
- /* Flushing/putting all chains will cause the block to be
- * deallocated when last chain is freed. However, if chain_list
- * is empty, block has to be manually deallocated. After block
- * reference counter reached 0, it is no longer possible to
- * increment it or add new chains to block.
- */
- bool free_block = list_empty(&block->chain_list);
+ u32 prio = 0;
- if (tcf_block_shared(block))
- tcf_block_remove(block, block->net);
- if (!free_block)
- tcf_block_flush_all_chains(block);
+ ASSERT_RTNL();
+ mutex_lock(&chain->filter_chain_lock);
- if (q)
- tcf_block_offload_unbind(block, q, ei);
+ if (!tp) {
+ tp = tcf_chain_dereference(chain->filter_chain, chain);
+ } else if (tcf_proto_is_deleting(tp)) {
+ /* 'deleting' flag is set and chain->filter_chain_lock was
+ * unlocked, which means next pointer could be invalid. Restart
+ * search.
+ */
+ prio = tp->prio + 1;
+ tp = tcf_chain_dereference(chain->filter_chain, chain);
- if (free_block)
- kfree_rcu(block, rcu);
- else
- tcf_block_put_all_chains(block);
- } else if (q) {
- tcf_block_offload_unbind(block, q, ei);
+ for (; tp; tp = tcf_chain_dereference(tp->next, chain))
+ if (!tp->deleting && tp->prio >= prio)
+ break;
+ } else {
+ tp = tcf_chain_dereference(tp->next, chain);
}
+
+ if (tp)
+ tcf_proto_get(tp);
+
+ mutex_unlock(&chain->filter_chain_lock);
+
+ return tp;
}
-static void tcf_block_refcnt_put(struct tcf_block *block)
+/* Function to be used by all clients that want to iterate over all tp's on
+ * chain. Users of this function must be tolerant to concurrent tp
+ * insertion/deletion or ensure that no concurrent chain modification is
+ * possible. Note that all netlink dump callbacks cannot guarantee to provide
+ * consistent dump because rtnl lock is released each time skb is filled with
+ * data and sent to user-space.
+ */
+
+struct tcf_proto *
+tcf_get_next_proto(struct tcf_chain *chain, struct tcf_proto *tp)
{
- __tcf_block_put(block, NULL, NULL);
+ struct tcf_proto *tp_next = __tcf_get_next_proto(chain, tp);
+
+ if (tp)
+ tcf_proto_put(tp, true, NULL);
+
+ return tp_next;
}
+EXPORT_SYMBOL(tcf_get_next_proto);
-/* Find tcf block.
- * Set q, parent, cl when appropriate.
+static void tcf_block_flush_all_chains(struct tcf_block *block, bool rtnl_held)
+{
+ struct tcf_chain *chain;
+
+ /* Last reference to block. At this point chains cannot be added or
+ * removed concurrently.
+ */
+ for (chain = tcf_get_next_chain(block, NULL);
+ chain;
+ chain = tcf_get_next_chain(block, chain)) {
+ tcf_chain_put_explicitly_created(chain);
+ tcf_chain_flush(chain, rtnl_held);
+ }
+}
+
+/* Lookup Qdisc and increments its reference counter.
+ * Set parent, if necessary.
*/
-static struct tcf_block *tcf_block_find(struct net *net, struct Qdisc **q,
- u32 *parent, unsigned long *cl,
- int ifindex, u32 block_index,
- struct netlink_ext_ack *extack)
+static int __tcf_qdisc_find(struct net *net, struct Qdisc **q,
+ u32 *parent, int ifindex, bool rtnl_held,
+ struct netlink_ext_ack *extack)
{
- struct tcf_block *block;
+ const struct Qdisc_class_ops *cops;
+ struct net_device *dev;
int err = 0;
- if (ifindex == TCM_IFINDEX_MAGIC_BLOCK) {
- block = tcf_block_refcnt_get(net, block_index);
- if (!block) {
- NL_SET_ERR_MSG(extack, "Block of given index was not found");
- return ERR_PTR(-EINVAL);
- }
- } else {
- const struct Qdisc_class_ops *cops;
- struct net_device *dev;
-
- rcu_read_lock();
+ if (ifindex == TCM_IFINDEX_MAGIC_BLOCK)
+ return 0;
- /* Find link */
- dev = dev_get_by_index_rcu(net, ifindex);
- if (!dev) {
- rcu_read_unlock();
- return ERR_PTR(-ENODEV);
- }
+ rcu_read_lock();
- /* Find qdisc */
- if (!*parent) {
- *q = dev->qdisc;
- *parent = (*q)->handle;
- } else {
- *q = qdisc_lookup_rcu(dev, TC_H_MAJ(*parent));
- if (!*q) {
- NL_SET_ERR_MSG(extack, "Parent Qdisc doesn't exists");
- err = -EINVAL;
- goto errout_rcu;
- }
- }
+ /* Find link */
+ dev = dev_get_by_index_rcu(net, ifindex);
+ if (!dev) {
+ rcu_read_unlock();
+ return -ENODEV;
+ }
- *q = qdisc_refcount_inc_nz(*q);
+ /* Find qdisc */
+ if (!*parent) {
+ *q = rcu_dereference(dev->qdisc);
+ *parent = (*q)->handle;
+ } else {
+ *q = qdisc_lookup_rcu(dev, TC_H_MAJ(*parent));
if (!*q) {
NL_SET_ERR_MSG(extack, "Parent Qdisc doesn't exists");
err = -EINVAL;
goto errout_rcu;
}
+ }
- /* Is it classful? */
- cops = (*q)->ops->cl_ops;
- if (!cops) {
- NL_SET_ERR_MSG(extack, "Qdisc not classful");
- err = -EINVAL;
- goto errout_rcu;
- }
+ *q = qdisc_refcount_inc_nz(*q);
+ if (!*q) {
+ NL_SET_ERR_MSG(extack, "Parent Qdisc doesn't exists");
+ err = -EINVAL;
+ goto errout_rcu;
+ }
- if (!cops->tcf_block) {
- NL_SET_ERR_MSG(extack, "Class doesn't support blocks");
- err = -EOPNOTSUPP;
- goto errout_rcu;
- }
+ /* Is it classful? */
+ cops = (*q)->ops->cl_ops;
+ if (!cops) {
+ NL_SET_ERR_MSG(extack, "Qdisc not classful");
+ err = -EINVAL;
+ goto errout_qdisc;
+ }
- /* At this point we know that qdisc is not noop_qdisc,
- * which means that qdisc holds a reference to net_device
- * and we hold a reference to qdisc, so it is safe to release
- * rcu read lock.
- */
- rcu_read_unlock();
+ if (!cops->tcf_block) {
+ NL_SET_ERR_MSG(extack, "Class doesn't support blocks");
+ err = -EOPNOTSUPP;
+ goto errout_qdisc;
+ }
- /* Do we search for filter, attached to class? */
- if (TC_H_MIN(*parent)) {
- *cl = cops->find(*q, *parent);
- if (*cl == 0) {
- NL_SET_ERR_MSG(extack, "Specified class doesn't exist");
- err = -ENOENT;
- goto errout_qdisc;
- }
+errout_rcu:
+ /* At this point we know that qdisc is not noop_qdisc,
+ * which means that qdisc holds a reference to net_device
+ * and we hold a reference to qdisc, so it is safe to release
+ * rcu read lock.
+ */
+ rcu_read_unlock();
+ return err;
+
+errout_qdisc:
+ rcu_read_unlock();
+
+ if (rtnl_held)
+ qdisc_put(*q);
+ else
+ qdisc_put_unlocked(*q);
+ *q = NULL;
+
+ return err;
+}
+
+static int __tcf_qdisc_cl_find(struct Qdisc *q, u32 parent, unsigned long *cl,
+ int ifindex, struct netlink_ext_ack *extack)
+{
+ if (ifindex == TCM_IFINDEX_MAGIC_BLOCK)
+ return 0;
+
+ /* Do we search for filter, attached to class? */
+ if (TC_H_MIN(parent)) {
+ const struct Qdisc_class_ops *cops = q->ops->cl_ops;
+
+ *cl = cops->find(q, parent);
+ if (*cl == 0) {
+ NL_SET_ERR_MSG(extack, "Specified class doesn't exist");
+ return -ENOENT;
}
+ }
+
+ return 0;
+}
+
+static struct tcf_block *__tcf_block_find(struct net *net, struct Qdisc *q,
+ unsigned long cl, int ifindex,
+ u32 block_index,
+ struct netlink_ext_ack *extack)
+{
+ struct tcf_block *block;
- /* And the last stroke */
- block = cops->tcf_block(*q, *cl, extack);
+ if (ifindex == TCM_IFINDEX_MAGIC_BLOCK) {
+ block = tcf_block_refcnt_get(net, block_index);
if (!block) {
- err = -EINVAL;
- goto errout_qdisc;
+ NL_SET_ERR_MSG(extack, "Block of given index was not found");
+ return ERR_PTR(-EINVAL);
}
+ } else {
+ const struct Qdisc_class_ops *cops = q->ops->cl_ops;
+
+ block = cops->tcf_block(q, cl, extack);
+ if (!block)
+ return ERR_PTR(-EINVAL);
+
if (tcf_block_shared(block)) {
NL_SET_ERR_MSG(extack, "This filter block is shared. Please use the block index to manipulate the filters");
- err = -EOPNOTSUPP;
- goto errout_qdisc;
+ return ERR_PTR(-EOPNOTSUPP);
}
/* Always take reference to block in order to support execution
@@ -962,40 +1307,107 @@ static struct tcf_block *tcf_block_find(struct net *net, struct Qdisc **q,
}
return block;
+}
+
+static void __tcf_block_put(struct tcf_block *block, struct Qdisc *q,
+ struct tcf_block_ext_info *ei, bool rtnl_held)
+{
+ if (refcount_dec_and_mutex_lock(&block->refcnt, &block->lock)) {
+ /* Flushing/putting all chains will cause the block to be
+ * deallocated when last chain is freed. However, if chain_list
+ * is empty, block has to be manually deallocated. After block
+ * reference counter reached 0, it is no longer possible to
+ * increment it or add new chains to block.
+ */
+ bool free_block = list_empty(&block->chain_list);
+
+ mutex_unlock(&block->lock);
+ if (tcf_block_shared(block))
+ tcf_block_remove(block, block->net);
+
+ if (q)
+ tcf_block_offload_unbind(block, q, ei);
+
+ if (free_block)
+ tcf_block_destroy(block);
+ else
+ tcf_block_flush_all_chains(block, rtnl_held);
+ } else if (q) {
+ tcf_block_offload_unbind(block, q, ei);
+ }
+}
+
+static void tcf_block_refcnt_put(struct tcf_block *block, bool rtnl_held)
+{
+ __tcf_block_put(block, NULL, NULL, rtnl_held);
+}
+
+/* Find tcf block.
+ * Set q, parent, cl when appropriate.
+ */
+
+static struct tcf_block *tcf_block_find(struct net *net, struct Qdisc **q,
+ u32 *parent, unsigned long *cl,
+ int ifindex, u32 block_index,
+ struct netlink_ext_ack *extack)
+{
+ struct tcf_block *block;
+ int err = 0;
+
+ ASSERT_RTNL();
+
+ err = __tcf_qdisc_find(net, q, parent, ifindex, true, extack);
+ if (err)
+ goto errout;
+
+ err = __tcf_qdisc_cl_find(*q, *parent, cl, ifindex, extack);
+ if (err)
+ goto errout_qdisc;
+
+ block = __tcf_block_find(net, *q, *cl, ifindex, block_index, extack);
+ if (IS_ERR(block)) {
+ err = PTR_ERR(block);
+ goto errout_qdisc;
+ }
+
+ return block;
-errout_rcu:
- rcu_read_unlock();
errout_qdisc:
- if (*q) {
+ if (*q)
qdisc_put(*q);
- *q = NULL;
- }
+errout:
+ *q = NULL;
return ERR_PTR(err);
}
-static void tcf_block_release(struct Qdisc *q, struct tcf_block *block)
+static void tcf_block_release(struct Qdisc *q, struct tcf_block *block,
+ bool rtnl_held)
{
if (!IS_ERR_OR_NULL(block))
- tcf_block_refcnt_put(block);
+ tcf_block_refcnt_put(block, rtnl_held);
- if (q)
- qdisc_put(q);
+ if (q) {
+ if (rtnl_held)
+ qdisc_put(q);
+ else
+ qdisc_put_unlocked(q);
+ }
}
struct tcf_block_owner_item {
struct list_head list;
struct Qdisc *q;
- enum tcf_block_binder_type binder_type;
+ enum flow_block_binder_type binder_type;
};
static void
tcf_block_owner_netif_keep_dst(struct tcf_block *block,
struct Qdisc *q,
- enum tcf_block_binder_type binder_type)
+ enum flow_block_binder_type binder_type)
{
if (block->keep_dst &&
- binder_type != TCF_BLOCK_BINDER_TYPE_CLSACT_INGRESS &&
- binder_type != TCF_BLOCK_BINDER_TYPE_CLSACT_EGRESS)
+ binder_type != FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS &&
+ binder_type != FLOW_BLOCK_BINDER_TYPE_CLSACT_EGRESS)
netif_keep_dst(qdisc_dev(q));
}
@@ -1012,7 +1424,7 @@ EXPORT_SYMBOL(tcf_block_netif_keep_dst);
static int tcf_block_owner_add(struct tcf_block *block,
struct Qdisc *q,
- enum tcf_block_binder_type binder_type)
+ enum flow_block_binder_type binder_type)
{
struct tcf_block_owner_item *item;
@@ -1027,7 +1439,7 @@ static int tcf_block_owner_add(struct tcf_block *block,
static void tcf_block_owner_del(struct tcf_block *block,
struct Qdisc *q,
- enum tcf_block_binder_type binder_type)
+ enum flow_block_binder_type binder_type)
{
struct tcf_block_owner_item *item;
@@ -1041,10 +1453,19 @@ static void tcf_block_owner_del(struct tcf_block *block,
WARN_ON(1);
}
+static bool tcf_block_tracks_dev(struct tcf_block *block,
+ struct tcf_block_ext_info *ei)
+{
+ return tcf_block_shared(block) &&
+ (ei->binder_type == FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS ||
+ ei->binder_type == FLOW_BLOCK_BINDER_TYPE_CLSACT_EGRESS);
+}
+
int tcf_block_get_ext(struct tcf_block **p_block, struct Qdisc *q,
struct tcf_block_ext_info *ei,
struct netlink_ext_ack *extack)
{
+ struct net_device *dev = qdisc_dev(q);
struct net *net = qdisc_net(q);
struct tcf_block *block = NULL;
int err;
@@ -1078,16 +1499,26 @@ int tcf_block_get_ext(struct tcf_block **p_block, struct Qdisc *q,
if (err)
goto err_block_offload_bind;
+ if (tcf_block_tracks_dev(block, ei)) {
+ err = xa_insert(&block->ports, dev->ifindex, dev, GFP_KERNEL);
+ if (err) {
+ NL_SET_ERR_MSG(extack, "block dev insert failed");
+ goto err_dev_insert;
+ }
+ }
+
*p_block = block;
return 0;
+err_dev_insert:
+ tcf_block_offload_unbind(block, q, ei);
err_block_offload_bind:
tcf_chain0_head_change_cb_del(block, ei);
err_chain0_head_change_cb_add:
tcf_block_owner_del(block, q, ei->binder_type);
err_block_owner_add:
err_block_insert:
- tcf_block_refcnt_put(block);
+ tcf_block_refcnt_put(block, true);
return err;
}
EXPORT_SYMBOL(tcf_block_get_ext);
@@ -1119,12 +1550,16 @@ EXPORT_SYMBOL(tcf_block_get);
void tcf_block_put_ext(struct tcf_block *block, struct Qdisc *q,
struct tcf_block_ext_info *ei)
{
+ struct net_device *dev = qdisc_dev(q);
+
if (!block)
return;
+ if (tcf_block_tracks_dev(block, ei))
+ xa_erase(&block->ports, dev->ifindex);
tcf_chain0_head_change_cb_del(block, ei);
tcf_block_owner_del(block, q, ei->binder_type);
- __tcf_block_put(block, q, ei);
+ __tcf_block_put(block, q, ei, true);
}
EXPORT_SYMBOL(tcf_block_put_ext);
@@ -1139,55 +1574,29 @@ void tcf_block_put(struct tcf_block *block)
EXPORT_SYMBOL(tcf_block_put);
-struct tcf_block_cb {
- struct list_head list;
- tc_setup_cb_t *cb;
- void *cb_ident;
- void *cb_priv;
- unsigned int refcnt;
-};
-
-void *tcf_block_cb_priv(struct tcf_block_cb *block_cb)
-{
- return block_cb->cb_priv;
-}
-EXPORT_SYMBOL(tcf_block_cb_priv);
-
-struct tcf_block_cb *tcf_block_cb_lookup(struct tcf_block *block,
- tc_setup_cb_t *cb, void *cb_ident)
-{ struct tcf_block_cb *block_cb;
-
- list_for_each_entry(block_cb, &block->cb_list, list)
- if (block_cb->cb == cb && block_cb->cb_ident == cb_ident)
- return block_cb;
- return NULL;
-}
-EXPORT_SYMBOL(tcf_block_cb_lookup);
-
-void tcf_block_cb_incref(struct tcf_block_cb *block_cb)
-{
- block_cb->refcnt++;
-}
-EXPORT_SYMBOL(tcf_block_cb_incref);
-
-unsigned int tcf_block_cb_decref(struct tcf_block_cb *block_cb)
-{
- return --block_cb->refcnt;
-}
-EXPORT_SYMBOL(tcf_block_cb_decref);
-
static int
-tcf_block_playback_offloads(struct tcf_block *block, tc_setup_cb_t *cb,
+tcf_block_playback_offloads(struct tcf_block *block, flow_setup_cb_t *cb,
void *cb_priv, bool add, bool offload_in_use,
struct netlink_ext_ack *extack)
{
- struct tcf_chain *chain;
- struct tcf_proto *tp;
+ struct tcf_chain *chain, *chain_prev;
+ struct tcf_proto *tp, *tp_prev;
int err;
- list_for_each_entry(chain, &block->chain_list, list) {
- for (tp = rtnl_dereference(chain->filter_chain); tp;
- tp = rtnl_dereference(tp->next)) {
+ lockdep_assert_held(&block->cb_lock);
+
+ for (chain = __tcf_get_next_chain(block, NULL);
+ chain;
+ chain_prev = chain,
+ chain = __tcf_get_next_chain(block, chain),
+ tcf_chain_put(chain_prev)) {
+ if (chain->tmplt_ops && add)
+ chain->tmplt_ops->tmplt_reoffload(chain, true, cb,
+ cb_priv);
+ for (tp = __tcf_get_next_proto(chain, NULL); tp;
+ tp_prev = tp,
+ tp = __tcf_get_next_proto(chain, tp),
+ tcf_proto_put(tp_prev, true, NULL)) {
if (tp->ops->reoffload) {
err = tp->ops->reoffload(tp, add, cb, cb_priv,
extack);
@@ -1199,107 +1608,169 @@ tcf_block_playback_offloads(struct tcf_block *block, tc_setup_cb_t *cb,
goto err_playback_remove;
}
}
+ if (chain->tmplt_ops && !add)
+ chain->tmplt_ops->tmplt_reoffload(chain, false, cb,
+ cb_priv);
}
return 0;
err_playback_remove:
+ tcf_proto_put(tp, true, NULL);
+ tcf_chain_put(chain);
tcf_block_playback_offloads(block, cb, cb_priv, false, offload_in_use,
extack);
return err;
}
-struct tcf_block_cb *__tcf_block_cb_register(struct tcf_block *block,
- tc_setup_cb_t *cb, void *cb_ident,
- void *cb_priv,
- struct netlink_ext_ack *extack)
+static int tcf_block_bind(struct tcf_block *block,
+ struct flow_block_offload *bo)
{
- struct tcf_block_cb *block_cb;
- int err;
+ struct flow_block_cb *block_cb, *next;
+ int err, i = 0;
- /* Replay any already present rules */
- err = tcf_block_playback_offloads(block, cb, cb_priv, true,
- tcf_block_offload_in_use(block),
- extack);
- if (err)
- return ERR_PTR(err);
+ lockdep_assert_held(&block->cb_lock);
- block_cb = kzalloc(sizeof(*block_cb), GFP_KERNEL);
- if (!block_cb)
- return ERR_PTR(-ENOMEM);
- block_cb->cb = cb;
- block_cb->cb_ident = cb_ident;
- block_cb->cb_priv = cb_priv;
- list_add(&block_cb->list, &block->cb_list);
- return block_cb;
-}
-EXPORT_SYMBOL(__tcf_block_cb_register);
+ list_for_each_entry(block_cb, &bo->cb_list, list) {
+ err = tcf_block_playback_offloads(block, block_cb->cb,
+ block_cb->cb_priv, true,
+ tcf_block_offload_in_use(block),
+ bo->extack);
+ if (err)
+ goto err_unroll;
+ if (!bo->unlocked_driver_cb)
+ block->lockeddevcnt++;
-int tcf_block_cb_register(struct tcf_block *block,
- tc_setup_cb_t *cb, void *cb_ident,
- void *cb_priv, struct netlink_ext_ack *extack)
-{
- struct tcf_block_cb *block_cb;
+ i++;
+ }
+ list_splice(&bo->cb_list, &block->flow_block.cb_list);
+
+ return 0;
- block_cb = __tcf_block_cb_register(block, cb, cb_ident, cb_priv,
- extack);
- return PTR_ERR_OR_ZERO(block_cb);
+err_unroll:
+ list_for_each_entry_safe(block_cb, next, &bo->cb_list, list) {
+ list_del(&block_cb->driver_list);
+ if (i-- > 0) {
+ list_del(&block_cb->list);
+ tcf_block_playback_offloads(block, block_cb->cb,
+ block_cb->cb_priv, false,
+ tcf_block_offload_in_use(block),
+ NULL);
+ if (!bo->unlocked_driver_cb)
+ block->lockeddevcnt--;
+ }
+ flow_block_cb_free(block_cb);
+ }
+
+ return err;
}
-EXPORT_SYMBOL(tcf_block_cb_register);
-void __tcf_block_cb_unregister(struct tcf_block *block,
- struct tcf_block_cb *block_cb)
+static void tcf_block_unbind(struct tcf_block *block,
+ struct flow_block_offload *bo)
{
- tcf_block_playback_offloads(block, block_cb->cb, block_cb->cb_priv,
- false, tcf_block_offload_in_use(block),
- NULL);
- list_del(&block_cb->list);
- kfree(block_cb);
+ struct flow_block_cb *block_cb, *next;
+
+ lockdep_assert_held(&block->cb_lock);
+
+ list_for_each_entry_safe(block_cb, next, &bo->cb_list, list) {
+ tcf_block_playback_offloads(block, block_cb->cb,
+ block_cb->cb_priv, false,
+ tcf_block_offload_in_use(block),
+ NULL);
+ list_del(&block_cb->list);
+ flow_block_cb_free(block_cb);
+ if (!bo->unlocked_driver_cb)
+ block->lockeddevcnt--;
+ }
}
-EXPORT_SYMBOL(__tcf_block_cb_unregister);
-void tcf_block_cb_unregister(struct tcf_block *block,
- tc_setup_cb_t *cb, void *cb_ident)
+static int tcf_block_setup(struct tcf_block *block,
+ struct flow_block_offload *bo)
{
- struct tcf_block_cb *block_cb;
+ int err;
- block_cb = tcf_block_cb_lookup(block, cb, cb_ident);
- if (!block_cb)
- return;
- __tcf_block_cb_unregister(block, block_cb);
+ switch (bo->command) {
+ case FLOW_BLOCK_BIND:
+ err = tcf_block_bind(block, bo);
+ break;
+ case FLOW_BLOCK_UNBIND:
+ err = 0;
+ tcf_block_unbind(block, bo);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ err = -EOPNOTSUPP;
+ }
+
+ return err;
}
-EXPORT_SYMBOL(tcf_block_cb_unregister);
/* Main classifier routine: scans classifier chain attached
* to this qdisc, (optionally) tests for protocol and asks
* specific classifiers.
*/
-int tcf_classify(struct sk_buff *skb, const struct tcf_proto *tp,
- struct tcf_result *res, bool compat_mode)
+static inline int __tcf_classify(struct sk_buff *skb,
+ const struct tcf_proto *tp,
+ const struct tcf_proto *orig_tp,
+ struct tcf_result *res,
+ bool compat_mode,
+ struct tcf_exts_miss_cookie_node *n,
+ int act_index,
+ u32 *last_executed_chain)
{
#ifdef CONFIG_NET_CLS_ACT
- const int max_reclassify_loop = 4;
- const struct tcf_proto *orig_tp = tp;
+ const int max_reclassify_loop = 16;
const struct tcf_proto *first_tp;
int limit = 0;
reclassify:
#endif
for (; tp; tp = rcu_dereference_bh(tp->next)) {
- __be16 protocol = tc_skb_protocol(skb);
- int err;
+ __be16 protocol = skb_protocol(skb, false);
+ int err = 0;
+
+ if (n) {
+ struct tcf_exts *exts;
+
+ if (n->tp_prio != tp->prio)
+ continue;
+
+ /* We re-lookup the tp and chain based on index instead
+ * of having hard refs and locks to them, so do a sanity
+ * check if any of tp,chain,exts was replaced by the
+ * time we got here with a cookie from hardware.
+ */
+ if (unlikely(n->tp != tp || n->tp->chain != n->chain ||
+ !tp->ops->get_exts)) {
+ tcf_set_drop_reason(skb,
+ SKB_DROP_REASON_TC_COOKIE_ERROR);
+ return TC_ACT_SHOT;
+ }
- if (tp->protocol != protocol &&
- tp->protocol != htons(ETH_P_ALL))
- continue;
+ exts = tp->ops->get_exts(tp, n->handle);
+ if (unlikely(!exts || n->exts != exts)) {
+ tcf_set_drop_reason(skb,
+ SKB_DROP_REASON_TC_COOKIE_ERROR);
+ return TC_ACT_SHOT;
+ }
- err = tp->classify(skb, tp, res);
+ n = NULL;
+ err = tcf_exts_exec_ex(skb, exts, act_index, res);
+ } else {
+ if (tp->protocol != protocol &&
+ tp->protocol != htons(ETH_P_ALL))
+ continue;
+
+ err = tc_classify(skb, tp, res);
+ }
#ifdef CONFIG_NET_CLS_ACT
if (unlikely(err == TC_ACT_RECLASSIFY && !compat_mode)) {
first_tp = orig_tp;
+ *last_executed_chain = first_tp->chain->index;
goto reset;
} else if (unlikely(TC_ACT_EXT_CMP(err, TC_ACT_GOTO_CHAIN))) {
first_tp = res->goto_tp;
+ *last_executed_chain = err & TC_ACT_EXT_VAL_MASK;
goto reset;
}
#endif
@@ -1307,6 +1778,12 @@ reclassify:
return err;
}
+ if (unlikely(n)) {
+ tcf_set_drop_reason(skb,
+ SKB_DROP_REASON_TC_COOKIE_ERROR);
+ return TC_ACT_SHOT;
+ }
+
return TC_ACT_UNSPEC; /* signal: continue lookup */
#ifdef CONFIG_NET_CLS_ACT
reset:
@@ -1315,6 +1792,8 @@ reset:
tp->chain->block->index,
tp->prio & 0xffff,
ntohs(tp->protocol));
+ tcf_set_drop_reason(skb,
+ SKB_DROP_REASON_TC_RECLASSIFY_LOOP);
return TC_ACT_SHOT;
}
@@ -1322,6 +1801,87 @@ reset:
goto reclassify;
#endif
}
+
+int tcf_classify(struct sk_buff *skb,
+ const struct tcf_block *block,
+ const struct tcf_proto *tp,
+ struct tcf_result *res, bool compat_mode)
+{
+#if !IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
+ u32 last_executed_chain = 0;
+
+ return __tcf_classify(skb, tp, tp, res, compat_mode, NULL, 0,
+ &last_executed_chain);
+#else
+ u32 last_executed_chain = tp ? tp->chain->index : 0;
+ struct tcf_exts_miss_cookie_node *n = NULL;
+ const struct tcf_proto *orig_tp = tp;
+ struct tc_skb_ext *ext;
+ int act_index = 0;
+ int ret;
+
+ if (block) {
+ ext = skb_ext_find(skb, TC_SKB_EXT);
+
+ if (ext && (ext->chain || ext->act_miss)) {
+ struct tcf_chain *fchain;
+ u32 chain;
+
+ if (ext->act_miss) {
+ n = tcf_exts_miss_cookie_lookup(ext->act_miss_cookie,
+ &act_index);
+ if (!n) {
+ tcf_set_drop_reason(skb,
+ SKB_DROP_REASON_TC_COOKIE_ERROR);
+ return TC_ACT_SHOT;
+ }
+
+ chain = n->chain_index;
+ } else {
+ chain = ext->chain;
+ }
+
+ fchain = tcf_chain_lookup_rcu(block, chain);
+ if (!fchain) {
+ tcf_set_drop_reason(skb,
+ SKB_DROP_REASON_TC_CHAIN_NOTFOUND);
+
+ return TC_ACT_SHOT;
+ }
+
+ /* Consume, so cloned/redirect skbs won't inherit ext */
+ skb_ext_del(skb, TC_SKB_EXT);
+
+ tp = rcu_dereference_bh(fchain->filter_chain);
+ last_executed_chain = fchain->index;
+ }
+ }
+
+ ret = __tcf_classify(skb, tp, orig_tp, res, compat_mode, n, act_index,
+ &last_executed_chain);
+
+ if (tc_skb_ext_tc_enabled()) {
+ /* If we missed on some chain */
+ if (ret == TC_ACT_UNSPEC && last_executed_chain) {
+ struct tc_skb_cb *cb = tc_skb_cb(skb);
+
+ ext = tc_skb_ext_alloc(skb);
+ if (!ext) {
+ tcf_set_drop_reason(skb, SKB_DROP_REASON_NOMEM);
+ return TC_ACT_SHOT;
+ }
+ ext->chain = last_executed_chain;
+ ext->mru = cb->mru;
+ ext->post_ct = qdisc_skb_cb(skb)->post_ct;
+ ext->post_ct_snat = qdisc_skb_cb(skb)->post_ct_snat;
+ ext->post_ct_dnat = qdisc_skb_cb(skb)->post_ct_dnat;
+ ext->zone = cb->zone;
+ }
+ }
+
+ return ret;
+#endif
+}
EXPORT_SYMBOL(tcf_classify);
struct tcf_chain_info {
@@ -1329,50 +1889,148 @@ struct tcf_chain_info {
struct tcf_proto __rcu *next;
};
-static struct tcf_proto *tcf_chain_tp_prev(struct tcf_chain_info *chain_info)
+static struct tcf_proto *tcf_chain_tp_prev(struct tcf_chain *chain,
+ struct tcf_chain_info *chain_info)
{
- return rtnl_dereference(*chain_info->pprev);
+ return tcf_chain_dereference(*chain_info->pprev, chain);
}
-static void tcf_chain_tp_insert(struct tcf_chain *chain,
- struct tcf_chain_info *chain_info,
- struct tcf_proto *tp)
+static int tcf_chain_tp_insert(struct tcf_chain *chain,
+ struct tcf_chain_info *chain_info,
+ struct tcf_proto *tp)
{
+ if (chain->flushing)
+ return -EAGAIN;
+
+ RCU_INIT_POINTER(tp->next, tcf_chain_tp_prev(chain, chain_info));
if (*chain_info->pprev == chain->filter_chain)
tcf_chain0_head_change(chain, tp);
- RCU_INIT_POINTER(tp->next, tcf_chain_tp_prev(chain_info));
+ tcf_proto_get(tp);
rcu_assign_pointer(*chain_info->pprev, tp);
- tcf_chain_hold(chain);
+
+ return 0;
}
static void tcf_chain_tp_remove(struct tcf_chain *chain,
struct tcf_chain_info *chain_info,
struct tcf_proto *tp)
{
- struct tcf_proto *next = rtnl_dereference(chain_info->next);
+ struct tcf_proto *next = tcf_chain_dereference(chain_info->next, chain);
+ tcf_proto_mark_delete(tp);
if (tp == chain->filter_chain)
tcf_chain0_head_change(chain, next);
RCU_INIT_POINTER(*chain_info->pprev, next);
- tcf_chain_put(chain);
}
static struct tcf_proto *tcf_chain_tp_find(struct tcf_chain *chain,
struct tcf_chain_info *chain_info,
u32 protocol, u32 prio,
- bool prio_allocate)
+ bool prio_allocate,
+ struct netlink_ext_ack *extack);
+
+/* Try to insert new proto.
+ * If proto with specified priority already exists, free new proto
+ * and return existing one.
+ */
+
+static struct tcf_proto *tcf_chain_tp_insert_unique(struct tcf_chain *chain,
+ struct tcf_proto *tp_new,
+ u32 protocol, u32 prio,
+ bool rtnl_held)
+{
+ struct tcf_chain_info chain_info;
+ struct tcf_proto *tp;
+ int err = 0;
+
+ mutex_lock(&chain->filter_chain_lock);
+
+ if (tcf_proto_exists_destroying(chain, tp_new)) {
+ mutex_unlock(&chain->filter_chain_lock);
+ tcf_proto_destroy(tp_new, rtnl_held, false, NULL);
+ return ERR_PTR(-EAGAIN);
+ }
+
+ tp = tcf_chain_tp_find(chain, &chain_info, protocol, prio, false, NULL);
+ if (!tp)
+ err = tcf_chain_tp_insert(chain, &chain_info, tp_new);
+ mutex_unlock(&chain->filter_chain_lock);
+
+ if (tp) {
+ tcf_proto_destroy(tp_new, rtnl_held, false, NULL);
+ tp_new = tp;
+ } else if (err) {
+ tcf_proto_destroy(tp_new, rtnl_held, false, NULL);
+ tp_new = ERR_PTR(err);
+ }
+
+ return tp_new;
+}
+
+static void tcf_chain_tp_delete_empty(struct tcf_chain *chain,
+ struct tcf_proto *tp, bool rtnl_held,
+ struct netlink_ext_ack *extack)
+{
+ struct tcf_chain_info chain_info;
+ struct tcf_proto *tp_iter;
+ struct tcf_proto **pprev;
+ struct tcf_proto *next;
+
+ mutex_lock(&chain->filter_chain_lock);
+
+ /* Atomically find and remove tp from chain. */
+ for (pprev = &chain->filter_chain;
+ (tp_iter = tcf_chain_dereference(*pprev, chain));
+ pprev = &tp_iter->next) {
+ if (tp_iter == tp) {
+ chain_info.pprev = pprev;
+ chain_info.next = tp_iter->next;
+ WARN_ON(tp_iter->deleting);
+ break;
+ }
+ }
+ /* Verify that tp still exists and no new filters were inserted
+ * concurrently.
+ * Mark tp for deletion if it is empty.
+ */
+ if (!tp_iter || !tcf_proto_check_delete(tp)) {
+ mutex_unlock(&chain->filter_chain_lock);
+ return;
+ }
+
+ tcf_proto_signal_destroying(chain, tp);
+ next = tcf_chain_dereference(chain_info.next, chain);
+ if (tp == chain->filter_chain)
+ tcf_chain0_head_change(chain, next);
+ RCU_INIT_POINTER(*chain_info.pprev, next);
+ mutex_unlock(&chain->filter_chain_lock);
+
+ tcf_proto_put(tp, rtnl_held, extack);
+}
+
+static struct tcf_proto *tcf_chain_tp_find(struct tcf_chain *chain,
+ struct tcf_chain_info *chain_info,
+ u32 protocol, u32 prio,
+ bool prio_allocate,
+ struct netlink_ext_ack *extack)
{
struct tcf_proto **pprev;
struct tcf_proto *tp;
/* Check the chain for existence of proto-tcf with this priority */
for (pprev = &chain->filter_chain;
- (tp = rtnl_dereference(*pprev)); pprev = &tp->next) {
+ (tp = tcf_chain_dereference(*pprev, chain));
+ pprev = &tp->next) {
if (tp->prio >= prio) {
if (tp->prio == prio) {
- if (prio_allocate ||
- (tp->protocol != protocol && protocol))
+ if (prio_allocate) {
+ NL_SET_ERR_MSG(extack, "Lowest ID from auto-alloc range already in use");
+ return ERR_PTR(-ENOSPC);
+ }
+ if (tp->protocol != protocol && protocol) {
+ NL_SET_ERR_MSG(extack, "Protocol mismatch for filter with specified priority");
return ERR_PTR(-EINVAL);
+ }
} else {
tp = NULL;
}
@@ -1380,18 +2038,26 @@ static struct tcf_proto *tcf_chain_tp_find(struct tcf_chain *chain,
}
}
chain_info->pprev = pprev;
- chain_info->next = tp ? tp->next : NULL;
+ if (tp) {
+ chain_info->next = tp->next;
+ tcf_proto_get(tp);
+ } else {
+ chain_info->next = NULL;
+ }
return tp;
}
static int tcf_fill_node(struct net *net, struct sk_buff *skb,
struct tcf_proto *tp, struct tcf_block *block,
struct Qdisc *q, u32 parent, void *fh,
- u32 portid, u32 seq, u16 flags, int event)
+ u32 portid, u32 seq, u16 flags, int event,
+ bool terse_dump, bool rtnl_held,
+ struct netlink_ext_ack *extack)
{
struct tcmsg *tcm;
struct nlmsghdr *nlh;
unsigned char *b = skb_tail_pointer(skb);
+ int ret = -EMSGSIZE;
nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
if (!nlh)
@@ -1414,92 +2080,152 @@ static int tcf_fill_node(struct net *net, struct sk_buff *skb,
goto nla_put_failure;
if (!fh) {
tcm->tcm_handle = 0;
+ } else if (terse_dump) {
+ if (tp->ops->terse_dump) {
+ if (tp->ops->terse_dump(net, tp, fh, skb, tcm,
+ rtnl_held) < 0)
+ goto nla_put_failure;
+ } else {
+ goto cls_op_not_supp;
+ }
} else {
- if (tp->ops->dump && tp->ops->dump(net, tp, fh, skb, tcm) < 0)
+ if (tp->ops->dump &&
+ tp->ops->dump(net, tp, fh, skb, tcm, rtnl_held) < 0)
goto nla_put_failure;
}
+
+ if (extack && extack->_msg &&
+ nla_put_string(skb, TCA_EXT_WARN_MSG, extack->_msg))
+ goto nla_put_failure;
+
nlh->nlmsg_len = skb_tail_pointer(skb) - b;
+
return skb->len;
+cls_op_not_supp:
+ ret = -EOPNOTSUPP;
out_nlmsg_trim:
nla_put_failure:
nlmsg_trim(skb, b);
- return -1;
+ return ret;
+}
+
+static struct sk_buff *tfilter_notify_prep(struct net *net,
+ struct sk_buff *oskb,
+ struct nlmsghdr *n,
+ struct tcf_proto *tp,
+ struct tcf_block *block,
+ struct Qdisc *q, u32 parent,
+ void *fh, int event,
+ u32 portid, bool rtnl_held,
+ struct netlink_ext_ack *extack)
+{
+ unsigned int size = oskb ? max(NLMSG_GOODSIZE, oskb->len) : NLMSG_GOODSIZE;
+ struct sk_buff *skb;
+ int ret;
+
+retry:
+ skb = alloc_skb(size, GFP_KERNEL);
+ if (!skb)
+ return ERR_PTR(-ENOBUFS);
+
+ ret = tcf_fill_node(net, skb, tp, block, q, parent, fh, portid,
+ n->nlmsg_seq, n->nlmsg_flags, event, false,
+ rtnl_held, extack);
+ if (ret <= 0) {
+ kfree_skb(skb);
+ if (ret == -EMSGSIZE) {
+ size += NLMSG_GOODSIZE;
+ goto retry;
+ }
+ return ERR_PTR(-EINVAL);
+ }
+ return skb;
}
static int tfilter_notify(struct net *net, struct sk_buff *oskb,
struct nlmsghdr *n, struct tcf_proto *tp,
struct tcf_block *block, struct Qdisc *q,
- u32 parent, void *fh, int event, bool unicast)
+ u32 parent, void *fh, int event, bool unicast,
+ bool rtnl_held, struct netlink_ext_ack *extack)
{
struct sk_buff *skb;
u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
+ int err = 0;
- skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
- if (!skb)
- return -ENOBUFS;
+ if (!unicast && !rtnl_notify_needed(net, n->nlmsg_flags, RTNLGRP_TC))
+ return 0;
- if (tcf_fill_node(net, skb, tp, block, q, parent, fh, portid,
- n->nlmsg_seq, n->nlmsg_flags, event) <= 0) {
- kfree_skb(skb);
- return -EINVAL;
- }
+ skb = tfilter_notify_prep(net, oskb, n, tp, block, q, parent, fh, event,
+ portid, rtnl_held, extack);
+ if (IS_ERR(skb))
+ return PTR_ERR(skb);
if (unicast)
- return netlink_unicast(net->rtnl, skb, portid, MSG_DONTWAIT);
-
- return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
- n->nlmsg_flags & NLM_F_ECHO);
+ err = rtnl_unicast(skb, net, portid);
+ else
+ err = rtnetlink_send(skb, net, portid, RTNLGRP_TC,
+ n->nlmsg_flags & NLM_F_ECHO);
+ return err;
}
static int tfilter_del_notify(struct net *net, struct sk_buff *oskb,
struct nlmsghdr *n, struct tcf_proto *tp,
struct tcf_block *block, struct Qdisc *q,
- u32 parent, void *fh, bool unicast, bool *last,
+ u32 parent, void *fh, bool *last, bool rtnl_held,
struct netlink_ext_ack *extack)
{
struct sk_buff *skb;
u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
int err;
- skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
- if (!skb)
- return -ENOBUFS;
+ if (!rtnl_notify_needed(net, n->nlmsg_flags, RTNLGRP_TC))
+ return tp->ops->delete(tp, fh, last, rtnl_held, extack);
- if (tcf_fill_node(net, skb, tp, block, q, parent, fh, portid,
- n->nlmsg_seq, n->nlmsg_flags, RTM_DELTFILTER) <= 0) {
+ skb = tfilter_notify_prep(net, oskb, n, tp, block, q, parent, fh,
+ RTM_DELTFILTER, portid, rtnl_held, extack);
+ if (IS_ERR(skb)) {
NL_SET_ERR_MSG(extack, "Failed to build del event notification");
- kfree_skb(skb);
- return -EINVAL;
+ return PTR_ERR(skb);
}
- err = tp->ops->delete(tp, fh, last, extack);
+ err = tp->ops->delete(tp, fh, last, rtnl_held, extack);
if (err) {
kfree_skb(skb);
return err;
}
- if (unicast)
- return netlink_unicast(net->rtnl, skb, portid, MSG_DONTWAIT);
-
err = rtnetlink_send(skb, net, portid, RTNLGRP_TC,
n->nlmsg_flags & NLM_F_ECHO);
if (err < 0)
NL_SET_ERR_MSG(extack, "Failed to send filter delete notification");
+
return err;
}
static void tfilter_notify_chain(struct net *net, struct sk_buff *oskb,
struct tcf_block *block, struct Qdisc *q,
u32 parent, struct nlmsghdr *n,
- struct tcf_chain *chain, int event)
+ struct tcf_chain *chain, int event,
+ struct netlink_ext_ack *extack)
{
struct tcf_proto *tp;
- for (tp = rtnl_dereference(chain->filter_chain);
- tp; tp = rtnl_dereference(tp->next))
- tfilter_notify(net, oskb, n, tp, block,
- q, parent, NULL, event, false);
+ for (tp = tcf_get_next_proto(chain, NULL);
+ tp; tp = tcf_get_next_proto(chain, tp))
+ tfilter_notify(net, oskb, n, tp, block, q, parent, NULL,
+ event, false, true, extack);
+}
+
+static void tfilter_put(struct tcf_proto *tp, void *fh)
+{
+ if (tp->ops->put && fh)
+ tp->ops->put(tp, fh);
+}
+
+static bool is_qdisc_ingress(__u32 classid)
+{
+ return (TC_H_MIN(classid) == TC_H_MIN(TC_H_MIN_INGRESS));
}
static int tc_new_tfilter(struct sk_buff *skb, struct nlmsghdr *n,
@@ -1507,29 +2233,30 @@ static int tc_new_tfilter(struct sk_buff *skb, struct nlmsghdr *n,
{
struct net *net = sock_net(skb->sk);
struct nlattr *tca[TCA_MAX + 1];
+ char name[IFNAMSIZ];
struct tcmsg *t;
u32 protocol;
u32 prio;
bool prio_allocate;
u32 parent;
u32 chain_index;
- struct Qdisc *q = NULL;
+ struct Qdisc *q;
struct tcf_chain_info chain_info;
- struct tcf_chain *chain = NULL;
+ struct tcf_chain *chain;
struct tcf_block *block;
struct tcf_proto *tp;
unsigned long cl;
void *fh;
int err;
int tp_created;
-
- if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
- return -EPERM;
+ bool rtnl_held = false;
+ u32 flags;
replay:
tp_created = 0;
- err = nlmsg_parse(n, sizeof(*t), tca, TCA_MAX, rtm_tca_policy, extack);
+ err = nlmsg_parse_deprecated(n, sizeof(*t), tca, TCA_MAX,
+ rtm_tca_policy, extack);
if (err < 0)
return err;
@@ -1538,7 +2265,12 @@ replay:
prio = TC_H_MAJ(t->tcm_info);
prio_allocate = false;
parent = t->tcm_parent;
+ tp = NULL;
cl = 0;
+ block = NULL;
+ q = NULL;
+ chain = NULL;
+ flags = 0;
if (prio == 0) {
/* If no priority is provided by the user,
@@ -1555,14 +2287,40 @@ replay:
/* Find head of filter chain. */
- block = tcf_block_find(net, &q, &parent, &cl,
- t->tcm_ifindex, t->tcm_block_index, extack);
+ err = __tcf_qdisc_find(net, &q, &parent, t->tcm_ifindex, false, extack);
+ if (err)
+ return err;
+
+ if (tcf_proto_check_kind(tca[TCA_KIND], name)) {
+ NL_SET_ERR_MSG(extack, "Specified TC filter name too long");
+ err = -EINVAL;
+ goto errout;
+ }
+
+ /* Take rtnl mutex if rtnl_held was set to true on previous iteration,
+ * block is shared (no qdisc found), qdisc is not unlocked, classifier
+ * type is not specified, classifier is not unlocked.
+ */
+ if (rtnl_held ||
+ (q && !(q->ops->cl_ops->flags & QDISC_CLASS_OPS_DOIT_UNLOCKED)) ||
+ !tcf_proto_is_unlocked(name)) {
+ rtnl_held = true;
+ rtnl_lock();
+ }
+
+ err = __tcf_qdisc_cl_find(q, parent, &cl, t->tcm_ifindex, extack);
+ if (err)
+ goto errout;
+
+ block = __tcf_block_find(net, q, cl, t->tcm_ifindex, t->tcm_block_index,
+ extack);
if (IS_ERR(block)) {
err = PTR_ERR(block);
goto errout;
}
+ block->classid = parent;
- chain_index = tca[TCA_CHAIN] ? nla_get_u32(tca[TCA_CHAIN]) : 0;
+ chain_index = nla_get_u32_default(tca[TCA_CHAIN], 0);
if (chain_index > TC_ACT_EXT_VAL_MASK) {
NL_SET_ERR_MSG(extack, "Specified chain index exceeds upper limit");
err = -EINVAL;
@@ -1575,40 +2333,60 @@ replay:
goto errout;
}
+ mutex_lock(&chain->filter_chain_lock);
tp = tcf_chain_tp_find(chain, &chain_info, protocol,
- prio, prio_allocate);
+ prio, prio_allocate, extack);
if (IS_ERR(tp)) {
- NL_SET_ERR_MSG(extack, "Filter with specified priority/protocol not found");
err = PTR_ERR(tp);
- goto errout;
+ goto errout_locked;
}
if (tp == NULL) {
+ struct tcf_proto *tp_new = NULL;
+
+ if (chain->flushing) {
+ err = -EAGAIN;
+ goto errout_locked;
+ }
+
/* Proto-tcf does not exist, create new one */
if (tca[TCA_KIND] == NULL || !protocol) {
NL_SET_ERR_MSG(extack, "Filter kind and protocol must be specified");
err = -EINVAL;
- goto errout;
+ goto errout_locked;
}
if (!(n->nlmsg_flags & NLM_F_CREATE)) {
NL_SET_ERR_MSG(extack, "Need both RTM_NEWTFILTER and NLM_F_CREATE to create a new filter");
err = -ENOENT;
- goto errout;
+ goto errout_locked;
}
if (prio_allocate)
- prio = tcf_auto_prio(tcf_chain_tp_prev(&chain_info));
+ prio = tcf_auto_prio(tcf_chain_tp_prev(chain,
+ &chain_info));
+
+ mutex_unlock(&chain->filter_chain_lock);
+ tp_new = tcf_proto_create(name, protocol, prio, chain,
+ rtnl_held, extack);
+ if (IS_ERR(tp_new)) {
+ err = PTR_ERR(tp_new);
+ goto errout_tp;
+ }
- tp = tcf_proto_create(nla_data(tca[TCA_KIND]),
- protocol, prio, chain, extack);
+ tp_created = 1;
+ tp = tcf_chain_tp_insert_unique(chain, tp_new, protocol, prio,
+ rtnl_held);
if (IS_ERR(tp)) {
err = PTR_ERR(tp);
- goto errout;
+ goto errout_tp;
}
- tp_created = 1;
- } else if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], tp->ops->kind)) {
+ } else {
+ mutex_unlock(&chain->filter_chain_lock);
+ }
+
+ if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], tp->ops->kind)) {
NL_SET_ERR_MSG(extack, "Specified filter kind does not match existing one");
err = -EINVAL;
goto errout;
@@ -1623,38 +2401,65 @@ replay:
goto errout;
}
} else if (n->nlmsg_flags & NLM_F_EXCL) {
+ tfilter_put(tp, fh);
NL_SET_ERR_MSG(extack, "Filter already exists");
err = -EEXIST;
goto errout;
}
if (chain->tmplt_ops && chain->tmplt_ops != tp->ops) {
+ tfilter_put(tp, fh);
NL_SET_ERR_MSG(extack, "Chain template is set to a different filter kind");
err = -EINVAL;
goto errout;
}
+ if (!(n->nlmsg_flags & NLM_F_CREATE))
+ flags |= TCA_ACT_FLAGS_REPLACE;
+ if (!rtnl_held)
+ flags |= TCA_ACT_FLAGS_NO_RTNL;
+ if (is_qdisc_ingress(parent))
+ flags |= TCA_ACT_FLAGS_AT_INGRESS;
err = tp->ops->change(net, skb, tp, cl, t->tcm_handle, tca, &fh,
- n->nlmsg_flags & NLM_F_CREATE ? TCA_ACT_NOREPLACE : TCA_ACT_REPLACE,
- extack);
+ flags, extack);
if (err == 0) {
- if (tp_created)
- tcf_chain_tp_insert(chain, &chain_info, tp);
tfilter_notify(net, skb, n, tp, block, q, parent, fh,
- RTM_NEWTFILTER, false);
- } else {
- if (tp_created)
- tcf_proto_destroy(tp, NULL);
+ RTM_NEWTFILTER, false, rtnl_held, extack);
+ tfilter_put(tp, fh);
+ tcf_proto_count_usesw(tp, true);
+ /* q pointer is NULL for shared blocks */
+ if (q)
+ q->flags &= ~TCQ_F_CAN_BYPASS;
}
errout:
- if (chain)
- tcf_chain_put(chain);
- tcf_block_release(q, block);
- if (err == -EAGAIN)
+ if (err && tp_created)
+ tcf_chain_tp_delete_empty(chain, tp, rtnl_held, NULL);
+errout_tp:
+ if (chain) {
+ if (tp && !IS_ERR(tp))
+ tcf_proto_put(tp, rtnl_held, NULL);
+ if (!tp_created)
+ tcf_chain_put(chain);
+ }
+ tcf_block_release(q, block, rtnl_held);
+
+ if (rtnl_held)
+ rtnl_unlock();
+
+ if (err == -EAGAIN) {
+ /* Take rtnl lock in case EAGAIN is caused by concurrent flush
+ * of target chain.
+ */
+ rtnl_held = true;
/* Replay the request. */
goto replay;
+ }
return err;
+
+errout_locked:
+ mutex_unlock(&chain->filter_chain_lock);
+ goto errout;
}
static int tc_del_tfilter(struct sk_buff *skb, struct nlmsghdr *n,
@@ -1662,6 +2467,7 @@ static int tc_del_tfilter(struct sk_buff *skb, struct nlmsghdr *n,
{
struct net *net = sock_net(skb->sk);
struct nlattr *tca[TCA_MAX + 1];
+ char name[IFNAMSIZ];
struct tcmsg *t;
u32 protocol;
u32 prio;
@@ -1670,16 +2476,15 @@ static int tc_del_tfilter(struct sk_buff *skb, struct nlmsghdr *n,
struct Qdisc *q = NULL;
struct tcf_chain_info chain_info;
struct tcf_chain *chain = NULL;
- struct tcf_block *block;
+ struct tcf_block *block = NULL;
struct tcf_proto *tp = NULL;
unsigned long cl = 0;
void *fh = NULL;
int err;
+ bool rtnl_held = false;
- if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
- return -EPERM;
-
- err = nlmsg_parse(n, sizeof(*t), tca, TCA_MAX, rtm_tca_policy, extack);
+ err = nlmsg_parse_deprecated(n, sizeof(*t), tca, TCA_MAX,
+ rtm_tca_policy, extack);
if (err < 0)
return err;
@@ -1695,14 +2500,38 @@ static int tc_del_tfilter(struct sk_buff *skb, struct nlmsghdr *n,
/* Find head of filter chain. */
- block = tcf_block_find(net, &q, &parent, &cl,
- t->tcm_ifindex, t->tcm_block_index, extack);
+ err = __tcf_qdisc_find(net, &q, &parent, t->tcm_ifindex, false, extack);
+ if (err)
+ return err;
+
+ if (tcf_proto_check_kind(tca[TCA_KIND], name)) {
+ NL_SET_ERR_MSG(extack, "Specified TC filter name too long");
+ err = -EINVAL;
+ goto errout;
+ }
+ /* Take rtnl mutex if flushing whole chain, block is shared (no qdisc
+ * found), qdisc is not unlocked, classifier type is not specified,
+ * classifier is not unlocked.
+ */
+ if (!prio ||
+ (q && !(q->ops->cl_ops->flags & QDISC_CLASS_OPS_DOIT_UNLOCKED)) ||
+ !tcf_proto_is_unlocked(name)) {
+ rtnl_held = true;
+ rtnl_lock();
+ }
+
+ err = __tcf_qdisc_cl_find(q, parent, &cl, t->tcm_ifindex, extack);
+ if (err)
+ goto errout;
+
+ block = __tcf_block_find(net, q, cl, t->tcm_ifindex, t->tcm_block_index,
+ extack);
if (IS_ERR(block)) {
err = PTR_ERR(block);
goto errout;
}
- chain_index = tca[TCA_CHAIN] ? nla_get_u32(tca[TCA_CHAIN]) : 0;
+ chain_index = nla_get_u32_default(tca[TCA_CHAIN], 0);
if (chain_index > TC_ACT_EXT_VAL_MASK) {
NL_SET_ERR_MSG(extack, "Specified chain index exceeds upper limit");
err = -EINVAL;
@@ -1724,56 +2553,72 @@ static int tc_del_tfilter(struct sk_buff *skb, struct nlmsghdr *n,
if (prio == 0) {
tfilter_notify_chain(net, skb, block, q, parent, n,
- chain, RTM_DELTFILTER);
- tcf_chain_flush(chain);
+ chain, RTM_DELTFILTER, extack);
+ tcf_chain_flush(chain, rtnl_held);
err = 0;
goto errout;
}
+ mutex_lock(&chain->filter_chain_lock);
tp = tcf_chain_tp_find(chain, &chain_info, protocol,
- prio, false);
- if (!tp || IS_ERR(tp)) {
+ prio, false, extack);
+ if (!tp) {
+ err = -ENOENT;
NL_SET_ERR_MSG(extack, "Filter with specified priority/protocol not found");
- err = tp ? PTR_ERR(tp) : -ENOENT;
- goto errout;
+ goto errout_locked;
+ } else if (IS_ERR(tp)) {
+ err = PTR_ERR(tp);
+ goto errout_locked;
} else if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], tp->ops->kind)) {
NL_SET_ERR_MSG(extack, "Specified filter kind does not match existing one");
err = -EINVAL;
+ goto errout_locked;
+ } else if (t->tcm_handle == 0) {
+ tcf_proto_signal_destroying(chain, tp);
+ tcf_chain_tp_remove(chain, &chain_info, tp);
+ mutex_unlock(&chain->filter_chain_lock);
+
+ tcf_proto_put(tp, rtnl_held, NULL);
+ tfilter_notify(net, skb, n, tp, block, q, parent, fh,
+ RTM_DELTFILTER, false, rtnl_held, extack);
+ err = 0;
goto errout;
}
+ mutex_unlock(&chain->filter_chain_lock);
fh = tp->ops->get(tp, t->tcm_handle);
if (!fh) {
- if (t->tcm_handle == 0) {
- tcf_chain_tp_remove(chain, &chain_info, tp);
- tfilter_notify(net, skb, n, tp, block, q, parent, fh,
- RTM_DELTFILTER, false);
- tcf_proto_destroy(tp, extack);
- err = 0;
- } else {
- NL_SET_ERR_MSG(extack, "Specified filter handle not found");
- err = -ENOENT;
- }
+ NL_SET_ERR_MSG(extack, "Specified filter handle not found");
+ err = -ENOENT;
} else {
bool last;
- err = tfilter_del_notify(net, skb, n, tp, block,
- q, parent, fh, false, &last,
- extack);
+ err = tfilter_del_notify(net, skb, n, tp, block, q, parent, fh,
+ &last, rtnl_held, extack);
+
if (err)
goto errout;
- if (last) {
- tcf_chain_tp_remove(chain, &chain_info, tp);
- tcf_proto_destroy(tp, extack);
- }
+ if (last)
+ tcf_chain_tp_delete_empty(chain, tp, rtnl_held, extack);
}
errout:
- if (chain)
+ if (chain) {
+ if (tp && !IS_ERR(tp))
+ tcf_proto_put(tp, rtnl_held, NULL);
tcf_chain_put(chain);
- tcf_block_release(q, block);
+ }
+ tcf_block_release(q, block, rtnl_held);
+
+ if (rtnl_held)
+ rtnl_unlock();
+
return err;
+
+errout_locked:
+ mutex_unlock(&chain->filter_chain_lock);
+ goto errout;
}
static int tc_get_tfilter(struct sk_buff *skb, struct nlmsghdr *n,
@@ -1781,6 +2626,7 @@ static int tc_get_tfilter(struct sk_buff *skb, struct nlmsghdr *n,
{
struct net *net = sock_net(skb->sk);
struct nlattr *tca[TCA_MAX + 1];
+ char name[IFNAMSIZ];
struct tcmsg *t;
u32 protocol;
u32 prio;
@@ -1789,13 +2635,15 @@ static int tc_get_tfilter(struct sk_buff *skb, struct nlmsghdr *n,
struct Qdisc *q = NULL;
struct tcf_chain_info chain_info;
struct tcf_chain *chain = NULL;
- struct tcf_block *block;
+ struct tcf_block *block = NULL;
struct tcf_proto *tp = NULL;
unsigned long cl = 0;
void *fh = NULL;
int err;
+ bool rtnl_held = false;
- err = nlmsg_parse(n, sizeof(*t), tca, TCA_MAX, rtm_tca_policy, extack);
+ err = nlmsg_parse_deprecated(n, sizeof(*t), tca, TCA_MAX,
+ rtm_tca_policy, extack);
if (err < 0)
return err;
@@ -1811,14 +2659,37 @@ static int tc_get_tfilter(struct sk_buff *skb, struct nlmsghdr *n,
/* Find head of filter chain. */
- block = tcf_block_find(net, &q, &parent, &cl,
- t->tcm_ifindex, t->tcm_block_index, extack);
+ err = __tcf_qdisc_find(net, &q, &parent, t->tcm_ifindex, false, extack);
+ if (err)
+ return err;
+
+ if (tcf_proto_check_kind(tca[TCA_KIND], name)) {
+ NL_SET_ERR_MSG(extack, "Specified TC filter name too long");
+ err = -EINVAL;
+ goto errout;
+ }
+ /* Take rtnl mutex if block is shared (no qdisc found), qdisc is not
+ * unlocked, classifier type is not specified, classifier is not
+ * unlocked.
+ */
+ if ((q && !(q->ops->cl_ops->flags & QDISC_CLASS_OPS_DOIT_UNLOCKED)) ||
+ !tcf_proto_is_unlocked(name)) {
+ rtnl_held = true;
+ rtnl_lock();
+ }
+
+ err = __tcf_qdisc_cl_find(q, parent, &cl, t->tcm_ifindex, extack);
+ if (err)
+ goto errout;
+
+ block = __tcf_block_find(net, q, cl, t->tcm_ifindex, t->tcm_block_index,
+ extack);
if (IS_ERR(block)) {
err = PTR_ERR(block);
goto errout;
}
- chain_index = tca[TCA_CHAIN] ? nla_get_u32(tca[TCA_CHAIN]) : 0;
+ chain_index = nla_get_u32_default(tca[TCA_CHAIN], 0);
if (chain_index > TC_ACT_EXT_VAL_MASK) {
NL_SET_ERR_MSG(extack, "Specified chain index exceeds upper limit");
err = -EINVAL;
@@ -1831,11 +2702,16 @@ static int tc_get_tfilter(struct sk_buff *skb, struct nlmsghdr *n,
goto errout;
}
+ mutex_lock(&chain->filter_chain_lock);
tp = tcf_chain_tp_find(chain, &chain_info, protocol,
- prio, false);
- if (!tp || IS_ERR(tp)) {
+ prio, false, extack);
+ mutex_unlock(&chain->filter_chain_lock);
+ if (!tp) {
+ err = -ENOENT;
NL_SET_ERR_MSG(extack, "Filter with specified priority/protocol not found");
- err = tp ? PTR_ERR(tp) : -ENOENT;
+ goto errout;
+ } else if (IS_ERR(tp)) {
+ err = PTR_ERR(tp);
goto errout;
} else if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], tp->ops->kind)) {
NL_SET_ERR_MSG(extack, "Specified filter kind does not match existing one");
@@ -1850,15 +2726,23 @@ static int tc_get_tfilter(struct sk_buff *skb, struct nlmsghdr *n,
err = -ENOENT;
} else {
err = tfilter_notify(net, skb, n, tp, block, q, parent,
- fh, RTM_NEWTFILTER, true);
+ fh, RTM_NEWTFILTER, true, rtnl_held, NULL);
if (err < 0)
NL_SET_ERR_MSG(extack, "Failed to send filter notify message");
}
+ tfilter_put(tp, fh);
errout:
- if (chain)
+ if (chain) {
+ if (tp && !IS_ERR(tp))
+ tcf_proto_put(tp, rtnl_held, NULL);
tcf_chain_put(chain);
- tcf_block_release(q, block);
+ }
+ tcf_block_release(q, block, rtnl_held);
+
+ if (rtnl_held)
+ rtnl_unlock();
+
return err;
}
@@ -1869,6 +2753,7 @@ struct tcf_dump_args {
struct tcf_block *block;
struct Qdisc *q;
u32 parent;
+ bool terse_dump;
};
static int tcf_node_dump(struct tcf_proto *tp, void *n, struct tcf_walker *arg)
@@ -1879,21 +2764,25 @@ static int tcf_node_dump(struct tcf_proto *tp, void *n, struct tcf_walker *arg)
return tcf_fill_node(net, a->skb, tp, a->block, a->q, a->parent,
n, NETLINK_CB(a->cb->skb).portid,
a->cb->nlh->nlmsg_seq, NLM_F_MULTI,
- RTM_NEWTFILTER);
+ RTM_NEWTFILTER, a->terse_dump, true, NULL);
}
static bool tcf_chain_dump(struct tcf_chain *chain, struct Qdisc *q, u32 parent,
struct sk_buff *skb, struct netlink_callback *cb,
- long index_start, long *p_index)
+ long index_start, long *p_index, bool terse)
{
struct net *net = sock_net(skb->sk);
struct tcf_block *block = chain->block;
struct tcmsg *tcm = nlmsg_data(cb->nlh);
+ struct tcf_proto *tp, *tp_prev;
struct tcf_dump_args arg;
- struct tcf_proto *tp;
- for (tp = rtnl_dereference(chain->filter_chain);
- tp; tp = rtnl_dereference(tp->next), (*p_index)++) {
+ for (tp = __tcf_get_next_proto(chain, NULL);
+ tp;
+ tp_prev = tp,
+ tp = __tcf_get_next_proto(chain, tp),
+ tcf_proto_put(tp_prev, true, NULL),
+ (*p_index)++) {
if (*p_index < index_start)
continue;
if (TC_H_MAJ(tcm->tcm_info) &&
@@ -1909,9 +2798,8 @@ static bool tcf_chain_dump(struct tcf_chain *chain, struct Qdisc *q, u32 parent,
if (tcf_fill_node(net, skb, tp, block, q, parent, NULL,
NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq, NLM_F_MULTI,
- RTM_NEWTFILTER) <= 0)
- return false;
-
+ RTM_NEWTFILTER, false, true, NULL) <= 0)
+ goto errout;
cb->args[1] = 1;
}
if (!tp->ops->walk)
@@ -1926,24 +2814,35 @@ static bool tcf_chain_dump(struct tcf_chain *chain, struct Qdisc *q, u32 parent,
arg.w.skip = cb->args[1] - 1;
arg.w.count = 0;
arg.w.cookie = cb->args[2];
- tp->ops->walk(tp, &arg.w);
+ arg.terse_dump = terse;
+ tp->ops->walk(tp, &arg.w, true);
cb->args[2] = arg.w.cookie;
cb->args[1] = arg.w.count + 1;
if (arg.w.stop)
- return false;
+ goto errout;
}
return true;
+
+errout:
+ tcf_proto_put(tp, true, NULL);
+ return false;
}
+static const struct nla_policy tcf_tfilter_dump_policy[TCA_MAX + 1] = {
+ [TCA_CHAIN] = { .type = NLA_U32 },
+ [TCA_DUMP_FLAGS] = NLA_POLICY_BITFIELD32(TCA_DUMP_FLAGS_TERSE),
+};
+
/* called with RTNL */
static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb)
{
+ struct tcf_chain *chain, *chain_prev;
struct net *net = sock_net(skb->sk);
struct nlattr *tca[TCA_MAX + 1];
struct Qdisc *q = NULL;
struct tcf_block *block;
- struct tcf_chain *chain;
struct tcmsg *tcm = nlmsg_data(cb->nlh);
+ bool terse_dump = false;
long index_start;
long index;
u32 parent;
@@ -1952,11 +2851,18 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb)
if (nlmsg_len(cb->nlh) < sizeof(*tcm))
return skb->len;
- err = nlmsg_parse(cb->nlh, sizeof(*tcm), tca, TCA_MAX, NULL,
- cb->extack);
+ err = nlmsg_parse_deprecated(cb->nlh, sizeof(*tcm), tca, TCA_MAX,
+ tcf_tfilter_dump_policy, cb->extack);
if (err)
return err;
+ if (tca[TCA_DUMP_FLAGS]) {
+ struct nla_bitfield32 flags =
+ nla_get_bitfield32(tca[TCA_DUMP_FLAGS]);
+
+ terse_dump = flags.value & TCA_DUMP_FLAGS_TERSE;
+ }
+
if (tcm->tcm_ifindex == TCM_IFINDEX_MAGIC_BLOCK) {
block = tcf_block_refcnt_get(net, tcm->tcm_block_index);
if (!block)
@@ -1978,12 +2884,10 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb)
return skb->len;
parent = tcm->tcm_parent;
- if (!parent) {
- q = dev->qdisc;
- parent = q->handle;
- } else {
+ if (!parent)
+ q = rtnl_dereference(dev->qdisc);
+ else
q = qdisc_lookup(dev, TC_H_MAJ(tcm->tcm_parent));
- }
if (!q)
goto out;
cops = q->ops->cl_ops;
@@ -1999,6 +2903,7 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb)
block = cops->tcf_block(q, cl, NULL);
if (!block)
goto out;
+ parent = block->classid;
if (tcf_block_shared(block))
q = NULL;
}
@@ -2006,19 +2911,24 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb)
index_start = cb->args[0];
index = 0;
- list_for_each_entry(chain, &block->chain_list, list) {
+ for (chain = __tcf_get_next_chain(block, NULL);
+ chain;
+ chain_prev = chain,
+ chain = __tcf_get_next_chain(block, chain),
+ tcf_chain_put(chain_prev)) {
if (tca[TCA_CHAIN] &&
nla_get_u32(tca[TCA_CHAIN]) != chain->index)
continue;
if (!tcf_chain_dump(chain, q, parent, skb, cb,
- index_start, &index)) {
+ index_start, &index, terse_dump)) {
+ tcf_chain_put(chain);
err = -EMSGSIZE;
break;
}
}
if (tcm->tcm_ifindex == TCM_IFINDEX_MAGIC_BLOCK)
- tcf_block_refcnt_put(block);
+ tcf_block_refcnt_put(block, true);
cb->args[0] = index;
out:
@@ -2028,9 +2938,12 @@ out:
return skb->len;
}
-static int tc_chain_fill_node(struct tcf_chain *chain, struct net *net,
- struct sk_buff *skb, struct tcf_block *block,
- u32 portid, u32 seq, u16 flags, int event)
+static int tc_chain_fill_node(const struct tcf_proto_ops *tmplt_ops,
+ void *tmplt_priv, u32 chain_index,
+ struct net *net, struct sk_buff *skb,
+ struct tcf_block *block,
+ u32 portid, u32 seq, u16 flags, int event,
+ struct netlink_ext_ack *extack)
{
unsigned char *b = skb_tail_pointer(skb);
const struct tcf_proto_ops *ops;
@@ -2038,8 +2951,8 @@ static int tc_chain_fill_node(struct tcf_chain *chain, struct net *net,
struct tcmsg *tcm;
void *priv;
- ops = chain->tmplt_ops;
- priv = chain->tmplt_priv;
+ ops = tmplt_ops;
+ priv = tmplt_priv;
nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
if (!nlh)
@@ -2057,7 +2970,7 @@ static int tc_chain_fill_node(struct tcf_chain *chain, struct net *net,
tcm->tcm_block_index = block->index;
}
- if (nla_put_u32(skb, TCA_CHAIN, chain->index))
+ if (nla_put_u32(skb, TCA_CHAIN, chain_index))
goto nla_put_failure;
if (ops) {
@@ -2067,7 +2980,12 @@ static int tc_chain_fill_node(struct tcf_chain *chain, struct net *net,
goto nla_put_failure;
}
+ if (extack && extack->_msg &&
+ nla_put_string(skb, TCA_EXT_WARN_MSG, extack->_msg))
+ goto out_nlmsg_trim;
+
nlh->nlmsg_len = skb_tail_pointer(skb) - b;
+
return skb->len;
out_nlmsg_trim:
@@ -2077,25 +2995,59 @@ nla_put_failure:
}
static int tc_chain_notify(struct tcf_chain *chain, struct sk_buff *oskb,
- u32 seq, u16 flags, int event, bool unicast)
+ u32 seq, u16 flags, int event, bool unicast,
+ struct netlink_ext_ack *extack)
{
u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
struct tcf_block *block = chain->block;
struct net *net = block->net;
struct sk_buff *skb;
+ int err = 0;
+
+ if (!unicast && !rtnl_notify_needed(net, flags, RTNLGRP_TC))
+ return 0;
skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
if (!skb)
return -ENOBUFS;
- if (tc_chain_fill_node(chain, net, skb, block, portid,
- seq, flags, event) <= 0) {
+ if (tc_chain_fill_node(chain->tmplt_ops, chain->tmplt_priv,
+ chain->index, net, skb, block, portid,
+ seq, flags, event, extack) <= 0) {
kfree_skb(skb);
return -EINVAL;
}
if (unicast)
- return netlink_unicast(net->rtnl, skb, portid, MSG_DONTWAIT);
+ err = rtnl_unicast(skb, net, portid);
+ else
+ err = rtnetlink_send(skb, net, portid, RTNLGRP_TC,
+ flags & NLM_F_ECHO);
+
+ return err;
+}
+
+static int tc_chain_notify_delete(const struct tcf_proto_ops *tmplt_ops,
+ void *tmplt_priv, u32 chain_index,
+ struct tcf_block *block, struct sk_buff *oskb,
+ u32 seq, u16 flags)
+{
+ u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
+ struct net *net = block->net;
+ struct sk_buff *skb;
+
+ if (!rtnl_notify_needed(net, flags, RTNLGRP_TC))
+ return 0;
+
+ skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+ if (!skb)
+ return -ENOBUFS;
+
+ if (tc_chain_fill_node(tmplt_ops, tmplt_priv, chain_index, net, skb,
+ block, portid, seq, flags, RTM_DELCHAIN, NULL) <= 0) {
+ kfree_skb(skb);
+ return -EINVAL;
+ }
return rtnetlink_send(skb, net, portid, RTNLGRP_TC, flags & NLM_F_ECHO);
}
@@ -2105,17 +3057,25 @@ static int tc_chain_tmplt_add(struct tcf_chain *chain, struct net *net,
struct netlink_ext_ack *extack)
{
const struct tcf_proto_ops *ops;
+ char name[IFNAMSIZ];
void *tmplt_priv;
/* If kind is not set, user did not specify template. */
if (!tca[TCA_KIND])
return 0;
- ops = tcf_proto_lookup_ops(nla_data(tca[TCA_KIND]), extack);
+ if (tcf_proto_check_kind(tca[TCA_KIND], name)) {
+ NL_SET_ERR_MSG(extack, "Specified TC chain template name too long");
+ return -EINVAL;
+ }
+
+ ops = tcf_proto_lookup_ops(name, true, extack);
if (IS_ERR(ops))
return PTR_ERR(ops);
- if (!ops->tmplt_create || !ops->tmplt_destroy || !ops->tmplt_dump) {
+ if (!ops->tmplt_create || !ops->tmplt_destroy || !ops->tmplt_dump ||
+ !ops->tmplt_reoffload) {
NL_SET_ERR_MSG(extack, "Chain templates are not supported with specified classifier");
+ module_put(ops->owner);
return -EOPNOTSUPP;
}
@@ -2129,16 +3089,15 @@ static int tc_chain_tmplt_add(struct tcf_chain *chain, struct net *net,
return 0;
}
-static void tc_chain_tmplt_del(struct tcf_chain *chain)
+static void tc_chain_tmplt_del(const struct tcf_proto_ops *tmplt_ops,
+ void *tmplt_priv)
{
- const struct tcf_proto_ops *ops = chain->tmplt_ops;
-
/* If template ops are set, no work to do for us. */
- if (!ops)
+ if (!tmplt_ops)
return;
- ops->tmplt_destroy(chain->tmplt_priv);
- module_put(ops->owner);
+ tmplt_ops->tmplt_destroy(tmplt_priv);
+ module_put(tmplt_ops->owner);
}
/* Add/delete/get a chain */
@@ -2151,18 +3110,16 @@ static int tc_ctl_chain(struct sk_buff *skb, struct nlmsghdr *n,
struct tcmsg *t;
u32 parent;
u32 chain_index;
- struct Qdisc *q = NULL;
- struct tcf_chain *chain = NULL;
+ struct Qdisc *q;
+ struct tcf_chain *chain;
struct tcf_block *block;
unsigned long cl;
int err;
- if (n->nlmsg_type != RTM_GETCHAIN &&
- !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
- return -EPERM;
-
replay:
- err = nlmsg_parse(n, sizeof(*t), tca, TCA_MAX, rtm_tca_policy, extack);
+ q = NULL;
+ err = nlmsg_parse_deprecated(n, sizeof(*t), tca, TCA_MAX,
+ rtm_tca_policy, extack);
if (err < 0)
return err;
@@ -2175,12 +3132,14 @@ replay:
if (IS_ERR(block))
return PTR_ERR(block);
- chain_index = tca[TCA_CHAIN] ? nla_get_u32(tca[TCA_CHAIN]) : 0;
+ chain_index = nla_get_u32_default(tca[TCA_CHAIN], 0);
if (chain_index > TC_ACT_EXT_VAL_MASK) {
NL_SET_ERR_MSG(extack, "Specified chain index exceeds upper limit");
err = -EINVAL;
goto errout_block;
}
+
+ mutex_lock(&block->lock);
chain = tcf_chain_lookup(block, chain_index);
if (n->nlmsg_type == RTM_NEWCHAIN) {
if (chain) {
@@ -2192,58 +3151,65 @@ replay:
} else {
NL_SET_ERR_MSG(extack, "Filter chain already exists");
err = -EEXIST;
- goto errout_block;
+ goto errout_block_locked;
}
} else {
if (!(n->nlmsg_flags & NLM_F_CREATE)) {
NL_SET_ERR_MSG(extack, "Need both RTM_NEWCHAIN and NLM_F_CREATE to create a new chain");
err = -ENOENT;
- goto errout_block;
+ goto errout_block_locked;
}
chain = tcf_chain_create(block, chain_index);
if (!chain) {
NL_SET_ERR_MSG(extack, "Failed to create filter chain");
err = -ENOMEM;
- goto errout_block;
+ goto errout_block_locked;
}
}
} else {
if (!chain || tcf_chain_held_by_acts_only(chain)) {
NL_SET_ERR_MSG(extack, "Cannot find specified filter chain");
err = -EINVAL;
- goto errout_block;
+ goto errout_block_locked;
}
tcf_chain_hold(chain);
}
+ if (n->nlmsg_type == RTM_NEWCHAIN) {
+ /* Modifying chain requires holding parent block lock. In case
+ * the chain was successfully added, take a reference to the
+ * chain. This ensures that an empty chain does not disappear at
+ * the end of this function.
+ */
+ tcf_chain_hold(chain);
+ chain->explicitly_created = true;
+ }
+ mutex_unlock(&block->lock);
+
switch (n->nlmsg_type) {
case RTM_NEWCHAIN:
err = tc_chain_tmplt_add(chain, net, tca, extack);
- if (err)
+ if (err) {
+ tcf_chain_put_explicitly_created(chain);
goto errout;
- /* In case the chain was successfully added, take a reference
- * to the chain. This ensures that an empty chain
- * does not disappear at the end of this function.
- */
- tcf_chain_hold(chain);
- chain->explicitly_created = true;
+ }
+
tc_chain_notify(chain, NULL, 0, NLM_F_CREATE | NLM_F_EXCL,
- RTM_NEWCHAIN, false);
+ RTM_NEWCHAIN, false, extack);
break;
case RTM_DELCHAIN:
tfilter_notify_chain(net, skb, block, q, parent, n,
- chain, RTM_DELTFILTER);
+ chain, RTM_DELTFILTER, extack);
/* Flush the chain first as the user requested chain removal. */
- tcf_chain_flush(chain);
+ tcf_chain_flush(chain, true);
/* In case the chain was successfully deleted, put a reference
* to the chain previously taken during addition.
*/
tcf_chain_put_explicitly_created(chain);
- chain->explicitly_created = false;
break;
case RTM_GETCHAIN:
err = tc_chain_notify(chain, skb, n->nlmsg_seq,
- n->nlmsg_seq, n->nlmsg_type, true);
+ n->nlmsg_flags, n->nlmsg_type, true, extack);
if (err < 0)
NL_SET_ERR_MSG(extack, "Failed to send chain notify message");
break;
@@ -2256,11 +3222,15 @@ replay:
errout:
tcf_chain_put(chain);
errout_block:
- tcf_block_release(q, block);
+ tcf_block_release(q, block, true);
if (err == -EAGAIN)
/* Replay the request. */
goto replay;
return err;
+
+errout_block_locked:
+ mutex_unlock(&block->lock);
+ goto errout_block;
}
/* called with RTNL */
@@ -2270,18 +3240,17 @@ static int tc_dump_chain(struct sk_buff *skb, struct netlink_callback *cb)
struct nlattr *tca[TCA_MAX + 1];
struct Qdisc *q = NULL;
struct tcf_block *block;
- struct tcf_chain *chain;
struct tcmsg *tcm = nlmsg_data(cb->nlh);
+ struct tcf_chain *chain;
long index_start;
long index;
- u32 parent;
int err;
if (nlmsg_len(cb->nlh) < sizeof(*tcm))
return skb->len;
- err = nlmsg_parse(cb->nlh, sizeof(*tcm), tca, TCA_MAX, rtm_tca_policy,
- cb->extack);
+ err = nlmsg_parse_deprecated(cb->nlh, sizeof(*tcm), tca, TCA_MAX,
+ rtm_tca_policy, cb->extack);
if (err)
return err;
@@ -2289,13 +3258,6 @@ static int tc_dump_chain(struct sk_buff *skb, struct netlink_callback *cb)
block = tcf_block_refcnt_get(net, tcm->tcm_block_index);
if (!block)
goto out;
- /* If we work with block index, q is NULL and parent value
- * will never be used in the following code. The check
- * in tcf_fill_node prevents it. However, compiler does not
- * see that far, so set parent to zero to silence the warning
- * about parent being uninitialized.
- */
- parent = 0;
} else {
const struct Qdisc_class_ops *cops;
struct net_device *dev;
@@ -2305,13 +3267,11 @@ static int tc_dump_chain(struct sk_buff *skb, struct netlink_callback *cb)
if (!dev)
return skb->len;
- parent = tcm->tcm_parent;
- if (!parent) {
- q = dev->qdisc;
- parent = q->handle;
- } else {
+ if (!tcm->tcm_parent)
+ q = rtnl_dereference(dev->qdisc);
+ else
q = qdisc_lookup(dev, TC_H_MAJ(tcm->tcm_parent));
- }
+
if (!q)
goto out;
cops = q->ops->cl_ops;
@@ -2334,6 +3294,7 @@ static int tc_dump_chain(struct sk_buff *skb, struct netlink_callback *cb)
index_start = cb->args[0];
index = 0;
+ mutex_lock(&block->lock);
list_for_each_entry(chain, &block->chain_list, list) {
if ((tca[TCA_CHAIN] &&
nla_get_u32(tca[TCA_CHAIN]) != chain->index))
@@ -2344,17 +3305,19 @@ static int tc_dump_chain(struct sk_buff *skb, struct netlink_callback *cb)
}
if (tcf_chain_held_by_acts_only(chain))
continue;
- err = tc_chain_fill_node(chain, net, skb, block,
+ err = tc_chain_fill_node(chain->tmplt_ops, chain->tmplt_priv,
+ chain->index, net, skb, block,
NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq, NLM_F_MULTI,
- RTM_NEWCHAIN);
+ RTM_NEWCHAIN, NULL);
if (err <= 0)
break;
index++;
}
+ mutex_unlock(&block->lock);
if (tcm->tcm_ifindex == TCM_IFINDEX_MAGIC_BLOCK)
- tcf_block_refcnt_put(block);
+ tcf_block_refcnt_put(block, true);
cb->args[0] = index;
out:
@@ -2364,47 +3327,102 @@ out:
return skb->len;
}
+int tcf_exts_init_ex(struct tcf_exts *exts, struct net *net, int action,
+ int police, struct tcf_proto *tp, u32 handle,
+ bool use_action_miss)
+{
+ int err = 0;
+
+#ifdef CONFIG_NET_CLS_ACT
+ exts->type = 0;
+ exts->nr_actions = 0;
+ exts->miss_cookie_node = NULL;
+ /* Note: we do not own yet a reference on net.
+ * This reference might be taken later from tcf_exts_get_net().
+ */
+ exts->net = net;
+ exts->actions = kcalloc(TCA_ACT_MAX_PRIO, sizeof(struct tc_action *),
+ GFP_KERNEL);
+ if (!exts->actions)
+ return -ENOMEM;
+#endif
+
+ exts->action = action;
+ exts->police = police;
+
+ if (!use_action_miss)
+ return 0;
+
+ err = tcf_exts_miss_cookie_base_alloc(exts, tp, handle);
+ if (err)
+ goto err_miss_alloc;
+
+ return 0;
+
+err_miss_alloc:
+ tcf_exts_destroy(exts);
+#ifdef CONFIG_NET_CLS_ACT
+ exts->actions = NULL;
+#endif
+ return err;
+}
+EXPORT_SYMBOL(tcf_exts_init_ex);
+
void tcf_exts_destroy(struct tcf_exts *exts)
{
+ tcf_exts_miss_cookie_base_destroy(exts);
+
#ifdef CONFIG_NET_CLS_ACT
- tcf_action_destroy(exts->actions, TCA_ACT_UNBIND);
- kfree(exts->actions);
+ if (exts->actions) {
+ tcf_action_destroy(exts->actions, TCA_ACT_UNBIND);
+ kfree(exts->actions);
+ }
exts->nr_actions = 0;
#endif
}
EXPORT_SYMBOL(tcf_exts_destroy);
-int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb,
- struct nlattr *rate_tlv, struct tcf_exts *exts, bool ovr,
- struct netlink_ext_ack *extack)
+int tcf_exts_validate_ex(struct net *net, struct tcf_proto *tp, struct nlattr **tb,
+ struct nlattr *rate_tlv, struct tcf_exts *exts,
+ u32 flags, u32 fl_flags, struct netlink_ext_ack *extack)
{
#ifdef CONFIG_NET_CLS_ACT
{
+ int init_res[TCA_ACT_MAX_PRIO] = {};
struct tc_action *act;
size_t attr_size = 0;
if (exts->police && tb[exts->police]) {
+ struct tc_action_ops *a_o;
+
+ flags |= TCA_ACT_FLAGS_POLICE | TCA_ACT_FLAGS_BIND;
+ a_o = tc_action_load_ops(tb[exts->police], flags,
+ extack);
+ if (IS_ERR(a_o))
+ return PTR_ERR(a_o);
act = tcf_action_init_1(net, tp, tb[exts->police],
- rate_tlv, "police", ovr,
- TCA_ACT_BIND, true, extack);
+ rate_tlv, a_o, init_res, flags,
+ extack);
+ module_put(a_o->owner);
if (IS_ERR(act))
return PTR_ERR(act);
act->type = exts->type = TCA_OLD_COMPAT;
exts->actions[0] = act;
exts->nr_actions = 1;
+ tcf_idr_insert_many(exts->actions, init_res);
} else if (exts->action && tb[exts->action]) {
int err;
+ flags |= TCA_ACT_FLAGS_BIND;
err = tcf_action_init(net, tp, tb[exts->action],
- rate_tlv, NULL, ovr, TCA_ACT_BIND,
- exts->actions, &attr_size, true,
+ rate_tlv, exts->actions, init_res,
+ &attr_size, flags, fl_flags,
extack);
if (err < 0)
return err;
exts->nr_actions = err;
}
- exts->net = net;
}
#else
if ((exts->action && tb[exts->action]) ||
@@ -2416,6 +3434,15 @@ int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb,
return 0;
}
+EXPORT_SYMBOL(tcf_exts_validate_ex);
+
+int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb,
+ struct nlattr *rate_tlv, struct tcf_exts *exts,
+ u32 flags, struct netlink_ext_ack *extack)
+{
+ return tcf_exts_validate_ex(net, tp, tb, rate_tlv, exts,
+ flags, 0, extack);
+}
EXPORT_SYMBOL(tcf_exts_validate);
void tcf_exts_change(struct tcf_exts *dst, struct tcf_exts *src)
@@ -2451,16 +3478,17 @@ int tcf_exts_dump(struct sk_buff *skb, struct tcf_exts *exts)
* tc data even if iproute2 was newer - jhs
*/
if (exts->type != TCA_OLD_COMPAT) {
- nest = nla_nest_start(skb, exts->action);
+ nest = nla_nest_start_noflag(skb, exts->action);
if (nest == NULL)
goto nla_put_failure;
- if (tcf_action_dump(skb, exts->actions, 0, 0) < 0)
+ if (tcf_action_dump(skb, exts->actions, 0, 0, false)
+ < 0)
goto nla_put_failure;
nla_nest_end(skb, nest);
} else if (exts->police) {
struct tc_action *act = tcf_exts_first_act(exts);
- nest = nla_nest_start(skb, exts->police);
+ nest = nla_nest_start_noflag(skb, exts->police);
if (nest == NULL || !act)
goto nla_put_failure;
if (tcf_action_dump_old(skb, act, 0, 0) < 0)
@@ -2479,6 +3507,31 @@ nla_put_failure:
}
EXPORT_SYMBOL(tcf_exts_dump);
+int tcf_exts_terse_dump(struct sk_buff *skb, struct tcf_exts *exts)
+{
+#ifdef CONFIG_NET_CLS_ACT
+ struct nlattr *nest;
+
+ if (!exts->action || !tcf_exts_has_actions(exts))
+ return 0;
+
+ nest = nla_nest_start_noflag(skb, exts->action);
+ if (!nest)
+ goto nla_put_failure;
+
+ if (tcf_action_dump(skb, exts->actions, 0, 0, true) < 0)
+ goto nla_put_failure;
+ nla_nest_end(skb, nest);
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(skb, nest);
+ return -1;
+#else
+ return 0;
+#endif
+}
+EXPORT_SYMBOL(tcf_exts_terse_dump);
int tcf_exts_dump_stats(struct sk_buff *skb, struct tcf_exts *exts)
{
@@ -2491,18 +3544,62 @@ int tcf_exts_dump_stats(struct sk_buff *skb, struct tcf_exts *exts)
}
EXPORT_SYMBOL(tcf_exts_dump_stats);
-int tc_setup_cb_call(struct tcf_block *block, enum tc_setup_type type,
- void *type_data, bool err_stop)
+static void tcf_block_offload_inc(struct tcf_block *block, u32 *flags)
{
- struct tcf_block_cb *block_cb;
+ if (*flags & TCA_CLS_FLAGS_IN_HW)
+ return;
+ *flags |= TCA_CLS_FLAGS_IN_HW;
+ atomic_inc(&block->offloadcnt);
+}
+
+static void tcf_block_offload_dec(struct tcf_block *block, u32 *flags)
+{
+ if (!(*flags & TCA_CLS_FLAGS_IN_HW))
+ return;
+ *flags &= ~TCA_CLS_FLAGS_IN_HW;
+ atomic_dec(&block->offloadcnt);
+}
+
+static void tc_cls_offload_cnt_update(struct tcf_block *block,
+ struct tcf_proto *tp, u32 *cnt,
+ u32 *flags, u32 diff, bool add)
+{
+ lockdep_assert_held(&block->cb_lock);
+
+ spin_lock(&tp->lock);
+ if (add) {
+ if (!*cnt)
+ tcf_block_offload_inc(block, flags);
+ *cnt += diff;
+ } else {
+ *cnt -= diff;
+ if (!*cnt)
+ tcf_block_offload_dec(block, flags);
+ }
+ spin_unlock(&tp->lock);
+}
+
+static void
+tc_cls_offload_cnt_reset(struct tcf_block *block, struct tcf_proto *tp,
+ u32 *cnt, u32 *flags)
+{
+ lockdep_assert_held(&block->cb_lock);
+
+ spin_lock(&tp->lock);
+ tcf_block_offload_dec(block, flags);
+ *cnt = 0;
+ spin_unlock(&tp->lock);
+}
+
+static int
+__tc_setup_cb_call(struct tcf_block *block, enum tc_setup_type type,
+ void *type_data, bool err_stop)
+{
+ struct flow_block_cb *block_cb;
int ok_count = 0;
int err;
- /* Make sure all netdevs sharing this block are offload-capable. */
- if (block->nooffloaddevcnt && err_stop)
- return -EOPNOTSUPP;
-
- list_for_each_entry(block_cb, &block->cb_list, list) {
+ list_for_each_entry(block_cb, &block->flow_block.cb_list, list) {
err = block_cb->cb(type, type_data, block_cb->cb_priv);
if (err) {
if (err_stop)
@@ -2513,8 +3610,453 @@ int tc_setup_cb_call(struct tcf_block *block, enum tc_setup_type type,
}
return ok_count;
}
+
+int tc_setup_cb_call(struct tcf_block *block, enum tc_setup_type type,
+ void *type_data, bool err_stop, bool rtnl_held)
+{
+ bool take_rtnl = READ_ONCE(block->lockeddevcnt) && !rtnl_held;
+ int ok_count;
+
+retry:
+ if (take_rtnl)
+ rtnl_lock();
+ down_read(&block->cb_lock);
+ /* Need to obtain rtnl lock if block is bound to devs that require it.
+ * In block bind code cb_lock is obtained while holding rtnl, so we must
+ * obtain the locks in same order here.
+ */
+ if (!rtnl_held && !take_rtnl && block->lockeddevcnt) {
+ up_read(&block->cb_lock);
+ take_rtnl = true;
+ goto retry;
+ }
+
+ ok_count = __tc_setup_cb_call(block, type, type_data, err_stop);
+
+ up_read(&block->cb_lock);
+ if (take_rtnl)
+ rtnl_unlock();
+ return ok_count;
+}
EXPORT_SYMBOL(tc_setup_cb_call);
+/* Non-destructive filter add. If filter that wasn't already in hardware is
+ * successfully offloaded, increment block offloads counter. On failure,
+ * previously offloaded filter is considered to be intact and offloads counter
+ * is not decremented.
+ */
+
+int tc_setup_cb_add(struct tcf_block *block, struct tcf_proto *tp,
+ enum tc_setup_type type, void *type_data, bool err_stop,
+ u32 *flags, unsigned int *in_hw_count, bool rtnl_held)
+{
+ bool take_rtnl = READ_ONCE(block->lockeddevcnt) && !rtnl_held;
+ int ok_count;
+
+retry:
+ if (take_rtnl)
+ rtnl_lock();
+ down_read(&block->cb_lock);
+ /* Need to obtain rtnl lock if block is bound to devs that require it.
+ * In block bind code cb_lock is obtained while holding rtnl, so we must
+ * obtain the locks in same order here.
+ */
+ if (!rtnl_held && !take_rtnl && block->lockeddevcnt) {
+ up_read(&block->cb_lock);
+ take_rtnl = true;
+ goto retry;
+ }
+
+ /* Make sure all netdevs sharing this block are offload-capable. */
+ if (block->nooffloaddevcnt && err_stop) {
+ ok_count = -EOPNOTSUPP;
+ goto err_unlock;
+ }
+
+ ok_count = __tc_setup_cb_call(block, type, type_data, err_stop);
+ if (ok_count < 0)
+ goto err_unlock;
+
+ if (tp->ops->hw_add)
+ tp->ops->hw_add(tp, type_data);
+ if (ok_count > 0)
+ tc_cls_offload_cnt_update(block, tp, in_hw_count, flags,
+ ok_count, true);
+err_unlock:
+ up_read(&block->cb_lock);
+ if (take_rtnl)
+ rtnl_unlock();
+ return min(ok_count, 0);
+}
+EXPORT_SYMBOL(tc_setup_cb_add);
+
+/* Destructive filter replace. If filter that wasn't already in hardware is
+ * successfully offloaded, increment block offload counter. On failure,
+ * previously offloaded filter is considered to be destroyed and offload counter
+ * is decremented.
+ */
+
+int tc_setup_cb_replace(struct tcf_block *block, struct tcf_proto *tp,
+ enum tc_setup_type type, void *type_data, bool err_stop,
+ u32 *old_flags, unsigned int *old_in_hw_count,
+ u32 *new_flags, unsigned int *new_in_hw_count,
+ bool rtnl_held)
+{
+ bool take_rtnl = READ_ONCE(block->lockeddevcnt) && !rtnl_held;
+ int ok_count;
+
+retry:
+ if (take_rtnl)
+ rtnl_lock();
+ down_read(&block->cb_lock);
+ /* Need to obtain rtnl lock if block is bound to devs that require it.
+ * In block bind code cb_lock is obtained while holding rtnl, so we must
+ * obtain the locks in same order here.
+ */
+ if (!rtnl_held && !take_rtnl && block->lockeddevcnt) {
+ up_read(&block->cb_lock);
+ take_rtnl = true;
+ goto retry;
+ }
+
+ /* Make sure all netdevs sharing this block are offload-capable. */
+ if (block->nooffloaddevcnt && err_stop) {
+ ok_count = -EOPNOTSUPP;
+ goto err_unlock;
+ }
+
+ tc_cls_offload_cnt_reset(block, tp, old_in_hw_count, old_flags);
+ if (tp->ops->hw_del)
+ tp->ops->hw_del(tp, type_data);
+
+ ok_count = __tc_setup_cb_call(block, type, type_data, err_stop);
+ if (ok_count < 0)
+ goto err_unlock;
+
+ if (tp->ops->hw_add)
+ tp->ops->hw_add(tp, type_data);
+ if (ok_count > 0)
+ tc_cls_offload_cnt_update(block, tp, new_in_hw_count,
+ new_flags, ok_count, true);
+err_unlock:
+ up_read(&block->cb_lock);
+ if (take_rtnl)
+ rtnl_unlock();
+ return min(ok_count, 0);
+}
+EXPORT_SYMBOL(tc_setup_cb_replace);
+
+/* Destroy filter and decrement block offload counter, if filter was previously
+ * offloaded.
+ */
+
+int tc_setup_cb_destroy(struct tcf_block *block, struct tcf_proto *tp,
+ enum tc_setup_type type, void *type_data, bool err_stop,
+ u32 *flags, unsigned int *in_hw_count, bool rtnl_held)
+{
+ bool take_rtnl = READ_ONCE(block->lockeddevcnt) && !rtnl_held;
+ int ok_count;
+
+retry:
+ if (take_rtnl)
+ rtnl_lock();
+ down_read(&block->cb_lock);
+ /* Need to obtain rtnl lock if block is bound to devs that require it.
+ * In block bind code cb_lock is obtained while holding rtnl, so we must
+ * obtain the locks in same order here.
+ */
+ if (!rtnl_held && !take_rtnl && block->lockeddevcnt) {
+ up_read(&block->cb_lock);
+ take_rtnl = true;
+ goto retry;
+ }
+
+ ok_count = __tc_setup_cb_call(block, type, type_data, err_stop);
+
+ tc_cls_offload_cnt_reset(block, tp, in_hw_count, flags);
+ if (tp->ops->hw_del)
+ tp->ops->hw_del(tp, type_data);
+
+ up_read(&block->cb_lock);
+ if (take_rtnl)
+ rtnl_unlock();
+ return min(ok_count, 0);
+}
+EXPORT_SYMBOL(tc_setup_cb_destroy);
+
+int tc_setup_cb_reoffload(struct tcf_block *block, struct tcf_proto *tp,
+ bool add, flow_setup_cb_t *cb,
+ enum tc_setup_type type, void *type_data,
+ void *cb_priv, u32 *flags, unsigned int *in_hw_count)
+{
+ int err = cb(type, type_data, cb_priv);
+
+ if (err) {
+ if (add && tc_skip_sw(*flags))
+ return err;
+ } else {
+ tc_cls_offload_cnt_update(block, tp, in_hw_count, flags, 1,
+ add);
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(tc_setup_cb_reoffload);
+
+static int tcf_act_get_user_cookie(struct flow_action_entry *entry,
+ const struct tc_action *act)
+{
+ struct tc_cookie *user_cookie;
+ int err = 0;
+
+ rcu_read_lock();
+ user_cookie = rcu_dereference(act->user_cookie);
+ if (user_cookie) {
+ entry->user_cookie = flow_action_cookie_create(user_cookie->data,
+ user_cookie->len,
+ GFP_ATOMIC);
+ if (!entry->user_cookie)
+ err = -ENOMEM;
+ }
+ rcu_read_unlock();
+ return err;
+}
+
+static void tcf_act_put_user_cookie(struct flow_action_entry *entry)
+{
+ flow_action_cookie_destroy(entry->user_cookie);
+}
+
+void tc_cleanup_offload_action(struct flow_action *flow_action)
+{
+ struct flow_action_entry *entry;
+ int i;
+
+ flow_action_for_each(i, entry, flow_action) {
+ tcf_act_put_user_cookie(entry);
+ if (entry->destructor)
+ entry->destructor(entry->destructor_priv);
+ }
+}
+EXPORT_SYMBOL(tc_cleanup_offload_action);
+
+static int tc_setup_offload_act(struct tc_action *act,
+ struct flow_action_entry *entry,
+ u32 *index_inc,
+ struct netlink_ext_ack *extack)
+{
+#ifdef CONFIG_NET_CLS_ACT
+ if (act->ops->offload_act_setup) {
+ return act->ops->offload_act_setup(act, entry, index_inc, true,
+ extack);
+ } else {
+ NL_SET_ERR_MSG(extack, "Action does not support offload");
+ return -EOPNOTSUPP;
+ }
+#else
+ return 0;
+#endif
+}
+
+int tc_setup_action(struct flow_action *flow_action,
+ struct tc_action *actions[],
+ u32 miss_cookie_base,
+ struct netlink_ext_ack *extack)
+{
+ int i, j, k, index, err = 0;
+ struct tc_action *act;
+
+ BUILD_BUG_ON(TCA_ACT_HW_STATS_ANY != FLOW_ACTION_HW_STATS_ANY);
+ BUILD_BUG_ON(TCA_ACT_HW_STATS_IMMEDIATE != FLOW_ACTION_HW_STATS_IMMEDIATE);
+ BUILD_BUG_ON(TCA_ACT_HW_STATS_DELAYED != FLOW_ACTION_HW_STATS_DELAYED);
+
+ if (!actions)
+ return 0;
+
+ j = 0;
+ tcf_act_for_each_action(i, act, actions) {
+ struct flow_action_entry *entry;
+
+ entry = &flow_action->entries[j];
+ spin_lock_bh(&act->tcfa_lock);
+ err = tcf_act_get_user_cookie(entry, act);
+ if (err)
+ goto err_out_locked;
+
+ index = 0;
+ err = tc_setup_offload_act(act, entry, &index, extack);
+ if (err)
+ goto err_out_locked;
+
+ for (k = 0; k < index ; k++) {
+ entry[k].hw_stats = tc_act_hw_stats(act->hw_stats);
+ entry[k].hw_index = act->tcfa_index;
+ entry[k].cookie = (unsigned long)act;
+ entry[k].miss_cookie =
+ tcf_exts_miss_cookie_get(miss_cookie_base, i);
+ }
+
+ j += index;
+
+ spin_unlock_bh(&act->tcfa_lock);
+ }
+
+err_out:
+ if (err)
+ tc_cleanup_offload_action(flow_action);
+
+ return err;
+err_out_locked:
+ spin_unlock_bh(&act->tcfa_lock);
+ goto err_out;
+}
+
+int tc_setup_offload_action(struct flow_action *flow_action,
+ const struct tcf_exts *exts,
+ struct netlink_ext_ack *extack)
+{
+#ifdef CONFIG_NET_CLS_ACT
+ u32 miss_cookie_base;
+
+ if (!exts)
+ return 0;
+
+ miss_cookie_base = exts->miss_cookie_node ?
+ exts->miss_cookie_node->miss_cookie_base : 0;
+ return tc_setup_action(flow_action, exts->actions, miss_cookie_base,
+ extack);
+#else
+ return 0;
+#endif
+}
+EXPORT_SYMBOL(tc_setup_offload_action);
+
+unsigned int tcf_exts_num_actions(struct tcf_exts *exts)
+{
+ unsigned int num_acts = 0;
+ struct tc_action *act;
+ int i;
+
+ tcf_exts_for_each_action(i, act, exts) {
+ if (is_tcf_pedit(act))
+ num_acts += tcf_pedit_nkeys(act);
+ else
+ num_acts++;
+ }
+ return num_acts;
+}
+EXPORT_SYMBOL(tcf_exts_num_actions);
+
+#ifdef CONFIG_NET_CLS_ACT
+static int tcf_qevent_parse_block_index(struct nlattr *block_index_attr,
+ u32 *p_block_index,
+ struct netlink_ext_ack *extack)
+{
+ *p_block_index = nla_get_u32(block_index_attr);
+ if (!*p_block_index) {
+ NL_SET_ERR_MSG(extack, "Block number may not be zero");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+int tcf_qevent_init(struct tcf_qevent *qe, struct Qdisc *sch,
+ enum flow_block_binder_type binder_type,
+ struct nlattr *block_index_attr,
+ struct netlink_ext_ack *extack)
+{
+ u32 block_index;
+ int err;
+
+ if (!block_index_attr)
+ return 0;
+
+ err = tcf_qevent_parse_block_index(block_index_attr, &block_index, extack);
+ if (err)
+ return err;
+
+ qe->info.binder_type = binder_type;
+ qe->info.chain_head_change = tcf_chain_head_change_dflt;
+ qe->info.chain_head_change_priv = &qe->filter_chain;
+ qe->info.block_index = block_index;
+
+ return tcf_block_get_ext(&qe->block, sch, &qe->info, extack);
+}
+EXPORT_SYMBOL(tcf_qevent_init);
+
+void tcf_qevent_destroy(struct tcf_qevent *qe, struct Qdisc *sch)
+{
+ if (qe->info.block_index)
+ tcf_block_put_ext(qe->block, sch, &qe->info);
+}
+EXPORT_SYMBOL(tcf_qevent_destroy);
+
+int tcf_qevent_validate_change(struct tcf_qevent *qe, struct nlattr *block_index_attr,
+ struct netlink_ext_ack *extack)
+{
+ u32 block_index;
+ int err;
+
+ if (!block_index_attr)
+ return 0;
+
+ err = tcf_qevent_parse_block_index(block_index_attr, &block_index, extack);
+ if (err)
+ return err;
+
+ /* Bounce newly-configured block or change in block. */
+ if (block_index != qe->info.block_index) {
+ NL_SET_ERR_MSG(extack, "Change of blocks is not supported");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(tcf_qevent_validate_change);
+
+struct sk_buff *tcf_qevent_handle(struct tcf_qevent *qe, struct Qdisc *sch, struct sk_buff *skb,
+ struct sk_buff **to_free, int *ret)
+{
+ struct tcf_result cl_res;
+ struct tcf_proto *fl;
+
+ if (!qe->info.block_index)
+ return skb;
+
+ fl = rcu_dereference_bh(qe->filter_chain);
+
+ switch (tcf_classify(skb, NULL, fl, &cl_res, false)) {
+ case TC_ACT_SHOT:
+ qdisc_qstats_drop(sch);
+ __qdisc_drop(skb, to_free);
+ *ret = __NET_XMIT_BYPASS;
+ return NULL;
+ case TC_ACT_STOLEN:
+ case TC_ACT_QUEUED:
+ case TC_ACT_TRAP:
+ __qdisc_drop(skb, to_free);
+ *ret = __NET_XMIT_STOLEN;
+ return NULL;
+ case TC_ACT_REDIRECT:
+ skb_do_redirect(skb);
+ *ret = __NET_XMIT_STOLEN;
+ return NULL;
+ }
+
+ return skb;
+}
+EXPORT_SYMBOL(tcf_qevent_handle);
+
+int tcf_qevent_dump(struct sk_buff *skb, int attr_name, struct tcf_qevent *qe)
+{
+ if (!qe->info.block_index)
+ return 0;
+ return nla_put_u32(skb, attr_name, qe->info.block_index);
+}
+EXPORT_SYMBOL(tcf_qevent_dump);
+#endif
+
static __net_init int tcf_net_init(struct net *net)
{
struct tcf_net *tn = net_generic(net, tcf_net_id);
@@ -2538,6 +4080,19 @@ static struct pernet_operations tcf_net_ops = {
.size = sizeof(struct tcf_net),
};
+static const struct rtnl_msg_handler tc_filter_rtnl_msg_handlers[] __initconst = {
+ {.msgtype = RTM_NEWTFILTER, .doit = tc_new_tfilter,
+ .flags = RTNL_FLAG_DOIT_UNLOCKED},
+ {.msgtype = RTM_DELTFILTER, .doit = tc_del_tfilter,
+ .flags = RTNL_FLAG_DOIT_UNLOCKED},
+ {.msgtype = RTM_GETTFILTER, .doit = tc_get_tfilter,
+ .dumpit = tc_dump_tfilter, .flags = RTNL_FLAG_DOIT_UNLOCKED},
+ {.msgtype = RTM_NEWCHAIN, .doit = tc_ctl_chain},
+ {.msgtype = RTM_DELCHAIN, .doit = tc_ctl_chain},
+ {.msgtype = RTM_GETCHAIN, .doit = tc_ctl_chain,
+ .dumpit = tc_dump_chain},
+};
+
static int __init tc_filter_init(void)
{
int err;
@@ -2550,24 +4105,11 @@ static int __init tc_filter_init(void)
if (err)
goto err_register_pernet_subsys;
- err = rhashtable_init(&indr_setup_block_ht,
- &tc_indr_setup_block_ht_params);
- if (err)
- goto err_rhash_setup_block_ht;
-
- rtnl_register(PF_UNSPEC, RTM_NEWTFILTER, tc_new_tfilter, NULL, 0);
- rtnl_register(PF_UNSPEC, RTM_DELTFILTER, tc_del_tfilter, NULL, 0);
- rtnl_register(PF_UNSPEC, RTM_GETTFILTER, tc_get_tfilter,
- tc_dump_tfilter, 0);
- rtnl_register(PF_UNSPEC, RTM_NEWCHAIN, tc_ctl_chain, NULL, 0);
- rtnl_register(PF_UNSPEC, RTM_DELCHAIN, tc_ctl_chain, NULL, 0);
- rtnl_register(PF_UNSPEC, RTM_GETCHAIN, tc_ctl_chain,
- tc_dump_chain, 0);
+ xa_init_flags(&tcf_exts_miss_cookies_xa, XA_FLAGS_ALLOC1);
+ rtnl_register_many(tc_filter_rtnl_msg_handlers);
return 0;
-err_rhash_setup_block_ht:
- unregister_pernet_subsys(&tcf_net_ops);
err_register_pernet_subsys:
destroy_workqueue(tc_filter_wq);
return err;
diff --git a/net/sched/cls_basic.c b/net/sched/cls_basic.c
index 4a57fec6f306..ecfaa4f9a04e 100644
--- a/net/sched/cls_basic.c
+++ b/net/sched/cls_basic.c
@@ -1,11 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* net/sched/cls_basic.c Basic Packet Classifier.
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Authors: Thomas Graf <tgraf@suug.ch>
*/
@@ -22,6 +18,7 @@
#include <net/netlink.h>
#include <net/act_api.h>
#include <net/pkt_cls.h>
+#include <net/tc_wrapper.h>
struct basic_head {
struct list_head flist;
@@ -40,8 +37,9 @@ struct basic_filter {
struct rcu_work rwork;
};
-static int basic_classify(struct sk_buff *skb, const struct tcf_proto *tp,
- struct tcf_result *res)
+TC_INDIRECT_SCOPE int basic_classify(struct sk_buff *skb,
+ const struct tcf_proto *tp,
+ struct tcf_result *res)
{
int r;
struct basic_head *head = rcu_dereference_bh(tp->root);
@@ -107,7 +105,8 @@ static void basic_delete_filter_work(struct work_struct *work)
rtnl_unlock();
}
-static void basic_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack)
+static void basic_destroy(struct tcf_proto *tp, bool rtnl_held,
+ struct netlink_ext_ack *extack)
{
struct basic_head *head = rtnl_dereference(tp->root);
struct basic_filter *f, *n;
@@ -126,7 +125,7 @@ static void basic_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack)
}
static int basic_delete(struct tcf_proto *tp, void *arg, bool *last,
- struct netlink_ext_ack *extack)
+ bool rtnl_held, struct netlink_ext_ack *extack)
{
struct basic_head *head = rtnl_dereference(tp->root);
struct basic_filter *f = arg;
@@ -148,12 +147,12 @@ static const struct nla_policy basic_policy[TCA_BASIC_MAX + 1] = {
static int basic_set_parms(struct net *net, struct tcf_proto *tp,
struct basic_filter *f, unsigned long base,
struct nlattr **tb,
- struct nlattr *est, bool ovr,
+ struct nlattr *est, u32 flags,
struct netlink_ext_ack *extack)
{
int err;
- err = tcf_exts_validate(net, tp, tb, est, &f->exts, ovr, extack);
+ err = tcf_exts_validate(net, tp, tb, est, &f->exts, flags, extack);
if (err < 0)
return err;
@@ -172,8 +171,8 @@ static int basic_set_parms(struct net *net, struct tcf_proto *tp,
static int basic_change(struct net *net, struct sk_buff *in_skb,
struct tcf_proto *tp, unsigned long base, u32 handle,
- struct nlattr **tca, void **arg, bool ovr,
- struct netlink_ext_ack *extack)
+ struct nlattr **tca, void **arg,
+ u32 flags, struct netlink_ext_ack *extack)
{
int err;
struct basic_head *head = rtnl_dereference(tp->root);
@@ -184,8 +183,8 @@ static int basic_change(struct net *net, struct sk_buff *in_skb,
if (tca[TCA_OPTIONS] == NULL)
return -EINVAL;
- err = nla_parse_nested(tb, TCA_BASIC_MAX, tca[TCA_OPTIONS],
- basic_policy, NULL);
+ err = nla_parse_nested_deprecated(tb, TCA_BASIC_MAX, tca[TCA_OPTIONS],
+ basic_policy, NULL);
if (err < 0)
return err;
@@ -198,7 +197,7 @@ static int basic_change(struct net *net, struct sk_buff *in_skb,
if (!fnew)
return -ENOBUFS;
- err = tcf_exts_init(&fnew->exts, TCA_BASIC_ACT, TCA_BASIC_POLICE);
+ err = tcf_exts_init(&fnew->exts, net, TCA_BASIC_ACT, TCA_BASIC_POLICE);
if (err < 0)
goto errout;
@@ -219,7 +218,7 @@ static int basic_change(struct net *net, struct sk_buff *in_skb,
goto errout;
}
- err = basic_set_parms(net, tp, fnew, base, tb, tca[TCA_RATE], ovr,
+ err = basic_set_parms(net, tp, fnew, base, tb, tca[TCA_RATE], flags,
extack);
if (err < 0) {
if (!fold)
@@ -247,34 +246,28 @@ errout:
return err;
}
-static void basic_walk(struct tcf_proto *tp, struct tcf_walker *arg)
+static void basic_walk(struct tcf_proto *tp, struct tcf_walker *arg,
+ bool rtnl_held)
{
struct basic_head *head = rtnl_dereference(tp->root);
struct basic_filter *f;
list_for_each_entry(f, &head->flist, link) {
- if (arg->count < arg->skip)
- goto skip;
-
- if (arg->fn(tp, f, arg) < 0) {
- arg->stop = 1;
+ if (!tc_cls_stats_dump(tp, arg, f))
break;
- }
-skip:
- arg->count++;
}
}
-static void basic_bind_class(void *fh, u32 classid, unsigned long cl)
+static void basic_bind_class(void *fh, u32 classid, unsigned long cl, void *q,
+ unsigned long base)
{
struct basic_filter *f = fh;
- if (f && f->res.classid == classid)
- f->res.class = cl;
+ tc_cls_bind_class(classid, cl, q, &f->res, base);
}
static int basic_dump(struct net *net, struct tcf_proto *tp, void *fh,
- struct sk_buff *skb, struct tcmsg *t)
+ struct sk_buff *skb, struct tcmsg *t, bool rtnl_held)
{
struct tc_basic_pcnt gpf = {};
struct basic_filter *f = fh;
@@ -286,7 +279,7 @@ static int basic_dump(struct net *net, struct tcf_proto *tp, void *fh,
t->tcm_handle = f->handle;
- nest = nla_nest_start(skb, TCA_OPTIONS);
+ nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
if (nest == NULL)
goto nla_put_failure;
@@ -335,6 +328,7 @@ static struct tcf_proto_ops cls_basic_ops __read_mostly = {
.bind_class = basic_bind_class,
.owner = THIS_MODULE,
};
+MODULE_ALIAS_NET_CLS("basic");
static int __init init_basic(void)
{
@@ -348,4 +342,5 @@ static void __exit exit_basic(void)
module_init(init_basic)
module_exit(exit_basic)
+MODULE_DESCRIPTION("TC basic classifier");
MODULE_LICENSE("GPL");
diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c
index a95cb240a606..a32754a2658b 100644
--- a/net/sched/cls_bpf.c
+++ b/net/sched/cls_bpf.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Berkeley Packet Filter based traffic classifier
*
@@ -6,10 +7,6 @@
* ematches.
*
* (C) 2013 Daniel Borkmann <dborkman@redhat.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include <linux/module.h>
@@ -22,6 +19,7 @@
#include <net/rtnetlink.h>
#include <net/pkt_cls.h>
#include <net/sock.h>
+#include <net/tc_wrapper.h>
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Daniel Borkmann <dborkman@redhat.com>");
@@ -80,16 +78,15 @@ static int cls_bpf_exec_opcode(int code)
}
}
-static int cls_bpf_classify(struct sk_buff *skb, const struct tcf_proto *tp,
- struct tcf_result *res)
+TC_INDIRECT_SCOPE int cls_bpf_classify(struct sk_buff *skb,
+ const struct tcf_proto *tp,
+ struct tcf_result *res)
{
struct cls_bpf_head *head = rcu_dereference_bh(tp->root);
bool at_ingress = skb_at_tc_ingress(skb);
struct cls_bpf_prog *prog;
int ret = -1;
- /* Needed here for accessing maps. */
- rcu_read_lock();
list_for_each_entry_rcu(prog, &head->plist, link) {
int filter_res;
@@ -100,13 +97,13 @@ static int cls_bpf_classify(struct sk_buff *skb, const struct tcf_proto *tp,
} else if (at_ingress) {
/* It is safe to push/pull even if skb_shared() */
__skb_push(skb, skb->mac_len);
- bpf_compute_data_pointers(skb);
- filter_res = BPF_PROG_RUN(prog->filter, skb);
+ filter_res = bpf_prog_run_data_pointers(prog->filter, skb);
__skb_pull(skb, skb->mac_len);
} else {
- bpf_compute_data_pointers(skb);
- filter_res = BPF_PROG_RUN(prog->filter, skb);
+ filter_res = bpf_prog_run_data_pointers(prog->filter, skb);
}
+ if (unlikely(!skb->tstamp && skb->tstamp_type))
+ skb->tstamp_type = SKB_CLOCK_REALTIME;
if (prog->exts_integrated) {
res->class = 0;
@@ -134,7 +131,6 @@ static int cls_bpf_classify(struct sk_buff *skb, const struct tcf_proto *tp,
break;
}
- rcu_read_unlock();
return ret;
}
@@ -157,8 +153,7 @@ static int cls_bpf_offload_cmd(struct tcf_proto *tp, struct cls_bpf_prog *prog,
skip_sw = prog && tc_skip_sw(prog->gen_flags);
obj = prog ?: oldprog;
- tc_cls_common_offload_init(&cls_bpf.common, tp, obj->gen_flags,
- extack);
+ tc_cls_common_offload_init(&cls_bpf.common, tp, obj->gen_flags, extack);
cls_bpf.command = TC_CLSBPF_OFFLOAD;
cls_bpf.exts = &obj->exts;
cls_bpf.prog = prog ? prog->filter : NULL;
@@ -166,18 +161,24 @@ static int cls_bpf_offload_cmd(struct tcf_proto *tp, struct cls_bpf_prog *prog,
cls_bpf.name = obj->bpf_name;
cls_bpf.exts_integrated = obj->exts_integrated;
- if (oldprog)
- tcf_block_offload_dec(block, &oldprog->gen_flags);
+ if (oldprog && prog)
+ err = tc_setup_cb_replace(block, tp, TC_SETUP_CLSBPF, &cls_bpf,
+ skip_sw, &oldprog->gen_flags,
+ &oldprog->in_hw_count,
+ &prog->gen_flags, &prog->in_hw_count,
+ true);
+ else if (prog)
+ err = tc_setup_cb_add(block, tp, TC_SETUP_CLSBPF, &cls_bpf,
+ skip_sw, &prog->gen_flags,
+ &prog->in_hw_count, true);
+ else
+ err = tc_setup_cb_destroy(block, tp, TC_SETUP_CLSBPF, &cls_bpf,
+ skip_sw, &oldprog->gen_flags,
+ &oldprog->in_hw_count, true);
- err = tc_setup_cb_call(block, TC_SETUP_CLSBPF, &cls_bpf, skip_sw);
- if (prog) {
- if (err < 0) {
- cls_bpf_offload_cmd(tp, oldprog, prog, extack);
- return err;
- } else if (err > 0) {
- prog->in_hw_count = err;
- tcf_block_offload_inc(block, &prog->gen_flags);
- }
+ if (prog && err) {
+ cls_bpf_offload_cmd(tp, oldprog, prog, extack);
+ return err;
}
if (prog && skip_sw && !(prog->gen_flags & TCA_CLS_FLAGS_IN_HW))
@@ -234,7 +235,7 @@ static void cls_bpf_offload_update_stats(struct tcf_proto *tp,
cls_bpf.name = prog->bpf_name;
cls_bpf.exts_integrated = prog->exts_integrated;
- tc_setup_cb_call(block, TC_SETUP_CLSBPF, &cls_bpf, false);
+ tc_setup_cb_call(block, TC_SETUP_CLSBPF, &cls_bpf, false, true);
}
static int cls_bpf_init(struct tcf_proto *tp)
@@ -298,7 +299,7 @@ static void __cls_bpf_delete(struct tcf_proto *tp, struct cls_bpf_prog *prog,
}
static int cls_bpf_delete(struct tcf_proto *tp, void *arg, bool *last,
- struct netlink_ext_ack *extack)
+ bool rtnl_held, struct netlink_ext_ack *extack)
{
struct cls_bpf_head *head = rtnl_dereference(tp->root);
@@ -307,7 +308,7 @@ static int cls_bpf_delete(struct tcf_proto *tp, void *arg, bool *last,
return 0;
}
-static void cls_bpf_destroy(struct tcf_proto *tp,
+static void cls_bpf_destroy(struct tcf_proto *tp, bool rtnl_held,
struct netlink_ext_ack *extack)
{
struct cls_bpf_head *head = rtnl_dereference(tp->root);
@@ -403,71 +404,26 @@ static int cls_bpf_prog_from_efd(struct nlattr **tb, struct cls_bpf_prog *prog,
return 0;
}
-static int cls_bpf_set_parms(struct net *net, struct tcf_proto *tp,
- struct cls_bpf_prog *prog, unsigned long base,
- struct nlattr **tb, struct nlattr *est, bool ovr,
- struct netlink_ext_ack *extack)
-{
- bool is_bpf, is_ebpf, have_exts = false;
- u32 gen_flags = 0;
- int ret;
-
- is_bpf = tb[TCA_BPF_OPS_LEN] && tb[TCA_BPF_OPS];
- is_ebpf = tb[TCA_BPF_FD];
- if ((!is_bpf && !is_ebpf) || (is_bpf && is_ebpf))
- return -EINVAL;
-
- ret = tcf_exts_validate(net, tp, tb, est, &prog->exts, ovr, extack);
- if (ret < 0)
- return ret;
-
- if (tb[TCA_BPF_FLAGS]) {
- u32 bpf_flags = nla_get_u32(tb[TCA_BPF_FLAGS]);
-
- if (bpf_flags & ~TCA_BPF_FLAG_ACT_DIRECT)
- return -EINVAL;
-
- have_exts = bpf_flags & TCA_BPF_FLAG_ACT_DIRECT;
- }
- if (tb[TCA_BPF_FLAGS_GEN]) {
- gen_flags = nla_get_u32(tb[TCA_BPF_FLAGS_GEN]);
- if (gen_flags & ~CLS_BPF_SUPPORTED_GEN_FLAGS ||
- !tc_flags_valid(gen_flags))
- return -EINVAL;
- }
-
- prog->exts_integrated = have_exts;
- prog->gen_flags = gen_flags;
-
- ret = is_bpf ? cls_bpf_prog_from_ops(tb, prog) :
- cls_bpf_prog_from_efd(tb, prog, gen_flags, tp);
- if (ret < 0)
- return ret;
-
- if (tb[TCA_BPF_CLASSID]) {
- prog->res.classid = nla_get_u32(tb[TCA_BPF_CLASSID]);
- tcf_bind_filter(tp, &prog->res, base);
- }
-
- return 0;
-}
-
static int cls_bpf_change(struct net *net, struct sk_buff *in_skb,
struct tcf_proto *tp, unsigned long base,
u32 handle, struct nlattr **tca,
- void **arg, bool ovr, struct netlink_ext_ack *extack)
+ void **arg, u32 flags,
+ struct netlink_ext_ack *extack)
{
struct cls_bpf_head *head = rtnl_dereference(tp->root);
+ bool is_bpf, is_ebpf, have_exts = false;
struct cls_bpf_prog *oldprog = *arg;
struct nlattr *tb[TCA_BPF_MAX + 1];
+ bool bound_to_filter = false;
struct cls_bpf_prog *prog;
+ u32 gen_flags = 0;
int ret;
if (tca[TCA_OPTIONS] == NULL)
return -EINVAL;
- ret = nla_parse_nested(tb, TCA_BPF_MAX, tca[TCA_OPTIONS], bpf_policy,
- NULL);
+ ret = nla_parse_nested_deprecated(tb, TCA_BPF_MAX, tca[TCA_OPTIONS],
+ bpf_policy, NULL);
if (ret < 0)
return ret;
@@ -475,7 +431,7 @@ static int cls_bpf_change(struct net *net, struct sk_buff *in_skb,
if (!prog)
return -ENOBUFS;
- ret = tcf_exts_init(&prog->exts, TCA_BPF_ACT, TCA_BPF_POLICE);
+ ret = tcf_exts_init(&prog->exts, net, TCA_BPF_ACT, TCA_BPF_POLICE);
if (ret < 0)
goto errout;
@@ -499,11 +455,51 @@ static int cls_bpf_change(struct net *net, struct sk_buff *in_skb,
goto errout;
prog->handle = handle;
- ret = cls_bpf_set_parms(net, tp, prog, base, tb, tca[TCA_RATE], ovr,
- extack);
+ is_bpf = tb[TCA_BPF_OPS_LEN] && tb[TCA_BPF_OPS];
+ is_ebpf = tb[TCA_BPF_FD];
+ if ((!is_bpf && !is_ebpf) || (is_bpf && is_ebpf)) {
+ ret = -EINVAL;
+ goto errout_idr;
+ }
+
+ ret = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &prog->exts,
+ flags, extack);
+ if (ret < 0)
+ goto errout_idr;
+
+ if (tb[TCA_BPF_FLAGS]) {
+ u32 bpf_flags = nla_get_u32(tb[TCA_BPF_FLAGS]);
+
+ if (bpf_flags & ~TCA_BPF_FLAG_ACT_DIRECT) {
+ ret = -EINVAL;
+ goto errout_idr;
+ }
+
+ have_exts = bpf_flags & TCA_BPF_FLAG_ACT_DIRECT;
+ }
+ if (tb[TCA_BPF_FLAGS_GEN]) {
+ gen_flags = nla_get_u32(tb[TCA_BPF_FLAGS_GEN]);
+ if (gen_flags & ~CLS_BPF_SUPPORTED_GEN_FLAGS ||
+ !tc_flags_valid(gen_flags)) {
+ ret = -EINVAL;
+ goto errout_idr;
+ }
+ }
+
+ prog->exts_integrated = have_exts;
+ prog->gen_flags = gen_flags;
+
+ ret = is_bpf ? cls_bpf_prog_from_ops(tb, prog) :
+ cls_bpf_prog_from_efd(tb, prog, gen_flags, tp);
if (ret < 0)
goto errout_idr;
+ if (tb[TCA_BPF_CLASSID]) {
+ prog->res.classid = nla_get_u32(tb[TCA_BPF_CLASSID]);
+ tcf_bind_filter(tp, &prog->res, base);
+ bound_to_filter = true;
+ }
+
ret = cls_bpf_offload(tp, prog, oldprog, extack);
if (ret)
goto errout_parms;
@@ -511,6 +507,8 @@ static int cls_bpf_change(struct net *net, struct sk_buff *in_skb,
if (!tc_in_hw(prog->gen_flags))
prog->gen_flags |= TCA_CLS_FLAGS_NOT_IN_HW;
+ tcf_proto_update_usesw(tp, prog->gen_flags);
+
if (oldprog) {
idr_replace(&head->handle_idr, prog, handle);
list_replace_rcu(&oldprog->link, &prog->link);
@@ -525,6 +523,8 @@ static int cls_bpf_change(struct net *net, struct sk_buff *in_skb,
return 0;
errout_parms:
+ if (bound_to_filter)
+ tcf_unbind_filter(tp, &prog->res);
cls_bpf_free_parms(prog);
errout_idr:
if (!oldprog)
@@ -575,7 +575,7 @@ static int cls_bpf_dump_ebpf_info(const struct cls_bpf_prog *prog,
}
static int cls_bpf_dump(struct net *net, struct tcf_proto *tp, void *fh,
- struct sk_buff *skb, struct tcmsg *tm)
+ struct sk_buff *skb, struct tcmsg *tm, bool rtnl_held)
{
struct cls_bpf_prog *prog = fh;
struct nlattr *nest;
@@ -589,7 +589,7 @@ static int cls_bpf_dump(struct net *net, struct tcf_proto *tp, void *fh,
cls_bpf_offload_update_stats(tp, prog);
- nest = nla_nest_start(skb, TCA_OPTIONS);
+ nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
if (nest == NULL)
goto nla_put_failure;
@@ -627,32 +627,27 @@ nla_put_failure:
return -1;
}
-static void cls_bpf_bind_class(void *fh, u32 classid, unsigned long cl)
+static void cls_bpf_bind_class(void *fh, u32 classid, unsigned long cl,
+ void *q, unsigned long base)
{
struct cls_bpf_prog *prog = fh;
- if (prog && prog->res.classid == classid)
- prog->res.class = cl;
+ tc_cls_bind_class(classid, cl, q, &prog->res, base);
}
-static void cls_bpf_walk(struct tcf_proto *tp, struct tcf_walker *arg)
+static void cls_bpf_walk(struct tcf_proto *tp, struct tcf_walker *arg,
+ bool rtnl_held)
{
struct cls_bpf_head *head = rtnl_dereference(tp->root);
struct cls_bpf_prog *prog;
list_for_each_entry(prog, &head->plist, link) {
- if (arg->count < arg->skip)
- goto skip;
- if (arg->fn(tp, prog, arg) < 0) {
- arg->stop = 1;
+ if (!tc_cls_stats_dump(tp, arg, prog))
break;
- }
-skip:
- arg->count++;
}
}
-static int cls_bpf_reoffload(struct tcf_proto *tp, bool add, tc_setup_cb_t *cb,
+static int cls_bpf_reoffload(struct tcf_proto *tp, bool add, flow_setup_cb_t *cb,
void *cb_priv, struct netlink_ext_ack *extack)
{
struct cls_bpf_head *head = rtnl_dereference(tp->root);
@@ -674,15 +669,11 @@ static int cls_bpf_reoffload(struct tcf_proto *tp, bool add, tc_setup_cb_t *cb,
cls_bpf.name = prog->bpf_name;
cls_bpf.exts_integrated = prog->exts_integrated;
- err = cb(TC_SETUP_CLSBPF, &cls_bpf, cb_priv);
- if (err) {
- if (add && tc_skip_sw(prog->gen_flags))
- return err;
- continue;
- }
-
- tc_cls_offload_cnt_update(block, &prog->in_hw_count,
- &prog->gen_flags, add);
+ err = tc_setup_cb_reoffload(block, tp, add, cb, TC_SETUP_CLSBPF,
+ &cls_bpf, cb_priv, &prog->gen_flags,
+ &prog->in_hw_count);
+ if (err)
+ return err;
}
return 0;
@@ -702,6 +693,7 @@ static struct tcf_proto_ops cls_bpf_ops __read_mostly = {
.dump = cls_bpf_dump,
.bind_class = cls_bpf_bind_class,
};
+MODULE_ALIAS_NET_CLS("bpf");
static int __init cls_bpf_init_mod(void)
{
diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c
index 3bc01bdde165..424252982d6a 100644
--- a/net/sched/cls_cgroup.c
+++ b/net/sched/cls_cgroup.c
@@ -1,11 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* net/sched/cls_cgroup.c Control Group Classifier
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Authors: Thomas Graf <tgraf@suug.ch>
*/
@@ -17,6 +13,7 @@
#include <net/pkt_cls.h>
#include <net/sock.h>
#include <net/cls_cgroup.h>
+#include <net/tc_wrapper.h>
struct cls_cgroup_head {
u32 handle;
@@ -26,12 +23,15 @@ struct cls_cgroup_head {
struct rcu_work rwork;
};
-static int cls_cgroup_classify(struct sk_buff *skb, const struct tcf_proto *tp,
- struct tcf_result *res)
+TC_INDIRECT_SCOPE int cls_cgroup_classify(struct sk_buff *skb,
+ const struct tcf_proto *tp,
+ struct tcf_result *res)
{
struct cls_cgroup_head *head = rcu_dereference_bh(tp->root);
u32 classid = task_get_classid(skb);
+ if (unlikely(!head))
+ return -1;
if (!classid)
return -1;
if (!tcf_em_tree_match(skb, &head->ematches, NULL))
@@ -78,7 +78,7 @@ static void cls_cgroup_destroy_work(struct work_struct *work)
static int cls_cgroup_change(struct net *net, struct sk_buff *in_skb,
struct tcf_proto *tp, unsigned long base,
u32 handle, struct nlattr **tca,
- void **arg, bool ovr,
+ void **arg, u32 flags,
struct netlink_ext_ack *extack)
{
struct nlattr *tb[TCA_CGROUP_MAX + 1];
@@ -99,17 +99,18 @@ static int cls_cgroup_change(struct net *net, struct sk_buff *in_skb,
if (!new)
return -ENOBUFS;
- err = tcf_exts_init(&new->exts, TCA_CGROUP_ACT, TCA_CGROUP_POLICE);
+ err = tcf_exts_init(&new->exts, net, TCA_CGROUP_ACT, TCA_CGROUP_POLICE);
if (err < 0)
goto errout;
new->handle = handle;
new->tp = tp;
- err = nla_parse_nested(tb, TCA_CGROUP_MAX, tca[TCA_OPTIONS],
- cgroup_policy, NULL);
+ err = nla_parse_nested_deprecated(tb, TCA_CGROUP_MAX,
+ tca[TCA_OPTIONS], cgroup_policy,
+ NULL);
if (err < 0)
goto errout;
- err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &new->exts, ovr,
+ err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &new->exts, flags,
extack);
if (err < 0)
goto errout;
@@ -130,7 +131,7 @@ errout:
return err;
}
-static void cls_cgroup_destroy(struct tcf_proto *tp,
+static void cls_cgroup_destroy(struct tcf_proto *tp, bool rtnl_held,
struct netlink_ext_ack *extack)
{
struct cls_cgroup_head *head = rtnl_dereference(tp->root);
@@ -145,18 +146,21 @@ static void cls_cgroup_destroy(struct tcf_proto *tp,
}
static int cls_cgroup_delete(struct tcf_proto *tp, void *arg, bool *last,
- struct netlink_ext_ack *extack)
+ bool rtnl_held, struct netlink_ext_ack *extack)
{
return -EOPNOTSUPP;
}
-static void cls_cgroup_walk(struct tcf_proto *tp, struct tcf_walker *arg)
+static void cls_cgroup_walk(struct tcf_proto *tp, struct tcf_walker *arg,
+ bool rtnl_held)
{
struct cls_cgroup_head *head = rtnl_dereference(tp->root);
if (arg->count < arg->skip)
goto skip;
+ if (!head)
+ return;
if (arg->fn(tp, head, arg) < 0) {
arg->stop = 1;
return;
@@ -166,14 +170,14 @@ skip:
}
static int cls_cgroup_dump(struct net *net, struct tcf_proto *tp, void *fh,
- struct sk_buff *skb, struct tcmsg *t)
+ struct sk_buff *skb, struct tcmsg *t, bool rtnl_held)
{
struct cls_cgroup_head *head = rtnl_dereference(tp->root);
struct nlattr *nest;
t->tcm_handle = head->handle;
- nest = nla_nest_start(skb, TCA_OPTIONS);
+ nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
if (nest == NULL)
goto nla_put_failure;
@@ -205,6 +209,7 @@ static struct tcf_proto_ops cls_cgroup_ops __read_mostly = {
.dump = cls_cgroup_dump,
.owner = THIS_MODULE,
};
+MODULE_ALIAS_NET_CLS("cgroup");
static int __init init_cgroup_cls(void)
{
@@ -218,4 +223,5 @@ static void __exit exit_cgroup_cls(void)
module_init(init_cgroup_cls);
module_exit(exit_cgroup_cls);
+MODULE_DESCRIPTION("TC cgroup classifier");
MODULE_LICENSE("GPL");
diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c
index 2bb043cd436b..5693b41b093f 100644
--- a/net/sched/cls_flow.c
+++ b/net/sched/cls_flow.c
@@ -1,12 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* net/sched/cls_flow.c Generic flow classifier
*
* Copyright (c) 2007, 2008 Patrick McHardy <kaber@trash.net>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version 2
- * of the License, or (at your option) any later version.
*/
#include <linux/kernel.h>
@@ -28,6 +24,7 @@
#include <net/ip.h>
#include <net/route.h>
#include <net/flow_dissector.h>
+#include <net/tc_wrapper.h>
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
#include <net/netfilter/nf_conntrack.h>
@@ -84,7 +81,7 @@ static u32 flow_get_dst(const struct sk_buff *skb, const struct flow_keys *flow)
if (dst)
return ntohl(dst);
- return addr_fold(skb_dst(skb)) ^ (__force u16) tc_skb_protocol(skb);
+ return addr_fold(skb_dst(skb)) ^ (__force u16)skb_protocol(skb, true);
}
static u32 flow_get_proto(const struct sk_buff *skb,
@@ -108,7 +105,7 @@ static u32 flow_get_proto_dst(const struct sk_buff *skb,
if (flow->ports.ports)
return ntohs(flow->ports.dst);
- return addr_fold(skb_dst(skb)) ^ (__force u16) tc_skb_protocol(skb);
+ return addr_fold(skb_dst(skb)) ^ (__force u16)skb_protocol(skb, true);
}
static u32 flow_get_iif(const struct sk_buff *skb)
@@ -155,7 +152,7 @@ static u32 flow_get_nfct(const struct sk_buff *skb)
static u32 flow_get_nfct_src(const struct sk_buff *skb,
const struct flow_keys *flow)
{
- switch (tc_skb_protocol(skb)) {
+ switch (skb_protocol(skb, true)) {
case htons(ETH_P_IP):
return ntohl(CTTUPLE(skb, src.u3.ip));
case htons(ETH_P_IPV6):
@@ -168,7 +165,7 @@ fallback:
static u32 flow_get_nfct_dst(const struct sk_buff *skb,
const struct flow_keys *flow)
{
- switch (tc_skb_protocol(skb)) {
+ switch (skb_protocol(skb, true)) {
case htons(ETH_P_IP):
return ntohl(CTTUPLE(skb, dst.u3.ip));
case htons(ETH_P_IPV6):
@@ -229,7 +226,7 @@ static u32 flow_get_skgid(const struct sk_buff *skb)
static u32 flow_get_vlan_tag(const struct sk_buff *skb)
{
- u16 uninitialized_var(tag);
+ u16 tag;
if (vlan_get_tag(skb, &tag) < 0)
return 0;
@@ -296,8 +293,9 @@ static u32 flow_key_get(struct sk_buff *skb, int key, struct flow_keys *flow)
(1 << FLOW_KEY_NFCT_PROTO_SRC) | \
(1 << FLOW_KEY_NFCT_PROTO_DST))
-static int flow_classify(struct sk_buff *skb, const struct tcf_proto *tp,
- struct tcf_result *res)
+TC_INDIRECT_SCOPE int flow_classify(struct sk_buff *skb,
+ const struct tcf_proto *tp,
+ struct tcf_result *res)
{
struct flow_head *head = rcu_dereference_bh(tp->root);
struct flow_filter *f;
@@ -347,7 +345,7 @@ static int flow_classify(struct sk_buff *skb, const struct tcf_proto *tp,
static void flow_perturbation(struct timer_list *t)
{
- struct flow_filter *f = from_timer(f, t, perturb_timer);
+ struct flow_filter *f = timer_container_of(f, t, perturb_timer);
get_random_bytes(&f->hashrnd, 4);
if (f->perturb_period)
@@ -358,7 +356,8 @@ static const struct nla_policy flow_policy[TCA_FLOW_MAX + 1] = {
[TCA_FLOW_KEYS] = { .type = NLA_U32 },
[TCA_FLOW_MODE] = { .type = NLA_U32 },
[TCA_FLOW_BASECLASS] = { .type = NLA_U32 },
- [TCA_FLOW_RSHIFT] = { .type = NLA_U32 },
+ [TCA_FLOW_RSHIFT] = NLA_POLICY_MAX(NLA_U32,
+ 31 /* BITS_PER_U32 - 1 */),
[TCA_FLOW_ADDEND] = { .type = NLA_U32 },
[TCA_FLOW_MASK] = { .type = NLA_U32 },
[TCA_FLOW_XOR] = { .type = NLA_U32 },
@@ -371,7 +370,7 @@ static const struct nla_policy flow_policy[TCA_FLOW_MAX + 1] = {
static void __flow_destroy_filter(struct flow_filter *f)
{
- del_timer_sync(&f->perturb_timer);
+ timer_shutdown_sync(&f->perturb_timer);
tcf_exts_destroy(&f->exts);
tcf_em_tree_destroy(&f->ematches);
tcf_exts_put_net(&f->exts);
@@ -391,7 +390,8 @@ static void flow_destroy_filter_work(struct work_struct *work)
static int flow_change(struct net *net, struct sk_buff *in_skb,
struct tcf_proto *tp, unsigned long base,
u32 handle, struct nlattr **tca,
- void **arg, bool ovr, struct netlink_ext_ack *extack)
+ void **arg, u32 flags,
+ struct netlink_ext_ack *extack)
{
struct flow_head *head = rtnl_dereference(tp->root);
struct flow_filter *fold, *fnew;
@@ -407,7 +407,8 @@ static int flow_change(struct net *net, struct sk_buff *in_skb,
if (opt == NULL)
return -EINVAL;
- err = nla_parse_nested(tb, TCA_FLOW_MAX, opt, flow_policy, NULL);
+ err = nla_parse_nested_deprecated(tb, TCA_FLOW_MAX, opt, flow_policy,
+ NULL);
if (err < 0)
return err;
@@ -440,11 +441,11 @@ static int flow_change(struct net *net, struct sk_buff *in_skb,
if (err < 0)
goto err1;
- err = tcf_exts_init(&fnew->exts, TCA_FLOW_ACT, TCA_FLOW_POLICE);
+ err = tcf_exts_init(&fnew->exts, net, TCA_FLOW_ACT, TCA_FLOW_POLICE);
if (err < 0)
goto err2;
- err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &fnew->exts, ovr,
+ err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &fnew->exts, flags,
extack);
if (err < 0)
goto err2;
@@ -566,7 +567,7 @@ err1:
}
static int flow_delete(struct tcf_proto *tp, void *arg, bool *last,
- struct netlink_ext_ack *extack)
+ bool rtnl_held, struct netlink_ext_ack *extack)
{
struct flow_head *head = rtnl_dereference(tp->root);
struct flow_filter *f = arg;
@@ -590,7 +591,8 @@ static int flow_init(struct tcf_proto *tp)
return 0;
}
-static void flow_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack)
+static void flow_destroy(struct tcf_proto *tp, bool rtnl_held,
+ struct netlink_ext_ack *extack)
{
struct flow_head *head = rtnl_dereference(tp->root);
struct flow_filter *f, *next;
@@ -617,7 +619,7 @@ static void *flow_get(struct tcf_proto *tp, u32 handle)
}
static int flow_dump(struct net *net, struct tcf_proto *tp, void *fh,
- struct sk_buff *skb, struct tcmsg *t)
+ struct sk_buff *skb, struct tcmsg *t, bool rtnl_held)
{
struct flow_filter *f = fh;
struct nlattr *nest;
@@ -627,7 +629,7 @@ static int flow_dump(struct net *net, struct tcf_proto *tp, void *fh,
t->tcm_handle = f->handle;
- nest = nla_nest_start(skb, TCA_OPTIONS);
+ nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
if (nest == NULL)
goto nla_put_failure;
@@ -677,20 +679,15 @@ nla_put_failure:
return -1;
}
-static void flow_walk(struct tcf_proto *tp, struct tcf_walker *arg)
+static void flow_walk(struct tcf_proto *tp, struct tcf_walker *arg,
+ bool rtnl_held)
{
struct flow_head *head = rtnl_dereference(tp->root);
struct flow_filter *f;
list_for_each_entry(f, &head->filters, list) {
- if (arg->count < arg->skip)
- goto skip;
- if (arg->fn(tp, f, arg) < 0) {
- arg->stop = 1;
+ if (!tc_cls_stats_dump(tp, arg, f))
break;
- }
-skip:
- arg->count++;
}
}
@@ -706,6 +703,7 @@ static struct tcf_proto_ops cls_flow_ops __read_mostly = {
.walk = flow_walk,
.owner = THIS_MODULE,
};
+MODULE_ALIAS_NET_CLS("flow");
static int __init cls_flow_init(void)
{
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index f6aa57fbbbaf..7669371c1354 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -1,12 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* net/sched/cls_flower.c Flower classifier
*
* Copyright (c) 2015 Jiri Pirko <jiri@resnulli.us>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
*/
#include <linux/kernel.h>
@@ -14,23 +10,49 @@
#include <linux/module.h>
#include <linux/rhashtable.h>
#include <linux/workqueue.h>
+#include <linux/refcount.h>
+#include <linux/bitfield.h>
#include <linux/if_ether.h>
#include <linux/in6.h>
#include <linux/ip.h>
#include <linux/mpls.h>
+#include <linux/ppp_defs.h>
#include <net/sch_generic.h>
#include <net/pkt_cls.h>
+#include <net/pkt_sched.h>
#include <net/ip.h>
#include <net/flow_dissector.h>
#include <net/geneve.h>
+#include <net/vxlan.h>
+#include <net/erspan.h>
+#include <net/gtp.h>
+#include <net/pfcp.h>
+#include <net/tc_wrapper.h>
#include <net/dst.h>
#include <net/dst_metadata.h>
+#include <uapi/linux/netfilter/nf_conntrack_common.h>
+
+#define TCA_FLOWER_KEY_CT_FLAGS_MAX \
+ ((__TCA_FLOWER_KEY_CT_FLAGS_MAX - 1) << 1)
+#define TCA_FLOWER_KEY_CT_FLAGS_MASK \
+ (TCA_FLOWER_KEY_CT_FLAGS_MAX - 1)
+
+#define TCA_FLOWER_KEY_FLAGS_POLICY_MASK \
+ (TCA_FLOWER_KEY_FLAGS_IS_FRAGMENT | \
+ TCA_FLOWER_KEY_FLAGS_FRAG_IS_FIRST)
+
+#define TCA_FLOWER_KEY_ENC_FLAGS_POLICY_MASK \
+ (TCA_FLOWER_KEY_FLAGS_TUNNEL_CSUM | \
+ TCA_FLOWER_KEY_FLAGS_TUNNEL_DONT_FRAGMENT | \
+ TCA_FLOWER_KEY_FLAGS_TUNNEL_OAM | \
+ TCA_FLOWER_KEY_FLAGS_TUNNEL_CRIT_OPT)
+
struct fl_flow_key {
- int indev_ifindex;
+ struct flow_dissector_key_meta meta;
struct flow_dissector_key_control control;
struct flow_dissector_key_control enc_control;
struct flow_dissector_key_basic basic;
@@ -55,8 +77,14 @@ struct fl_flow_key {
struct flow_dissector_key_ip ip;
struct flow_dissector_key_ip enc_ip;
struct flow_dissector_key_enc_opts enc_opts;
- struct flow_dissector_key_ports tp_min;
- struct flow_dissector_key_ports tp_max;
+ struct flow_dissector_key_ports_range tp_range;
+ struct flow_dissector_key_ct ct;
+ struct flow_dissector_key_hash hash;
+ struct flow_dissector_key_num_of_vlans num_of_vlans;
+ struct flow_dissector_key_pppoe pppoe;
+ struct flow_dissector_key_l2tpv3 l2tpv3;
+ struct flow_dissector_key_ipsec ipsec;
+ struct flow_dissector_key_cfm cfm;
} __aligned(BITS_PER_LONG / 8); /* Ensure that we can do comparisons as longs. */
struct fl_flow_mask_range {
@@ -75,6 +103,7 @@ struct fl_flow_mask {
struct list_head filters;
struct rcu_work rwork;
struct list_head list;
+ refcount_t refcnt;
};
struct fl_flow_tmplt {
@@ -86,7 +115,9 @@ struct fl_flow_tmplt {
struct cls_fl_head {
struct rhashtable ht;
+ spinlock_t masks_lock; /* Protect masks list */
struct list_head masks;
+ struct list_head hw_filters;
struct rcu_work rwork;
struct idr handle_idr;
};
@@ -99,11 +130,19 @@ struct cls_fl_filter {
struct tcf_result res;
struct fl_flow_key key;
struct list_head list;
+ struct list_head hw_list;
u32 handle;
u32 flags;
u32 in_hw_count;
+ u8 needs_tc_skb_ext:1;
struct rcu_work rwork;
struct net_device *hw_dev;
+ /* Flower classifier is unlocked, which means that its reference counter
+ * can be changed concurrently without any kind of external
+ * synchronization. Use atomic reference counter to be concurrency-safe.
+ */
+ refcount_t refcnt;
+ bool deleted;
};
static const struct rhashtable_params mask_ht_params = {
@@ -186,21 +225,21 @@ static bool fl_range_port_dst_cmp(struct cls_fl_filter *filter,
struct fl_flow_key *key,
struct fl_flow_key *mkey)
{
- __be16 min_mask, max_mask, min_val, max_val;
+ u16 min_mask, max_mask, min_val, max_val;
- min_mask = htons(filter->mask->key.tp_min.dst);
- max_mask = htons(filter->mask->key.tp_max.dst);
- min_val = htons(filter->key.tp_min.dst);
- max_val = htons(filter->key.tp_max.dst);
+ min_mask = ntohs(filter->mask->key.tp_range.tp_min.dst);
+ max_mask = ntohs(filter->mask->key.tp_range.tp_max.dst);
+ min_val = ntohs(filter->key.tp_range.tp_min.dst);
+ max_val = ntohs(filter->key.tp_range.tp_max.dst);
if (min_mask && max_mask) {
- if (htons(key->tp.dst) < min_val ||
- htons(key->tp.dst) > max_val)
+ if (ntohs(key->tp_range.tp.dst) < min_val ||
+ ntohs(key->tp_range.tp.dst) > max_val)
return false;
/* skb does not have min and max values */
- mkey->tp_min.dst = filter->mkey.tp_min.dst;
- mkey->tp_max.dst = filter->mkey.tp_max.dst;
+ mkey->tp_range.tp_min.dst = filter->mkey.tp_range.tp_min.dst;
+ mkey->tp_range.tp_max.dst = filter->mkey.tp_range.tp_max.dst;
}
return true;
}
@@ -209,21 +248,21 @@ static bool fl_range_port_src_cmp(struct cls_fl_filter *filter,
struct fl_flow_key *key,
struct fl_flow_key *mkey)
{
- __be16 min_mask, max_mask, min_val, max_val;
+ u16 min_mask, max_mask, min_val, max_val;
- min_mask = htons(filter->mask->key.tp_min.src);
- max_mask = htons(filter->mask->key.tp_max.src);
- min_val = htons(filter->key.tp_min.src);
- max_val = htons(filter->key.tp_max.src);
+ min_mask = ntohs(filter->mask->key.tp_range.tp_min.src);
+ max_mask = ntohs(filter->mask->key.tp_range.tp_max.src);
+ min_val = ntohs(filter->key.tp_range.tp_min.src);
+ max_val = ntohs(filter->key.tp_range.tp_max.src);
if (min_mask && max_mask) {
- if (htons(key->tp.src) < min_val ||
- htons(key->tp.src) > max_val)
+ if (ntohs(key->tp_range.tp.src) < min_val ||
+ ntohs(key->tp_range.tp.src) > max_val)
return false;
/* skb does not have min and max values */
- mkey->tp_min.src = filter->mkey.tp_min.src;
- mkey->tp_max.src = filter->mkey.tp_max.src;
+ mkey->tp_range.tp_min.src = filter->mkey.tp_range.tp_min.src;
+ mkey->tp_range.tp_max.src = filter->mkey.tp_range.tp_max.src;
}
return true;
}
@@ -255,39 +294,63 @@ static struct cls_fl_filter *fl_lookup_range(struct fl_flow_mask *mask,
return NULL;
}
-static struct cls_fl_filter *fl_lookup(struct fl_flow_mask *mask,
- struct fl_flow_key *mkey,
- struct fl_flow_key *key)
+static noinline_for_stack
+struct cls_fl_filter *fl_mask_lookup(struct fl_flow_mask *mask, struct fl_flow_key *key)
{
- if ((mask->flags & TCA_FLOWER_MASK_FLAGS_RANGE))
- return fl_lookup_range(mask, mkey, key);
+ struct fl_flow_key mkey;
- return __fl_lookup(mask, mkey);
-}
+ fl_set_masked_key(&mkey, key, mask);
+ if ((mask->flags & TCA_FLOWER_MASK_FLAGS_RANGE))
+ return fl_lookup_range(mask, &mkey, key);
+
+ return __fl_lookup(mask, &mkey);
+}
+
+static u16 fl_ct_info_to_flower_map[] = {
+ [IP_CT_ESTABLISHED] = TCA_FLOWER_KEY_CT_FLAGS_TRACKED |
+ TCA_FLOWER_KEY_CT_FLAGS_ESTABLISHED,
+ [IP_CT_RELATED] = TCA_FLOWER_KEY_CT_FLAGS_TRACKED |
+ TCA_FLOWER_KEY_CT_FLAGS_RELATED,
+ [IP_CT_ESTABLISHED_REPLY] = TCA_FLOWER_KEY_CT_FLAGS_TRACKED |
+ TCA_FLOWER_KEY_CT_FLAGS_ESTABLISHED |
+ TCA_FLOWER_KEY_CT_FLAGS_REPLY,
+ [IP_CT_RELATED_REPLY] = TCA_FLOWER_KEY_CT_FLAGS_TRACKED |
+ TCA_FLOWER_KEY_CT_FLAGS_RELATED |
+ TCA_FLOWER_KEY_CT_FLAGS_REPLY,
+ [IP_CT_NEW] = TCA_FLOWER_KEY_CT_FLAGS_TRACKED |
+ TCA_FLOWER_KEY_CT_FLAGS_NEW,
+};
-static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp,
- struct tcf_result *res)
+TC_INDIRECT_SCOPE int fl_classify(struct sk_buff *skb,
+ const struct tcf_proto *tp,
+ struct tcf_result *res)
{
struct cls_fl_head *head = rcu_dereference_bh(tp->root);
- struct cls_fl_filter *f;
- struct fl_flow_mask *mask;
+ bool post_ct = qdisc_skb_cb(skb)->post_ct;
+ u16 zone = tc_skb_cb(skb)->zone;
struct fl_flow_key skb_key;
- struct fl_flow_key skb_mkey;
+ struct fl_flow_mask *mask;
+ struct cls_fl_filter *f;
list_for_each_entry_rcu(mask, &head->masks, list) {
+ flow_dissector_init_keys(&skb_key.control, &skb_key.basic);
fl_clear_masked_range(&skb_key, mask);
- skb_key.indev_ifindex = skb->skb_iif;
+ skb_flow_dissect_meta(skb, &mask->dissector, &skb_key);
/* skb_flow_dissect() does not set n_proto in case an unknown
* protocol, so do it rather here.
*/
- skb_key.basic.n_proto = skb->protocol;
+ skb_key.basic.n_proto = skb_protocol(skb, false);
skb_flow_dissect_tunnel_info(skb, &mask->dissector, &skb_key);
- skb_flow_dissect(skb, &mask->dissector, &skb_key, 0);
-
- fl_set_masked_key(&skb_mkey, &skb_key, mask);
-
- f = fl_lookup(mask, &skb_mkey, &skb_key);
+ skb_flow_dissect_ct(skb, &mask->dissector, &skb_key,
+ fl_ct_info_to_flower_map,
+ ARRAY_SIZE(fl_ct_info_to_flower_map),
+ post_ct, zone);
+ skb_flow_dissect_hash(skb, &mask->dissector, &skb_key);
+ skb_flow_dissect(skb, &mask->dissector, &skb_key,
+ FLOW_DISSECTOR_F_STOP_BEFORE_ENCAP);
+
+ f = fl_mask_lookup(mask, &skb_key);
if (f && !tc_skip_sw(f->flags)) {
*res = f->res;
return tcf_exts_exec(skb, &f->exts, res);
@@ -304,16 +367,22 @@ static int fl_init(struct tcf_proto *tp)
if (!head)
return -ENOBUFS;
+ spin_lock_init(&head->masks_lock);
INIT_LIST_HEAD_RCU(&head->masks);
+ INIT_LIST_HEAD(&head->hw_filters);
rcu_assign_pointer(tp->root, head);
idr_init(&head->handle_idr);
return rhashtable_init(&head->ht, &mask_ht_params);
}
-static void fl_mask_free(struct fl_flow_mask *mask)
+static void fl_mask_free(struct fl_flow_mask *mask, bool mask_init_done)
{
- rhashtable_destroy(&mask->ht);
+ /* temporary masks don't have their filters list and ht initialized */
+ if (mask_init_done) {
+ WARN_ON(!list_empty(&mask->filters));
+ rhashtable_destroy(&mask->ht);
+ }
kfree(mask);
}
@@ -322,27 +391,47 @@ static void fl_mask_free_work(struct work_struct *work)
struct fl_flow_mask *mask = container_of(to_rcu_work(work),
struct fl_flow_mask, rwork);
- fl_mask_free(mask);
+ fl_mask_free(mask, true);
+}
+
+static void fl_uninit_mask_free_work(struct work_struct *work)
+{
+ struct fl_flow_mask *mask = container_of(to_rcu_work(work),
+ struct fl_flow_mask, rwork);
+
+ fl_mask_free(mask, false);
}
-static bool fl_mask_put(struct cls_fl_head *head, struct fl_flow_mask *mask,
- bool async)
+static bool fl_mask_put(struct cls_fl_head *head, struct fl_flow_mask *mask)
{
- if (!list_empty(&mask->filters))
+ if (!refcount_dec_and_test(&mask->refcnt))
return false;
rhashtable_remove_fast(&head->ht, &mask->ht_node, mask_ht_params);
+
+ spin_lock(&head->masks_lock);
list_del_rcu(&mask->list);
- if (async)
- tcf_queue_work(&mask->rwork, fl_mask_free_work);
- else
- fl_mask_free(mask);
+ spin_unlock(&head->masks_lock);
+
+ tcf_queue_work(&mask->rwork, fl_mask_free_work);
return true;
}
+static struct cls_fl_head *fl_head_dereference(struct tcf_proto *tp)
+{
+ /* Flower classifier only changes root pointer during init and destroy.
+ * Users must obtain reference to tcf_proto instance before calling its
+ * API, so tp->root pointer is protected from concurrent call to
+ * fl_destroy() by reference counting.
+ */
+ return rcu_dereference_raw(tp->root);
+}
+
static void __fl_destroy_filter(struct cls_fl_filter *f)
{
+ if (f->needs_tc_skb_ext)
+ tc_skb_ext_tc_disable();
tcf_exts_destroy(&f->exts);
tcf_exts_put_net(&f->exts);
kfree(f);
@@ -353,50 +442,61 @@ static void fl_destroy_filter_work(struct work_struct *work)
struct cls_fl_filter *f = container_of(to_rcu_work(work),
struct cls_fl_filter, rwork);
- rtnl_lock();
__fl_destroy_filter(f);
- rtnl_unlock();
}
static void fl_hw_destroy_filter(struct tcf_proto *tp, struct cls_fl_filter *f,
- struct netlink_ext_ack *extack)
+ bool rtnl_held, struct netlink_ext_ack *extack)
{
- struct tc_cls_flower_offload cls_flower = {};
struct tcf_block *block = tp->chain->block;
+ struct flow_cls_offload cls_flower = {};
tc_cls_common_offload_init(&cls_flower.common, tp, f->flags, extack);
- cls_flower.command = TC_CLSFLOWER_DESTROY;
+ cls_flower.command = FLOW_CLS_DESTROY;
cls_flower.cookie = (unsigned long) f;
- tc_setup_cb_call(block, TC_SETUP_CLSFLOWER, &cls_flower, false);
- tcf_block_offload_dec(block, &f->flags);
+ tc_setup_cb_destroy(block, tp, TC_SETUP_CLSFLOWER, &cls_flower, false,
+ &f->flags, &f->in_hw_count, rtnl_held);
+
}
static int fl_hw_replace_filter(struct tcf_proto *tp,
- struct cls_fl_filter *f,
+ struct cls_fl_filter *f, bool rtnl_held,
struct netlink_ext_ack *extack)
{
- struct tc_cls_flower_offload cls_flower = {};
struct tcf_block *block = tp->chain->block;
+ struct flow_cls_offload cls_flower = {};
bool skip_sw = tc_skip_sw(f->flags);
- int err;
+ int err = 0;
+
+ cls_flower.rule = flow_rule_alloc(tcf_exts_num_actions(&f->exts));
+ if (!cls_flower.rule)
+ return -ENOMEM;
tc_cls_common_offload_init(&cls_flower.common, tp, f->flags, extack);
- cls_flower.command = TC_CLSFLOWER_REPLACE;
+ cls_flower.command = FLOW_CLS_REPLACE;
cls_flower.cookie = (unsigned long) f;
- cls_flower.dissector = &f->mask->dissector;
- cls_flower.mask = &f->mask->key;
- cls_flower.key = &f->mkey;
- cls_flower.exts = &f->exts;
+ cls_flower.rule->match.dissector = &f->mask->dissector;
+ cls_flower.rule->match.mask = &f->mask->key;
+ cls_flower.rule->match.key = &f->mkey;
cls_flower.classid = f->res.classid;
- err = tc_setup_cb_call(block, TC_SETUP_CLSFLOWER, &cls_flower, skip_sw);
- if (err < 0) {
- fl_hw_destroy_filter(tp, f, NULL);
+ err = tc_setup_offload_action(&cls_flower.rule->action, &f->exts,
+ cls_flower.common.extack);
+ if (err) {
+ kfree(cls_flower.rule);
+
+ return skip_sw ? err : 0;
+ }
+
+ err = tc_setup_cb_add(block, tp, TC_SETUP_CLSFLOWER, &cls_flower,
+ skip_sw, &f->flags, &f->in_hw_count, rtnl_held);
+ tc_cleanup_offload_action(&cls_flower.rule->action);
+ kfree(cls_flower.rule);
+
+ if (err) {
+ fl_hw_destroy_filter(tp, f, rtnl_held, NULL);
return err;
- } else if (err > 0) {
- f->in_hw_count = err;
- tcf_block_offload_inc(block, &f->flags);
}
if (skip_sw && !(f->flags & TCA_CLS_FLAGS_IN_HW))
@@ -405,39 +505,84 @@ static int fl_hw_replace_filter(struct tcf_proto *tp,
return 0;
}
-static void fl_hw_update_stats(struct tcf_proto *tp, struct cls_fl_filter *f)
+static void fl_hw_update_stats(struct tcf_proto *tp, struct cls_fl_filter *f,
+ bool rtnl_held)
{
- struct tc_cls_flower_offload cls_flower = {};
struct tcf_block *block = tp->chain->block;
+ struct flow_cls_offload cls_flower = {};
tc_cls_common_offload_init(&cls_flower.common, tp, f->flags, NULL);
- cls_flower.command = TC_CLSFLOWER_STATS;
+ cls_flower.command = FLOW_CLS_STATS;
cls_flower.cookie = (unsigned long) f;
- cls_flower.exts = &f->exts;
cls_flower.classid = f->res.classid;
- tc_setup_cb_call(block, TC_SETUP_CLSFLOWER, &cls_flower, false);
+ tc_setup_cb_call(block, TC_SETUP_CLSFLOWER, &cls_flower, false,
+ rtnl_held);
+
+ tcf_exts_hw_stats_update(&f->exts, &cls_flower.stats, cls_flower.use_act_stats);
}
-static bool __fl_delete(struct tcf_proto *tp, struct cls_fl_filter *f,
- struct netlink_ext_ack *extack)
+static void __fl_put(struct cls_fl_filter *f)
{
- struct cls_fl_head *head = rtnl_dereference(tp->root);
- bool async = tcf_exts_get_net(&f->exts);
- bool last;
+ if (!refcount_dec_and_test(&f->refcnt))
+ return;
+ if (tcf_exts_get_net(&f->exts))
+ tcf_queue_work(&f->rwork, fl_destroy_filter_work);
+ else
+ __fl_destroy_filter(f);
+}
+
+static struct cls_fl_filter *__fl_get(struct cls_fl_head *head, u32 handle)
+{
+ struct cls_fl_filter *f;
+
+ rcu_read_lock();
+ f = idr_find(&head->handle_idr, handle);
+ if (f && !refcount_inc_not_zero(&f->refcnt))
+ f = NULL;
+ rcu_read_unlock();
+
+ return f;
+}
+
+static struct tcf_exts *fl_get_exts(const struct tcf_proto *tp, u32 handle)
+{
+ struct cls_fl_head *head = rcu_dereference_bh(tp->root);
+ struct cls_fl_filter *f;
+
+ f = idr_find(&head->handle_idr, handle);
+ return f ? &f->exts : NULL;
+}
+
+static int __fl_delete(struct tcf_proto *tp, struct cls_fl_filter *f,
+ bool *last, bool rtnl_held,
+ struct netlink_ext_ack *extack)
+{
+ struct cls_fl_head *head = fl_head_dereference(tp);
+
+ *last = false;
+
+ spin_lock(&tp->lock);
+ if (f->deleted) {
+ spin_unlock(&tp->lock);
+ return -ENOENT;
+ }
+
+ f->deleted = true;
+ rhashtable_remove_fast(&f->mask->ht, &f->ht_node,
+ f->mask->filter_ht_params);
idr_remove(&head->handle_idr, f->handle);
list_del_rcu(&f->list);
- last = fl_mask_put(head, f->mask, async);
+ spin_unlock(&tp->lock);
+
+ *last = fl_mask_put(head, f->mask);
if (!tc_skip_hw(f->flags))
- fl_hw_destroy_filter(tp, f, extack);
+ fl_hw_destroy_filter(tp, f, rtnl_held, extack);
tcf_unbind_filter(tp, &f->res);
- if (async)
- tcf_queue_work(&f->rwork, fl_destroy_filter_work);
- else
- __fl_destroy_filter(f);
+ __fl_put(f);
- return last;
+ return 0;
}
static void fl_destroy_sleepable(struct work_struct *work)
@@ -451,15 +596,18 @@ static void fl_destroy_sleepable(struct work_struct *work)
module_put(THIS_MODULE);
}
-static void fl_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack)
+static void fl_destroy(struct tcf_proto *tp, bool rtnl_held,
+ struct netlink_ext_ack *extack)
{
- struct cls_fl_head *head = rtnl_dereference(tp->root);
+ struct cls_fl_head *head = fl_head_dereference(tp);
struct fl_flow_mask *mask, *next_mask;
struct cls_fl_filter *f, *next;
+ bool last;
list_for_each_entry_safe(mask, next_mask, &head->masks, list) {
list_for_each_entry_safe(f, next, &mask->filters, list) {
- if (__fl_delete(tp, f, extack))
+ __fl_delete(tp, f, &last, rtnl_held, extack);
+ if (last)
break;
}
}
@@ -469,15 +617,23 @@ static void fl_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack)
tcf_queue_work(&head->rwork, fl_destroy_sleepable);
}
+static void fl_put(struct tcf_proto *tp, void *arg)
+{
+ struct cls_fl_filter *f = arg;
+
+ __fl_put(f);
+}
+
static void *fl_get(struct tcf_proto *tp, u32 handle)
{
- struct cls_fl_head *head = rtnl_dereference(tp->root);
+ struct cls_fl_head *head = fl_head_dereference(tp);
- return idr_find(&head->handle_idr, handle);
+ return __fl_get(head, handle);
}
static const struct nla_policy fl_policy[TCA_FLOWER_MAX + 1] = {
- [TCA_FLOWER_UNSPEC] = { .type = NLA_UNSPEC },
+ [TCA_FLOWER_UNSPEC] = { .strict_start_type =
+ TCA_FLOWER_L2_MISS },
[TCA_FLOWER_CLASSID] = { .type = NLA_U32 },
[TCA_FLOWER_INDEV] = { .type = NLA_STRING,
.len = IFNAMSIZ },
@@ -523,8 +679,10 @@ static const struct nla_policy fl_policy[TCA_FLOWER_MAX + 1] = {
[TCA_FLOWER_KEY_ENC_UDP_SRC_PORT_MASK] = { .type = NLA_U16 },
[TCA_FLOWER_KEY_ENC_UDP_DST_PORT] = { .type = NLA_U16 },
[TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK] = { .type = NLA_U16 },
- [TCA_FLOWER_KEY_FLAGS] = { .type = NLA_U32 },
- [TCA_FLOWER_KEY_FLAGS_MASK] = { .type = NLA_U32 },
+ [TCA_FLOWER_KEY_FLAGS] = NLA_POLICY_MASK(NLA_BE32,
+ TCA_FLOWER_KEY_FLAGS_POLICY_MASK),
+ [TCA_FLOWER_KEY_FLAGS_MASK] = NLA_POLICY_MASK(NLA_BE32,
+ TCA_FLOWER_KEY_FLAGS_POLICY_MASK),
[TCA_FLOWER_KEY_ICMPV4_TYPE] = { .type = NLA_U8 },
[TCA_FLOWER_KEY_ICMPV4_TYPE_MASK] = { .type = NLA_U8 },
[TCA_FLOWER_KEY_ICMPV4_CODE] = { .type = NLA_U8 },
@@ -547,6 +705,7 @@ static const struct nla_policy fl_policy[TCA_FLOWER_MAX + 1] = {
[TCA_FLOWER_KEY_MPLS_BOS] = { .type = NLA_U8 },
[TCA_FLOWER_KEY_MPLS_TC] = { .type = NLA_U8 },
[TCA_FLOWER_KEY_MPLS_LABEL] = { .type = NLA_U32 },
+ [TCA_FLOWER_KEY_MPLS_OPTS] = { .type = NLA_NESTED },
[TCA_FLOWER_KEY_TCP_FLAGS] = { .type = NLA_U16 },
[TCA_FLOWER_KEY_TCP_FLAGS_MASK] = { .type = NLA_U16 },
[TCA_FLOWER_KEY_IP_TOS] = { .type = NLA_U8 },
@@ -562,11 +721,44 @@ static const struct nla_policy fl_policy[TCA_FLOWER_MAX + 1] = {
[TCA_FLOWER_KEY_ENC_IP_TTL_MASK] = { .type = NLA_U8 },
[TCA_FLOWER_KEY_ENC_OPTS] = { .type = NLA_NESTED },
[TCA_FLOWER_KEY_ENC_OPTS_MASK] = { .type = NLA_NESTED },
+ [TCA_FLOWER_KEY_CT_STATE] =
+ NLA_POLICY_MASK(NLA_U16, TCA_FLOWER_KEY_CT_FLAGS_MASK),
+ [TCA_FLOWER_KEY_CT_STATE_MASK] =
+ NLA_POLICY_MASK(NLA_U16, TCA_FLOWER_KEY_CT_FLAGS_MASK),
+ [TCA_FLOWER_KEY_CT_ZONE] = { .type = NLA_U16 },
+ [TCA_FLOWER_KEY_CT_ZONE_MASK] = { .type = NLA_U16 },
+ [TCA_FLOWER_KEY_CT_MARK] = { .type = NLA_U32 },
+ [TCA_FLOWER_KEY_CT_MARK_MASK] = { .type = NLA_U32 },
+ [TCA_FLOWER_KEY_CT_LABELS] = { .type = NLA_BINARY,
+ .len = 128 / BITS_PER_BYTE },
+ [TCA_FLOWER_KEY_CT_LABELS_MASK] = { .type = NLA_BINARY,
+ .len = 128 / BITS_PER_BYTE },
+ [TCA_FLOWER_FLAGS] = { .type = NLA_U32 },
+ [TCA_FLOWER_KEY_HASH] = { .type = NLA_U32 },
+ [TCA_FLOWER_KEY_HASH_MASK] = { .type = NLA_U32 },
+ [TCA_FLOWER_KEY_NUM_OF_VLANS] = { .type = NLA_U8 },
+ [TCA_FLOWER_KEY_PPPOE_SID] = { .type = NLA_U16 },
+ [TCA_FLOWER_KEY_PPP_PROTO] = { .type = NLA_U16 },
+ [TCA_FLOWER_KEY_L2TPV3_SID] = { .type = NLA_U32 },
+ [TCA_FLOWER_KEY_SPI] = { .type = NLA_U32 },
+ [TCA_FLOWER_KEY_SPI_MASK] = { .type = NLA_U32 },
+ [TCA_FLOWER_L2_MISS] = NLA_POLICY_MAX(NLA_U8, 1),
+ [TCA_FLOWER_KEY_CFM] = { .type = NLA_NESTED },
+ [TCA_FLOWER_KEY_ENC_FLAGS] = NLA_POLICY_MASK(NLA_BE32,
+ TCA_FLOWER_KEY_ENC_FLAGS_POLICY_MASK),
+ [TCA_FLOWER_KEY_ENC_FLAGS_MASK] = NLA_POLICY_MASK(NLA_BE32,
+ TCA_FLOWER_KEY_ENC_FLAGS_POLICY_MASK),
};
static const struct nla_policy
enc_opts_policy[TCA_FLOWER_KEY_ENC_OPTS_MAX + 1] = {
+ [TCA_FLOWER_KEY_ENC_OPTS_UNSPEC] = {
+ .strict_start_type = TCA_FLOWER_KEY_ENC_OPTS_VXLAN },
[TCA_FLOWER_KEY_ENC_OPTS_GENEVE] = { .type = NLA_NESTED },
+ [TCA_FLOWER_KEY_ENC_OPTS_VXLAN] = { .type = NLA_NESTED },
+ [TCA_FLOWER_KEY_ENC_OPTS_ERSPAN] = { .type = NLA_NESTED },
+ [TCA_FLOWER_KEY_ENC_OPTS_GTP] = { .type = NLA_NESTED },
+ [TCA_FLOWER_KEY_ENC_OPTS_PFCP] = { .type = NLA_NESTED },
};
static const struct nla_policy
@@ -574,7 +766,48 @@ geneve_opt_policy[TCA_FLOWER_KEY_ENC_OPT_GENEVE_MAX + 1] = {
[TCA_FLOWER_KEY_ENC_OPT_GENEVE_CLASS] = { .type = NLA_U16 },
[TCA_FLOWER_KEY_ENC_OPT_GENEVE_TYPE] = { .type = NLA_U8 },
[TCA_FLOWER_KEY_ENC_OPT_GENEVE_DATA] = { .type = NLA_BINARY,
- .len = 128 },
+ .len = 127 },
+};
+
+static const struct nla_policy
+vxlan_opt_policy[TCA_FLOWER_KEY_ENC_OPT_VXLAN_MAX + 1] = {
+ [TCA_FLOWER_KEY_ENC_OPT_VXLAN_GBP] = { .type = NLA_U32 },
+};
+
+static const struct nla_policy
+erspan_opt_policy[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_MAX + 1] = {
+ [TCA_FLOWER_KEY_ENC_OPT_ERSPAN_VER] = { .type = NLA_U8 },
+ [TCA_FLOWER_KEY_ENC_OPT_ERSPAN_INDEX] = { .type = NLA_U32 },
+ [TCA_FLOWER_KEY_ENC_OPT_ERSPAN_DIR] = { .type = NLA_U8 },
+ [TCA_FLOWER_KEY_ENC_OPT_ERSPAN_HWID] = { .type = NLA_U8 },
+};
+
+static const struct nla_policy
+gtp_opt_policy[TCA_FLOWER_KEY_ENC_OPT_GTP_MAX + 1] = {
+ [TCA_FLOWER_KEY_ENC_OPT_GTP_PDU_TYPE] = { .type = NLA_U8 },
+ [TCA_FLOWER_KEY_ENC_OPT_GTP_QFI] = { .type = NLA_U8 },
+};
+
+static const struct nla_policy
+pfcp_opt_policy[TCA_FLOWER_KEY_ENC_OPT_PFCP_MAX + 1] = {
+ [TCA_FLOWER_KEY_ENC_OPT_PFCP_TYPE] = { .type = NLA_U8 },
+ [TCA_FLOWER_KEY_ENC_OPT_PFCP_SEID] = { .type = NLA_U64 },
+};
+
+static const struct nla_policy
+mpls_stack_entry_policy[TCA_FLOWER_KEY_MPLS_OPT_LSE_MAX + 1] = {
+ [TCA_FLOWER_KEY_MPLS_OPT_LSE_DEPTH] = { .type = NLA_U8 },
+ [TCA_FLOWER_KEY_MPLS_OPT_LSE_TTL] = { .type = NLA_U8 },
+ [TCA_FLOWER_KEY_MPLS_OPT_LSE_BOS] = { .type = NLA_U8 },
+ [TCA_FLOWER_KEY_MPLS_OPT_LSE_TC] = { .type = NLA_U8 },
+ [TCA_FLOWER_KEY_MPLS_OPT_LSE_LABEL] = { .type = NLA_U32 },
+};
+
+static const struct nla_policy
+cfm_opt_policy[TCA_FLOWER_KEY_CFM_OPT_MAX + 1] = {
+ [TCA_FLOWER_KEY_CFM_MD_LEVEL] = NLA_POLICY_MAX(NLA_U8,
+ FLOW_DIS_CFM_MDL_MAX),
+ [TCA_FLOWER_KEY_CFM_OPCODE] = { .type = NLA_U8 },
};
static void fl_set_key_val(struct nlattr **tb,
@@ -583,69 +816,271 @@ static void fl_set_key_val(struct nlattr **tb,
{
if (!tb[val_type])
return;
- memcpy(val, nla_data(tb[val_type]), len);
+ nla_memcpy(val, tb[val_type], len);
if (mask_type == TCA_FLOWER_UNSPEC || !tb[mask_type])
memset(mask, 0xff, len);
else
- memcpy(mask, nla_data(tb[mask_type]), len);
+ nla_memcpy(mask, tb[mask_type], len);
+}
+
+static int fl_set_key_spi(struct nlattr **tb, struct fl_flow_key *key,
+ struct fl_flow_key *mask,
+ struct netlink_ext_ack *extack)
+{
+ if (key->basic.ip_proto != IPPROTO_ESP &&
+ key->basic.ip_proto != IPPROTO_AH) {
+ NL_SET_ERR_MSG(extack,
+ "Protocol must be either ESP or AH");
+ return -EINVAL;
+ }
+
+ fl_set_key_val(tb, &key->ipsec.spi,
+ TCA_FLOWER_KEY_SPI,
+ &mask->ipsec.spi, TCA_FLOWER_KEY_SPI_MASK,
+ sizeof(key->ipsec.spi));
+ return 0;
}
static int fl_set_key_port_range(struct nlattr **tb, struct fl_flow_key *key,
- struct fl_flow_key *mask)
-{
- fl_set_key_val(tb, &key->tp_min.dst,
- TCA_FLOWER_KEY_PORT_DST_MIN, &mask->tp_min.dst,
- TCA_FLOWER_UNSPEC, sizeof(key->tp_min.dst));
- fl_set_key_val(tb, &key->tp_max.dst,
- TCA_FLOWER_KEY_PORT_DST_MAX, &mask->tp_max.dst,
- TCA_FLOWER_UNSPEC, sizeof(key->tp_max.dst));
- fl_set_key_val(tb, &key->tp_min.src,
- TCA_FLOWER_KEY_PORT_SRC_MIN, &mask->tp_min.src,
- TCA_FLOWER_UNSPEC, sizeof(key->tp_min.src));
- fl_set_key_val(tb, &key->tp_max.src,
- TCA_FLOWER_KEY_PORT_SRC_MAX, &mask->tp_max.src,
- TCA_FLOWER_UNSPEC, sizeof(key->tp_max.src));
-
- if ((mask->tp_min.dst && mask->tp_max.dst &&
- htons(key->tp_max.dst) <= htons(key->tp_min.dst)) ||
- (mask->tp_min.src && mask->tp_max.src &&
- htons(key->tp_max.src) <= htons(key->tp_min.src)))
+ struct fl_flow_key *mask,
+ struct netlink_ext_ack *extack)
+{
+ fl_set_key_val(tb, &key->tp_range.tp_min.dst,
+ TCA_FLOWER_KEY_PORT_DST_MIN, &mask->tp_range.tp_min.dst,
+ TCA_FLOWER_UNSPEC, sizeof(key->tp_range.tp_min.dst));
+ fl_set_key_val(tb, &key->tp_range.tp_max.dst,
+ TCA_FLOWER_KEY_PORT_DST_MAX, &mask->tp_range.tp_max.dst,
+ TCA_FLOWER_UNSPEC, sizeof(key->tp_range.tp_max.dst));
+ fl_set_key_val(tb, &key->tp_range.tp_min.src,
+ TCA_FLOWER_KEY_PORT_SRC_MIN, &mask->tp_range.tp_min.src,
+ TCA_FLOWER_UNSPEC, sizeof(key->tp_range.tp_min.src));
+ fl_set_key_val(tb, &key->tp_range.tp_max.src,
+ TCA_FLOWER_KEY_PORT_SRC_MAX, &mask->tp_range.tp_max.src,
+ TCA_FLOWER_UNSPEC, sizeof(key->tp_range.tp_max.src));
+
+ if (mask->tp_range.tp_min.dst != mask->tp_range.tp_max.dst) {
+ NL_SET_ERR_MSG(extack,
+ "Both min and max destination ports must be specified");
+ return -EINVAL;
+ }
+ if (mask->tp_range.tp_min.src != mask->tp_range.tp_max.src) {
+ NL_SET_ERR_MSG(extack,
+ "Both min and max source ports must be specified");
+ return -EINVAL;
+ }
+ if (mask->tp_range.tp_min.dst && mask->tp_range.tp_max.dst &&
+ ntohs(key->tp_range.tp_max.dst) <=
+ ntohs(key->tp_range.tp_min.dst)) {
+ NL_SET_ERR_MSG_ATTR(extack,
+ tb[TCA_FLOWER_KEY_PORT_DST_MIN],
+ "Invalid destination port range (min must be strictly smaller than max)");
+ return -EINVAL;
+ }
+ if (mask->tp_range.tp_min.src && mask->tp_range.tp_max.src &&
+ ntohs(key->tp_range.tp_max.src) <=
+ ntohs(key->tp_range.tp_min.src)) {
+ NL_SET_ERR_MSG_ATTR(extack,
+ tb[TCA_FLOWER_KEY_PORT_SRC_MIN],
+ "Invalid source port range (min must be strictly smaller than max)");
return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int fl_set_key_mpls_lse(const struct nlattr *nla_lse,
+ struct flow_dissector_key_mpls *key_val,
+ struct flow_dissector_key_mpls *key_mask,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[TCA_FLOWER_KEY_MPLS_OPT_LSE_MAX + 1];
+ struct flow_dissector_mpls_lse *lse_mask;
+ struct flow_dissector_mpls_lse *lse_val;
+ u8 lse_index;
+ u8 depth;
+ int err;
+
+ err = nla_parse_nested(tb, TCA_FLOWER_KEY_MPLS_OPT_LSE_MAX, nla_lse,
+ mpls_stack_entry_policy, extack);
+ if (err < 0)
+ return err;
+
+ if (!tb[TCA_FLOWER_KEY_MPLS_OPT_LSE_DEPTH]) {
+ NL_SET_ERR_MSG(extack, "Missing MPLS option \"depth\"");
+ return -EINVAL;
+ }
+
+ depth = nla_get_u8(tb[TCA_FLOWER_KEY_MPLS_OPT_LSE_DEPTH]);
+
+ /* LSE depth starts at 1, for consistency with terminology used by
+ * RFC 3031 (section 3.9), where depth 0 refers to unlabeled packets.
+ */
+ if (depth < 1 || depth > FLOW_DIS_MPLS_MAX) {
+ NL_SET_ERR_MSG_ATTR(extack,
+ tb[TCA_FLOWER_KEY_MPLS_OPT_LSE_DEPTH],
+ "Invalid MPLS depth");
+ return -EINVAL;
+ }
+ lse_index = depth - 1;
+
+ dissector_set_mpls_lse(key_val, lse_index);
+ dissector_set_mpls_lse(key_mask, lse_index);
+
+ lse_val = &key_val->ls[lse_index];
+ lse_mask = &key_mask->ls[lse_index];
+
+ if (tb[TCA_FLOWER_KEY_MPLS_OPT_LSE_TTL]) {
+ lse_val->mpls_ttl = nla_get_u8(tb[TCA_FLOWER_KEY_MPLS_OPT_LSE_TTL]);
+ lse_mask->mpls_ttl = MPLS_TTL_MASK;
+ }
+ if (tb[TCA_FLOWER_KEY_MPLS_OPT_LSE_BOS]) {
+ u8 bos = nla_get_u8(tb[TCA_FLOWER_KEY_MPLS_OPT_LSE_BOS]);
+
+ if (bos & ~MPLS_BOS_MASK) {
+ NL_SET_ERR_MSG_ATTR(extack,
+ tb[TCA_FLOWER_KEY_MPLS_OPT_LSE_BOS],
+ "Bottom Of Stack (BOS) must be 0 or 1");
+ return -EINVAL;
+ }
+ lse_val->mpls_bos = bos;
+ lse_mask->mpls_bos = MPLS_BOS_MASK;
+ }
+ if (tb[TCA_FLOWER_KEY_MPLS_OPT_LSE_TC]) {
+ u8 tc = nla_get_u8(tb[TCA_FLOWER_KEY_MPLS_OPT_LSE_TC]);
+
+ if (tc & ~MPLS_TC_MASK) {
+ NL_SET_ERR_MSG_ATTR(extack,
+ tb[TCA_FLOWER_KEY_MPLS_OPT_LSE_TC],
+ "Traffic Class (TC) must be between 0 and 7");
+ return -EINVAL;
+ }
+ lse_val->mpls_tc = tc;
+ lse_mask->mpls_tc = MPLS_TC_MASK;
+ }
+ if (tb[TCA_FLOWER_KEY_MPLS_OPT_LSE_LABEL]) {
+ u32 label = nla_get_u32(tb[TCA_FLOWER_KEY_MPLS_OPT_LSE_LABEL]);
+
+ if (label & ~MPLS_LABEL_MASK) {
+ NL_SET_ERR_MSG_ATTR(extack,
+ tb[TCA_FLOWER_KEY_MPLS_OPT_LSE_LABEL],
+ "Label must be between 0 and 1048575");
+ return -EINVAL;
+ }
+ lse_val->mpls_label = label;
+ lse_mask->mpls_label = MPLS_LABEL_MASK;
+ }
+
+ return 0;
+}
+
+static int fl_set_key_mpls_opts(const struct nlattr *nla_mpls_opts,
+ struct flow_dissector_key_mpls *key_val,
+ struct flow_dissector_key_mpls *key_mask,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *nla_lse;
+ int rem;
+ int err;
+
+ if (!(nla_mpls_opts->nla_type & NLA_F_NESTED)) {
+ NL_SET_ERR_MSG_ATTR(extack, nla_mpls_opts,
+ "NLA_F_NESTED is missing");
+ return -EINVAL;
+ }
+
+ nla_for_each_nested(nla_lse, nla_mpls_opts, rem) {
+ if (nla_type(nla_lse) != TCA_FLOWER_KEY_MPLS_OPTS_LSE) {
+ NL_SET_ERR_MSG_ATTR(extack, nla_lse,
+ "Invalid MPLS option type");
+ return -EINVAL;
+ }
+
+ err = fl_set_key_mpls_lse(nla_lse, key_val, key_mask, extack);
+ if (err < 0)
+ return err;
+ }
+ if (rem) {
+ NL_SET_ERR_MSG(extack,
+ "Bytes leftover after parsing MPLS options");
+ return -EINVAL;
+ }
return 0;
}
static int fl_set_key_mpls(struct nlattr **tb,
struct flow_dissector_key_mpls *key_val,
- struct flow_dissector_key_mpls *key_mask)
-{
+ struct flow_dissector_key_mpls *key_mask,
+ struct netlink_ext_ack *extack)
+{
+ struct flow_dissector_mpls_lse *lse_mask;
+ struct flow_dissector_mpls_lse *lse_val;
+
+ if (tb[TCA_FLOWER_KEY_MPLS_OPTS]) {
+ if (tb[TCA_FLOWER_KEY_MPLS_TTL] ||
+ tb[TCA_FLOWER_KEY_MPLS_BOS] ||
+ tb[TCA_FLOWER_KEY_MPLS_TC] ||
+ tb[TCA_FLOWER_KEY_MPLS_LABEL]) {
+ NL_SET_ERR_MSG_ATTR(extack,
+ tb[TCA_FLOWER_KEY_MPLS_OPTS],
+ "MPLS label, Traffic Class, Bottom Of Stack and Time To Live must be encapsulated in the MPLS options attribute");
+ return -EBADMSG;
+ }
+
+ return fl_set_key_mpls_opts(tb[TCA_FLOWER_KEY_MPLS_OPTS],
+ key_val, key_mask, extack);
+ }
+
+ lse_val = &key_val->ls[0];
+ lse_mask = &key_mask->ls[0];
+
if (tb[TCA_FLOWER_KEY_MPLS_TTL]) {
- key_val->mpls_ttl = nla_get_u8(tb[TCA_FLOWER_KEY_MPLS_TTL]);
- key_mask->mpls_ttl = MPLS_TTL_MASK;
+ lse_val->mpls_ttl = nla_get_u8(tb[TCA_FLOWER_KEY_MPLS_TTL]);
+ lse_mask->mpls_ttl = MPLS_TTL_MASK;
+ dissector_set_mpls_lse(key_val, 0);
+ dissector_set_mpls_lse(key_mask, 0);
}
if (tb[TCA_FLOWER_KEY_MPLS_BOS]) {
u8 bos = nla_get_u8(tb[TCA_FLOWER_KEY_MPLS_BOS]);
- if (bos & ~MPLS_BOS_MASK)
+ if (bos & ~MPLS_BOS_MASK) {
+ NL_SET_ERR_MSG_ATTR(extack,
+ tb[TCA_FLOWER_KEY_MPLS_BOS],
+ "Bottom Of Stack (BOS) must be 0 or 1");
return -EINVAL;
- key_val->mpls_bos = bos;
- key_mask->mpls_bos = MPLS_BOS_MASK;
+ }
+ lse_val->mpls_bos = bos;
+ lse_mask->mpls_bos = MPLS_BOS_MASK;
+ dissector_set_mpls_lse(key_val, 0);
+ dissector_set_mpls_lse(key_mask, 0);
}
if (tb[TCA_FLOWER_KEY_MPLS_TC]) {
u8 tc = nla_get_u8(tb[TCA_FLOWER_KEY_MPLS_TC]);
- if (tc & ~MPLS_TC_MASK)
+ if (tc & ~MPLS_TC_MASK) {
+ NL_SET_ERR_MSG_ATTR(extack,
+ tb[TCA_FLOWER_KEY_MPLS_TC],
+ "Traffic Class (TC) must be between 0 and 7");
return -EINVAL;
- key_val->mpls_tc = tc;
- key_mask->mpls_tc = MPLS_TC_MASK;
+ }
+ lse_val->mpls_tc = tc;
+ lse_mask->mpls_tc = MPLS_TC_MASK;
+ dissector_set_mpls_lse(key_val, 0);
+ dissector_set_mpls_lse(key_mask, 0);
}
if (tb[TCA_FLOWER_KEY_MPLS_LABEL]) {
u32 label = nla_get_u32(tb[TCA_FLOWER_KEY_MPLS_LABEL]);
- if (label & ~MPLS_LABEL_MASK)
+ if (label & ~MPLS_LABEL_MASK) {
+ NL_SET_ERR_MSG_ATTR(extack,
+ tb[TCA_FLOWER_KEY_MPLS_LABEL],
+ "Label must be between 0 and 1048575");
return -EINVAL;
- key_val->mpls_label = label;
- key_mask->mpls_label = MPLS_LABEL_MASK;
+ }
+ lse_val->mpls_label = label;
+ lse_mask->mpls_label = MPLS_LABEL_MASK;
+ dissector_set_mpls_lse(key_val, 0);
+ dissector_set_mpls_lse(key_mask, 0);
}
return 0;
}
@@ -653,6 +1088,7 @@ static int fl_set_key_mpls(struct nlattr **tb,
static void fl_set_key_vlan(struct nlattr **tb,
__be16 ethertype,
int vlan_id_key, int vlan_prio_key,
+ int vlan_next_eth_type_key,
struct flow_dissector_key_vlan *key_val,
struct flow_dissector_key_vlan *key_mask)
{
@@ -669,8 +1105,59 @@ static void fl_set_key_vlan(struct nlattr **tb,
VLAN_PRIORITY_MASK;
key_mask->vlan_priority = VLAN_PRIORITY_MASK;
}
- key_val->vlan_tpid = ethertype;
- key_mask->vlan_tpid = cpu_to_be16(~0);
+ if (ethertype) {
+ key_val->vlan_tpid = ethertype;
+ key_mask->vlan_tpid = cpu_to_be16(~0);
+ }
+ if (tb[vlan_next_eth_type_key]) {
+ key_val->vlan_eth_type =
+ nla_get_be16(tb[vlan_next_eth_type_key]);
+ key_mask->vlan_eth_type = cpu_to_be16(~0);
+ }
+}
+
+static void fl_set_key_pppoe(struct nlattr **tb,
+ struct flow_dissector_key_pppoe *key_val,
+ struct flow_dissector_key_pppoe *key_mask,
+ struct fl_flow_key *key,
+ struct fl_flow_key *mask)
+{
+ /* key_val::type must be set to ETH_P_PPP_SES
+ * because ETH_P_PPP_SES was stored in basic.n_proto
+ * which might get overwritten by ppp_proto
+ * or might be set to 0, the role of key_val::type
+ * is similar to vlan_key::tpid
+ */
+ key_val->type = htons(ETH_P_PPP_SES);
+ key_mask->type = cpu_to_be16(~0);
+
+ if (tb[TCA_FLOWER_KEY_PPPOE_SID]) {
+ key_val->session_id =
+ nla_get_be16(tb[TCA_FLOWER_KEY_PPPOE_SID]);
+ key_mask->session_id = cpu_to_be16(~0);
+ }
+ if (tb[TCA_FLOWER_KEY_PPP_PROTO]) {
+ key_val->ppp_proto =
+ nla_get_be16(tb[TCA_FLOWER_KEY_PPP_PROTO]);
+ key_mask->ppp_proto = cpu_to_be16(~0);
+
+ if (key_val->ppp_proto == htons(PPP_IP)) {
+ key->basic.n_proto = htons(ETH_P_IP);
+ mask->basic.n_proto = cpu_to_be16(~0);
+ } else if (key_val->ppp_proto == htons(PPP_IPV6)) {
+ key->basic.n_proto = htons(ETH_P_IPV6);
+ mask->basic.n_proto = cpu_to_be16(~0);
+ } else if (key_val->ppp_proto == htons(PPP_MPLS_UC)) {
+ key->basic.n_proto = htons(ETH_P_MPLS_UC);
+ mask->basic.n_proto = cpu_to_be16(~0);
+ } else if (key_val->ppp_proto == htons(PPP_MPLS_MC)) {
+ key->basic.n_proto = htons(ETH_P_MPLS_MC);
+ mask->basic.n_proto = cpu_to_be16(~0);
+ }
+ } else {
+ key->basic.n_proto = 0;
+ mask->basic.n_proto = cpu_to_be16(0);
+ }
}
static void fl_set_key_flag(u32 flower_key, u32 flower_mask,
@@ -684,17 +1171,29 @@ static void fl_set_key_flag(u32 flower_key, u32 flower_mask,
}
}
-static int fl_set_key_flags(struct nlattr **tb,
- u32 *flags_key, u32 *flags_mask)
+static int fl_set_key_flags(struct nlattr *tca_opts, struct nlattr **tb,
+ bool encap, u32 *flags_key, u32 *flags_mask,
+ struct netlink_ext_ack *extack)
{
+ int fl_key, fl_mask;
u32 key, mask;
+ if (encap) {
+ fl_key = TCA_FLOWER_KEY_ENC_FLAGS;
+ fl_mask = TCA_FLOWER_KEY_ENC_FLAGS_MASK;
+ } else {
+ fl_key = TCA_FLOWER_KEY_FLAGS;
+ fl_mask = TCA_FLOWER_KEY_FLAGS_MASK;
+ }
+
/* mask is mandatory for flags */
- if (!tb[TCA_FLOWER_KEY_FLAGS_MASK])
+ if (NL_REQ_ATTR_CHECK(extack, tca_opts, tb, fl_mask)) {
+ NL_SET_ERR_MSG(extack, "Missing flags mask");
return -EINVAL;
+ }
- key = be32_to_cpu(nla_get_u32(tb[TCA_FLOWER_KEY_FLAGS]));
- mask = be32_to_cpu(nla_get_u32(tb[TCA_FLOWER_KEY_FLAGS_MASK]));
+ key = be32_to_cpu(nla_get_be32(tb[fl_key]));
+ mask = be32_to_cpu(nla_get_be32(tb[fl_mask]));
*flags_key = 0;
*flags_mask = 0;
@@ -705,6 +1204,21 @@ static int fl_set_key_flags(struct nlattr **tb,
TCA_FLOWER_KEY_FLAGS_FRAG_IS_FIRST,
FLOW_DIS_FIRST_FRAG);
+ fl_set_key_flag(key, mask, flags_key, flags_mask,
+ TCA_FLOWER_KEY_FLAGS_TUNNEL_CSUM,
+ FLOW_DIS_F_TUNNEL_CSUM);
+
+ fl_set_key_flag(key, mask, flags_key, flags_mask,
+ TCA_FLOWER_KEY_FLAGS_TUNNEL_DONT_FRAGMENT,
+ FLOW_DIS_F_TUNNEL_DONT_FRAGMENT);
+
+ fl_set_key_flag(key, mask, flags_key, flags_mask,
+ TCA_FLOWER_KEY_FLAGS_TUNNEL_OAM, FLOW_DIS_F_TUNNEL_OAM);
+
+ fl_set_key_flag(key, mask, flags_key, flags_mask,
+ TCA_FLOWER_KEY_FLAGS_TUNNEL_CRIT_OPT,
+ FLOW_DIS_F_TUNNEL_CRIT_OPT);
+
return 0;
}
@@ -733,6 +1247,9 @@ static int fl_set_geneve_opt(const struct nlattr *nla, struct fl_flow_key *key,
if (option_len > sizeof(struct geneve_opt))
data_len = option_len - sizeof(struct geneve_opt);
+ if (key->enc_opts.len > FLOW_DIS_TUN_OPTS_MAX - 4)
+ return -ERANGE;
+
opt = (struct geneve_opt *)&key->enc_opts.data[key->enc_opts.len];
memset(opt, 0xff, option_len);
opt->length = data_len / 4;
@@ -749,8 +1266,9 @@ static int fl_set_geneve_opt(const struct nlattr *nla, struct fl_flow_key *key,
return -EINVAL;
}
- err = nla_parse_nested(tb, TCA_FLOWER_KEY_ENC_OPT_GENEVE_MAX,
- nla, geneve_opt_policy, extack);
+ err = nla_parse_nested_deprecated(tb,
+ TCA_FLOWER_KEY_ENC_OPT_GENEVE_MAX,
+ nla, geneve_opt_policy, extack);
if (err < 0)
return err;
@@ -805,6 +1323,190 @@ static int fl_set_geneve_opt(const struct nlattr *nla, struct fl_flow_key *key,
return sizeof(struct geneve_opt) + data_len;
}
+static int fl_set_vxlan_opt(const struct nlattr *nla, struct fl_flow_key *key,
+ int depth, int option_len,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[TCA_FLOWER_KEY_ENC_OPT_VXLAN_MAX + 1];
+ struct vxlan_metadata *md;
+ int err;
+
+ md = (struct vxlan_metadata *)&key->enc_opts.data[key->enc_opts.len];
+ memset(md, 0xff, sizeof(*md));
+
+ if (!depth)
+ return sizeof(*md);
+
+ if (nla_type(nla) != TCA_FLOWER_KEY_ENC_OPTS_VXLAN) {
+ NL_SET_ERR_MSG(extack, "Non-vxlan option type for mask");
+ return -EINVAL;
+ }
+
+ err = nla_parse_nested(tb, TCA_FLOWER_KEY_ENC_OPT_VXLAN_MAX, nla,
+ vxlan_opt_policy, extack);
+ if (err < 0)
+ return err;
+
+ if (!option_len && !tb[TCA_FLOWER_KEY_ENC_OPT_VXLAN_GBP]) {
+ NL_SET_ERR_MSG(extack, "Missing tunnel key vxlan option gbp");
+ return -EINVAL;
+ }
+
+ if (tb[TCA_FLOWER_KEY_ENC_OPT_VXLAN_GBP]) {
+ md->gbp = nla_get_u32(tb[TCA_FLOWER_KEY_ENC_OPT_VXLAN_GBP]);
+ md->gbp &= VXLAN_GBP_MASK;
+ }
+
+ return sizeof(*md);
+}
+
+static int fl_set_erspan_opt(const struct nlattr *nla, struct fl_flow_key *key,
+ int depth, int option_len,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_MAX + 1];
+ struct erspan_metadata *md;
+ int err;
+
+ md = (struct erspan_metadata *)&key->enc_opts.data[key->enc_opts.len];
+ md->version = 1;
+
+ if (!depth)
+ return sizeof(*md);
+
+ if (nla_type(nla) != TCA_FLOWER_KEY_ENC_OPTS_ERSPAN) {
+ NL_SET_ERR_MSG(extack, "Non-erspan option type for mask");
+ return -EINVAL;
+ }
+
+ err = nla_parse_nested(tb, TCA_FLOWER_KEY_ENC_OPT_ERSPAN_MAX, nla,
+ erspan_opt_policy, extack);
+ if (err < 0)
+ return err;
+
+ if (!option_len && !tb[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_VER]) {
+ NL_SET_ERR_MSG(extack, "Missing tunnel key erspan option ver");
+ return -EINVAL;
+ }
+
+ if (tb[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_VER])
+ md->version = nla_get_u8(tb[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_VER]);
+
+ if (md->version == 1) {
+ if (!option_len && !tb[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_INDEX]) {
+ NL_SET_ERR_MSG(extack, "Missing tunnel key erspan option index");
+ return -EINVAL;
+ }
+ memset(&md->u.index, 0xff, sizeof(md->u.index));
+ if (tb[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_INDEX]) {
+ nla = tb[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_INDEX];
+ md->u.index = nla_get_be32(nla);
+ }
+ } else if (md->version == 2) {
+ if (!option_len && (!tb[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_DIR] ||
+ !tb[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_HWID])) {
+ NL_SET_ERR_MSG(extack, "Missing tunnel key erspan option dir or hwid");
+ return -EINVAL;
+ }
+ md->u.md2.dir = 1;
+ if (tb[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_DIR]) {
+ nla = tb[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_DIR];
+ md->u.md2.dir = nla_get_u8(nla);
+ }
+ set_hwid(&md->u.md2, 0xff);
+ if (tb[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_HWID]) {
+ nla = tb[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_HWID];
+ set_hwid(&md->u.md2, nla_get_u8(nla));
+ }
+ } else {
+ NL_SET_ERR_MSG(extack, "Tunnel key erspan option ver is incorrect");
+ return -EINVAL;
+ }
+
+ return sizeof(*md);
+}
+
+static int fl_set_gtp_opt(const struct nlattr *nla, struct fl_flow_key *key,
+ int depth, int option_len,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[TCA_FLOWER_KEY_ENC_OPT_GTP_MAX + 1];
+ struct gtp_pdu_session_info *sinfo;
+ u8 len = key->enc_opts.len;
+ int err;
+
+ sinfo = (struct gtp_pdu_session_info *)&key->enc_opts.data[len];
+ memset(sinfo, 0xff, option_len);
+
+ if (!depth)
+ return sizeof(*sinfo);
+
+ if (nla_type(nla) != TCA_FLOWER_KEY_ENC_OPTS_GTP) {
+ NL_SET_ERR_MSG_MOD(extack, "Non-gtp option type for mask");
+ return -EINVAL;
+ }
+
+ err = nla_parse_nested(tb, TCA_FLOWER_KEY_ENC_OPT_GTP_MAX, nla,
+ gtp_opt_policy, extack);
+ if (err < 0)
+ return err;
+
+ if (!option_len &&
+ (!tb[TCA_FLOWER_KEY_ENC_OPT_GTP_PDU_TYPE] ||
+ !tb[TCA_FLOWER_KEY_ENC_OPT_GTP_QFI])) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "Missing tunnel key gtp option pdu type or qfi");
+ return -EINVAL;
+ }
+
+ if (tb[TCA_FLOWER_KEY_ENC_OPT_GTP_PDU_TYPE])
+ sinfo->pdu_type =
+ nla_get_u8(tb[TCA_FLOWER_KEY_ENC_OPT_GTP_PDU_TYPE]);
+
+ if (tb[TCA_FLOWER_KEY_ENC_OPT_GTP_QFI])
+ sinfo->qfi = nla_get_u8(tb[TCA_FLOWER_KEY_ENC_OPT_GTP_QFI]);
+
+ return sizeof(*sinfo);
+}
+
+static int fl_set_pfcp_opt(const struct nlattr *nla, struct fl_flow_key *key,
+ int depth, int option_len,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[TCA_FLOWER_KEY_ENC_OPT_PFCP_MAX + 1];
+ struct pfcp_metadata *md;
+ int err;
+
+ md = (struct pfcp_metadata *)&key->enc_opts.data[key->enc_opts.len];
+ memset(md, 0xff, sizeof(*md));
+
+ if (!depth)
+ return sizeof(*md);
+
+ if (nla_type(nla) != TCA_FLOWER_KEY_ENC_OPTS_PFCP) {
+ NL_SET_ERR_MSG_MOD(extack, "Non-pfcp option type for mask");
+ return -EINVAL;
+ }
+
+ err = nla_parse_nested(tb, TCA_FLOWER_KEY_ENC_OPT_PFCP_MAX, nla,
+ pfcp_opt_policy, extack);
+ if (err < 0)
+ return err;
+
+ if (!option_len && !tb[TCA_FLOWER_KEY_ENC_OPT_PFCP_TYPE]) {
+ NL_SET_ERR_MSG_MOD(extack, "Missing tunnel key pfcp option type");
+ return -EINVAL;
+ }
+
+ if (tb[TCA_FLOWER_KEY_ENC_OPT_PFCP_TYPE])
+ md->type = nla_get_u8(tb[TCA_FLOWER_KEY_ENC_OPT_PFCP_TYPE]);
+
+ if (tb[TCA_FLOWER_KEY_ENC_OPT_PFCP_SEID])
+ md->seid = nla_get_be64(tb[TCA_FLOWER_KEY_ENC_OPT_PFCP_SEID]);
+
+ return sizeof(*md);
+}
+
static int fl_set_enc_opt(struct nlattr **tb, struct fl_flow_key *key,
struct fl_flow_key *mask,
struct netlink_ext_ack *extack)
@@ -812,31 +1514,41 @@ static int fl_set_enc_opt(struct nlattr **tb, struct fl_flow_key *key,
const struct nlattr *nla_enc_key, *nla_opt_key, *nla_opt_msk = NULL;
int err, option_len, key_depth, msk_depth = 0;
- err = nla_validate_nested(tb[TCA_FLOWER_KEY_ENC_OPTS],
- TCA_FLOWER_KEY_ENC_OPTS_MAX,
- enc_opts_policy, extack);
+ err = nla_validate_nested_deprecated(tb[TCA_FLOWER_KEY_ENC_OPTS],
+ TCA_FLOWER_KEY_ENC_OPTS_MAX,
+ enc_opts_policy, extack);
if (err)
return err;
nla_enc_key = nla_data(tb[TCA_FLOWER_KEY_ENC_OPTS]);
if (tb[TCA_FLOWER_KEY_ENC_OPTS_MASK]) {
- err = nla_validate_nested(tb[TCA_FLOWER_KEY_ENC_OPTS_MASK],
- TCA_FLOWER_KEY_ENC_OPTS_MAX,
- enc_opts_policy, extack);
+ err = nla_validate_nested_deprecated(tb[TCA_FLOWER_KEY_ENC_OPTS_MASK],
+ TCA_FLOWER_KEY_ENC_OPTS_MAX,
+ enc_opts_policy, extack);
if (err)
return err;
nla_opt_msk = nla_data(tb[TCA_FLOWER_KEY_ENC_OPTS_MASK]);
msk_depth = nla_len(tb[TCA_FLOWER_KEY_ENC_OPTS_MASK]);
+ if (!nla_ok(nla_opt_msk, msk_depth)) {
+ NL_SET_ERR_MSG(extack, "Invalid nested attribute for masks");
+ return -EINVAL;
+ }
}
nla_for_each_attr(nla_opt_key, nla_enc_key,
nla_len(tb[TCA_FLOWER_KEY_ENC_OPTS]), key_depth) {
switch (nla_type(nla_opt_key)) {
case TCA_FLOWER_KEY_ENC_OPTS_GENEVE:
+ if (key->enc_opts.dst_opt_type &&
+ key->enc_opts.dst_opt_type !=
+ IP_TUNNEL_GENEVE_OPT_BIT) {
+ NL_SET_ERR_MSG(extack, "Duplicate type for geneve options");
+ return -EINVAL;
+ }
option_len = 0;
- key->enc_opts.dst_opt_type = TUNNEL_GENEVE_OPT;
+ key->enc_opts.dst_opt_type = IP_TUNNEL_GENEVE_OPT_BIT;
option_len = fl_set_geneve_opt(nla_opt_key, key,
key_depth, option_len,
extack);
@@ -847,7 +1559,7 @@ static int fl_set_enc_opt(struct nlattr **tb, struct fl_flow_key *key,
/* At the same time we need to parse through the mask
* in order to verify exact and mask attribute lengths.
*/
- mask->enc_opts.dst_opt_type = TUNNEL_GENEVE_OPT;
+ mask->enc_opts.dst_opt_type = IP_TUNNEL_GENEVE_OPT_BIT;
option_len = fl_set_geneve_opt(nla_opt_msk, mask,
msk_depth, option_len,
extack);
@@ -859,34 +1571,320 @@ static int fl_set_enc_opt(struct nlattr **tb, struct fl_flow_key *key,
NL_SET_ERR_MSG(extack, "Key and mask miss aligned");
return -EINVAL;
}
+ break;
+ case TCA_FLOWER_KEY_ENC_OPTS_VXLAN:
+ if (key->enc_opts.dst_opt_type) {
+ NL_SET_ERR_MSG(extack, "Duplicate type for vxlan options");
+ return -EINVAL;
+ }
+ option_len = 0;
+ key->enc_opts.dst_opt_type = IP_TUNNEL_VXLAN_OPT_BIT;
+ option_len = fl_set_vxlan_opt(nla_opt_key, key,
+ key_depth, option_len,
+ extack);
+ if (option_len < 0)
+ return option_len;
+
+ key->enc_opts.len += option_len;
+ /* At the same time we need to parse through the mask
+ * in order to verify exact and mask attribute lengths.
+ */
+ mask->enc_opts.dst_opt_type = IP_TUNNEL_VXLAN_OPT_BIT;
+ option_len = fl_set_vxlan_opt(nla_opt_msk, mask,
+ msk_depth, option_len,
+ extack);
+ if (option_len < 0)
+ return option_len;
- if (msk_depth)
- nla_opt_msk = nla_next(nla_opt_msk, &msk_depth);
+ mask->enc_opts.len += option_len;
+ if (key->enc_opts.len != mask->enc_opts.len) {
+ NL_SET_ERR_MSG(extack, "Key and mask miss aligned");
+ return -EINVAL;
+ }
+ break;
+ case TCA_FLOWER_KEY_ENC_OPTS_ERSPAN:
+ if (key->enc_opts.dst_opt_type) {
+ NL_SET_ERR_MSG(extack, "Duplicate type for erspan options");
+ return -EINVAL;
+ }
+ option_len = 0;
+ key->enc_opts.dst_opt_type = IP_TUNNEL_ERSPAN_OPT_BIT;
+ option_len = fl_set_erspan_opt(nla_opt_key, key,
+ key_depth, option_len,
+ extack);
+ if (option_len < 0)
+ return option_len;
+
+ key->enc_opts.len += option_len;
+ /* At the same time we need to parse through the mask
+ * in order to verify exact and mask attribute lengths.
+ */
+ mask->enc_opts.dst_opt_type = IP_TUNNEL_ERSPAN_OPT_BIT;
+ option_len = fl_set_erspan_opt(nla_opt_msk, mask,
+ msk_depth, option_len,
+ extack);
+ if (option_len < 0)
+ return option_len;
+
+ mask->enc_opts.len += option_len;
+ if (key->enc_opts.len != mask->enc_opts.len) {
+ NL_SET_ERR_MSG(extack, "Key and mask miss aligned");
+ return -EINVAL;
+ }
+ break;
+ case TCA_FLOWER_KEY_ENC_OPTS_GTP:
+ if (key->enc_opts.dst_opt_type) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "Duplicate type for gtp options");
+ return -EINVAL;
+ }
+ option_len = 0;
+ key->enc_opts.dst_opt_type = IP_TUNNEL_GTP_OPT_BIT;
+ option_len = fl_set_gtp_opt(nla_opt_key, key,
+ key_depth, option_len,
+ extack);
+ if (option_len < 0)
+ return option_len;
+
+ key->enc_opts.len += option_len;
+ /* At the same time we need to parse through the mask
+ * in order to verify exact and mask attribute lengths.
+ */
+ mask->enc_opts.dst_opt_type = IP_TUNNEL_GTP_OPT_BIT;
+ option_len = fl_set_gtp_opt(nla_opt_msk, mask,
+ msk_depth, option_len,
+ extack);
+ if (option_len < 0)
+ return option_len;
+
+ mask->enc_opts.len += option_len;
+ if (key->enc_opts.len != mask->enc_opts.len) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "Key and mask miss aligned");
+ return -EINVAL;
+ }
+ break;
+ case TCA_FLOWER_KEY_ENC_OPTS_PFCP:
+ if (key->enc_opts.dst_opt_type) {
+ NL_SET_ERR_MSG_MOD(extack, "Duplicate type for pfcp options");
+ return -EINVAL;
+ }
+ option_len = 0;
+ key->enc_opts.dst_opt_type = IP_TUNNEL_PFCP_OPT_BIT;
+ option_len = fl_set_pfcp_opt(nla_opt_key, key,
+ key_depth, option_len,
+ extack);
+ if (option_len < 0)
+ return option_len;
+
+ key->enc_opts.len += option_len;
+ /* At the same time we need to parse through the mask
+ * in order to verify exact and mask attribute lengths.
+ */
+ mask->enc_opts.dst_opt_type = IP_TUNNEL_PFCP_OPT_BIT;
+ option_len = fl_set_pfcp_opt(nla_opt_msk, mask,
+ msk_depth, option_len,
+ extack);
+ if (option_len < 0)
+ return option_len;
+
+ mask->enc_opts.len += option_len;
+ if (key->enc_opts.len != mask->enc_opts.len) {
+ NL_SET_ERR_MSG_MOD(extack, "Key and mask miss aligned");
+ return -EINVAL;
+ }
break;
default:
NL_SET_ERR_MSG(extack, "Unknown tunnel option type");
return -EINVAL;
}
+
+ if (!msk_depth)
+ continue;
+
+ if (!nla_ok(nla_opt_msk, msk_depth)) {
+ NL_SET_ERR_MSG(extack, "A mask attribute is invalid");
+ return -EINVAL;
+ }
+ nla_opt_msk = nla_next(nla_opt_msk, &msk_depth);
+ }
+
+ return 0;
+}
+
+static int fl_validate_ct_state(u16 state, struct nlattr *tb,
+ struct netlink_ext_ack *extack)
+{
+ if (state && !(state & TCA_FLOWER_KEY_CT_FLAGS_TRACKED)) {
+ NL_SET_ERR_MSG_ATTR(extack, tb,
+ "no trk, so no other flag can be set");
+ return -EINVAL;
+ }
+
+ if (state & TCA_FLOWER_KEY_CT_FLAGS_NEW &&
+ state & TCA_FLOWER_KEY_CT_FLAGS_ESTABLISHED) {
+ NL_SET_ERR_MSG_ATTR(extack, tb,
+ "new and est are mutually exclusive");
+ return -EINVAL;
+ }
+
+ if (state & TCA_FLOWER_KEY_CT_FLAGS_INVALID &&
+ state & ~(TCA_FLOWER_KEY_CT_FLAGS_TRACKED |
+ TCA_FLOWER_KEY_CT_FLAGS_INVALID)) {
+ NL_SET_ERR_MSG_ATTR(extack, tb,
+ "when inv is set, only trk may be set");
+ return -EINVAL;
+ }
+
+ if (state & TCA_FLOWER_KEY_CT_FLAGS_NEW &&
+ state & TCA_FLOWER_KEY_CT_FLAGS_REPLY) {
+ NL_SET_ERR_MSG_ATTR(extack, tb,
+ "new and rpl are mutually exclusive");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int fl_set_key_ct(struct nlattr **tb,
+ struct flow_dissector_key_ct *key,
+ struct flow_dissector_key_ct *mask,
+ struct netlink_ext_ack *extack)
+{
+ if (tb[TCA_FLOWER_KEY_CT_STATE]) {
+ int err;
+
+ if (!IS_ENABLED(CONFIG_NF_CONNTRACK)) {
+ NL_SET_ERR_MSG(extack, "Conntrack isn't enabled");
+ return -EOPNOTSUPP;
+ }
+ fl_set_key_val(tb, &key->ct_state, TCA_FLOWER_KEY_CT_STATE,
+ &mask->ct_state, TCA_FLOWER_KEY_CT_STATE_MASK,
+ sizeof(key->ct_state));
+
+ err = fl_validate_ct_state(key->ct_state & mask->ct_state,
+ tb[TCA_FLOWER_KEY_CT_STATE_MASK],
+ extack);
+ if (err)
+ return err;
+
+ }
+ if (tb[TCA_FLOWER_KEY_CT_ZONE]) {
+ if (!IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES)) {
+ NL_SET_ERR_MSG(extack, "Conntrack zones isn't enabled");
+ return -EOPNOTSUPP;
+ }
+ fl_set_key_val(tb, &key->ct_zone, TCA_FLOWER_KEY_CT_ZONE,
+ &mask->ct_zone, TCA_FLOWER_KEY_CT_ZONE_MASK,
+ sizeof(key->ct_zone));
+ }
+ if (tb[TCA_FLOWER_KEY_CT_MARK]) {
+ if (!IS_ENABLED(CONFIG_NF_CONNTRACK_MARK)) {
+ NL_SET_ERR_MSG(extack, "Conntrack mark isn't enabled");
+ return -EOPNOTSUPP;
+ }
+ fl_set_key_val(tb, &key->ct_mark, TCA_FLOWER_KEY_CT_MARK,
+ &mask->ct_mark, TCA_FLOWER_KEY_CT_MARK_MASK,
+ sizeof(key->ct_mark));
+ }
+ if (tb[TCA_FLOWER_KEY_CT_LABELS]) {
+ if (!IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS)) {
+ NL_SET_ERR_MSG(extack, "Conntrack labels aren't enabled");
+ return -EOPNOTSUPP;
+ }
+ fl_set_key_val(tb, key->ct_labels, TCA_FLOWER_KEY_CT_LABELS,
+ mask->ct_labels, TCA_FLOWER_KEY_CT_LABELS_MASK,
+ sizeof(key->ct_labels));
+ }
+
+ return 0;
+}
+
+static bool is_vlan_key(struct nlattr *tb, __be16 *ethertype,
+ struct fl_flow_key *key, struct fl_flow_key *mask,
+ int vthresh)
+{
+ const bool good_num_of_vlans = key->num_of_vlans.num_of_vlans > vthresh;
+
+ if (!tb) {
+ *ethertype = 0;
+ return good_num_of_vlans;
}
+ *ethertype = nla_get_be16(tb);
+ if (good_num_of_vlans || eth_type_vlan(*ethertype))
+ return true;
+
+ key->basic.n_proto = *ethertype;
+ mask->basic.n_proto = cpu_to_be16(~0);
+ return false;
+}
+
+static void fl_set_key_cfm_md_level(struct nlattr **tb,
+ struct fl_flow_key *key,
+ struct fl_flow_key *mask,
+ struct netlink_ext_ack *extack)
+{
+ u8 level;
+
+ if (!tb[TCA_FLOWER_KEY_CFM_MD_LEVEL])
+ return;
+
+ level = nla_get_u8(tb[TCA_FLOWER_KEY_CFM_MD_LEVEL]);
+ key->cfm.mdl_ver = FIELD_PREP(FLOW_DIS_CFM_MDL_MASK, level);
+ mask->cfm.mdl_ver = FLOW_DIS_CFM_MDL_MASK;
+}
+
+static void fl_set_key_cfm_opcode(struct nlattr **tb,
+ struct fl_flow_key *key,
+ struct fl_flow_key *mask,
+ struct netlink_ext_ack *extack)
+{
+ fl_set_key_val(tb, &key->cfm.opcode, TCA_FLOWER_KEY_CFM_OPCODE,
+ &mask->cfm.opcode, TCA_FLOWER_UNSPEC,
+ sizeof(key->cfm.opcode));
+}
+
+static int fl_set_key_cfm(struct nlattr **tb,
+ struct fl_flow_key *key,
+ struct fl_flow_key *mask,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *nla_cfm_opt[TCA_FLOWER_KEY_CFM_OPT_MAX + 1];
+ int err;
+
+ if (!tb[TCA_FLOWER_KEY_CFM])
+ return 0;
+
+ err = nla_parse_nested(nla_cfm_opt, TCA_FLOWER_KEY_CFM_OPT_MAX,
+ tb[TCA_FLOWER_KEY_CFM], cfm_opt_policy, extack);
+ if (err < 0)
+ return err;
+
+ fl_set_key_cfm_opcode(nla_cfm_opt, key, mask, extack);
+ fl_set_key_cfm_md_level(nla_cfm_opt, key, mask, extack);
+
return 0;
}
-static int fl_set_key(struct net *net, struct nlattr **tb,
- struct fl_flow_key *key, struct fl_flow_key *mask,
- struct netlink_ext_ack *extack)
+static int fl_set_key(struct net *net, struct nlattr *tca_opts,
+ struct nlattr **tb, struct fl_flow_key *key,
+ struct fl_flow_key *mask, struct netlink_ext_ack *extack)
{
__be16 ethertype;
int ret = 0;
-#ifdef CONFIG_NET_CLS_IND
+
if (tb[TCA_FLOWER_INDEV]) {
int err = tcf_change_indev(net, tb[TCA_FLOWER_INDEV], extack);
if (err < 0)
return err;
- key->indev_ifindex = err;
- mask->indev_ifindex = 0xffffffff;
+ key->meta.ingress_ifindex = err;
+ mask->meta.ingress_ifindex = 0xffffffff;
}
-#endif
+
+ fl_set_key_val(tb, &key->meta.l2_miss, TCA_FLOWER_L2_MISS,
+ &mask->meta.l2_miss, TCA_FLOWER_UNSPEC,
+ sizeof(key->meta.l2_miss));
fl_set_key_val(tb, key->eth.dst, TCA_FLOWER_KEY_ETH_DST,
mask->eth.dst, TCA_FLOWER_KEY_ETH_DST_MASK,
@@ -894,38 +1892,36 @@ static int fl_set_key(struct net *net, struct nlattr **tb,
fl_set_key_val(tb, key->eth.src, TCA_FLOWER_KEY_ETH_SRC,
mask->eth.src, TCA_FLOWER_KEY_ETH_SRC_MASK,
sizeof(key->eth.src));
-
- if (tb[TCA_FLOWER_KEY_ETH_TYPE]) {
- ethertype = nla_get_be16(tb[TCA_FLOWER_KEY_ETH_TYPE]);
-
- if (eth_type_vlan(ethertype)) {
- fl_set_key_vlan(tb, ethertype, TCA_FLOWER_KEY_VLAN_ID,
- TCA_FLOWER_KEY_VLAN_PRIO, &key->vlan,
- &mask->vlan);
-
- if (tb[TCA_FLOWER_KEY_VLAN_ETH_TYPE]) {
- ethertype = nla_get_be16(tb[TCA_FLOWER_KEY_VLAN_ETH_TYPE]);
- if (eth_type_vlan(ethertype)) {
- fl_set_key_vlan(tb, ethertype,
- TCA_FLOWER_KEY_CVLAN_ID,
- TCA_FLOWER_KEY_CVLAN_PRIO,
- &key->cvlan, &mask->cvlan);
- fl_set_key_val(tb, &key->basic.n_proto,
- TCA_FLOWER_KEY_CVLAN_ETH_TYPE,
- &mask->basic.n_proto,
- TCA_FLOWER_UNSPEC,
- sizeof(key->basic.n_proto));
- } else {
- key->basic.n_proto = ethertype;
- mask->basic.n_proto = cpu_to_be16(~0);
- }
- }
- } else {
- key->basic.n_proto = ethertype;
- mask->basic.n_proto = cpu_to_be16(~0);
+ fl_set_key_val(tb, &key->num_of_vlans,
+ TCA_FLOWER_KEY_NUM_OF_VLANS,
+ &mask->num_of_vlans,
+ TCA_FLOWER_UNSPEC,
+ sizeof(key->num_of_vlans));
+
+ if (is_vlan_key(tb[TCA_FLOWER_KEY_ETH_TYPE], &ethertype, key, mask, 0)) {
+ fl_set_key_vlan(tb, ethertype, TCA_FLOWER_KEY_VLAN_ID,
+ TCA_FLOWER_KEY_VLAN_PRIO,
+ TCA_FLOWER_KEY_VLAN_ETH_TYPE,
+ &key->vlan, &mask->vlan);
+
+ if (is_vlan_key(tb[TCA_FLOWER_KEY_VLAN_ETH_TYPE],
+ &ethertype, key, mask, 1)) {
+ fl_set_key_vlan(tb, ethertype,
+ TCA_FLOWER_KEY_CVLAN_ID,
+ TCA_FLOWER_KEY_CVLAN_PRIO,
+ TCA_FLOWER_KEY_CVLAN_ETH_TYPE,
+ &key->cvlan, &mask->cvlan);
+ fl_set_key_val(tb, &key->basic.n_proto,
+ TCA_FLOWER_KEY_CVLAN_ETH_TYPE,
+ &mask->basic.n_proto,
+ TCA_FLOWER_UNSPEC,
+ sizeof(key->basic.n_proto));
}
}
+ if (key->basic.n_proto == htons(ETH_P_PPP_SES))
+ fl_set_key_pppoe(tb, &key->pppoe, &mask->pppoe, key, mask);
+
if (key->basic.n_proto == htons(ETH_P_IP) ||
key->basic.n_proto == htons(ETH_P_IPV6)) {
fl_set_key_val(tb, &key->basic.ip_proto, TCA_FLOWER_KEY_IP_PROTO,
@@ -1000,7 +1996,7 @@ static int fl_set_key(struct net *net, struct nlattr **tb,
sizeof(key->icmp.code));
} else if (key->basic.n_proto == htons(ETH_P_MPLS_UC) ||
key->basic.n_proto == htons(ETH_P_MPLS_MC)) {
- ret = fl_set_key_mpls(tb, &key->mpls, &mask->mpls);
+ ret = fl_set_key_mpls(tb, &key->mpls, &mask->mpls, extack);
if (ret)
return ret;
} else if (key->basic.n_proto == htons(ETH_P_ARP) ||
@@ -1020,12 +2016,27 @@ static int fl_set_key(struct net *net, struct nlattr **tb,
fl_set_key_val(tb, key->arp.tha, TCA_FLOWER_KEY_ARP_THA,
mask->arp.tha, TCA_FLOWER_KEY_ARP_THA_MASK,
sizeof(key->arp.tha));
+ } else if (key->basic.ip_proto == IPPROTO_L2TP) {
+ fl_set_key_val(tb, &key->l2tpv3.session_id,
+ TCA_FLOWER_KEY_L2TPV3_SID,
+ &mask->l2tpv3.session_id, TCA_FLOWER_UNSPEC,
+ sizeof(key->l2tpv3.session_id));
+ } else if (key->basic.n_proto == htons(ETH_P_CFM)) {
+ ret = fl_set_key_cfm(tb, key, mask, extack);
+ if (ret)
+ return ret;
}
if (key->basic.ip_proto == IPPROTO_TCP ||
key->basic.ip_proto == IPPROTO_UDP ||
key->basic.ip_proto == IPPROTO_SCTP) {
- ret = fl_set_key_port_range(tb, key, mask);
+ ret = fl_set_key_port_range(tb, key, mask, extack);
+ if (ret)
+ return ret;
+ }
+
+ if (tb[TCA_FLOWER_KEY_SPI]) {
+ ret = fl_set_key_spi(tb, key, mask, extack);
if (ret)
return ret;
}
@@ -1076,14 +2087,32 @@ static int fl_set_key(struct net *net, struct nlattr **tb,
fl_set_key_ip(tb, true, &key->enc_ip, &mask->enc_ip);
+ fl_set_key_val(tb, &key->hash.hash, TCA_FLOWER_KEY_HASH,
+ &mask->hash.hash, TCA_FLOWER_KEY_HASH_MASK,
+ sizeof(key->hash.hash));
+
if (tb[TCA_FLOWER_KEY_ENC_OPTS]) {
ret = fl_set_enc_opt(tb, key, mask, extack);
if (ret)
return ret;
}
- if (tb[TCA_FLOWER_KEY_FLAGS])
- ret = fl_set_key_flags(tb, &key->control.flags, &mask->control.flags);
+ ret = fl_set_key_ct(tb, &key->ct, &mask->ct, extack);
+ if (ret)
+ return ret;
+
+ if (tb[TCA_FLOWER_KEY_FLAGS]) {
+ ret = fl_set_key_flags(tca_opts, tb, false,
+ &key->control.flags,
+ &mask->control.flags, extack);
+ if (ret)
+ return ret;
+ }
+
+ if (tb[TCA_FLOWER_KEY_ENC_FLAGS])
+ ret = fl_set_key_flags(tca_opts, tb, true,
+ &key->enc_control.flags,
+ &mask->enc_control.flags, extack);
return ret;
}
@@ -1114,7 +2143,7 @@ static int fl_init_mask_hashtable(struct fl_flow_mask *mask)
}
#define FL_KEY_MEMBER_OFFSET(member) offsetof(struct fl_flow_key, member)
-#define FL_KEY_MEMBER_SIZE(member) FIELD_SIZEOF(struct fl_flow_key, member)
+#define FL_KEY_MEMBER_SIZE(member) sizeof_field(struct fl_flow_key, member)
#define FL_KEY_IS_MASKED(mask, member) \
memchr_inv(((char *)mask) + FL_KEY_MEMBER_OFFSET(member), \
@@ -1139,6 +2168,8 @@ static void fl_init_dissector(struct flow_dissector *dissector,
struct flow_dissector_key keys[FLOW_DISSECTOR_KEY_MAX];
size_t cnt = 0;
+ FL_KEY_SET_IF_MASKED(mask, keys, cnt,
+ FLOW_DISSECTOR_KEY_META, meta);
FL_KEY_SET(keys, cnt, FLOW_DISSECTOR_KEY_CONTROL, control);
FL_KEY_SET(keys, cnt, FLOW_DISSECTOR_KEY_BASIC, basic);
FL_KEY_SET_IF_MASKED(mask, keys, cnt,
@@ -1147,9 +2178,10 @@ static void fl_init_dissector(struct flow_dissector *dissector,
FLOW_DISSECTOR_KEY_IPV4_ADDRS, ipv4);
FL_KEY_SET_IF_MASKED(mask, keys, cnt,
FLOW_DISSECTOR_KEY_IPV6_ADDRS, ipv6);
- if (FL_KEY_IS_MASKED(mask, tp) ||
- FL_KEY_IS_MASKED(mask, tp_min) || FL_KEY_IS_MASKED(mask, tp_max))
- FL_KEY_SET(keys, cnt, FLOW_DISSECTOR_KEY_PORTS, tp);
+ FL_KEY_SET_IF_MASKED(mask, keys, cnt,
+ FLOW_DISSECTOR_KEY_PORTS, tp);
+ FL_KEY_SET_IF_MASKED(mask, keys, cnt,
+ FLOW_DISSECTOR_KEY_PORTS_RANGE, tp_range);
FL_KEY_SET_IF_MASKED(mask, keys, cnt,
FLOW_DISSECTOR_KEY_IP, ip);
FL_KEY_SET_IF_MASKED(mask, keys, cnt,
@@ -1171,7 +2203,8 @@ static void fl_init_dissector(struct flow_dissector *dissector,
FL_KEY_SET_IF_MASKED(mask, keys, cnt,
FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS, enc_ipv6);
if (FL_KEY_IS_MASKED(mask, enc_ipv4) ||
- FL_KEY_IS_MASKED(mask, enc_ipv6))
+ FL_KEY_IS_MASKED(mask, enc_ipv6) ||
+ FL_KEY_IS_MASKED(mask, enc_control))
FL_KEY_SET(keys, cnt, FLOW_DISSECTOR_KEY_ENC_CONTROL,
enc_control);
FL_KEY_SET_IF_MASKED(mask, keys, cnt,
@@ -1180,6 +2213,20 @@ static void fl_init_dissector(struct flow_dissector *dissector,
FLOW_DISSECTOR_KEY_ENC_IP, enc_ip);
FL_KEY_SET_IF_MASKED(mask, keys, cnt,
FLOW_DISSECTOR_KEY_ENC_OPTS, enc_opts);
+ FL_KEY_SET_IF_MASKED(mask, keys, cnt,
+ FLOW_DISSECTOR_KEY_CT, ct);
+ FL_KEY_SET_IF_MASKED(mask, keys, cnt,
+ FLOW_DISSECTOR_KEY_HASH, hash);
+ FL_KEY_SET_IF_MASKED(mask, keys, cnt,
+ FLOW_DISSECTOR_KEY_NUM_OF_VLANS, num_of_vlans);
+ FL_KEY_SET_IF_MASKED(mask, keys, cnt,
+ FLOW_DISSECTOR_KEY_PPPOE, pppoe);
+ FL_KEY_SET_IF_MASKED(mask, keys, cnt,
+ FLOW_DISSECTOR_KEY_L2TPV3, l2tpv3);
+ FL_KEY_SET_IF_MASKED(mask, keys, cnt,
+ FLOW_DISSECTOR_KEY_IPSEC, ipsec);
+ FL_KEY_SET_IF_MASKED(mask, keys, cnt,
+ FLOW_DISSECTOR_KEY_CFM, cfm);
skb_flow_dissector_init(dissector, keys, cnt);
}
@@ -1196,8 +2243,10 @@ static struct fl_flow_mask *fl_create_new_mask(struct cls_fl_head *head,
fl_mask_copy(newmask, mask);
- if ((newmask->key.tp_min.dst && newmask->key.tp_max.dst) ||
- (newmask->key.tp_min.src && newmask->key.tp_max.src))
+ if ((newmask->key.tp_range.tp_min.dst &&
+ newmask->key.tp_range.tp_max.dst) ||
+ (newmask->key.tp_range.tp_min.src &&
+ newmask->key.tp_range.tp_max.src))
newmask->flags |= TCA_FLOWER_MASK_FLAGS_RANGE;
err = fl_init_mask_hashtable(newmask);
@@ -1208,12 +2257,15 @@ static struct fl_flow_mask *fl_create_new_mask(struct cls_fl_head *head,
INIT_LIST_HEAD_RCU(&newmask->filters);
- err = rhashtable_insert_fast(&head->ht, &newmask->ht_node,
- mask_ht_params);
+ refcount_set(&newmask->refcnt, 1);
+ err = rhashtable_replace_fast(&head->ht, &mask->ht_node,
+ &newmask->ht_node, mask_ht_params);
if (err)
goto errout_destroy;
+ spin_lock(&head->masks_lock);
list_add_tail_rcu(&newmask->list, &head->masks);
+ spin_unlock(&head->masks_lock);
return newmask;
@@ -1231,75 +2283,104 @@ static int fl_check_assign_mask(struct cls_fl_head *head,
struct fl_flow_mask *mask)
{
struct fl_flow_mask *newmask;
+ int ret = 0;
- fnew->mask = rhashtable_lookup_fast(&head->ht, mask, mask_ht_params);
+ rcu_read_lock();
+
+ /* Insert mask as temporary node to prevent concurrent creation of mask
+ * with same key. Any concurrent lookups with same key will return
+ * -EAGAIN because mask's refcnt is zero.
+ */
+ fnew->mask = rhashtable_lookup_get_insert_fast(&head->ht,
+ &mask->ht_node,
+ mask_ht_params);
if (!fnew->mask) {
- if (fold)
- return -EINVAL;
+ rcu_read_unlock();
+
+ if (fold) {
+ ret = -EINVAL;
+ goto errout_cleanup;
+ }
newmask = fl_create_new_mask(head, mask);
- if (IS_ERR(newmask))
- return PTR_ERR(newmask);
+ if (IS_ERR(newmask)) {
+ ret = PTR_ERR(newmask);
+ goto errout_cleanup;
+ }
fnew->mask = newmask;
+ return 0;
+ } else if (IS_ERR(fnew->mask)) {
+ ret = PTR_ERR(fnew->mask);
} else if (fold && fold->mask != fnew->mask) {
- return -EINVAL;
+ ret = -EINVAL;
+ } else if (!refcount_inc_not_zero(&fnew->mask->refcnt)) {
+ /* Mask was deleted concurrently, try again */
+ ret = -EAGAIN;
}
+ rcu_read_unlock();
+ return ret;
- return 0;
+errout_cleanup:
+ rhashtable_remove_fast(&head->ht, &mask->ht_node,
+ mask_ht_params);
+ return ret;
}
-static int fl_set_parms(struct net *net, struct tcf_proto *tp,
- struct cls_fl_filter *f, struct fl_flow_mask *mask,
- unsigned long base, struct nlattr **tb,
- struct nlattr *est, bool ovr,
- struct fl_flow_tmplt *tmplt,
- struct netlink_ext_ack *extack)
+static bool fl_needs_tc_skb_ext(const struct fl_flow_key *mask)
{
- int err;
-
- err = tcf_exts_validate(net, tp, tb, est, &f->exts, ovr, extack);
- if (err < 0)
- return err;
-
- if (tb[TCA_FLOWER_CLASSID]) {
- f->res.classid = nla_get_u32(tb[TCA_FLOWER_CLASSID]);
- tcf_bind_filter(tp, &f->res, base);
- }
-
- err = fl_set_key(net, tb, &f->key, &mask->key, extack);
- if (err)
- return err;
+ return mask->meta.l2_miss;
+}
- fl_mask_update_range(mask);
- fl_set_masked_key(&f->mkey, &f->key, mask);
+static int fl_ht_insert_unique(struct cls_fl_filter *fnew,
+ struct cls_fl_filter *fold,
+ bool *in_ht)
+{
+ struct fl_flow_mask *mask = fnew->mask;
+ int err;
- if (!fl_mask_fits_tmplt(tmplt, mask)) {
- NL_SET_ERR_MSG_MOD(extack, "Mask does not fit the template");
- return -EINVAL;
+ err = rhashtable_lookup_insert_fast(&mask->ht,
+ &fnew->ht_node,
+ mask->filter_ht_params);
+ if (err) {
+ *in_ht = false;
+ /* It is okay if filter with same key exists when
+ * overwriting.
+ */
+ return fold && err == -EEXIST ? 0 : err;
}
+ *in_ht = true;
return 0;
}
static int fl_change(struct net *net, struct sk_buff *in_skb,
struct tcf_proto *tp, unsigned long base,
u32 handle, struct nlattr **tca,
- void **arg, bool ovr, struct netlink_ext_ack *extack)
+ void **arg, u32 flags,
+ struct netlink_ext_ack *extack)
{
- struct cls_fl_head *head = rtnl_dereference(tp->root);
+ struct cls_fl_head *head = fl_head_dereference(tp);
+ bool rtnl_held = !(flags & TCA_ACT_FLAGS_NO_RTNL);
+ struct nlattr *tca_opts = tca[TCA_OPTIONS];
struct cls_fl_filter *fold = *arg;
+ bool bound_to_filter = false;
struct cls_fl_filter *fnew;
struct fl_flow_mask *mask;
struct nlattr **tb;
+ bool in_ht;
int err;
- if (!tca[TCA_OPTIONS])
- return -EINVAL;
+ if (!tca_opts) {
+ err = -EINVAL;
+ goto errout_fold;
+ }
mask = kzalloc(sizeof(struct fl_flow_mask), GFP_KERNEL);
- if (!mask)
- return -ENOBUFS;
+ if (!mask) {
+ err = -ENOBUFS;
+ goto errout_fold;
+ }
tb = kcalloc(TCA_FLOWER_MAX + 1, sizeof(struct nlattr *), GFP_KERNEL);
if (!tb) {
@@ -1307,8 +2388,8 @@ static int fl_change(struct net *net, struct sk_buff *in_skb,
goto errout_mask_alloc;
}
- err = nla_parse_nested(tb, TCA_FLOWER_MAX, tca[TCA_OPTIONS],
- fl_policy, NULL);
+ err = nla_parse_nested_deprecated(tb, TCA_FLOWER_MAX,
+ tca_opts, fl_policy, NULL);
if (err < 0)
goto errout_tb;
@@ -1322,223 +2403,423 @@ static int fl_change(struct net *net, struct sk_buff *in_skb,
err = -ENOBUFS;
goto errout_tb;
}
-
- err = tcf_exts_init(&fnew->exts, TCA_FLOWER_ACT, 0);
- if (err < 0)
- goto errout;
-
- if (!handle) {
- handle = 1;
- err = idr_alloc_u32(&head->handle_idr, fnew, &handle,
- INT_MAX, GFP_KERNEL);
- } else if (!fold) {
- /* user specifies a handle and it doesn't exist */
- err = idr_alloc_u32(&head->handle_idr, fnew, &handle,
- handle, GFP_KERNEL);
- }
- if (err)
- goto errout;
- fnew->handle = handle;
+ INIT_LIST_HEAD(&fnew->hw_list);
+ refcount_set(&fnew->refcnt, 1);
if (tb[TCA_FLOWER_FLAGS]) {
fnew->flags = nla_get_u32(tb[TCA_FLOWER_FLAGS]);
if (!tc_flags_valid(fnew->flags)) {
+ kfree(fnew);
err = -EINVAL;
- goto errout_idr;
+ goto errout_tb;
}
}
- err = fl_set_parms(net, tp, fnew, mask, base, tb, tca[TCA_RATE], ovr,
- tp->chain->tmplt_priv, extack);
- if (err)
+ if (!fold) {
+ spin_lock(&tp->lock);
+ if (!handle) {
+ handle = 1;
+ err = idr_alloc_u32(&head->handle_idr, NULL, &handle,
+ INT_MAX, GFP_ATOMIC);
+ } else {
+ err = idr_alloc_u32(&head->handle_idr, NULL, &handle,
+ handle, GFP_ATOMIC);
+
+ /* Filter with specified handle was concurrently
+ * inserted after initial check in cls_api. This is not
+ * necessarily an error if NLM_F_EXCL is not set in
+ * message flags. Returning EAGAIN will cause cls_api to
+ * try to update concurrently inserted rule.
+ */
+ if (err == -ENOSPC)
+ err = -EAGAIN;
+ }
+ spin_unlock(&tp->lock);
+
+ if (err) {
+ kfree(fnew);
+ goto errout_tb;
+ }
+ }
+ fnew->handle = handle;
+
+ err = tcf_exts_init_ex(&fnew->exts, net, TCA_FLOWER_ACT, 0, tp, handle,
+ !tc_skip_hw(fnew->flags));
+ if (err < 0)
goto errout_idr;
- err = fl_check_assign_mask(head, fnew, fold, mask);
- if (err)
+ err = tcf_exts_validate_ex(net, tp, tb, tca[TCA_RATE],
+ &fnew->exts, flags, fnew->flags,
+ extack);
+ if (err < 0)
goto errout_idr;
- if (!fold && __fl_lookup(fnew->mask, &fnew->mkey)) {
- err = -EEXIST;
- goto errout_mask;
+ if (tb[TCA_FLOWER_CLASSID]) {
+ fnew->res.classid = nla_get_u32(tb[TCA_FLOWER_CLASSID]);
+ if (flags & TCA_ACT_FLAGS_NO_RTNL)
+ rtnl_lock();
+ tcf_bind_filter(tp, &fnew->res, base);
+ if (flags & TCA_ACT_FLAGS_NO_RTNL)
+ rtnl_unlock();
+ bound_to_filter = true;
+ }
+
+ err = fl_set_key(net, tca_opts, tb, &fnew->key, &mask->key, extack);
+ if (err)
+ goto unbind_filter;
+
+ fl_mask_update_range(mask);
+ fl_set_masked_key(&fnew->mkey, &fnew->key, mask);
+
+ if (!fl_mask_fits_tmplt(tp->chain->tmplt_priv, mask)) {
+ NL_SET_ERR_MSG_MOD(extack, "Mask does not fit the template");
+ err = -EINVAL;
+ goto unbind_filter;
+ }
+
+ /* Enable tc skb extension if filter matches on data extracted from
+ * this extension.
+ */
+ if (fl_needs_tc_skb_ext(&mask->key)) {
+ fnew->needs_tc_skb_ext = 1;
+ tc_skb_ext_tc_enable();
}
- err = rhashtable_insert_fast(&fnew->mask->ht, &fnew->ht_node,
- fnew->mask->filter_ht_params);
+ err = fl_check_assign_mask(head, fnew, fold, mask);
+ if (err)
+ goto unbind_filter;
+
+ err = fl_ht_insert_unique(fnew, fold, &in_ht);
if (err)
goto errout_mask;
if (!tc_skip_hw(fnew->flags)) {
- err = fl_hw_replace_filter(tp, fnew, extack);
+ err = fl_hw_replace_filter(tp, fnew, rtnl_held, extack);
if (err)
- goto errout_mask;
+ goto errout_ht;
}
if (!tc_in_hw(fnew->flags))
fnew->flags |= TCA_CLS_FLAGS_NOT_IN_HW;
+ tcf_proto_update_usesw(tp, fnew->flags);
+
+ spin_lock(&tp->lock);
+
+ /* tp was deleted concurrently. -EAGAIN will cause caller to lookup
+ * proto again or create new one, if necessary.
+ */
+ if (tp->deleting) {
+ err = -EAGAIN;
+ goto errout_hw;
+ }
+
if (fold) {
+ /* Fold filter was deleted concurrently. Retry lookup. */
+ if (fold->deleted) {
+ err = -EAGAIN;
+ goto errout_hw;
+ }
+
+ fnew->handle = handle;
+
+ if (!in_ht) {
+ struct rhashtable_params params =
+ fnew->mask->filter_ht_params;
+
+ err = rhashtable_insert_fast(&fnew->mask->ht,
+ &fnew->ht_node,
+ params);
+ if (err)
+ goto errout_hw;
+ in_ht = true;
+ }
+
+ refcount_inc(&fnew->refcnt);
rhashtable_remove_fast(&fold->mask->ht,
&fold->ht_node,
fold->mask->filter_ht_params);
- if (!tc_skip_hw(fold->flags))
- fl_hw_destroy_filter(tp, fold, NULL);
- }
-
- *arg = fnew;
-
- if (fold) {
idr_replace(&head->handle_idr, fnew, fnew->handle);
list_replace_rcu(&fold->list, &fnew->list);
+ fold->deleted = true;
+
+ spin_unlock(&tp->lock);
+
+ fl_mask_put(head, fold->mask);
+ if (!tc_skip_hw(fold->flags))
+ fl_hw_destroy_filter(tp, fold, rtnl_held, NULL);
tcf_unbind_filter(tp, &fold->res);
- tcf_exts_get_net(&fold->exts);
- tcf_queue_work(&fold->rwork, fl_destroy_filter_work);
+ /* Caller holds reference to fold, so refcnt is always > 0
+ * after this.
+ */
+ refcount_dec(&fold->refcnt);
+ __fl_put(fold);
} else {
+ idr_replace(&head->handle_idr, fnew, fnew->handle);
+
+ refcount_inc(&fnew->refcnt);
list_add_tail_rcu(&fnew->list, &fnew->mask->filters);
+ spin_unlock(&tp->lock);
}
+ *arg = fnew;
+
kfree(tb);
- kfree(mask);
+ tcf_queue_work(&mask->rwork, fl_uninit_mask_free_work);
return 0;
+errout_ht:
+ spin_lock(&tp->lock);
+errout_hw:
+ fnew->deleted = true;
+ spin_unlock(&tp->lock);
+ if (!tc_skip_hw(fnew->flags))
+ fl_hw_destroy_filter(tp, fnew, rtnl_held, NULL);
+ if (in_ht)
+ rhashtable_remove_fast(&fnew->mask->ht, &fnew->ht_node,
+ fnew->mask->filter_ht_params);
errout_mask:
- fl_mask_put(head, fnew->mask, false);
+ fl_mask_put(head, fnew->mask);
+
+unbind_filter:
+ if (bound_to_filter) {
+ if (flags & TCA_ACT_FLAGS_NO_RTNL)
+ rtnl_lock();
+ tcf_unbind_filter(tp, &fnew->res);
+ if (flags & TCA_ACT_FLAGS_NO_RTNL)
+ rtnl_unlock();
+ }
errout_idr:
- if (!fold)
+ if (!fold) {
+ spin_lock(&tp->lock);
idr_remove(&head->handle_idr, fnew->handle);
-errout:
- tcf_exts_destroy(&fnew->exts);
- kfree(fnew);
+ spin_unlock(&tp->lock);
+ }
+ __fl_put(fnew);
errout_tb:
kfree(tb);
errout_mask_alloc:
- kfree(mask);
+ tcf_queue_work(&mask->rwork, fl_uninit_mask_free_work);
+errout_fold:
+ if (fold)
+ __fl_put(fold);
return err;
}
static int fl_delete(struct tcf_proto *tp, void *arg, bool *last,
- struct netlink_ext_ack *extack)
+ bool rtnl_held, struct netlink_ext_ack *extack)
{
- struct cls_fl_head *head = rtnl_dereference(tp->root);
+ struct cls_fl_head *head = fl_head_dereference(tp);
struct cls_fl_filter *f = arg;
+ bool last_on_mask;
+ int err = 0;
- rhashtable_remove_fast(&f->mask->ht, &f->ht_node,
- f->mask->filter_ht_params);
- __fl_delete(tp, f, extack);
+ err = __fl_delete(tp, f, &last_on_mask, rtnl_held, extack);
*last = list_empty(&head->masks);
- return 0;
+ __fl_put(f);
+
+ return err;
}
-static void fl_walk(struct tcf_proto *tp, struct tcf_walker *arg)
+static void fl_walk(struct tcf_proto *tp, struct tcf_walker *arg,
+ bool rtnl_held)
{
- struct cls_fl_head *head = rtnl_dereference(tp->root);
+ struct cls_fl_head *head = fl_head_dereference(tp);
+ unsigned long id = arg->cookie, tmp;
struct cls_fl_filter *f;
arg->count = arg->skip;
- while ((f = idr_get_next_ul(&head->handle_idr,
- &arg->cookie)) != NULL) {
+ rcu_read_lock();
+ idr_for_each_entry_continue_ul(&head->handle_idr, f, tmp, id) {
+ /* don't return filters that are being deleted */
+ if (!f || !refcount_inc_not_zero(&f->refcnt))
+ continue;
+ rcu_read_unlock();
+
if (arg->fn(tp, f, arg) < 0) {
+ __fl_put(f);
arg->stop = 1;
+ rcu_read_lock();
break;
}
- arg->cookie = f->handle + 1;
+ __fl_put(f);
arg->count++;
+ rcu_read_lock();
}
+ rcu_read_unlock();
+ arg->cookie = id;
}
-static int fl_reoffload(struct tcf_proto *tp, bool add, tc_setup_cb_t *cb,
+static struct cls_fl_filter *
+fl_get_next_hw_filter(struct tcf_proto *tp, struct cls_fl_filter *f, bool add)
+{
+ struct cls_fl_head *head = fl_head_dereference(tp);
+
+ spin_lock(&tp->lock);
+ if (list_empty(&head->hw_filters)) {
+ spin_unlock(&tp->lock);
+ return NULL;
+ }
+
+ if (!f)
+ f = list_entry(&head->hw_filters, struct cls_fl_filter,
+ hw_list);
+ list_for_each_entry_continue(f, &head->hw_filters, hw_list) {
+ if (!(add && f->deleted) && refcount_inc_not_zero(&f->refcnt)) {
+ spin_unlock(&tp->lock);
+ return f;
+ }
+ }
+
+ spin_unlock(&tp->lock);
+ return NULL;
+}
+
+static int fl_reoffload(struct tcf_proto *tp, bool add, flow_setup_cb_t *cb,
void *cb_priv, struct netlink_ext_ack *extack)
{
- struct cls_fl_head *head = rtnl_dereference(tp->root);
- struct tc_cls_flower_offload cls_flower = {};
struct tcf_block *block = tp->chain->block;
- struct fl_flow_mask *mask;
- struct cls_fl_filter *f;
+ struct flow_cls_offload cls_flower = {};
+ struct cls_fl_filter *f = NULL;
int err;
- list_for_each_entry(mask, &head->masks, list) {
- list_for_each_entry(f, &mask->filters, list) {
- if (tc_skip_hw(f->flags))
- continue;
-
- tc_cls_common_offload_init(&cls_flower.common, tp,
- f->flags, extack);
- cls_flower.command = add ?
- TC_CLSFLOWER_REPLACE : TC_CLSFLOWER_DESTROY;
- cls_flower.cookie = (unsigned long)f;
- cls_flower.dissector = &mask->dissector;
- cls_flower.mask = &mask->key;
- cls_flower.key = &f->mkey;
- cls_flower.exts = &f->exts;
- cls_flower.classid = f->res.classid;
-
- err = cb(TC_SETUP_CLSFLOWER, &cls_flower, cb_priv);
- if (err) {
- if (add && tc_skip_sw(f->flags))
- return err;
- continue;
+ /* hw_filters list can only be changed by hw offload functions after
+ * obtaining rtnl lock. Make sure it is not changed while reoffload is
+ * iterating it.
+ */
+ ASSERT_RTNL();
+
+ while ((f = fl_get_next_hw_filter(tp, f, add))) {
+ cls_flower.rule =
+ flow_rule_alloc(tcf_exts_num_actions(&f->exts));
+ if (!cls_flower.rule) {
+ __fl_put(f);
+ return -ENOMEM;
+ }
+
+ tc_cls_common_offload_init(&cls_flower.common, tp, f->flags,
+ extack);
+ cls_flower.command = add ?
+ FLOW_CLS_REPLACE : FLOW_CLS_DESTROY;
+ cls_flower.cookie = (unsigned long)f;
+ cls_flower.rule->match.dissector = &f->mask->dissector;
+ cls_flower.rule->match.mask = &f->mask->key;
+ cls_flower.rule->match.key = &f->mkey;
+
+ err = tc_setup_offload_action(&cls_flower.rule->action, &f->exts,
+ cls_flower.common.extack);
+ if (err) {
+ kfree(cls_flower.rule);
+ if (tc_skip_sw(f->flags)) {
+ __fl_put(f);
+ return err;
}
+ goto next_flow;
+ }
+
+ cls_flower.classid = f->res.classid;
- tc_cls_offload_cnt_update(block, &f->in_hw_count,
- &f->flags, add);
+ err = tc_setup_cb_reoffload(block, tp, add, cb,
+ TC_SETUP_CLSFLOWER, &cls_flower,
+ cb_priv, &f->flags,
+ &f->in_hw_count);
+ tc_cleanup_offload_action(&cls_flower.rule->action);
+ kfree(cls_flower.rule);
+
+ if (err) {
+ __fl_put(f);
+ return err;
}
+next_flow:
+ __fl_put(f);
}
return 0;
}
-static void fl_hw_create_tmplt(struct tcf_chain *chain,
- struct fl_flow_tmplt *tmplt)
+static void fl_hw_add(struct tcf_proto *tp, void *type_data)
+{
+ struct flow_cls_offload *cls_flower = type_data;
+ struct cls_fl_filter *f =
+ (struct cls_fl_filter *) cls_flower->cookie;
+ struct cls_fl_head *head = fl_head_dereference(tp);
+
+ spin_lock(&tp->lock);
+ list_add(&f->hw_list, &head->hw_filters);
+ spin_unlock(&tp->lock);
+}
+
+static void fl_hw_del(struct tcf_proto *tp, void *type_data)
+{
+ struct flow_cls_offload *cls_flower = type_data;
+ struct cls_fl_filter *f =
+ (struct cls_fl_filter *) cls_flower->cookie;
+
+ spin_lock(&tp->lock);
+ if (!list_empty(&f->hw_list))
+ list_del_init(&f->hw_list);
+ spin_unlock(&tp->lock);
+}
+
+static int fl_hw_create_tmplt(struct tcf_chain *chain,
+ struct fl_flow_tmplt *tmplt)
{
- struct tc_cls_flower_offload cls_flower = {};
+ struct flow_cls_offload cls_flower = {};
struct tcf_block *block = chain->block;
- struct tcf_exts dummy_exts = { 0, };
+
+ cls_flower.rule = flow_rule_alloc(0);
+ if (!cls_flower.rule)
+ return -ENOMEM;
cls_flower.common.chain_index = chain->index;
- cls_flower.command = TC_CLSFLOWER_TMPLT_CREATE;
+ cls_flower.command = FLOW_CLS_TMPLT_CREATE;
cls_flower.cookie = (unsigned long) tmplt;
- cls_flower.dissector = &tmplt->dissector;
- cls_flower.mask = &tmplt->mask;
- cls_flower.key = &tmplt->dummy_key;
- cls_flower.exts = &dummy_exts;
+ cls_flower.rule->match.dissector = &tmplt->dissector;
+ cls_flower.rule->match.mask = &tmplt->mask;
+ cls_flower.rule->match.key = &tmplt->dummy_key;
/* We don't care if driver (any of them) fails to handle this
* call. It serves just as a hint for it.
*/
- tc_setup_cb_call(block, TC_SETUP_CLSFLOWER, &cls_flower, false);
+ tc_setup_cb_call(block, TC_SETUP_CLSFLOWER, &cls_flower, false, true);
+ kfree(cls_flower.rule);
+
+ return 0;
}
static void fl_hw_destroy_tmplt(struct tcf_chain *chain,
struct fl_flow_tmplt *tmplt)
{
- struct tc_cls_flower_offload cls_flower = {};
+ struct flow_cls_offload cls_flower = {};
struct tcf_block *block = chain->block;
cls_flower.common.chain_index = chain->index;
- cls_flower.command = TC_CLSFLOWER_TMPLT_DESTROY;
+ cls_flower.command = FLOW_CLS_TMPLT_DESTROY;
cls_flower.cookie = (unsigned long) tmplt;
- tc_setup_cb_call(block, TC_SETUP_CLSFLOWER, &cls_flower, false);
+ tc_setup_cb_call(block, TC_SETUP_CLSFLOWER, &cls_flower, false, true);
}
static void *fl_tmplt_create(struct net *net, struct tcf_chain *chain,
struct nlattr **tca,
struct netlink_ext_ack *extack)
{
+ struct nlattr *tca_opts = tca[TCA_OPTIONS];
struct fl_flow_tmplt *tmplt;
struct nlattr **tb;
int err;
- if (!tca[TCA_OPTIONS])
+ if (!tca_opts)
return ERR_PTR(-EINVAL);
tb = kcalloc(TCA_FLOWER_MAX + 1, sizeof(struct nlattr *), GFP_KERNEL);
if (!tb)
return ERR_PTR(-ENOBUFS);
- err = nla_parse_nested(tb, TCA_FLOWER_MAX, tca[TCA_OPTIONS],
- fl_policy, NULL);
+ err = nla_parse_nested_deprecated(tb, TCA_FLOWER_MAX,
+ tca_opts, fl_policy, NULL);
if (err)
goto errout_tb;
@@ -1548,15 +2829,18 @@ static void *fl_tmplt_create(struct net *net, struct tcf_chain *chain,
goto errout_tb;
}
tmplt->chain = chain;
- err = fl_set_key(net, tb, &tmplt->dummy_key, &tmplt->mask, extack);
+ err = fl_set_key(net, tca_opts, tb, &tmplt->dummy_key,
+ &tmplt->mask, extack);
if (err)
goto errout_tmplt;
- kfree(tb);
fl_init_dissector(&tmplt->dissector, &tmplt->mask);
- fl_hw_create_tmplt(chain, tmplt);
+ err = fl_hw_create_tmplt(chain, tmplt);
+ if (err)
+ goto errout_tmplt;
+ kfree(tb);
return tmplt;
errout_tmplt:
@@ -1574,6 +2858,28 @@ static void fl_tmplt_destroy(void *tmplt_priv)
kfree(tmplt);
}
+static void fl_tmplt_reoffload(struct tcf_chain *chain, bool add,
+ flow_setup_cb_t *cb, void *cb_priv)
+{
+ struct fl_flow_tmplt *tmplt = chain->tmplt_priv;
+ struct flow_cls_offload cls_flower = {};
+
+ cls_flower.rule = flow_rule_alloc(0);
+ if (!cls_flower.rule)
+ return;
+
+ cls_flower.common.chain_index = chain->index;
+ cls_flower.command = add ? FLOW_CLS_TMPLT_CREATE :
+ FLOW_CLS_TMPLT_DESTROY;
+ cls_flower.cookie = (unsigned long) tmplt;
+ cls_flower.rule->match.dissector = &tmplt->dissector;
+ cls_flower.rule->match.mask = &tmplt->mask;
+ cls_flower.rule->match.key = &tmplt->dummy_key;
+
+ cb(TC_SETUP_CLSFLOWER, &cls_flower, cb_priv);
+ kfree(cls_flower.rule);
+}
+
static int fl_dump_key_val(struct sk_buff *skb,
void *val, int val_type,
void *mask, int mask_type, int len)
@@ -1596,52 +2902,153 @@ static int fl_dump_key_val(struct sk_buff *skb,
static int fl_dump_key_port_range(struct sk_buff *skb, struct fl_flow_key *key,
struct fl_flow_key *mask)
{
- if (fl_dump_key_val(skb, &key->tp_min.dst, TCA_FLOWER_KEY_PORT_DST_MIN,
- &mask->tp_min.dst, TCA_FLOWER_UNSPEC,
- sizeof(key->tp_min.dst)) ||
- fl_dump_key_val(skb, &key->tp_max.dst, TCA_FLOWER_KEY_PORT_DST_MAX,
- &mask->tp_max.dst, TCA_FLOWER_UNSPEC,
- sizeof(key->tp_max.dst)) ||
- fl_dump_key_val(skb, &key->tp_min.src, TCA_FLOWER_KEY_PORT_SRC_MIN,
- &mask->tp_min.src, TCA_FLOWER_UNSPEC,
- sizeof(key->tp_min.src)) ||
- fl_dump_key_val(skb, &key->tp_max.src, TCA_FLOWER_KEY_PORT_SRC_MAX,
- &mask->tp_max.src, TCA_FLOWER_UNSPEC,
- sizeof(key->tp_max.src)))
+ if (fl_dump_key_val(skb, &key->tp_range.tp_min.dst,
+ TCA_FLOWER_KEY_PORT_DST_MIN,
+ &mask->tp_range.tp_min.dst, TCA_FLOWER_UNSPEC,
+ sizeof(key->tp_range.tp_min.dst)) ||
+ fl_dump_key_val(skb, &key->tp_range.tp_max.dst,
+ TCA_FLOWER_KEY_PORT_DST_MAX,
+ &mask->tp_range.tp_max.dst, TCA_FLOWER_UNSPEC,
+ sizeof(key->tp_range.tp_max.dst)) ||
+ fl_dump_key_val(skb, &key->tp_range.tp_min.src,
+ TCA_FLOWER_KEY_PORT_SRC_MIN,
+ &mask->tp_range.tp_min.src, TCA_FLOWER_UNSPEC,
+ sizeof(key->tp_range.tp_min.src)) ||
+ fl_dump_key_val(skb, &key->tp_range.tp_max.src,
+ TCA_FLOWER_KEY_PORT_SRC_MAX,
+ &mask->tp_range.tp_max.src, TCA_FLOWER_UNSPEC,
+ sizeof(key->tp_range.tp_max.src)))
return -1;
return 0;
}
+static int fl_dump_key_mpls_opt_lse(struct sk_buff *skb,
+ struct flow_dissector_key_mpls *mpls_key,
+ struct flow_dissector_key_mpls *mpls_mask,
+ u8 lse_index)
+{
+ struct flow_dissector_mpls_lse *lse_mask = &mpls_mask->ls[lse_index];
+ struct flow_dissector_mpls_lse *lse_key = &mpls_key->ls[lse_index];
+ int err;
+
+ err = nla_put_u8(skb, TCA_FLOWER_KEY_MPLS_OPT_LSE_DEPTH,
+ lse_index + 1);
+ if (err)
+ return err;
+
+ if (lse_mask->mpls_ttl) {
+ err = nla_put_u8(skb, TCA_FLOWER_KEY_MPLS_OPT_LSE_TTL,
+ lse_key->mpls_ttl);
+ if (err)
+ return err;
+ }
+ if (lse_mask->mpls_bos) {
+ err = nla_put_u8(skb, TCA_FLOWER_KEY_MPLS_OPT_LSE_BOS,
+ lse_key->mpls_bos);
+ if (err)
+ return err;
+ }
+ if (lse_mask->mpls_tc) {
+ err = nla_put_u8(skb, TCA_FLOWER_KEY_MPLS_OPT_LSE_TC,
+ lse_key->mpls_tc);
+ if (err)
+ return err;
+ }
+ if (lse_mask->mpls_label) {
+ err = nla_put_u32(skb, TCA_FLOWER_KEY_MPLS_OPT_LSE_LABEL,
+ lse_key->mpls_label);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
+static int fl_dump_key_mpls_opts(struct sk_buff *skb,
+ struct flow_dissector_key_mpls *mpls_key,
+ struct flow_dissector_key_mpls *mpls_mask)
+{
+ struct nlattr *opts;
+ struct nlattr *lse;
+ u8 lse_index;
+ int err;
+
+ opts = nla_nest_start(skb, TCA_FLOWER_KEY_MPLS_OPTS);
+ if (!opts)
+ return -EMSGSIZE;
+
+ for (lse_index = 0; lse_index < FLOW_DIS_MPLS_MAX; lse_index++) {
+ if (!(mpls_mask->used_lses & 1 << lse_index))
+ continue;
+
+ lse = nla_nest_start(skb, TCA_FLOWER_KEY_MPLS_OPTS_LSE);
+ if (!lse) {
+ err = -EMSGSIZE;
+ goto err_opts;
+ }
+
+ err = fl_dump_key_mpls_opt_lse(skb, mpls_key, mpls_mask,
+ lse_index);
+ if (err)
+ goto err_opts_lse;
+ nla_nest_end(skb, lse);
+ }
+ nla_nest_end(skb, opts);
+
+ return 0;
+
+err_opts_lse:
+ nla_nest_cancel(skb, lse);
+err_opts:
+ nla_nest_cancel(skb, opts);
+
+ return err;
+}
+
static int fl_dump_key_mpls(struct sk_buff *skb,
struct flow_dissector_key_mpls *mpls_key,
struct flow_dissector_key_mpls *mpls_mask)
{
+ struct flow_dissector_mpls_lse *lse_mask;
+ struct flow_dissector_mpls_lse *lse_key;
int err;
- if (!memchr_inv(mpls_mask, 0, sizeof(*mpls_mask)))
+ if (!mpls_mask->used_lses)
return 0;
- if (mpls_mask->mpls_ttl) {
+
+ lse_mask = &mpls_mask->ls[0];
+ lse_key = &mpls_key->ls[0];
+
+ /* For backward compatibility, don't use the MPLS nested attributes if
+ * the rule can be expressed using the old attributes.
+ */
+ if (mpls_mask->used_lses & ~1 ||
+ (!lse_mask->mpls_ttl && !lse_mask->mpls_bos &&
+ !lse_mask->mpls_tc && !lse_mask->mpls_label))
+ return fl_dump_key_mpls_opts(skb, mpls_key, mpls_mask);
+
+ if (lse_mask->mpls_ttl) {
err = nla_put_u8(skb, TCA_FLOWER_KEY_MPLS_TTL,
- mpls_key->mpls_ttl);
+ lse_key->mpls_ttl);
if (err)
return err;
}
- if (mpls_mask->mpls_tc) {
+ if (lse_mask->mpls_tc) {
err = nla_put_u8(skb, TCA_FLOWER_KEY_MPLS_TC,
- mpls_key->mpls_tc);
+ lse_key->mpls_tc);
if (err)
return err;
}
- if (mpls_mask->mpls_label) {
+ if (lse_mask->mpls_label) {
err = nla_put_u32(skb, TCA_FLOWER_KEY_MPLS_LABEL,
- mpls_key->mpls_label);
+ lse_key->mpls_label);
if (err)
return err;
}
- if (mpls_mask->mpls_bos) {
+ if (lse_mask->mpls_bos) {
err = nla_put_u8(skb, TCA_FLOWER_KEY_MPLS_BOS,
- mpls_key->mpls_bos);
+ lse_key->mpls_bos);
if (err)
return err;
}
@@ -1699,12 +3106,22 @@ static void fl_get_key_flag(u32 dissector_key, u32 dissector_mask,
}
}
-static int fl_dump_key_flags(struct sk_buff *skb, u32 flags_key, u32 flags_mask)
+static int fl_dump_key_flags(struct sk_buff *skb, bool encap,
+ u32 flags_key, u32 flags_mask)
{
- u32 key, mask;
+ int fl_key, fl_mask;
__be32 _key, _mask;
+ u32 key, mask;
int err;
+ if (encap) {
+ fl_key = TCA_FLOWER_KEY_ENC_FLAGS;
+ fl_mask = TCA_FLOWER_KEY_ENC_FLAGS_MASK;
+ } else {
+ fl_key = TCA_FLOWER_KEY_FLAGS;
+ fl_mask = TCA_FLOWER_KEY_FLAGS_MASK;
+ }
+
if (!memchr_inv(&flags_mask, 0, sizeof(flags_mask)))
return 0;
@@ -1717,14 +3134,29 @@ static int fl_dump_key_flags(struct sk_buff *skb, u32 flags_key, u32 flags_mask)
TCA_FLOWER_KEY_FLAGS_FRAG_IS_FIRST,
FLOW_DIS_FIRST_FRAG);
+ fl_get_key_flag(flags_key, flags_mask, &key, &mask,
+ TCA_FLOWER_KEY_FLAGS_TUNNEL_CSUM,
+ FLOW_DIS_F_TUNNEL_CSUM);
+
+ fl_get_key_flag(flags_key, flags_mask, &key, &mask,
+ TCA_FLOWER_KEY_FLAGS_TUNNEL_DONT_FRAGMENT,
+ FLOW_DIS_F_TUNNEL_DONT_FRAGMENT);
+
+ fl_get_key_flag(flags_key, flags_mask, &key, &mask,
+ TCA_FLOWER_KEY_FLAGS_TUNNEL_OAM, FLOW_DIS_F_TUNNEL_OAM);
+
+ fl_get_key_flag(flags_key, flags_mask, &key, &mask,
+ TCA_FLOWER_KEY_FLAGS_TUNNEL_CRIT_OPT,
+ FLOW_DIS_F_TUNNEL_CRIT_OPT);
+
_key = cpu_to_be32(key);
_mask = cpu_to_be32(mask);
- err = nla_put(skb, TCA_FLOWER_KEY_FLAGS, 4, &_key);
+ err = nla_put(skb, fl_key, 4, &_key);
if (err)
return err;
- return nla_put(skb, TCA_FLOWER_KEY_FLAGS_MASK, 4, &_mask);
+ return nla_put(skb, fl_mask, 4, &_mask);
}
static int fl_dump_key_geneve_opt(struct sk_buff *skb,
@@ -1734,7 +3166,7 @@ static int fl_dump_key_geneve_opt(struct sk_buff *skb,
struct nlattr *nest;
int opt_off = 0;
- nest = nla_nest_start(skb, TCA_FLOWER_KEY_ENC_OPTS_GENEVE);
+ nest = nla_nest_start_noflag(skb, TCA_FLOWER_KEY_ENC_OPTS_GENEVE);
if (!nest)
goto nla_put_failure;
@@ -1761,6 +3193,186 @@ nla_put_failure:
return -EMSGSIZE;
}
+static int fl_dump_key_vxlan_opt(struct sk_buff *skb,
+ struct flow_dissector_key_enc_opts *enc_opts)
+{
+ struct vxlan_metadata *md;
+ struct nlattr *nest;
+
+ nest = nla_nest_start_noflag(skb, TCA_FLOWER_KEY_ENC_OPTS_VXLAN);
+ if (!nest)
+ goto nla_put_failure;
+
+ md = (struct vxlan_metadata *)&enc_opts->data[0];
+ if (nla_put_u32(skb, TCA_FLOWER_KEY_ENC_OPT_VXLAN_GBP, md->gbp))
+ goto nla_put_failure;
+
+ nla_nest_end(skb, nest);
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(skb, nest);
+ return -EMSGSIZE;
+}
+
+static int fl_dump_key_erspan_opt(struct sk_buff *skb,
+ struct flow_dissector_key_enc_opts *enc_opts)
+{
+ struct erspan_metadata *md;
+ struct nlattr *nest;
+
+ nest = nla_nest_start_noflag(skb, TCA_FLOWER_KEY_ENC_OPTS_ERSPAN);
+ if (!nest)
+ goto nla_put_failure;
+
+ md = (struct erspan_metadata *)&enc_opts->data[0];
+ if (nla_put_u8(skb, TCA_FLOWER_KEY_ENC_OPT_ERSPAN_VER, md->version))
+ goto nla_put_failure;
+
+ if (md->version == 1 &&
+ nla_put_be32(skb, TCA_FLOWER_KEY_ENC_OPT_ERSPAN_INDEX, md->u.index))
+ goto nla_put_failure;
+
+ if (md->version == 2 &&
+ (nla_put_u8(skb, TCA_FLOWER_KEY_ENC_OPT_ERSPAN_DIR,
+ md->u.md2.dir) ||
+ nla_put_u8(skb, TCA_FLOWER_KEY_ENC_OPT_ERSPAN_HWID,
+ get_hwid(&md->u.md2))))
+ goto nla_put_failure;
+
+ nla_nest_end(skb, nest);
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(skb, nest);
+ return -EMSGSIZE;
+}
+
+static int fl_dump_key_gtp_opt(struct sk_buff *skb,
+ struct flow_dissector_key_enc_opts *enc_opts)
+
+{
+ struct gtp_pdu_session_info *session_info;
+ struct nlattr *nest;
+
+ nest = nla_nest_start_noflag(skb, TCA_FLOWER_KEY_ENC_OPTS_GTP);
+ if (!nest)
+ goto nla_put_failure;
+
+ session_info = (struct gtp_pdu_session_info *)&enc_opts->data[0];
+
+ if (nla_put_u8(skb, TCA_FLOWER_KEY_ENC_OPT_GTP_PDU_TYPE,
+ session_info->pdu_type))
+ goto nla_put_failure;
+
+ if (nla_put_u8(skb, TCA_FLOWER_KEY_ENC_OPT_GTP_QFI, session_info->qfi))
+ goto nla_put_failure;
+
+ nla_nest_end(skb, nest);
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(skb, nest);
+ return -EMSGSIZE;
+}
+
+static int fl_dump_key_pfcp_opt(struct sk_buff *skb,
+ struct flow_dissector_key_enc_opts *enc_opts)
+{
+ struct pfcp_metadata *md;
+ struct nlattr *nest;
+
+ nest = nla_nest_start_noflag(skb, TCA_FLOWER_KEY_ENC_OPTS_PFCP);
+ if (!nest)
+ goto nla_put_failure;
+
+ md = (struct pfcp_metadata *)&enc_opts->data[0];
+ if (nla_put_u8(skb, TCA_FLOWER_KEY_ENC_OPT_PFCP_TYPE, md->type))
+ goto nla_put_failure;
+
+ if (nla_put_be64(skb, TCA_FLOWER_KEY_ENC_OPT_PFCP_SEID,
+ md->seid, 0))
+ goto nla_put_failure;
+
+ nla_nest_end(skb, nest);
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(skb, nest);
+ return -EMSGSIZE;
+}
+
+static int fl_dump_key_ct(struct sk_buff *skb,
+ struct flow_dissector_key_ct *key,
+ struct flow_dissector_key_ct *mask)
+{
+ if (IS_ENABLED(CONFIG_NF_CONNTRACK) &&
+ fl_dump_key_val(skb, &key->ct_state, TCA_FLOWER_KEY_CT_STATE,
+ &mask->ct_state, TCA_FLOWER_KEY_CT_STATE_MASK,
+ sizeof(key->ct_state)))
+ goto nla_put_failure;
+
+ if (IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES) &&
+ fl_dump_key_val(skb, &key->ct_zone, TCA_FLOWER_KEY_CT_ZONE,
+ &mask->ct_zone, TCA_FLOWER_KEY_CT_ZONE_MASK,
+ sizeof(key->ct_zone)))
+ goto nla_put_failure;
+
+ if (IS_ENABLED(CONFIG_NF_CONNTRACK_MARK) &&
+ fl_dump_key_val(skb, &key->ct_mark, TCA_FLOWER_KEY_CT_MARK,
+ &mask->ct_mark, TCA_FLOWER_KEY_CT_MARK_MASK,
+ sizeof(key->ct_mark)))
+ goto nla_put_failure;
+
+ if (IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS) &&
+ fl_dump_key_val(skb, &key->ct_labels, TCA_FLOWER_KEY_CT_LABELS,
+ &mask->ct_labels, TCA_FLOWER_KEY_CT_LABELS_MASK,
+ sizeof(key->ct_labels)))
+ goto nla_put_failure;
+
+ return 0;
+
+nla_put_failure:
+ return -EMSGSIZE;
+}
+
+static int fl_dump_key_cfm(struct sk_buff *skb,
+ struct flow_dissector_key_cfm *key,
+ struct flow_dissector_key_cfm *mask)
+{
+ struct nlattr *opts;
+ int err;
+ u8 mdl;
+
+ if (!memchr_inv(mask, 0, sizeof(*mask)))
+ return 0;
+
+ opts = nla_nest_start(skb, TCA_FLOWER_KEY_CFM);
+ if (!opts)
+ return -EMSGSIZE;
+
+ if (FIELD_GET(FLOW_DIS_CFM_MDL_MASK, mask->mdl_ver)) {
+ mdl = FIELD_GET(FLOW_DIS_CFM_MDL_MASK, key->mdl_ver);
+ err = nla_put_u8(skb, TCA_FLOWER_KEY_CFM_MD_LEVEL, mdl);
+ if (err)
+ goto err_cfm_opts;
+ }
+
+ if (mask->opcode) {
+ err = nla_put_u8(skb, TCA_FLOWER_KEY_CFM_OPCODE, key->opcode);
+ if (err)
+ goto err_cfm_opts;
+ }
+
+ nla_nest_end(skb, opts);
+
+ return 0;
+
+err_cfm_opts:
+ nla_nest_cancel(skb, opts);
+ return err;
+}
+
static int fl_dump_key_options(struct sk_buff *skb, int enc_opt_type,
struct flow_dissector_key_enc_opts *enc_opts)
{
@@ -1770,16 +3382,36 @@ static int fl_dump_key_options(struct sk_buff *skb, int enc_opt_type,
if (!enc_opts->len)
return 0;
- nest = nla_nest_start(skb, enc_opt_type);
+ nest = nla_nest_start_noflag(skb, enc_opt_type);
if (!nest)
goto nla_put_failure;
switch (enc_opts->dst_opt_type) {
- case TUNNEL_GENEVE_OPT:
+ case IP_TUNNEL_GENEVE_OPT_BIT:
err = fl_dump_key_geneve_opt(skb, enc_opts);
if (err)
goto nla_put_failure;
break;
+ case IP_TUNNEL_VXLAN_OPT_BIT:
+ err = fl_dump_key_vxlan_opt(skb, enc_opts);
+ if (err)
+ goto nla_put_failure;
+ break;
+ case IP_TUNNEL_ERSPAN_OPT_BIT:
+ err = fl_dump_key_erspan_opt(skb, enc_opts);
+ if (err)
+ goto nla_put_failure;
+ break;
+ case IP_TUNNEL_GTP_OPT_BIT:
+ err = fl_dump_key_gtp_opt(skb, enc_opts);
+ if (err)
+ goto nla_put_failure;
+ break;
+ case IP_TUNNEL_PFCP_OPT_BIT:
+ err = fl_dump_key_pfcp_opt(skb, enc_opts);
+ if (err)
+ goto nla_put_failure;
+ break;
default:
goto nla_put_failure;
}
@@ -1807,14 +3439,19 @@ static int fl_dump_key_enc_opt(struct sk_buff *skb,
static int fl_dump_key(struct sk_buff *skb, struct net *net,
struct fl_flow_key *key, struct fl_flow_key *mask)
{
- if (mask->indev_ifindex) {
+ if (mask->meta.ingress_ifindex) {
struct net_device *dev;
- dev = __dev_get_by_index(net, key->indev_ifindex);
+ dev = __dev_get_by_index(net, key->meta.ingress_ifindex);
if (dev && nla_put_string(skb, TCA_FLOWER_INDEV, dev->name))
goto nla_put_failure;
}
+ if (fl_dump_key_val(skb, &key->meta.l2_miss,
+ TCA_FLOWER_L2_MISS, &mask->meta.l2_miss,
+ TCA_FLOWER_UNSPEC, sizeof(key->meta.l2_miss)))
+ goto nla_put_failure;
+
if (fl_dump_key_val(skb, key->eth.dst, TCA_FLOWER_KEY_ETH_DST,
mask->eth.dst, TCA_FLOWER_KEY_ETH_DST_MASK,
sizeof(key->eth.dst)) ||
@@ -1826,6 +3463,11 @@ static int fl_dump_key(struct sk_buff *skb, struct net *net,
sizeof(key->basic.n_proto)))
goto nla_put_failure;
+ if (mask->num_of_vlans.num_of_vlans) {
+ if (nla_put_u8(skb, TCA_FLOWER_KEY_NUM_OF_VLANS, key->num_of_vlans.num_of_vlans))
+ goto nla_put_failure;
+ }
+
if (fl_dump_key_mpls(skb, &key->mpls, &mask->mpls))
goto nla_put_failure;
@@ -1842,13 +3484,13 @@ static int fl_dump_key(struct sk_buff *skb, struct net *net,
goto nla_put_failure;
if (mask->basic.n_proto) {
- if (mask->cvlan.vlan_tpid) {
+ if (mask->cvlan.vlan_eth_type) {
if (nla_put_be16(skb, TCA_FLOWER_KEY_CVLAN_ETH_TYPE,
key->basic.n_proto))
goto nla_put_failure;
- } else if (mask->vlan.vlan_tpid) {
+ } else if (mask->vlan.vlan_eth_type) {
if (nla_put_be16(skb, TCA_FLOWER_KEY_VLAN_ETH_TYPE,
- key->basic.n_proto))
+ key->vlan.vlan_eth_type))
goto nla_put_failure;
}
}
@@ -1861,6 +3503,17 @@ static int fl_dump_key(struct sk_buff *skb, struct net *net,
fl_dump_key_ip(skb, false, &key->ip, &mask->ip)))
goto nla_put_failure;
+ if (mask->pppoe.session_id) {
+ if (nla_put_be16(skb, TCA_FLOWER_KEY_PPPOE_SID,
+ key->pppoe.session_id))
+ goto nla_put_failure;
+ }
+ if (mask->basic.n_proto && mask->pppoe.ppp_proto) {
+ if (nla_put_be16(skb, TCA_FLOWER_KEY_PPP_PROTO,
+ key->pppoe.ppp_proto))
+ goto nla_put_failure;
+ }
+
if (key->control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS &&
(fl_dump_key_val(skb, &key->ipv4.src, TCA_FLOWER_KEY_IPV4_SRC,
&mask->ipv4.src, TCA_FLOWER_KEY_IPV4_SRC_MASK,
@@ -1948,6 +3601,19 @@ static int fl_dump_key(struct sk_buff *skb, struct net *net,
mask->arp.tha, TCA_FLOWER_KEY_ARP_THA_MASK,
sizeof(key->arp.tha))))
goto nla_put_failure;
+ else if (key->basic.ip_proto == IPPROTO_L2TP &&
+ fl_dump_key_val(skb, &key->l2tpv3.session_id,
+ TCA_FLOWER_KEY_L2TPV3_SID,
+ &mask->l2tpv3.session_id,
+ TCA_FLOWER_UNSPEC,
+ sizeof(key->l2tpv3.session_id)))
+ goto nla_put_failure;
+
+ if (key->ipsec.spi &&
+ fl_dump_key_val(skb, &key->ipsec.spi, TCA_FLOWER_KEY_SPI,
+ &mask->ipsec.spi, TCA_FLOWER_KEY_SPI_MASK,
+ sizeof(key->ipsec.spi)))
+ goto nla_put_failure;
if ((key->basic.ip_proto == IPPROTO_TCP ||
key->basic.ip_proto == IPPROTO_UDP ||
@@ -1994,7 +3660,23 @@ static int fl_dump_key(struct sk_buff *skb, struct net *net,
fl_dump_key_enc_opt(skb, &key->enc_opts, &mask->enc_opts))
goto nla_put_failure;
- if (fl_dump_key_flags(skb, key->control.flags, mask->control.flags))
+ if (fl_dump_key_ct(skb, &key->ct, &mask->ct))
+ goto nla_put_failure;
+
+ if (fl_dump_key_flags(skb, false, key->control.flags,
+ mask->control.flags))
+ goto nla_put_failure;
+
+ if (fl_dump_key_val(skb, &key->hash.hash, TCA_FLOWER_KEY_HASH,
+ &mask->hash.hash, TCA_FLOWER_KEY_HASH_MASK,
+ sizeof(key->hash.hash)))
+ goto nla_put_failure;
+
+ if (fl_dump_key_cfm(skb, &key->cfm, &mask->cfm))
+ goto nla_put_failure;
+
+ if (fl_dump_key_flags(skb, true, key->enc_control.flags,
+ mask->enc_control.flags))
goto nla_put_failure;
return 0;
@@ -2004,36 +3686,42 @@ nla_put_failure:
}
static int fl_dump(struct net *net, struct tcf_proto *tp, void *fh,
- struct sk_buff *skb, struct tcmsg *t)
+ struct sk_buff *skb, struct tcmsg *t, bool rtnl_held)
{
struct cls_fl_filter *f = fh;
struct nlattr *nest;
struct fl_flow_key *key, *mask;
+ bool skip_hw;
if (!f)
return skb->len;
t->tcm_handle = f->handle;
- nest = nla_nest_start(skb, TCA_OPTIONS);
+ nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
if (!nest)
goto nla_put_failure;
+ spin_lock(&tp->lock);
+
if (f->res.classid &&
nla_put_u32(skb, TCA_FLOWER_CLASSID, f->res.classid))
- goto nla_put_failure;
+ goto nla_put_failure_locked;
key = &f->key;
mask = &f->mask->key;
+ skip_hw = tc_skip_hw(f->flags);
if (fl_dump_key(skb, net, key, mask))
- goto nla_put_failure;
-
- if (!tc_skip_hw(f->flags))
- fl_hw_update_stats(tp, f);
+ goto nla_put_failure_locked;
if (f->flags && nla_put_u32(skb, TCA_FLOWER_FLAGS, f->flags))
- goto nla_put_failure;
+ goto nla_put_failure_locked;
+
+ spin_unlock(&tp->lock);
+
+ if (!skip_hw)
+ fl_hw_update_stats(tp, f, rtnl_held);
if (nla_put_u32(skb, TCA_FLOWER_IN_HW_COUNT, f->in_hw_count))
goto nla_put_failure;
@@ -2048,6 +3736,50 @@ static int fl_dump(struct net *net, struct tcf_proto *tp, void *fh,
return skb->len;
+nla_put_failure_locked:
+ spin_unlock(&tp->lock);
+nla_put_failure:
+ nla_nest_cancel(skb, nest);
+ return -1;
+}
+
+static int fl_terse_dump(struct net *net, struct tcf_proto *tp, void *fh,
+ struct sk_buff *skb, struct tcmsg *t, bool rtnl_held)
+{
+ struct cls_fl_filter *f = fh;
+ struct nlattr *nest;
+ bool skip_hw;
+
+ if (!f)
+ return skb->len;
+
+ t->tcm_handle = f->handle;
+
+ nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
+ if (!nest)
+ goto nla_put_failure;
+
+ spin_lock(&tp->lock);
+
+ skip_hw = tc_skip_hw(f->flags);
+
+ if (f->flags && nla_put_u32(skb, TCA_FLOWER_FLAGS, f->flags))
+ goto nla_put_failure_locked;
+
+ spin_unlock(&tp->lock);
+
+ if (!skip_hw)
+ fl_hw_update_stats(tp, f, rtnl_held);
+
+ if (tcf_exts_terse_dump(skb, &f->exts))
+ goto nla_put_failure;
+
+ nla_nest_end(skb, nest);
+
+ return skb->len;
+
+nla_put_failure_locked:
+ spin_unlock(&tp->lock);
nla_put_failure:
nla_nest_cancel(skb, nest);
return -1;
@@ -2059,7 +3791,7 @@ static int fl_tmplt_dump(struct sk_buff *skb, struct net *net, void *tmplt_priv)
struct fl_flow_key *key, *mask;
struct nlattr *nest;
- nest = nla_nest_start(skb, TCA_OPTIONS);
+ nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
if (!nest)
goto nla_put_failure;
@@ -2078,12 +3810,23 @@ nla_put_failure:
return -EMSGSIZE;
}
-static void fl_bind_class(void *fh, u32 classid, unsigned long cl)
+static void fl_bind_class(void *fh, u32 classid, unsigned long cl, void *q,
+ unsigned long base)
{
struct cls_fl_filter *f = fh;
- if (f && f->res.classid == classid)
- f->res.class = cl;
+ tc_cls_bind_class(classid, cl, q, &f->res, base);
+}
+
+static bool fl_delete_empty(struct tcf_proto *tp)
+{
+ struct cls_fl_head *head = fl_head_dereference(tp);
+
+ spin_lock(&tp->lock);
+ tp->deleting = idr_is_empty(&head->handle_idr);
+ spin_unlock(&tp->lock);
+
+ return tp->deleting;
}
static struct tcf_proto_ops cls_fl_ops __read_mostly = {
@@ -2092,17 +3835,26 @@ static struct tcf_proto_ops cls_fl_ops __read_mostly = {
.init = fl_init,
.destroy = fl_destroy,
.get = fl_get,
+ .put = fl_put,
.change = fl_change,
.delete = fl_delete,
+ .delete_empty = fl_delete_empty,
.walk = fl_walk,
.reoffload = fl_reoffload,
+ .hw_add = fl_hw_add,
+ .hw_del = fl_hw_del,
.dump = fl_dump,
+ .terse_dump = fl_terse_dump,
.bind_class = fl_bind_class,
.tmplt_create = fl_tmplt_create,
.tmplt_destroy = fl_tmplt_destroy,
+ .tmplt_reoffload = fl_tmplt_reoffload,
.tmplt_dump = fl_tmplt_dump,
+ .get_exts = fl_get_exts,
.owner = THIS_MODULE,
+ .flags = TCF_PROTO_OPS_DOIT_UNLOCKED,
};
+MODULE_ALIAS_NET_CLS("flower");
static int __init cls_fl_init(void)
{
diff --git a/net/sched/cls_fw.c b/net/sched/cls_fw.c
index 29eeeaf3ea44..cdddc8695228 100644
--- a/net/sched/cls_fw.c
+++ b/net/sched/cls_fw.c
@@ -1,21 +1,13 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* net/sched/cls_fw.c Classifier mapping ipchains' fwmark to traffic class.
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
*
* Changes:
* Karlis Peisenieks <karlis@mt.lv> : 990415 : fw_walk off by one
* Karlis Peisenieks <karlis@mt.lv> : 990415 : fw_delete killed all the filter (and kernel).
* Alex <alex@pilotsoft.com> : 2004xxyy: Added Action extension
- *
- * JHS: We should remove the CONFIG_NET_CLS_IND from here
- * eventually when the meta match extension is made available
- *
*/
#include <linux/module.h>
@@ -29,6 +21,7 @@
#include <net/act_api.h>
#include <net/pkt_cls.h>
#include <net/sch_generic.h>
+#include <net/tc_wrapper.h>
#define HTSIZE 256
@@ -42,9 +35,7 @@ struct fw_filter {
struct fw_filter __rcu *next;
u32 id;
struct tcf_result res;
-#ifdef CONFIG_NET_CLS_IND
int ifindex;
-#endif /* CONFIG_NET_CLS_IND */
struct tcf_exts exts;
struct tcf_proto *tp;
struct rcu_work rwork;
@@ -57,8 +48,9 @@ static u32 fw_hash(u32 handle)
return handle % HTSIZE;
}
-static int fw_classify(struct sk_buff *skb, const struct tcf_proto *tp,
- struct tcf_result *res)
+TC_INDIRECT_SCOPE int fw_classify(struct sk_buff *skb,
+ const struct tcf_proto *tp,
+ struct tcf_result *res)
{
struct fw_head *head = rcu_dereference_bh(tp->root);
struct fw_filter *f;
@@ -72,10 +64,8 @@ static int fw_classify(struct sk_buff *skb, const struct tcf_proto *tp,
f = rcu_dereference_bh(f->next)) {
if (f->id == id) {
*res = f->res;
-#ifdef CONFIG_NET_CLS_IND
if (!tcf_match_indev(skb, f->ifindex))
continue;
-#endif /* CONFIG_NET_CLS_IND */
r = tcf_exts_exec(skb, &f->exts, res);
if (r < 0)
continue;
@@ -139,7 +129,8 @@ static void fw_delete_filter_work(struct work_struct *work)
rtnl_unlock();
}
-static void fw_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack)
+static void fw_destroy(struct tcf_proto *tp, bool rtnl_held,
+ struct netlink_ext_ack *extack)
{
struct fw_head *head = rtnl_dereference(tp->root);
struct fw_filter *f;
@@ -163,7 +154,7 @@ static void fw_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack)
}
static int fw_delete(struct tcf_proto *tp, void *arg, bool *last,
- struct netlink_ext_ack *extack)
+ bool rtnl_held, struct netlink_ext_ack *extack)
{
struct fw_head *head = rtnl_dereference(tp->root);
struct fw_filter *f = arg;
@@ -209,24 +200,18 @@ static const struct nla_policy fw_policy[TCA_FW_MAX + 1] = {
static int fw_set_parms(struct net *net, struct tcf_proto *tp,
struct fw_filter *f, struct nlattr **tb,
- struct nlattr **tca, unsigned long base, bool ovr,
+ struct nlattr **tca, unsigned long base, u32 flags,
struct netlink_ext_ack *extack)
{
struct fw_head *head = rtnl_dereference(tp->root);
u32 mask;
int err;
- err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &f->exts, ovr,
+ err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &f->exts, flags,
extack);
if (err < 0)
return err;
- if (tb[TCA_FW_CLASSID]) {
- f->res.classid = nla_get_u32(tb[TCA_FW_CLASSID]);
- tcf_bind_filter(tp, &f->res, base);
- }
-
-#ifdef CONFIG_NET_CLS_IND
if (tb[TCA_FW_INDEV]) {
int ret;
ret = tcf_change_indev(net, tb[TCA_FW_INDEV], extack);
@@ -234,7 +219,6 @@ static int fw_set_parms(struct net *net, struct tcf_proto *tp,
return ret;
f->ifindex = ret;
}
-#endif /* CONFIG_NET_CLS_IND */
err = -EINVAL;
if (tb[TCA_FW_MASK]) {
@@ -244,13 +228,18 @@ static int fw_set_parms(struct net *net, struct tcf_proto *tp,
} else if (head->mask != 0xFFFFFFFF)
return err;
+ if (tb[TCA_FW_CLASSID]) {
+ f->res.classid = nla_get_u32(tb[TCA_FW_CLASSID]);
+ tcf_bind_filter(tp, &f->res, base);
+ }
+
return 0;
}
static int fw_change(struct net *net, struct sk_buff *in_skb,
struct tcf_proto *tp, unsigned long base,
u32 handle, struct nlattr **tca, void **arg,
- bool ovr, struct netlink_ext_ack *extack)
+ u32 flags, struct netlink_ext_ack *extack)
{
struct fw_head *head = rtnl_dereference(tp->root);
struct fw_filter *f = *arg;
@@ -261,7 +250,8 @@ static int fw_change(struct net *net, struct sk_buff *in_skb,
if (!opt)
return handle ? -EINVAL : 0; /* Succeed if it is old method. */
- err = nla_parse_nested(tb, TCA_FW_MAX, opt, fw_policy, NULL);
+ err = nla_parse_nested_deprecated(tb, TCA_FW_MAX, opt, fw_policy,
+ NULL);
if (err < 0)
return err;
@@ -277,19 +267,17 @@ static int fw_change(struct net *net, struct sk_buff *in_skb,
return -ENOBUFS;
fnew->id = f->id;
- fnew->res = f->res;
-#ifdef CONFIG_NET_CLS_IND
fnew->ifindex = f->ifindex;
-#endif /* CONFIG_NET_CLS_IND */
fnew->tp = f->tp;
- err = tcf_exts_init(&fnew->exts, TCA_FW_ACT, TCA_FW_POLICE);
+ err = tcf_exts_init(&fnew->exts, net, TCA_FW_ACT,
+ TCA_FW_POLICE);
if (err < 0) {
kfree(fnew);
return err;
}
- err = fw_set_parms(net, tp, fnew, tb, tca, base, ovr, extack);
+ err = fw_set_parms(net, tp, fnew, tb, tca, base, flags, extack);
if (err < 0) {
tcf_exts_destroy(&fnew->exts);
kfree(fnew);
@@ -332,13 +320,13 @@ static int fw_change(struct net *net, struct sk_buff *in_skb,
if (f == NULL)
return -ENOBUFS;
- err = tcf_exts_init(&f->exts, TCA_FW_ACT, TCA_FW_POLICE);
+ err = tcf_exts_init(&f->exts, net, TCA_FW_ACT, TCA_FW_POLICE);
if (err < 0)
goto errout;
f->id = handle;
f->tp = tp;
- err = fw_set_parms(net, tp, f, tb, tca, base, ovr, extack);
+ err = fw_set_parms(net, tp, f, tb, tca, base, flags, extack);
if (err < 0)
goto errout;
@@ -354,7 +342,8 @@ errout:
return err;
}
-static void fw_walk(struct tcf_proto *tp, struct tcf_walker *arg)
+static void fw_walk(struct tcf_proto *tp, struct tcf_walker *arg,
+ bool rtnl_held)
{
struct fw_head *head = rtnl_dereference(tp->root);
int h;
@@ -370,21 +359,14 @@ static void fw_walk(struct tcf_proto *tp, struct tcf_walker *arg)
for (f = rtnl_dereference(head->ht[h]); f;
f = rtnl_dereference(f->next)) {
- if (arg->count < arg->skip) {
- arg->count++;
- continue;
- }
- if (arg->fn(tp, f, arg) < 0) {
- arg->stop = 1;
+ if (!tc_cls_stats_dump(tp, arg, f))
return;
- }
- arg->count++;
}
}
}
static int fw_dump(struct net *net, struct tcf_proto *tp, void *fh,
- struct sk_buff *skb, struct tcmsg *t)
+ struct sk_buff *skb, struct tcmsg *t, bool rtnl_held)
{
struct fw_head *head = rtnl_dereference(tp->root);
struct fw_filter *f = fh;
@@ -398,21 +380,19 @@ static int fw_dump(struct net *net, struct tcf_proto *tp, void *fh,
if (!f->res.classid && !tcf_exts_has_actions(&f->exts))
return skb->len;
- nest = nla_nest_start(skb, TCA_OPTIONS);
+ nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
if (nest == NULL)
goto nla_put_failure;
if (f->res.classid &&
nla_put_u32(skb, TCA_FW_CLASSID, f->res.classid))
goto nla_put_failure;
-#ifdef CONFIG_NET_CLS_IND
if (f->ifindex) {
struct net_device *dev;
dev = __dev_get_by_index(net, f->ifindex);
if (dev && nla_put_string(skb, TCA_FW_INDEV, dev->name))
goto nla_put_failure;
}
-#endif /* CONFIG_NET_CLS_IND */
if (head->mask != 0xFFFFFFFF &&
nla_put_u32(skb, TCA_FW_MASK, head->mask))
goto nla_put_failure;
@@ -432,12 +412,12 @@ nla_put_failure:
return -1;
}
-static void fw_bind_class(void *fh, u32 classid, unsigned long cl)
+static void fw_bind_class(void *fh, u32 classid, unsigned long cl, void *q,
+ unsigned long base)
{
struct fw_filter *f = fh;
- if (f && f->res.classid == classid)
- f->res.class = cl;
+ tc_cls_bind_class(classid, cl, q, &f->res, base);
}
static struct tcf_proto_ops cls_fw_ops __read_mostly = {
@@ -453,6 +433,7 @@ static struct tcf_proto_ops cls_fw_ops __read_mostly = {
.bind_class = fw_bind_class,
.owner = THIS_MODULE,
};
+MODULE_ALIAS_NET_CLS("fw");
static int __init init_fw(void)
{
@@ -466,4 +447,5 @@ static void __exit exit_fw(void)
module_init(init_fw)
module_exit(exit_fw)
+MODULE_DESCRIPTION("SKB mark based TC classifier");
MODULE_LICENSE("GPL");
diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c
index a1b803fd372e..f03bf5da39ee 100644
--- a/net/sched/cls_matchall.c
+++ b/net/sched/cls_matchall.c
@@ -1,12 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* net/sched/cls_matchll.c Match-all classifier
*
* Copyright (c) 2016 Jiri Pirko <jiri@mellanox.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
*/
#include <linux/kernel.h>
@@ -16,6 +12,7 @@
#include <net/sch_generic.h>
#include <net/pkt_cls.h>
+#include <net/tc_wrapper.h>
struct cls_mall_head {
struct tcf_exts exts;
@@ -25,13 +22,18 @@ struct cls_mall_head {
unsigned int in_hw_count;
struct tc_matchall_pcnt __percpu *pf;
struct rcu_work rwork;
+ bool deleting;
};
-static int mall_classify(struct sk_buff *skb, const struct tcf_proto *tp,
- struct tcf_result *res)
+TC_INDIRECT_SCOPE int mall_classify(struct sk_buff *skb,
+ const struct tcf_proto *tp,
+ struct tcf_result *res)
{
struct cls_mall_head *head = rcu_dereference_bh(tp->root);
+ if (unlikely(!head))
+ return -1;
+
if (tc_skip_sw(head->flags))
return -1;
@@ -75,8 +77,8 @@ static void mall_destroy_hw_filter(struct tcf_proto *tp,
cls_mall.command = TC_CLSMATCHALL_DESTROY;
cls_mall.cookie = cookie;
- tc_setup_cb_call(block, TC_SETUP_CLSMATCHALL, &cls_mall, false);
- tcf_block_offload_dec(block, &head->flags);
+ tc_setup_cb_destroy(block, tp, TC_SETUP_CLSMATCHALL, &cls_mall, false,
+ &head->flags, &head->in_hw_count, true);
}
static int mall_replace_hw_filter(struct tcf_proto *tp,
@@ -89,18 +91,31 @@ static int mall_replace_hw_filter(struct tcf_proto *tp,
bool skip_sw = tc_skip_sw(head->flags);
int err;
+ cls_mall.rule = flow_rule_alloc(tcf_exts_num_actions(&head->exts));
+ if (!cls_mall.rule)
+ return -ENOMEM;
+
tc_cls_common_offload_init(&cls_mall.common, tp, head->flags, extack);
cls_mall.command = TC_CLSMATCHALL_REPLACE;
- cls_mall.exts = &head->exts;
cls_mall.cookie = cookie;
- err = tc_setup_cb_call(block, TC_SETUP_CLSMATCHALL, &cls_mall, skip_sw);
- if (err < 0) {
+ err = tc_setup_offload_action(&cls_mall.rule->action, &head->exts,
+ cls_mall.common.extack);
+ if (err) {
+ kfree(cls_mall.rule);
+ mall_destroy_hw_filter(tp, head, cookie, NULL);
+
+ return skip_sw ? err : 0;
+ }
+
+ err = tc_setup_cb_add(block, tp, TC_SETUP_CLSMATCHALL, &cls_mall,
+ skip_sw, &head->flags, &head->in_hw_count, true);
+ tc_cleanup_offload_action(&cls_mall.rule->action);
+ kfree(cls_mall.rule);
+
+ if (err) {
mall_destroy_hw_filter(tp, head, cookie, NULL);
return err;
- } else if (err > 0) {
- head->in_hw_count = err;
- tcf_block_offload_inc(block, &head->flags);
}
if (skip_sw && !(head->flags & TCA_CLS_FLAGS_IN_HW))
@@ -109,7 +124,8 @@ static int mall_replace_hw_filter(struct tcf_proto *tp,
return 0;
}
-static void mall_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack)
+static void mall_destroy(struct tcf_proto *tp, bool rtnl_held,
+ struct netlink_ext_ack *extack)
{
struct cls_mall_head *head = rtnl_dereference(tp->root);
@@ -129,42 +145,31 @@ static void mall_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack)
static void *mall_get(struct tcf_proto *tp, u32 handle)
{
+ struct cls_mall_head *head = rtnl_dereference(tp->root);
+
+ if (head && head->handle == handle)
+ return head;
+
return NULL;
}
static const struct nla_policy mall_policy[TCA_MATCHALL_MAX + 1] = {
[TCA_MATCHALL_UNSPEC] = { .type = NLA_UNSPEC },
[TCA_MATCHALL_CLASSID] = { .type = NLA_U32 },
+ [TCA_MATCHALL_FLAGS] = { .type = NLA_U32 },
};
-static int mall_set_parms(struct net *net, struct tcf_proto *tp,
- struct cls_mall_head *head,
- unsigned long base, struct nlattr **tb,
- struct nlattr *est, bool ovr,
- struct netlink_ext_ack *extack)
-{
- int err;
-
- err = tcf_exts_validate(net, tp, tb, est, &head->exts, ovr, extack);
- if (err < 0)
- return err;
-
- if (tb[TCA_MATCHALL_CLASSID]) {
- head->res.classid = nla_get_u32(tb[TCA_MATCHALL_CLASSID]);
- tcf_bind_filter(tp, &head->res, base);
- }
- return 0;
-}
-
static int mall_change(struct net *net, struct sk_buff *in_skb,
struct tcf_proto *tp, unsigned long base,
u32 handle, struct nlattr **tca,
- void **arg, bool ovr, struct netlink_ext_ack *extack)
+ void **arg, u32 flags,
+ struct netlink_ext_ack *extack)
{
struct cls_mall_head *head = rtnl_dereference(tp->root);
struct nlattr *tb[TCA_MATCHALL_MAX + 1];
+ bool bound_to_filter = false;
struct cls_mall_head *new;
- u32 flags = 0;
+ u32 userflags = 0;
int err;
if (!tca[TCA_OPTIONS])
@@ -173,14 +178,14 @@ static int mall_change(struct net *net, struct sk_buff *in_skb,
if (head)
return -EEXIST;
- err = nla_parse_nested(tb, TCA_MATCHALL_MAX, tca[TCA_OPTIONS],
- mall_policy, NULL);
+ err = nla_parse_nested_deprecated(tb, TCA_MATCHALL_MAX,
+ tca[TCA_OPTIONS], mall_policy, NULL);
if (err < 0)
return err;
if (tb[TCA_MATCHALL_FLAGS]) {
- flags = nla_get_u32(tb[TCA_MATCHALL_FLAGS]);
- if (!tc_flags_valid(flags))
+ userflags = nla_get_u32(tb[TCA_MATCHALL_FLAGS]);
+ if (!tc_flags_valid(userflags))
return -EINVAL;
}
@@ -188,25 +193,31 @@ static int mall_change(struct net *net, struct sk_buff *in_skb,
if (!new)
return -ENOBUFS;
- err = tcf_exts_init(&new->exts, TCA_MATCHALL_ACT, 0);
+ err = tcf_exts_init(&new->exts, net, TCA_MATCHALL_ACT, 0);
if (err)
goto err_exts_init;
if (!handle)
handle = 1;
new->handle = handle;
- new->flags = flags;
+ new->flags = userflags;
new->pf = alloc_percpu(struct tc_matchall_pcnt);
if (!new->pf) {
err = -ENOMEM;
goto err_alloc_percpu;
}
- err = mall_set_parms(net, tp, new, base, tb, tca[TCA_RATE], ovr,
- extack);
- if (err)
+ err = tcf_exts_validate_ex(net, tp, tb, tca[TCA_RATE],
+ &new->exts, flags, new->flags, extack);
+ if (err < 0)
goto err_set_parms;
+ if (tb[TCA_MATCHALL_CLASSID]) {
+ new->res.classid = nla_get_u32(tb[TCA_MATCHALL_CLASSID]);
+ tcf_bind_filter(tp, &new->res, base);
+ bound_to_filter = true;
+ }
+
if (!tc_skip_hw(new->flags)) {
err = mall_replace_hw_filter(tp, new, (unsigned long)new,
extack);
@@ -217,11 +228,15 @@ static int mall_change(struct net *net, struct sk_buff *in_skb,
if (!tc_in_hw(new->flags))
new->flags |= TCA_CLS_FLAGS_NOT_IN_HW;
+ tcf_proto_update_usesw(tp, new->flags);
+
*arg = head;
rcu_assign_pointer(tp->root, new);
return 0;
err_replace_hw_filter:
+ if (bound_to_filter)
+ tcf_unbind_filter(tp, &new->res);
err_set_parms:
free_percpu(new->pf);
err_alloc_percpu:
@@ -232,24 +247,32 @@ err_exts_init:
}
static int mall_delete(struct tcf_proto *tp, void *arg, bool *last,
- struct netlink_ext_ack *extack)
+ bool rtnl_held, struct netlink_ext_ack *extack)
{
- return -EOPNOTSUPP;
+ struct cls_mall_head *head = rtnl_dereference(tp->root);
+
+ head->deleting = true;
+ *last = true;
+ return 0;
}
-static void mall_walk(struct tcf_proto *tp, struct tcf_walker *arg)
+static void mall_walk(struct tcf_proto *tp, struct tcf_walker *arg,
+ bool rtnl_held)
{
struct cls_mall_head *head = rtnl_dereference(tp->root);
if (arg->count < arg->skip)
goto skip;
+
+ if (!head || head->deleting)
+ return;
if (arg->fn(tp, head, arg) < 0)
arg->stop = 1;
skip:
arg->count++;
}
-static int mall_reoffload(struct tcf_proto *tp, bool add, tc_setup_cb_t *cb,
+static int mall_reoffload(struct tcf_proto *tp, bool add, flow_setup_cb_t *cb,
void *cb_priv, struct netlink_ext_ack *extack)
{
struct cls_mall_head *head = rtnl_dereference(tp->root);
@@ -260,26 +283,50 @@ static int mall_reoffload(struct tcf_proto *tp, bool add, tc_setup_cb_t *cb,
if (tc_skip_hw(head->flags))
return 0;
+ cls_mall.rule = flow_rule_alloc(tcf_exts_num_actions(&head->exts));
+ if (!cls_mall.rule)
+ return -ENOMEM;
+
tc_cls_common_offload_init(&cls_mall.common, tp, head->flags, extack);
cls_mall.command = add ?
TC_CLSMATCHALL_REPLACE : TC_CLSMATCHALL_DESTROY;
- cls_mall.exts = &head->exts;
cls_mall.cookie = (unsigned long)head;
- err = cb(TC_SETUP_CLSMATCHALL, &cls_mall, cb_priv);
+ err = tc_setup_offload_action(&cls_mall.rule->action, &head->exts,
+ cls_mall.common.extack);
if (err) {
- if (add && tc_skip_sw(head->flags))
- return err;
- return 0;
+ kfree(cls_mall.rule);
+
+ return add && tc_skip_sw(head->flags) ? err : 0;
}
- tc_cls_offload_cnt_update(block, &head->in_hw_count, &head->flags, add);
+ err = tc_setup_cb_reoffload(block, tp, add, cb, TC_SETUP_CLSMATCHALL,
+ &cls_mall, cb_priv, &head->flags,
+ &head->in_hw_count);
+ tc_cleanup_offload_action(&cls_mall.rule->action);
+ kfree(cls_mall.rule);
- return 0;
+ return err;
+}
+
+static void mall_stats_hw_filter(struct tcf_proto *tp,
+ struct cls_mall_head *head,
+ unsigned long cookie)
+{
+ struct tc_cls_matchall_offload cls_mall = {};
+ struct tcf_block *block = tp->chain->block;
+
+ tc_cls_common_offload_init(&cls_mall.common, tp, head->flags, NULL);
+ cls_mall.command = TC_CLSMATCHALL_STATS;
+ cls_mall.cookie = cookie;
+
+ tc_setup_cb_call(block, TC_SETUP_CLSMATCHALL, &cls_mall, false, true);
+
+ tcf_exts_hw_stats_update(&head->exts, &cls_mall.stats, cls_mall.use_act_stats);
}
static int mall_dump(struct net *net, struct tcf_proto *tp, void *fh,
- struct sk_buff *skb, struct tcmsg *t)
+ struct sk_buff *skb, struct tcmsg *t, bool rtnl_held)
{
struct tc_matchall_pcnt gpf = {};
struct cls_mall_head *head = fh;
@@ -289,9 +336,12 @@ static int mall_dump(struct net *net, struct tcf_proto *tp, void *fh,
if (!head)
return skb->len;
+ if (!tc_skip_hw(head->flags))
+ mall_stats_hw_filter(tp, head, (unsigned long)head);
+
t->tcm_handle = head->handle;
- nest = nla_nest_start(skb, TCA_OPTIONS);
+ nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
if (!nest)
goto nla_put_failure;
@@ -328,12 +378,12 @@ nla_put_failure:
return -1;
}
-static void mall_bind_class(void *fh, u32 classid, unsigned long cl)
+static void mall_bind_class(void *fh, u32 classid, unsigned long cl, void *q,
+ unsigned long base)
{
struct cls_mall_head *head = fh;
- if (head && head->res.classid == classid)
- head->res.class = cl;
+ tc_cls_bind_class(classid, cl, q, &head->res, base);
}
static struct tcf_proto_ops cls_mall_ops __read_mostly = {
@@ -350,6 +400,7 @@ static struct tcf_proto_ops cls_mall_ops __read_mostly = {
.bind_class = mall_bind_class,
.owner = THIS_MODULE,
};
+MODULE_ALIAS_NET_CLS("matchall");
static int __init cls_mall_init(void)
{
diff --git a/net/sched/cls_route.c b/net/sched/cls_route.c
index 0404aa5fa7cb..b9c58c040c30 100644
--- a/net/sched/cls_route.c
+++ b/net/sched/cls_route.c
@@ -1,11 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* net/sched/cls_route.c ROUTE4 classifier.
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
*/
@@ -21,6 +17,7 @@
#include <net/netlink.h>
#include <net/act_api.h>
#include <net/pkt_cls.h>
+#include <net/tc_wrapper.h>
/*
* 1. For now we assume that route tags < 256.
@@ -125,8 +122,9 @@ static inline int route4_hash_wild(void)
return 0; \
}
-static int route4_classify(struct sk_buff *skb, const struct tcf_proto *tp,
- struct tcf_result *res)
+TC_INDIRECT_SCOPE int route4_classify(struct sk_buff *skb,
+ const struct tcf_proto *tp,
+ struct tcf_result *res)
{
struct route4_head *head = rcu_dereference_bh(tp->root);
struct dst_entry *dst;
@@ -276,7 +274,8 @@ static void route4_queue_work(struct route4_filter *f)
tcf_queue_work(&f->rwork, route4_delete_filter_work);
}
-static void route4_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack)
+static void route4_destroy(struct tcf_proto *tp, bool rtnl_held,
+ struct netlink_ext_ack *extack)
{
struct route4_head *head = rtnl_dereference(tp->root);
int h1, h2;
@@ -312,7 +311,7 @@ static void route4_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack)
}
static int route4_delete(struct tcf_proto *tp, void *arg, bool *last,
- struct netlink_ext_ack *extack)
+ bool rtnl_held, struct netlink_ext_ack *extack)
{
struct route4_head *head = rtnl_dereference(tp->root);
struct route4_filter *f = arg;
@@ -376,16 +375,16 @@ out:
static const struct nla_policy route4_policy[TCA_ROUTE4_MAX + 1] = {
[TCA_ROUTE4_CLASSID] = { .type = NLA_U32 },
- [TCA_ROUTE4_TO] = { .type = NLA_U32 },
- [TCA_ROUTE4_FROM] = { .type = NLA_U32 },
- [TCA_ROUTE4_IIF] = { .type = NLA_U32 },
+ [TCA_ROUTE4_TO] = NLA_POLICY_MAX(NLA_U32, 0xFF),
+ [TCA_ROUTE4_FROM] = NLA_POLICY_MAX(NLA_U32, 0xFF),
+ [TCA_ROUTE4_IIF] = NLA_POLICY_MAX(NLA_U32, 0x7FFF),
};
static int route4_set_parms(struct net *net, struct tcf_proto *tp,
unsigned long base, struct route4_filter *f,
u32 handle, struct route4_head *head,
struct nlattr **tb, struct nlattr *est, int new,
- bool ovr, struct netlink_ext_ack *extack)
+ u32 flags, struct netlink_ext_ack *extack)
{
u32 id = 0, to = 0, nhandle = 0x8000;
struct route4_filter *fp;
@@ -393,38 +392,47 @@ static int route4_set_parms(struct net *net, struct tcf_proto *tp,
struct route4_bucket *b;
int err;
- err = tcf_exts_validate(net, tp, tb, est, &f->exts, ovr, extack);
+ err = tcf_exts_validate(net, tp, tb, est, &f->exts, flags, extack);
if (err < 0)
return err;
if (tb[TCA_ROUTE4_TO]) {
- if (new && handle & 0x8000)
+ if (new && handle & 0x8000) {
+ NL_SET_ERR_MSG(extack, "Invalid handle");
return -EINVAL;
+ }
to = nla_get_u32(tb[TCA_ROUTE4_TO]);
- if (to > 0xFF)
- return -EINVAL;
nhandle = to;
}
+ if (tb[TCA_ROUTE4_FROM] && tb[TCA_ROUTE4_IIF]) {
+ NL_SET_ERR_MSG_ATTR(extack, tb[TCA_ROUTE4_FROM],
+ "'from' and 'fromif' are mutually exclusive");
+ return -EINVAL;
+ }
+
if (tb[TCA_ROUTE4_FROM]) {
- if (tb[TCA_ROUTE4_IIF])
- return -EINVAL;
id = nla_get_u32(tb[TCA_ROUTE4_FROM]);
- if (id > 0xFF)
- return -EINVAL;
nhandle |= id << 16;
} else if (tb[TCA_ROUTE4_IIF]) {
id = nla_get_u32(tb[TCA_ROUTE4_IIF]);
- if (id > 0x7FFF)
- return -EINVAL;
nhandle |= (id | 0x8000) << 16;
} else
nhandle |= 0xFFFF << 16;
if (handle && new) {
nhandle |= handle & 0x7F00;
- if (nhandle != handle)
+ if (nhandle != handle) {
+ NL_SET_ERR_MSG_FMT(extack,
+ "Handle mismatch constructed: %x (expected: %x)",
+ handle, nhandle);
return -EINVAL;
+ }
+ }
+
+ if (!nhandle) {
+ NL_SET_ERR_MSG(extack, "Replacing with handle of 0 is invalid");
+ return -EINVAL;
}
h1 = to_hash(nhandle);
@@ -467,28 +475,35 @@ static int route4_set_parms(struct net *net, struct tcf_proto *tp,
static int route4_change(struct net *net, struct sk_buff *in_skb,
struct tcf_proto *tp, unsigned long base, u32 handle,
- struct nlattr **tca, void **arg, bool ovr,
+ struct nlattr **tca, void **arg, u32 flags,
struct netlink_ext_ack *extack)
{
struct route4_head *head = rtnl_dereference(tp->root);
struct route4_filter __rcu **fp;
struct route4_filter *fold, *f1, *pfp, *f = NULL;
struct route4_bucket *b;
- struct nlattr *opt = tca[TCA_OPTIONS];
struct nlattr *tb[TCA_ROUTE4_MAX + 1];
unsigned int h, th;
int err;
bool new = true;
- if (opt == NULL)
- return handle ? -EINVAL : 0;
+ if (!handle) {
+ NL_SET_ERR_MSG(extack, "Creating with handle of 0 is invalid");
+ return -EINVAL;
+ }
- err = nla_parse_nested(tb, TCA_ROUTE4_MAX, opt, route4_policy, NULL);
+ if (NL_REQ_ATTR_CHECK(extack, NULL, tca, TCA_OPTIONS)) {
+ NL_SET_ERR_MSG_MOD(extack, "Missing options");
+ return -EINVAL;
+ }
+
+ err = nla_parse_nested_deprecated(tb, TCA_ROUTE4_MAX, tca[TCA_OPTIONS],
+ route4_policy, NULL);
if (err < 0)
return err;
fold = *arg;
- if (fold && handle && fold->handle != handle)
+ if (fold && fold->handle != handle)
return -EINVAL;
err = -ENOBUFS;
@@ -496,14 +511,13 @@ static int route4_change(struct net *net, struct sk_buff *in_skb,
if (!f)
goto errout;
- err = tcf_exts_init(&f->exts, TCA_ROUTE4_ACT, TCA_ROUTE4_POLICE);
+ err = tcf_exts_init(&f->exts, net, TCA_ROUTE4_ACT, TCA_ROUTE4_POLICE);
if (err < 0)
goto errout;
if (fold) {
f->id = fold->id;
f->iif = fold->iif;
- f->res = fold->res;
f->handle = fold->handle;
f->tp = fold->tp;
@@ -512,7 +526,7 @@ static int route4_change(struct net *net, struct sk_buff *in_skb,
}
err = route4_set_parms(net, tp, base, f, handle, head, tb,
- tca[TCA_RATE], new, ovr, extack);
+ tca[TCA_RATE], new, flags, extack);
if (err < 0)
goto errout;
@@ -528,7 +542,7 @@ static int route4_change(struct net *net, struct sk_buff *in_skb,
rcu_assign_pointer(f->next, f1);
rcu_assign_pointer(*fp, f);
- if (fold && fold->handle && f->handle != fold->handle) {
+ if (fold) {
th = to_hash(fold->handle);
h = from_hash(fold->handle >> 16);
b = rtnl_dereference(head->table[th]);
@@ -536,8 +550,8 @@ static int route4_change(struct net *net, struct sk_buff *in_skb,
fp = &b->ht[h];
for (pfp = rtnl_dereference(*fp); pfp;
fp = &pfp->next, pfp = rtnl_dereference(*fp)) {
- if (pfp == f) {
- *fp = f->next;
+ if (pfp == fold) {
+ rcu_assign_pointer(*fp, fold->next);
break;
}
}
@@ -560,15 +574,13 @@ errout:
return err;
}
-static void route4_walk(struct tcf_proto *tp, struct tcf_walker *arg)
+static void route4_walk(struct tcf_proto *tp, struct tcf_walker *arg,
+ bool rtnl_held)
{
struct route4_head *head = rtnl_dereference(tp->root);
unsigned int h, h1;
- if (head == NULL)
- arg->stop = 1;
-
- if (arg->stop)
+ if (head == NULL || arg->stop)
return;
for (h = 0; h <= 256; h++) {
@@ -581,15 +593,8 @@ static void route4_walk(struct tcf_proto *tp, struct tcf_walker *arg)
for (f = rtnl_dereference(b->ht[h1]);
f;
f = rtnl_dereference(f->next)) {
- if (arg->count < arg->skip) {
- arg->count++;
- continue;
- }
- if (arg->fn(tp, f, arg) < 0) {
- arg->stop = 1;
+ if (!tc_cls_stats_dump(tp, arg, f))
return;
- }
- arg->count++;
}
}
}
@@ -597,7 +602,7 @@ static void route4_walk(struct tcf_proto *tp, struct tcf_walker *arg)
}
static int route4_dump(struct net *net, struct tcf_proto *tp, void *fh,
- struct sk_buff *skb, struct tcmsg *t)
+ struct sk_buff *skb, struct tcmsg *t, bool rtnl_held)
{
struct route4_filter *f = fh;
struct nlattr *nest;
@@ -608,7 +613,7 @@ static int route4_dump(struct net *net, struct tcf_proto *tp, void *fh,
t->tcm_handle = f->handle;
- nest = nla_nest_start(skb, TCA_OPTIONS);
+ nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
if (nest == NULL)
goto nla_put_failure;
@@ -645,12 +650,12 @@ nla_put_failure:
return -1;
}
-static void route4_bind_class(void *fh, u32 classid, unsigned long cl)
+static void route4_bind_class(void *fh, u32 classid, unsigned long cl, void *q,
+ unsigned long base)
{
struct route4_filter *f = fh;
- if (f && f->res.classid == classid)
- f->res.class = cl;
+ tc_cls_bind_class(classid, cl, q, &f->res, base);
}
static struct tcf_proto_ops cls_route4_ops __read_mostly = {
@@ -666,6 +671,7 @@ static struct tcf_proto_ops cls_route4_ops __read_mostly = {
.bind_class = route4_bind_class,
.owner = THIS_MODULE,
};
+MODULE_ALIAS_NET_CLS("route");
static int __init init_route4(void)
{
@@ -679,4 +685,5 @@ static void __exit exit_route4(void)
module_init(init_route4)
module_exit(exit_route4)
+MODULE_DESCRIPTION("Routing table realm based TC classifier");
MODULE_LICENSE("GPL");
diff --git a/net/sched/cls_rsvp.c b/net/sched/cls_rsvp.c
deleted file mode 100644
index cbb5e0d600f3..000000000000
--- a/net/sched/cls_rsvp.c
+++ /dev/null
@@ -1,28 +0,0 @@
-/*
- * net/sched/cls_rsvp.c Special RSVP packet classifier for IPv4.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
- * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
- */
-
-#include <linux/module.h>
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/string.h>
-#include <linux/errno.h>
-#include <linux/skbuff.h>
-#include <net/ip.h>
-#include <net/netlink.h>
-#include <net/act_api.h>
-#include <net/pkt_cls.h>
-
-#define RSVP_DST_LEN 1
-#define RSVP_ID "rsvp"
-#define RSVP_OPS cls_rsvp_ops
-
-#include "cls_rsvp.h"
-MODULE_LICENSE("GPL");
diff --git a/net/sched/cls_rsvp.h b/net/sched/cls_rsvp.h
deleted file mode 100644
index e9ccf7daea7d..000000000000
--- a/net/sched/cls_rsvp.h
+++ /dev/null
@@ -1,772 +0,0 @@
-/*
- * net/sched/cls_rsvp.h Template file for RSVPv[46] classifiers.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
- * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
- */
-
-/*
- Comparing to general packet classification problem,
- RSVP needs only sevaral relatively simple rules:
-
- * (dst, protocol) are always specified,
- so that we are able to hash them.
- * src may be exact, or may be wildcard, so that
- we can keep a hash table plus one wildcard entry.
- * source port (or flow label) is important only if src is given.
-
- IMPLEMENTATION.
-
- We use a two level hash table: The top level is keyed by
- destination address and protocol ID, every bucket contains a list
- of "rsvp sessions", identified by destination address, protocol and
- DPI(="Destination Port ID"): triple (key, mask, offset).
-
- Every bucket has a smaller hash table keyed by source address
- (cf. RSVP flowspec) and one wildcard entry for wildcard reservations.
- Every bucket is again a list of "RSVP flows", selected by
- source address and SPI(="Source Port ID" here rather than
- "security parameter index"): triple (key, mask, offset).
-
-
- NOTE 1. All the packets with IPv6 extension headers (but AH and ESP)
- and all fragmented packets go to the best-effort traffic class.
-
-
- NOTE 2. Two "port id"'s seems to be redundant, rfc2207 requires
- only one "Generalized Port Identifier". So that for classic
- ah, esp (and udp,tcp) both *pi should coincide or one of them
- should be wildcard.
-
- At first sight, this redundancy is just a waste of CPU
- resources. But DPI and SPI add the possibility to assign different
- priorities to GPIs. Look also at note 4 about tunnels below.
-
-
- NOTE 3. One complication is the case of tunneled packets.
- We implement it as following: if the first lookup
- matches a special session with "tunnelhdr" value not zero,
- flowid doesn't contain the true flow ID, but the tunnel ID (1...255).
- In this case, we pull tunnelhdr bytes and restart lookup
- with tunnel ID added to the list of keys. Simple and stupid 8)8)
- It's enough for PIMREG and IPIP.
-
-
- NOTE 4. Two GPIs make it possible to parse even GRE packets.
- F.e. DPI can select ETH_P_IP (and necessary flags to make
- tunnelhdr correct) in GRE protocol field and SPI matches
- GRE key. Is it not nice? 8)8)
-
-
- Well, as result, despite its simplicity, we get a pretty
- powerful classification engine. */
-
-
-struct rsvp_head {
- u32 tmap[256/32];
- u32 hgenerator;
- u8 tgenerator;
- struct rsvp_session __rcu *ht[256];
- struct rcu_head rcu;
-};
-
-struct rsvp_session {
- struct rsvp_session __rcu *next;
- __be32 dst[RSVP_DST_LEN];
- struct tc_rsvp_gpi dpi;
- u8 protocol;
- u8 tunnelid;
- /* 16 (src,sport) hash slots, and one wildcard source slot */
- struct rsvp_filter __rcu *ht[16 + 1];
- struct rcu_head rcu;
-};
-
-
-struct rsvp_filter {
- struct rsvp_filter __rcu *next;
- __be32 src[RSVP_DST_LEN];
- struct tc_rsvp_gpi spi;
- u8 tunnelhdr;
-
- struct tcf_result res;
- struct tcf_exts exts;
-
- u32 handle;
- struct rsvp_session *sess;
- struct rcu_work rwork;
-};
-
-static inline unsigned int hash_dst(__be32 *dst, u8 protocol, u8 tunnelid)
-{
- unsigned int h = (__force __u32)dst[RSVP_DST_LEN - 1];
-
- h ^= h>>16;
- h ^= h>>8;
- return (h ^ protocol ^ tunnelid) & 0xFF;
-}
-
-static inline unsigned int hash_src(__be32 *src)
-{
- unsigned int h = (__force __u32)src[RSVP_DST_LEN-1];
-
- h ^= h>>16;
- h ^= h>>8;
- h ^= h>>4;
- return h & 0xF;
-}
-
-#define RSVP_APPLY_RESULT() \
-{ \
- int r = tcf_exts_exec(skb, &f->exts, res); \
- if (r < 0) \
- continue; \
- else if (r > 0) \
- return r; \
-}
-
-static int rsvp_classify(struct sk_buff *skb, const struct tcf_proto *tp,
- struct tcf_result *res)
-{
- struct rsvp_head *head = rcu_dereference_bh(tp->root);
- struct rsvp_session *s;
- struct rsvp_filter *f;
- unsigned int h1, h2;
- __be32 *dst, *src;
- u8 protocol;
- u8 tunnelid = 0;
- u8 *xprt;
-#if RSVP_DST_LEN == 4
- struct ipv6hdr *nhptr;
-
- if (!pskb_network_may_pull(skb, sizeof(*nhptr)))
- return -1;
- nhptr = ipv6_hdr(skb);
-#else
- struct iphdr *nhptr;
-
- if (!pskb_network_may_pull(skb, sizeof(*nhptr)))
- return -1;
- nhptr = ip_hdr(skb);
-#endif
-restart:
-
-#if RSVP_DST_LEN == 4
- src = &nhptr->saddr.s6_addr32[0];
- dst = &nhptr->daddr.s6_addr32[0];
- protocol = nhptr->nexthdr;
- xprt = ((u8 *)nhptr) + sizeof(struct ipv6hdr);
-#else
- src = &nhptr->saddr;
- dst = &nhptr->daddr;
- protocol = nhptr->protocol;
- xprt = ((u8 *)nhptr) + (nhptr->ihl<<2);
- if (ip_is_fragment(nhptr))
- return -1;
-#endif
-
- h1 = hash_dst(dst, protocol, tunnelid);
- h2 = hash_src(src);
-
- for (s = rcu_dereference_bh(head->ht[h1]); s;
- s = rcu_dereference_bh(s->next)) {
- if (dst[RSVP_DST_LEN-1] == s->dst[RSVP_DST_LEN - 1] &&
- protocol == s->protocol &&
- !(s->dpi.mask &
- (*(u32 *)(xprt + s->dpi.offset) ^ s->dpi.key)) &&
-#if RSVP_DST_LEN == 4
- dst[0] == s->dst[0] &&
- dst[1] == s->dst[1] &&
- dst[2] == s->dst[2] &&
-#endif
- tunnelid == s->tunnelid) {
-
- for (f = rcu_dereference_bh(s->ht[h2]); f;
- f = rcu_dereference_bh(f->next)) {
- if (src[RSVP_DST_LEN-1] == f->src[RSVP_DST_LEN - 1] &&
- !(f->spi.mask & (*(u32 *)(xprt + f->spi.offset) ^ f->spi.key))
-#if RSVP_DST_LEN == 4
- &&
- src[0] == f->src[0] &&
- src[1] == f->src[1] &&
- src[2] == f->src[2]
-#endif
- ) {
- *res = f->res;
- RSVP_APPLY_RESULT();
-
-matched:
- if (f->tunnelhdr == 0)
- return 0;
-
- tunnelid = f->res.classid;
- nhptr = (void *)(xprt + f->tunnelhdr - sizeof(*nhptr));
- goto restart;
- }
- }
-
- /* And wildcard bucket... */
- for (f = rcu_dereference_bh(s->ht[16]); f;
- f = rcu_dereference_bh(f->next)) {
- *res = f->res;
- RSVP_APPLY_RESULT();
- goto matched;
- }
- return -1;
- }
- }
- return -1;
-}
-
-static void rsvp_replace(struct tcf_proto *tp, struct rsvp_filter *n, u32 h)
-{
- struct rsvp_head *head = rtnl_dereference(tp->root);
- struct rsvp_session *s;
- struct rsvp_filter __rcu **ins;
- struct rsvp_filter *pins;
- unsigned int h1 = h & 0xFF;
- unsigned int h2 = (h >> 8) & 0xFF;
-
- for (s = rtnl_dereference(head->ht[h1]); s;
- s = rtnl_dereference(s->next)) {
- for (ins = &s->ht[h2], pins = rtnl_dereference(*ins); ;
- ins = &pins->next, pins = rtnl_dereference(*ins)) {
- if (pins->handle == h) {
- RCU_INIT_POINTER(n->next, pins->next);
- rcu_assign_pointer(*ins, n);
- return;
- }
- }
- }
-
- /* Something went wrong if we are trying to replace a non-existant
- * node. Mind as well halt instead of silently failing.
- */
- BUG_ON(1);
-}
-
-static void *rsvp_get(struct tcf_proto *tp, u32 handle)
-{
- struct rsvp_head *head = rtnl_dereference(tp->root);
- struct rsvp_session *s;
- struct rsvp_filter *f;
- unsigned int h1 = handle & 0xFF;
- unsigned int h2 = (handle >> 8) & 0xFF;
-
- if (h2 > 16)
- return NULL;
-
- for (s = rtnl_dereference(head->ht[h1]); s;
- s = rtnl_dereference(s->next)) {
- for (f = rtnl_dereference(s->ht[h2]); f;
- f = rtnl_dereference(f->next)) {
- if (f->handle == handle)
- return f;
- }
- }
- return NULL;
-}
-
-static int rsvp_init(struct tcf_proto *tp)
-{
- struct rsvp_head *data;
-
- data = kzalloc(sizeof(struct rsvp_head), GFP_KERNEL);
- if (data) {
- rcu_assign_pointer(tp->root, data);
- return 0;
- }
- return -ENOBUFS;
-}
-
-static void __rsvp_delete_filter(struct rsvp_filter *f)
-{
- tcf_exts_destroy(&f->exts);
- tcf_exts_put_net(&f->exts);
- kfree(f);
-}
-
-static void rsvp_delete_filter_work(struct work_struct *work)
-{
- struct rsvp_filter *f = container_of(to_rcu_work(work),
- struct rsvp_filter,
- rwork);
- rtnl_lock();
- __rsvp_delete_filter(f);
- rtnl_unlock();
-}
-
-static void rsvp_delete_filter(struct tcf_proto *tp, struct rsvp_filter *f)
-{
- tcf_unbind_filter(tp, &f->res);
- /* all classifiers are required to call tcf_exts_destroy() after rcu
- * grace period, since converted-to-rcu actions are relying on that
- * in cleanup() callback
- */
- if (tcf_exts_get_net(&f->exts))
- tcf_queue_work(&f->rwork, rsvp_delete_filter_work);
- else
- __rsvp_delete_filter(f);
-}
-
-static void rsvp_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack)
-{
- struct rsvp_head *data = rtnl_dereference(tp->root);
- int h1, h2;
-
- if (data == NULL)
- return;
-
- for (h1 = 0; h1 < 256; h1++) {
- struct rsvp_session *s;
-
- while ((s = rtnl_dereference(data->ht[h1])) != NULL) {
- RCU_INIT_POINTER(data->ht[h1], s->next);
-
- for (h2 = 0; h2 <= 16; h2++) {
- struct rsvp_filter *f;
-
- while ((f = rtnl_dereference(s->ht[h2])) != NULL) {
- rcu_assign_pointer(s->ht[h2], f->next);
- rsvp_delete_filter(tp, f);
- }
- }
- kfree_rcu(s, rcu);
- }
- }
- kfree_rcu(data, rcu);
-}
-
-static int rsvp_delete(struct tcf_proto *tp, void *arg, bool *last,
- struct netlink_ext_ack *extack)
-{
- struct rsvp_head *head = rtnl_dereference(tp->root);
- struct rsvp_filter *nfp, *f = arg;
- struct rsvp_filter __rcu **fp;
- unsigned int h = f->handle;
- struct rsvp_session __rcu **sp;
- struct rsvp_session *nsp, *s = f->sess;
- int i, h1;
-
- fp = &s->ht[(h >> 8) & 0xFF];
- for (nfp = rtnl_dereference(*fp); nfp;
- fp = &nfp->next, nfp = rtnl_dereference(*fp)) {
- if (nfp == f) {
- RCU_INIT_POINTER(*fp, f->next);
- rsvp_delete_filter(tp, f);
-
- /* Strip tree */
-
- for (i = 0; i <= 16; i++)
- if (s->ht[i])
- goto out;
-
- /* OK, session has no flows */
- sp = &head->ht[h & 0xFF];
- for (nsp = rtnl_dereference(*sp); nsp;
- sp = &nsp->next, nsp = rtnl_dereference(*sp)) {
- if (nsp == s) {
- RCU_INIT_POINTER(*sp, s->next);
- kfree_rcu(s, rcu);
- goto out;
- }
- }
-
- break;
- }
- }
-
-out:
- *last = true;
- for (h1 = 0; h1 < 256; h1++) {
- if (rcu_access_pointer(head->ht[h1])) {
- *last = false;
- break;
- }
- }
-
- return 0;
-}
-
-static unsigned int gen_handle(struct tcf_proto *tp, unsigned salt)
-{
- struct rsvp_head *data = rtnl_dereference(tp->root);
- int i = 0xFFFF;
-
- while (i-- > 0) {
- u32 h;
-
- if ((data->hgenerator += 0x10000) == 0)
- data->hgenerator = 0x10000;
- h = data->hgenerator|salt;
- if (!rsvp_get(tp, h))
- return h;
- }
- return 0;
-}
-
-static int tunnel_bts(struct rsvp_head *data)
-{
- int n = data->tgenerator >> 5;
- u32 b = 1 << (data->tgenerator & 0x1F);
-
- if (data->tmap[n] & b)
- return 0;
- data->tmap[n] |= b;
- return 1;
-}
-
-static void tunnel_recycle(struct rsvp_head *data)
-{
- struct rsvp_session __rcu **sht = data->ht;
- u32 tmap[256/32];
- int h1, h2;
-
- memset(tmap, 0, sizeof(tmap));
-
- for (h1 = 0; h1 < 256; h1++) {
- struct rsvp_session *s;
- for (s = rtnl_dereference(sht[h1]); s;
- s = rtnl_dereference(s->next)) {
- for (h2 = 0; h2 <= 16; h2++) {
- struct rsvp_filter *f;
-
- for (f = rtnl_dereference(s->ht[h2]); f;
- f = rtnl_dereference(f->next)) {
- if (f->tunnelhdr == 0)
- continue;
- data->tgenerator = f->res.classid;
- tunnel_bts(data);
- }
- }
- }
- }
-
- memcpy(data->tmap, tmap, sizeof(tmap));
-}
-
-static u32 gen_tunnel(struct rsvp_head *data)
-{
- int i, k;
-
- for (k = 0; k < 2; k++) {
- for (i = 255; i > 0; i--) {
- if (++data->tgenerator == 0)
- data->tgenerator = 1;
- if (tunnel_bts(data))
- return data->tgenerator;
- }
- tunnel_recycle(data);
- }
- return 0;
-}
-
-static const struct nla_policy rsvp_policy[TCA_RSVP_MAX + 1] = {
- [TCA_RSVP_CLASSID] = { .type = NLA_U32 },
- [TCA_RSVP_DST] = { .type = NLA_BINARY,
- .len = RSVP_DST_LEN * sizeof(u32) },
- [TCA_RSVP_SRC] = { .type = NLA_BINARY,
- .len = RSVP_DST_LEN * sizeof(u32) },
- [TCA_RSVP_PINFO] = { .len = sizeof(struct tc_rsvp_pinfo) },
-};
-
-static int rsvp_change(struct net *net, struct sk_buff *in_skb,
- struct tcf_proto *tp, unsigned long base,
- u32 handle,
- struct nlattr **tca,
- void **arg, bool ovr, struct netlink_ext_ack *extack)
-{
- struct rsvp_head *data = rtnl_dereference(tp->root);
- struct rsvp_filter *f, *nfp;
- struct rsvp_filter __rcu **fp;
- struct rsvp_session *nsp, *s;
- struct rsvp_session __rcu **sp;
- struct tc_rsvp_pinfo *pinfo = NULL;
- struct nlattr *opt = tca[TCA_OPTIONS];
- struct nlattr *tb[TCA_RSVP_MAX + 1];
- struct tcf_exts e;
- unsigned int h1, h2;
- __be32 *dst;
- int err;
-
- if (opt == NULL)
- return handle ? -EINVAL : 0;
-
- err = nla_parse_nested(tb, TCA_RSVP_MAX, opt, rsvp_policy, NULL);
- if (err < 0)
- return err;
-
- err = tcf_exts_init(&e, TCA_RSVP_ACT, TCA_RSVP_POLICE);
- if (err < 0)
- return err;
- err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &e, ovr, extack);
- if (err < 0)
- goto errout2;
-
- f = *arg;
- if (f) {
- /* Node exists: adjust only classid */
- struct rsvp_filter *n;
-
- if (f->handle != handle && handle)
- goto errout2;
-
- n = kmemdup(f, sizeof(*f), GFP_KERNEL);
- if (!n) {
- err = -ENOMEM;
- goto errout2;
- }
-
- err = tcf_exts_init(&n->exts, TCA_RSVP_ACT, TCA_RSVP_POLICE);
- if (err < 0) {
- kfree(n);
- goto errout2;
- }
-
- if (tb[TCA_RSVP_CLASSID]) {
- n->res.classid = nla_get_u32(tb[TCA_RSVP_CLASSID]);
- tcf_bind_filter(tp, &n->res, base);
- }
-
- tcf_exts_change(&n->exts, &e);
- rsvp_replace(tp, n, handle);
- return 0;
- }
-
- /* Now more serious part... */
- err = -EINVAL;
- if (handle)
- goto errout2;
- if (tb[TCA_RSVP_DST] == NULL)
- goto errout2;
-
- err = -ENOBUFS;
- f = kzalloc(sizeof(struct rsvp_filter), GFP_KERNEL);
- if (f == NULL)
- goto errout2;
-
- err = tcf_exts_init(&f->exts, TCA_RSVP_ACT, TCA_RSVP_POLICE);
- if (err < 0)
- goto errout;
- h2 = 16;
- if (tb[TCA_RSVP_SRC]) {
- memcpy(f->src, nla_data(tb[TCA_RSVP_SRC]), sizeof(f->src));
- h2 = hash_src(f->src);
- }
- if (tb[TCA_RSVP_PINFO]) {
- pinfo = nla_data(tb[TCA_RSVP_PINFO]);
- f->spi = pinfo->spi;
- f->tunnelhdr = pinfo->tunnelhdr;
- }
- if (tb[TCA_RSVP_CLASSID])
- f->res.classid = nla_get_u32(tb[TCA_RSVP_CLASSID]);
-
- dst = nla_data(tb[TCA_RSVP_DST]);
- h1 = hash_dst(dst, pinfo ? pinfo->protocol : 0, pinfo ? pinfo->tunnelid : 0);
-
- err = -ENOMEM;
- if ((f->handle = gen_handle(tp, h1 | (h2<<8))) == 0)
- goto errout;
-
- if (f->tunnelhdr) {
- err = -EINVAL;
- if (f->res.classid > 255)
- goto errout;
-
- err = -ENOMEM;
- if (f->res.classid == 0 &&
- (f->res.classid = gen_tunnel(data)) == 0)
- goto errout;
- }
-
- for (sp = &data->ht[h1];
- (s = rtnl_dereference(*sp)) != NULL;
- sp = &s->next) {
- if (dst[RSVP_DST_LEN-1] == s->dst[RSVP_DST_LEN-1] &&
- pinfo && pinfo->protocol == s->protocol &&
- memcmp(&pinfo->dpi, &s->dpi, sizeof(s->dpi)) == 0 &&
-#if RSVP_DST_LEN == 4
- dst[0] == s->dst[0] &&
- dst[1] == s->dst[1] &&
- dst[2] == s->dst[2] &&
-#endif
- pinfo->tunnelid == s->tunnelid) {
-
-insert:
- /* OK, we found appropriate session */
-
- fp = &s->ht[h2];
-
- f->sess = s;
- if (f->tunnelhdr == 0)
- tcf_bind_filter(tp, &f->res, base);
-
- tcf_exts_change(&f->exts, &e);
-
- fp = &s->ht[h2];
- for (nfp = rtnl_dereference(*fp); nfp;
- fp = &nfp->next, nfp = rtnl_dereference(*fp)) {
- __u32 mask = nfp->spi.mask & f->spi.mask;
-
- if (mask != f->spi.mask)
- break;
- }
- RCU_INIT_POINTER(f->next, nfp);
- rcu_assign_pointer(*fp, f);
-
- *arg = f;
- return 0;
- }
- }
-
- /* No session found. Create new one. */
-
- err = -ENOBUFS;
- s = kzalloc(sizeof(struct rsvp_session), GFP_KERNEL);
- if (s == NULL)
- goto errout;
- memcpy(s->dst, dst, sizeof(s->dst));
-
- if (pinfo) {
- s->dpi = pinfo->dpi;
- s->protocol = pinfo->protocol;
- s->tunnelid = pinfo->tunnelid;
- }
- sp = &data->ht[h1];
- for (nsp = rtnl_dereference(*sp); nsp;
- sp = &nsp->next, nsp = rtnl_dereference(*sp)) {
- if ((nsp->dpi.mask & s->dpi.mask) != s->dpi.mask)
- break;
- }
- RCU_INIT_POINTER(s->next, nsp);
- rcu_assign_pointer(*sp, s);
-
- goto insert;
-
-errout:
- tcf_exts_destroy(&f->exts);
- kfree(f);
-errout2:
- tcf_exts_destroy(&e);
- return err;
-}
-
-static void rsvp_walk(struct tcf_proto *tp, struct tcf_walker *arg)
-{
- struct rsvp_head *head = rtnl_dereference(tp->root);
- unsigned int h, h1;
-
- if (arg->stop)
- return;
-
- for (h = 0; h < 256; h++) {
- struct rsvp_session *s;
-
- for (s = rtnl_dereference(head->ht[h]); s;
- s = rtnl_dereference(s->next)) {
- for (h1 = 0; h1 <= 16; h1++) {
- struct rsvp_filter *f;
-
- for (f = rtnl_dereference(s->ht[h1]); f;
- f = rtnl_dereference(f->next)) {
- if (arg->count < arg->skip) {
- arg->count++;
- continue;
- }
- if (arg->fn(tp, f, arg) < 0) {
- arg->stop = 1;
- return;
- }
- arg->count++;
- }
- }
- }
- }
-}
-
-static int rsvp_dump(struct net *net, struct tcf_proto *tp, void *fh,
- struct sk_buff *skb, struct tcmsg *t)
-{
- struct rsvp_filter *f = fh;
- struct rsvp_session *s;
- struct nlattr *nest;
- struct tc_rsvp_pinfo pinfo;
-
- if (f == NULL)
- return skb->len;
- s = f->sess;
-
- t->tcm_handle = f->handle;
-
- nest = nla_nest_start(skb, TCA_OPTIONS);
- if (nest == NULL)
- goto nla_put_failure;
-
- if (nla_put(skb, TCA_RSVP_DST, sizeof(s->dst), &s->dst))
- goto nla_put_failure;
- pinfo.dpi = s->dpi;
- pinfo.spi = f->spi;
- pinfo.protocol = s->protocol;
- pinfo.tunnelid = s->tunnelid;
- pinfo.tunnelhdr = f->tunnelhdr;
- pinfo.pad = 0;
- if (nla_put(skb, TCA_RSVP_PINFO, sizeof(pinfo), &pinfo))
- goto nla_put_failure;
- if (f->res.classid &&
- nla_put_u32(skb, TCA_RSVP_CLASSID, f->res.classid))
- goto nla_put_failure;
- if (((f->handle >> 8) & 0xFF) != 16 &&
- nla_put(skb, TCA_RSVP_SRC, sizeof(f->src), f->src))
- goto nla_put_failure;
-
- if (tcf_exts_dump(skb, &f->exts) < 0)
- goto nla_put_failure;
-
- nla_nest_end(skb, nest);
-
- if (tcf_exts_dump_stats(skb, &f->exts) < 0)
- goto nla_put_failure;
- return skb->len;
-
-nla_put_failure:
- nla_nest_cancel(skb, nest);
- return -1;
-}
-
-static void rsvp_bind_class(void *fh, u32 classid, unsigned long cl)
-{
- struct rsvp_filter *f = fh;
-
- if (f && f->res.classid == classid)
- f->res.class = cl;
-}
-
-static struct tcf_proto_ops RSVP_OPS __read_mostly = {
- .kind = RSVP_ID,
- .classify = rsvp_classify,
- .init = rsvp_init,
- .destroy = rsvp_destroy,
- .get = rsvp_get,
- .change = rsvp_change,
- .delete = rsvp_delete,
- .walk = rsvp_walk,
- .dump = rsvp_dump,
- .bind_class = rsvp_bind_class,
- .owner = THIS_MODULE,
-};
-
-static int __init init_rsvp(void)
-{
- return register_tcf_proto_ops(&RSVP_OPS);
-}
-
-static void __exit exit_rsvp(void)
-{
- unregister_tcf_proto_ops(&RSVP_OPS);
-}
-
-module_init(init_rsvp)
-module_exit(exit_rsvp)
diff --git a/net/sched/cls_rsvp6.c b/net/sched/cls_rsvp6.c
deleted file mode 100644
index dd08aea2aee5..000000000000
--- a/net/sched/cls_rsvp6.c
+++ /dev/null
@@ -1,28 +0,0 @@
-/*
- * net/sched/cls_rsvp6.c Special RSVP packet classifier for IPv6.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
- * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
- */
-
-#include <linux/module.h>
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/string.h>
-#include <linux/errno.h>
-#include <linux/ipv6.h>
-#include <linux/skbuff.h>
-#include <net/act_api.h>
-#include <net/pkt_cls.h>
-#include <net/netlink.h>
-
-#define RSVP_DST_LEN 4
-#define RSVP_ID "rsvp6"
-#define RSVP_OPS cls_rsvp6_ops
-
-#include "cls_rsvp.h"
-MODULE_LICENSE("GPL");
diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c
deleted file mode 100644
index 9ccc93f257db..000000000000
--- a/net/sched/cls_tcindex.c
+++ /dev/null
@@ -1,673 +0,0 @@
-/*
- * net/sched/cls_tcindex.c Packet classifier for skb->tc_index
- *
- * Written 1998,1999 by Werner Almesberger, EPFL ICA
- */
-
-#include <linux/module.h>
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/skbuff.h>
-#include <linux/errno.h>
-#include <linux/slab.h>
-#include <net/act_api.h>
-#include <net/netlink.h>
-#include <net/pkt_cls.h>
-#include <net/sch_generic.h>
-
-/*
- * Passing parameters to the root seems to be done more awkwardly than really
- * necessary. At least, u32 doesn't seem to use such dirty hacks. To be
- * verified. FIXME.
- */
-
-#define PERFECT_HASH_THRESHOLD 64 /* use perfect hash if not bigger */
-#define DEFAULT_HASH_SIZE 64 /* optimized for diffserv */
-
-
-struct tcindex_filter_result {
- struct tcf_exts exts;
- struct tcf_result res;
- struct rcu_work rwork;
-};
-
-struct tcindex_filter {
- u16 key;
- struct tcindex_filter_result result;
- struct tcindex_filter __rcu *next;
- struct rcu_work rwork;
-};
-
-
-struct tcindex_data {
- struct tcindex_filter_result *perfect; /* perfect hash; NULL if none */
- struct tcindex_filter __rcu **h; /* imperfect hash; */
- struct tcf_proto *tp;
- u16 mask; /* AND key with mask */
- u32 shift; /* shift ANDed key to the right */
- u32 hash; /* hash table size; 0 if undefined */
- u32 alloc_hash; /* allocated size */
- u32 fall_through; /* 0: only classify if explicit match */
- struct rcu_head rcu;
-};
-
-static inline int tcindex_filter_is_set(struct tcindex_filter_result *r)
-{
- return tcf_exts_has_actions(&r->exts) || r->res.classid;
-}
-
-static struct tcindex_filter_result *tcindex_lookup(struct tcindex_data *p,
- u16 key)
-{
- if (p->perfect) {
- struct tcindex_filter_result *f = p->perfect + key;
-
- return tcindex_filter_is_set(f) ? f : NULL;
- } else if (p->h) {
- struct tcindex_filter __rcu **fp;
- struct tcindex_filter *f;
-
- fp = &p->h[key % p->hash];
- for (f = rcu_dereference_bh_rtnl(*fp);
- f;
- fp = &f->next, f = rcu_dereference_bh_rtnl(*fp))
- if (f->key == key)
- return &f->result;
- }
-
- return NULL;
-}
-
-
-static int tcindex_classify(struct sk_buff *skb, const struct tcf_proto *tp,
- struct tcf_result *res)
-{
- struct tcindex_data *p = rcu_dereference_bh(tp->root);
- struct tcindex_filter_result *f;
- int key = (skb->tc_index & p->mask) >> p->shift;
-
- pr_debug("tcindex_classify(skb %p,tp %p,res %p),p %p\n",
- skb, tp, res, p);
-
- f = tcindex_lookup(p, key);
- if (!f) {
- struct Qdisc *q = tcf_block_q(tp->chain->block);
-
- if (!p->fall_through)
- return -1;
- res->classid = TC_H_MAKE(TC_H_MAJ(q->handle), key);
- res->class = 0;
- pr_debug("alg 0x%x\n", res->classid);
- return 0;
- }
- *res = f->res;
- pr_debug("map 0x%x\n", res->classid);
-
- return tcf_exts_exec(skb, &f->exts, res);
-}
-
-
-static void *tcindex_get(struct tcf_proto *tp, u32 handle)
-{
- struct tcindex_data *p = rtnl_dereference(tp->root);
- struct tcindex_filter_result *r;
-
- pr_debug("tcindex_get(tp %p,handle 0x%08x)\n", tp, handle);
- if (p->perfect && handle >= p->alloc_hash)
- return NULL;
- r = tcindex_lookup(p, handle);
- return r && tcindex_filter_is_set(r) ? r : NULL;
-}
-
-static int tcindex_init(struct tcf_proto *tp)
-{
- struct tcindex_data *p;
-
- pr_debug("tcindex_init(tp %p)\n", tp);
- p = kzalloc(sizeof(struct tcindex_data), GFP_KERNEL);
- if (!p)
- return -ENOMEM;
-
- p->mask = 0xffff;
- p->hash = DEFAULT_HASH_SIZE;
- p->fall_through = 1;
-
- rcu_assign_pointer(tp->root, p);
- return 0;
-}
-
-static void __tcindex_destroy_rexts(struct tcindex_filter_result *r)
-{
- tcf_exts_destroy(&r->exts);
- tcf_exts_put_net(&r->exts);
-}
-
-static void tcindex_destroy_rexts_work(struct work_struct *work)
-{
- struct tcindex_filter_result *r;
-
- r = container_of(to_rcu_work(work),
- struct tcindex_filter_result,
- rwork);
- rtnl_lock();
- __tcindex_destroy_rexts(r);
- rtnl_unlock();
-}
-
-static void __tcindex_destroy_fexts(struct tcindex_filter *f)
-{
- tcf_exts_destroy(&f->result.exts);
- tcf_exts_put_net(&f->result.exts);
- kfree(f);
-}
-
-static void tcindex_destroy_fexts_work(struct work_struct *work)
-{
- struct tcindex_filter *f = container_of(to_rcu_work(work),
- struct tcindex_filter,
- rwork);
-
- rtnl_lock();
- __tcindex_destroy_fexts(f);
- rtnl_unlock();
-}
-
-static int tcindex_delete(struct tcf_proto *tp, void *arg, bool *last,
- struct netlink_ext_ack *extack)
-{
- struct tcindex_data *p = rtnl_dereference(tp->root);
- struct tcindex_filter_result *r = arg;
- struct tcindex_filter __rcu **walk;
- struct tcindex_filter *f = NULL;
-
- pr_debug("tcindex_delete(tp %p,arg %p),p %p\n", tp, arg, p);
- if (p->perfect) {
- if (!r->res.class)
- return -ENOENT;
- } else {
- int i;
-
- for (i = 0; i < p->hash; i++) {
- walk = p->h + i;
- for (f = rtnl_dereference(*walk); f;
- walk = &f->next, f = rtnl_dereference(*walk)) {
- if (&f->result == r)
- goto found;
- }
- }
- return -ENOENT;
-
-found:
- rcu_assign_pointer(*walk, rtnl_dereference(f->next));
- }
- tcf_unbind_filter(tp, &r->res);
- /* all classifiers are required to call tcf_exts_destroy() after rcu
- * grace period, since converted-to-rcu actions are relying on that
- * in cleanup() callback
- */
- if (f) {
- if (tcf_exts_get_net(&f->result.exts))
- tcf_queue_work(&f->rwork, tcindex_destroy_fexts_work);
- else
- __tcindex_destroy_fexts(f);
- } else {
- if (tcf_exts_get_net(&r->exts))
- tcf_queue_work(&r->rwork, tcindex_destroy_rexts_work);
- else
- __tcindex_destroy_rexts(r);
- }
-
- *last = false;
- return 0;
-}
-
-static int tcindex_destroy_element(struct tcf_proto *tp,
- void *arg, struct tcf_walker *walker)
-{
- bool last;
-
- return tcindex_delete(tp, arg, &last, NULL);
-}
-
-static void __tcindex_destroy(struct rcu_head *head)
-{
- struct tcindex_data *p = container_of(head, struct tcindex_data, rcu);
-
- kfree(p->perfect);
- kfree(p->h);
- kfree(p);
-}
-
-static inline int
-valid_perfect_hash(struct tcindex_data *p)
-{
- return p->hash > (p->mask >> p->shift);
-}
-
-static const struct nla_policy tcindex_policy[TCA_TCINDEX_MAX + 1] = {
- [TCA_TCINDEX_HASH] = { .type = NLA_U32 },
- [TCA_TCINDEX_MASK] = { .type = NLA_U16 },
- [TCA_TCINDEX_SHIFT] = { .type = NLA_U32 },
- [TCA_TCINDEX_FALL_THROUGH] = { .type = NLA_U32 },
- [TCA_TCINDEX_CLASSID] = { .type = NLA_U32 },
-};
-
-static int tcindex_filter_result_init(struct tcindex_filter_result *r)
-{
- memset(r, 0, sizeof(*r));
- return tcf_exts_init(&r->exts, TCA_TCINDEX_ACT, TCA_TCINDEX_POLICE);
-}
-
-static void __tcindex_partial_destroy(struct rcu_head *head)
-{
- struct tcindex_data *p = container_of(head, struct tcindex_data, rcu);
-
- kfree(p->perfect);
- kfree(p);
-}
-
-static void tcindex_free_perfect_hash(struct tcindex_data *cp)
-{
- int i;
-
- for (i = 0; i < cp->hash; i++)
- tcf_exts_destroy(&cp->perfect[i].exts);
- kfree(cp->perfect);
-}
-
-static int tcindex_alloc_perfect_hash(struct tcindex_data *cp)
-{
- int i, err = 0;
-
- cp->perfect = kcalloc(cp->hash, sizeof(struct tcindex_filter_result),
- GFP_KERNEL);
- if (!cp->perfect)
- return -ENOMEM;
-
- for (i = 0; i < cp->hash; i++) {
- err = tcf_exts_init(&cp->perfect[i].exts,
- TCA_TCINDEX_ACT, TCA_TCINDEX_POLICE);
- if (err < 0)
- goto errout;
- }
-
- return 0;
-
-errout:
- tcindex_free_perfect_hash(cp);
- return err;
-}
-
-static int
-tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base,
- u32 handle, struct tcindex_data *p,
- struct tcindex_filter_result *r, struct nlattr **tb,
- struct nlattr *est, bool ovr, struct netlink_ext_ack *extack)
-{
- struct tcindex_filter_result new_filter_result, *old_r = r;
- struct tcindex_filter_result cr;
- struct tcindex_data *cp = NULL, *oldp;
- struct tcindex_filter *f = NULL; /* make gcc behave */
- int err, balloc = 0;
- struct tcf_exts e;
-
- err = tcf_exts_init(&e, TCA_TCINDEX_ACT, TCA_TCINDEX_POLICE);
- if (err < 0)
- return err;
- err = tcf_exts_validate(net, tp, tb, est, &e, ovr, extack);
- if (err < 0)
- goto errout;
-
- err = -ENOMEM;
- /* tcindex_data attributes must look atomic to classifier/lookup so
- * allocate new tcindex data and RCU assign it onto root. Keeping
- * perfect hash and hash pointers from old data.
- */
- cp = kzalloc(sizeof(*cp), GFP_KERNEL);
- if (!cp)
- goto errout;
-
- cp->mask = p->mask;
- cp->shift = p->shift;
- cp->hash = p->hash;
- cp->alloc_hash = p->alloc_hash;
- cp->fall_through = p->fall_through;
- cp->tp = tp;
-
- if (p->perfect) {
- int i;
-
- if (tcindex_alloc_perfect_hash(cp) < 0)
- goto errout;
- for (i = 0; i < cp->hash; i++)
- cp->perfect[i].res = p->perfect[i].res;
- balloc = 1;
- }
- cp->h = p->h;
-
- err = tcindex_filter_result_init(&new_filter_result);
- if (err < 0)
- goto errout1;
- err = tcindex_filter_result_init(&cr);
- if (err < 0)
- goto errout1;
- if (old_r)
- cr.res = r->res;
-
- if (tb[TCA_TCINDEX_HASH])
- cp->hash = nla_get_u32(tb[TCA_TCINDEX_HASH]);
-
- if (tb[TCA_TCINDEX_MASK])
- cp->mask = nla_get_u16(tb[TCA_TCINDEX_MASK]);
-
- if (tb[TCA_TCINDEX_SHIFT])
- cp->shift = nla_get_u32(tb[TCA_TCINDEX_SHIFT]);
-
- err = -EBUSY;
-
- /* Hash already allocated, make sure that we still meet the
- * requirements for the allocated hash.
- */
- if (cp->perfect) {
- if (!valid_perfect_hash(cp) ||
- cp->hash > cp->alloc_hash)
- goto errout_alloc;
- } else if (cp->h && cp->hash != cp->alloc_hash) {
- goto errout_alloc;
- }
-
- err = -EINVAL;
- if (tb[TCA_TCINDEX_FALL_THROUGH])
- cp->fall_through = nla_get_u32(tb[TCA_TCINDEX_FALL_THROUGH]);
-
- if (!cp->hash) {
- /* Hash not specified, use perfect hash if the upper limit
- * of the hashing index is below the threshold.
- */
- if ((cp->mask >> cp->shift) < PERFECT_HASH_THRESHOLD)
- cp->hash = (cp->mask >> cp->shift) + 1;
- else
- cp->hash = DEFAULT_HASH_SIZE;
- }
-
- if (!cp->perfect && !cp->h)
- cp->alloc_hash = cp->hash;
-
- /* Note: this could be as restrictive as if (handle & ~(mask >> shift))
- * but then, we'd fail handles that may become valid after some future
- * mask change. While this is extremely unlikely to ever matter,
- * the check below is safer (and also more backwards-compatible).
- */
- if (cp->perfect || valid_perfect_hash(cp))
- if (handle >= cp->alloc_hash)
- goto errout_alloc;
-
-
- err = -ENOMEM;
- if (!cp->perfect && !cp->h) {
- if (valid_perfect_hash(cp)) {
- if (tcindex_alloc_perfect_hash(cp) < 0)
- goto errout_alloc;
- balloc = 1;
- } else {
- struct tcindex_filter __rcu **hash;
-
- hash = kcalloc(cp->hash,
- sizeof(struct tcindex_filter *),
- GFP_KERNEL);
-
- if (!hash)
- goto errout_alloc;
-
- cp->h = hash;
- balloc = 2;
- }
- }
-
- if (cp->perfect)
- r = cp->perfect + handle;
- else
- r = tcindex_lookup(cp, handle) ? : &new_filter_result;
-
- if (r == &new_filter_result) {
- f = kzalloc(sizeof(*f), GFP_KERNEL);
- if (!f)
- goto errout_alloc;
- f->key = handle;
- f->next = NULL;
- err = tcindex_filter_result_init(&f->result);
- if (err < 0) {
- kfree(f);
- goto errout_alloc;
- }
- }
-
- if (tb[TCA_TCINDEX_CLASSID]) {
- cr.res.classid = nla_get_u32(tb[TCA_TCINDEX_CLASSID]);
- tcf_bind_filter(tp, &cr.res, base);
- }
-
- if (old_r && old_r != r) {
- err = tcindex_filter_result_init(old_r);
- if (err < 0) {
- kfree(f);
- goto errout_alloc;
- }
- }
-
- oldp = p;
- r->res = cr.res;
- tcf_exts_change(&r->exts, &e);
-
- rcu_assign_pointer(tp->root, cp);
-
- if (r == &new_filter_result) {
- struct tcindex_filter *nfp;
- struct tcindex_filter __rcu **fp;
-
- f->result.res = r->res;
- tcf_exts_change(&f->result.exts, &r->exts);
-
- fp = cp->h + (handle % cp->hash);
- for (nfp = rtnl_dereference(*fp);
- nfp;
- fp = &nfp->next, nfp = rtnl_dereference(*fp))
- ; /* nothing */
-
- rcu_assign_pointer(*fp, f);
- }
-
- if (oldp)
- call_rcu(&oldp->rcu, __tcindex_partial_destroy);
- return 0;
-
-errout_alloc:
- if (balloc == 1)
- tcindex_free_perfect_hash(cp);
- else if (balloc == 2)
- kfree(cp->h);
-errout1:
- tcf_exts_destroy(&cr.exts);
- tcf_exts_destroy(&new_filter_result.exts);
-errout:
- kfree(cp);
- tcf_exts_destroy(&e);
- return err;
-}
-
-static int
-tcindex_change(struct net *net, struct sk_buff *in_skb,
- struct tcf_proto *tp, unsigned long base, u32 handle,
- struct nlattr **tca, void **arg, bool ovr,
- struct netlink_ext_ack *extack)
-{
- struct nlattr *opt = tca[TCA_OPTIONS];
- struct nlattr *tb[TCA_TCINDEX_MAX + 1];
- struct tcindex_data *p = rtnl_dereference(tp->root);
- struct tcindex_filter_result *r = *arg;
- int err;
-
- pr_debug("tcindex_change(tp %p,handle 0x%08x,tca %p,arg %p),opt %p,"
- "p %p,r %p,*arg %p\n",
- tp, handle, tca, arg, opt, p, r, arg ? *arg : NULL);
-
- if (!opt)
- return 0;
-
- err = nla_parse_nested(tb, TCA_TCINDEX_MAX, opt, tcindex_policy, NULL);
- if (err < 0)
- return err;
-
- return tcindex_set_parms(net, tp, base, handle, p, r, tb,
- tca[TCA_RATE], ovr, extack);
-}
-
-static void tcindex_walk(struct tcf_proto *tp, struct tcf_walker *walker)
-{
- struct tcindex_data *p = rtnl_dereference(tp->root);
- struct tcindex_filter *f, *next;
- int i;
-
- pr_debug("tcindex_walk(tp %p,walker %p),p %p\n", tp, walker, p);
- if (p->perfect) {
- for (i = 0; i < p->hash; i++) {
- if (!p->perfect[i].res.class)
- continue;
- if (walker->count >= walker->skip) {
- if (walker->fn(tp, p->perfect + i, walker) < 0) {
- walker->stop = 1;
- return;
- }
- }
- walker->count++;
- }
- }
- if (!p->h)
- return;
- for (i = 0; i < p->hash; i++) {
- for (f = rtnl_dereference(p->h[i]); f; f = next) {
- next = rtnl_dereference(f->next);
- if (walker->count >= walker->skip) {
- if (walker->fn(tp, &f->result, walker) < 0) {
- walker->stop = 1;
- return;
- }
- }
- walker->count++;
- }
- }
-}
-
-static void tcindex_destroy(struct tcf_proto *tp,
- struct netlink_ext_ack *extack)
-{
- struct tcindex_data *p = rtnl_dereference(tp->root);
- struct tcf_walker walker;
-
- pr_debug("tcindex_destroy(tp %p),p %p\n", tp, p);
- walker.count = 0;
- walker.skip = 0;
- walker.fn = tcindex_destroy_element;
- tcindex_walk(tp, &walker);
-
- call_rcu(&p->rcu, __tcindex_destroy);
-}
-
-
-static int tcindex_dump(struct net *net, struct tcf_proto *tp, void *fh,
- struct sk_buff *skb, struct tcmsg *t)
-{
- struct tcindex_data *p = rtnl_dereference(tp->root);
- struct tcindex_filter_result *r = fh;
- struct nlattr *nest;
-
- pr_debug("tcindex_dump(tp %p,fh %p,skb %p,t %p),p %p,r %p\n",
- tp, fh, skb, t, p, r);
- pr_debug("p->perfect %p p->h %p\n", p->perfect, p->h);
-
- nest = nla_nest_start(skb, TCA_OPTIONS);
- if (nest == NULL)
- goto nla_put_failure;
-
- if (!fh) {
- t->tcm_handle = ~0; /* whatever ... */
- if (nla_put_u32(skb, TCA_TCINDEX_HASH, p->hash) ||
- nla_put_u16(skb, TCA_TCINDEX_MASK, p->mask) ||
- nla_put_u32(skb, TCA_TCINDEX_SHIFT, p->shift) ||
- nla_put_u32(skb, TCA_TCINDEX_FALL_THROUGH, p->fall_through))
- goto nla_put_failure;
- nla_nest_end(skb, nest);
- } else {
- if (p->perfect) {
- t->tcm_handle = r - p->perfect;
- } else {
- struct tcindex_filter *f;
- struct tcindex_filter __rcu **fp;
- int i;
-
- t->tcm_handle = 0;
- for (i = 0; !t->tcm_handle && i < p->hash; i++) {
- fp = &p->h[i];
- for (f = rtnl_dereference(*fp);
- !t->tcm_handle && f;
- fp = &f->next, f = rtnl_dereference(*fp)) {
- if (&f->result == r)
- t->tcm_handle = f->key;
- }
- }
- }
- pr_debug("handle = %d\n", t->tcm_handle);
- if (r->res.class &&
- nla_put_u32(skb, TCA_TCINDEX_CLASSID, r->res.classid))
- goto nla_put_failure;
-
- if (tcf_exts_dump(skb, &r->exts) < 0)
- goto nla_put_failure;
- nla_nest_end(skb, nest);
-
- if (tcf_exts_dump_stats(skb, &r->exts) < 0)
- goto nla_put_failure;
- }
-
- return skb->len;
-
-nla_put_failure:
- nla_nest_cancel(skb, nest);
- return -1;
-}
-
-static void tcindex_bind_class(void *fh, u32 classid, unsigned long cl)
-{
- struct tcindex_filter_result *r = fh;
-
- if (r && r->res.classid == classid)
- r->res.class = cl;
-}
-
-static struct tcf_proto_ops cls_tcindex_ops __read_mostly = {
- .kind = "tcindex",
- .classify = tcindex_classify,
- .init = tcindex_init,
- .destroy = tcindex_destroy,
- .get = tcindex_get,
- .change = tcindex_change,
- .delete = tcindex_delete,
- .walk = tcindex_walk,
- .dump = tcindex_dump,
- .bind_class = tcindex_bind_class,
- .owner = THIS_MODULE,
-};
-
-static int __init init_tcindex(void)
-{
- return register_tcf_proto_ops(&cls_tcindex_ops);
-}
-
-static void __exit exit_tcindex(void)
-{
- unregister_tcf_proto_ops(&cls_tcindex_ops);
-}
-
-module_init(init_tcindex)
-module_exit(exit_tcindex)
-MODULE_LICENSE("GPL");
diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c
index dcea21004604..2a1c00048fd6 100644
--- a/net/sched/cls_u32.c
+++ b/net/sched/cls_u32.c
@@ -1,11 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* net/sched/cls_u32.c Ugly (or Universal) 32bit key Packet Classifier.
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
*
* The filters are packed to hash tables of key nodes
@@ -24,9 +20,6 @@
* pure RSVP doesn't need such a general approach and can use
* much simpler (and faster) schemes, sort of cls_rsvp.c.
*
- * JHS: We should remove the CONFIG_NET_CLS_IND from here
- * eventually when the meta match extension is made available
- *
* nfmark match added by Catalin(ux aka Dino) BOIE <catab at umbrella.ro>
*/
@@ -46,15 +39,14 @@
#include <net/act_api.h>
#include <net/pkt_cls.h>
#include <linux/idr.h>
+#include <net/tc_wrapper.h>
struct tc_u_knode {
struct tc_u_knode __rcu *next;
u32 handle;
struct tc_u_hnode __rcu *ht_up;
struct tcf_exts exts;
-#ifdef CONFIG_NET_CLS_IND
int ifindex;
-#endif
u8 fshift;
struct tcf_result res;
struct tc_u_hnode __rcu *ht_down;
@@ -79,7 +71,7 @@ struct tc_u_hnode {
struct tc_u_hnode __rcu *next;
u32 handle;
u32 prio;
- int refcnt;
+ refcount_t refcnt;
unsigned int divisor;
struct idr handle_idr;
bool is_root;
@@ -88,18 +80,28 @@ struct tc_u_hnode {
/* The 'ht' field MUST be the last field in structure to allow for
* more entries allocated at end of structure.
*/
- struct tc_u_knode __rcu *ht[1];
+ struct tc_u_knode __rcu *ht[];
};
struct tc_u_common {
struct tc_u_hnode __rcu *hlist;
void *ptr;
- int refcnt;
+ refcount_t refcnt;
struct idr handle_idr;
struct hlist_node hnode;
long knodes;
};
+static u32 handle2id(u32 h)
+{
+ return ((h & 0x80000000) ? ((h >> 20) & 0x7FF) : h);
+}
+
+static u32 id2handle(u32 id)
+{
+ return (id | 0x800U) << 20;
+}
+
static inline unsigned int u32_hash_fold(__be32 key,
const struct tc_u32_sel *sel,
u8 fshift)
@@ -109,8 +111,9 @@ static inline unsigned int u32_hash_fold(__be32 key,
return h;
}
-static int u32_classify(struct sk_buff *skb, const struct tcf_proto *tp,
- struct tcf_result *res)
+TC_INDIRECT_SCOPE int u32_classify(struct sk_buff *skb,
+ const struct tcf_proto *tp,
+ struct tcf_result *res)
{
struct {
struct tc_u_knode *knode;
@@ -180,12 +183,10 @@ check_terminal:
if (n->sel.flags & TC_U32_TERMINAL) {
*res = n->res;
-#ifdef CONFIG_NET_CLS_IND
if (!tcf_match_indev(skb, n->ifindex)) {
n = rcu_dereference_bh(n->next);
goto next_knode;
}
-#endif
#ifdef CONFIG_CLS_U32_PERF
__this_cpu_inc(n->pf->rhit);
#endif
@@ -319,7 +320,7 @@ static u32 gen_new_htid(struct tc_u_common *tp_c, struct tc_u_hnode *ptr)
int id = idr_alloc_cyclic(&tp_c->handle_idr, ptr, 1, 0x7FF, GFP_KERNEL);
if (id < 0)
return 0;
- return (id | 0x800U) << 20;
+ return id2handle(id);
}
static struct hlist_head *tc_u_common_hash;
@@ -364,12 +365,12 @@ static int u32_init(struct tcf_proto *tp)
void *key = tc_u_common_ptr(tp);
struct tc_u_common *tp_c = tc_u_common_find(key);
- root_ht = kzalloc(sizeof(*root_ht), GFP_KERNEL);
+ root_ht = kzalloc(struct_size(root_ht, ht, 1), GFP_KERNEL);
if (root_ht == NULL)
return -ENOBUFS;
- root_ht->refcnt++;
- root_ht->handle = tp_c ? gen_new_htid(tp_c, root_ht) : 0x80000000;
+ refcount_set(&root_ht->refcnt, 1);
+ root_ht->handle = tp_c ? gen_new_htid(tp_c, root_ht) : id2handle(0);
root_ht->prio = tp->prio;
root_ht->is_root = true;
idr_init(&root_ht->handle_idr);
@@ -380,31 +381,38 @@ static int u32_init(struct tcf_proto *tp)
kfree(root_ht);
return -ENOBUFS;
}
+ refcount_set(&tp_c->refcnt, 1);
tp_c->ptr = key;
INIT_HLIST_NODE(&tp_c->hnode);
idr_init(&tp_c->handle_idr);
hlist_add_head(&tp_c->hnode, tc_u_hash(key));
+ } else {
+ refcount_inc(&tp_c->refcnt);
}
- tp_c->refcnt++;
RCU_INIT_POINTER(root_ht->next, tp_c->hlist);
rcu_assign_pointer(tp_c->hlist, root_ht);
- root_ht->refcnt++;
+ /* root_ht must be destroyed when tcf_proto is destroyed */
rcu_assign_pointer(tp->root, root_ht);
tp->data = tp_c;
return 0;
}
-static int u32_destroy_key(struct tc_u_knode *n, bool free_pf)
+static void __u32_destroy_key(struct tc_u_knode *n)
{
struct tc_u_hnode *ht = rtnl_dereference(n->ht_down);
tcf_exts_destroy(&n->exts);
- tcf_exts_put_net(&n->exts);
- if (ht && --ht->refcnt == 0)
+ if (ht && refcount_dec_and_test(&ht->refcnt))
kfree(ht);
+ kfree(n);
+}
+
+static void u32_destroy_key(struct tc_u_knode *n, bool free_pf)
+{
+ tcf_exts_put_net(&n->exts);
#ifdef CONFIG_CLS_U32_PERF
if (free_pf)
free_percpu(n->pf);
@@ -413,8 +421,7 @@ static int u32_destroy_key(struct tc_u_knode *n, bool free_pf)
if (free_pf)
free_percpu(n->pcpu_success);
#endif
- kfree(n);
- return 0;
+ __u32_destroy_key(n);
}
/* u32_delete_key_rcu should be called when free'ing a copied
@@ -491,7 +498,7 @@ static void u32_clear_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h,
cls_u32.hnode.handle = h->handle;
cls_u32.hnode.prio = h->prio;
- tc_setup_cb_call(block, TC_SETUP_CLSU32, &cls_u32, false);
+ tc_setup_cb_call(block, TC_SETUP_CLSU32, &cls_u32, false, true);
}
static int u32_replace_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h,
@@ -509,7 +516,7 @@ static int u32_replace_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h,
cls_u32.hnode.handle = h->handle;
cls_u32.hnode.prio = h->prio;
- err = tc_setup_cb_call(block, TC_SETUP_CLSU32, &cls_u32, skip_sw);
+ err = tc_setup_cb_call(block, TC_SETUP_CLSU32, &cls_u32, skip_sw, true);
if (err < 0) {
u32_clear_hw_hnode(tp, h, NULL);
return err;
@@ -533,8 +540,8 @@ static void u32_remove_hw_knode(struct tcf_proto *tp, struct tc_u_knode *n,
cls_u32.command = TC_CLSU32_DELETE_KNODE;
cls_u32.knode.handle = n->handle;
- tc_setup_cb_call(block, TC_SETUP_CLSU32, &cls_u32, false);
- tcf_block_offload_dec(block, &n->flags);
+ tc_setup_cb_destroy(block, tp, TC_SETUP_CLSU32, &cls_u32, false,
+ &n->flags, &n->in_hw_count, true);
}
static int u32_replace_hw_knode(struct tcf_proto *tp, struct tc_u_knode *n,
@@ -563,13 +570,11 @@ static int u32_replace_hw_knode(struct tcf_proto *tp, struct tc_u_knode *n,
if (n->ht_down)
cls_u32.knode.link_handle = ht->handle;
- err = tc_setup_cb_call(block, TC_SETUP_CLSU32, &cls_u32, skip_sw);
- if (err < 0) {
+ err = tc_setup_cb_add(block, tp, TC_SETUP_CLSU32, &cls_u32, skip_sw,
+ &n->flags, &n->in_hw_count, true);
+ if (err) {
u32_remove_hw_knode(tp, n, NULL);
return err;
- } else if (err > 0) {
- n->in_hw_count = err;
- tcf_block_offload_inc(block, &n->flags);
}
if (skip_sw && !(n->flags & TCA_CLS_FLAGS_IN_HW))
@@ -608,8 +613,6 @@ static int u32_destroy_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht,
struct tc_u_hnode __rcu **hn;
struct tc_u_hnode *phn;
- WARN_ON(--ht->refcnt);
-
u32_clear_hnode(tp, ht, extack);
hn = &tp_c->hlist;
@@ -619,7 +622,7 @@ static int u32_destroy_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht,
if (phn == ht) {
u32_clear_hw_hnode(tp, ht, extack);
idr_destroy(&ht->handle_idr);
- idr_remove(&tp_c->handle_idr, ht->handle);
+ idr_remove(&tp_c->handle_idr, handle2id(ht->handle));
RCU_INIT_POINTER(*hn, ht->next);
kfree_rcu(ht, rcu);
return 0;
@@ -629,17 +632,18 @@ static int u32_destroy_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht,
return -ENOENT;
}
-static void u32_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack)
+static void u32_destroy(struct tcf_proto *tp, bool rtnl_held,
+ struct netlink_ext_ack *extack)
{
struct tc_u_common *tp_c = tp->data;
struct tc_u_hnode *root_ht = rtnl_dereference(tp->root);
WARN_ON(root_ht == NULL);
- if (root_ht && --root_ht->refcnt == 1)
+ if (root_ht && refcount_dec_and_test(&root_ht->refcnt))
u32_destroy_hnode(tp, root_ht, extack);
- if (--tp_c->refcnt == 0) {
+ if (refcount_dec_and_test(&tp_c->refcnt)) {
struct tc_u_hnode *ht;
hlist_del(&tp_c->hnode);
@@ -651,7 +655,7 @@ static void u32_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack)
/* u32_destroy_key() will later free ht for us, if it's
* still referenced by some knode
*/
- if (--ht->refcnt == 0)
+ if (refcount_dec_and_test(&ht->refcnt))
kfree_rcu(ht, rcu);
}
@@ -663,7 +667,7 @@ static void u32_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack)
}
static int u32_delete(struct tcf_proto *tp, void *arg, bool *last,
- struct netlink_ext_ack *extack)
+ bool rtnl_held, struct netlink_ext_ack *extack)
{
struct tc_u_hnode *ht = arg;
struct tc_u_common *tp_c = tp->data;
@@ -680,7 +684,7 @@ static int u32_delete(struct tcf_proto *tp, void *arg, bool *last,
return -EINVAL;
}
- if (ht->refcnt == 1) {
+ if (refcount_dec_if_one(&ht->refcnt)) {
u32_destroy_hnode(tp, ht, extack);
} else {
NL_SET_ERR_MSG_MOD(extack, "Can not delete in-use filter");
@@ -688,7 +692,7 @@ static int u32_delete(struct tcf_proto *tp, void *arg, bool *last,
}
out:
- *last = tp_c->refcnt == 1 && tp_c->knodes == 0;
+ *last = refcount_read(&tp_c->refcnt) == 1 && tp_c->knodes == 0;
return ret;
}
@@ -718,18 +722,40 @@ static const struct nla_policy u32_policy[TCA_U32_MAX + 1] = {
[TCA_U32_FLAGS] = { .type = NLA_U32 },
};
+static void u32_unbind_filter(struct tcf_proto *tp, struct tc_u_knode *n,
+ struct nlattr **tb)
+{
+ if (tb[TCA_U32_CLASSID])
+ tcf_unbind_filter(tp, &n->res);
+}
+
+static void u32_bind_filter(struct tcf_proto *tp, struct tc_u_knode *n,
+ unsigned long base, struct nlattr **tb)
+{
+ if (tb[TCA_U32_CLASSID]) {
+ n->res.classid = nla_get_u32(tb[TCA_U32_CLASSID]);
+ tcf_bind_filter(tp, &n->res, base);
+ }
+}
+
static int u32_set_parms(struct net *net, struct tcf_proto *tp,
- unsigned long base,
struct tc_u_knode *n, struct nlattr **tb,
- struct nlattr *est, bool ovr,
+ struct nlattr *est, u32 flags, u32 fl_flags,
struct netlink_ext_ack *extack)
{
- int err;
+ int err, ifindex = -1;
- err = tcf_exts_validate(net, tp, tb, est, &n->exts, ovr, extack);
+ err = tcf_exts_validate_ex(net, tp, tb, est, &n->exts, flags,
+ fl_flags, extack);
if (err < 0)
return err;
+ if (tb[TCA_U32_INDEV]) {
+ ifindex = tcf_change_indev(net, tb[TCA_U32_INDEV], extack);
+ if (ifindex < 0)
+ return -EINVAL;
+ }
+
if (tb[TCA_U32_LINK]) {
u32 handle = nla_get_u32(tb[TCA_U32_LINK]);
struct tc_u_hnode *ht_down = NULL, *ht_old;
@@ -750,29 +776,19 @@ static int u32_set_parms(struct net *net, struct tcf_proto *tp,
NL_SET_ERR_MSG_MOD(extack, "Not linking to root node");
return -EINVAL;
}
- ht_down->refcnt++;
+ refcount_inc(&ht_down->refcnt);
}
ht_old = rtnl_dereference(n->ht_down);
rcu_assign_pointer(n->ht_down, ht_down);
if (ht_old)
- ht_old->refcnt--;
- }
- if (tb[TCA_U32_CLASSID]) {
- n->res.classid = nla_get_u32(tb[TCA_U32_CLASSID]);
- tcf_bind_filter(tp, &n->res, base);
+ refcount_dec(&ht_old->refcnt);
}
-#ifdef CONFIG_NET_CLS_IND
- if (tb[TCA_U32_INDEV]) {
- int ret;
- ret = tcf_change_indev(net, tb[TCA_U32_INDEV], extack);
- if (ret < 0)
- return -EINVAL;
- n->ifindex = ret;
- }
-#endif
+ if (ifindex >= 0)
+ n->ifindex = ifindex;
+
return 0;
}
@@ -803,16 +819,14 @@ static void u32_replace_knode(struct tcf_proto *tp, struct tc_u_common *tp_c,
rcu_assign_pointer(*ins, n);
}
-static struct tc_u_knode *u32_init_knode(struct tcf_proto *tp,
+static struct tc_u_knode *u32_init_knode(struct net *net, struct tcf_proto *tp,
struct tc_u_knode *n)
{
struct tc_u_hnode *ht = rtnl_dereference(n->ht_down);
struct tc_u32_sel *s = &n->sel;
struct tc_u_knode *new;
- new = kzalloc(sizeof(*n) + s->nkeys*sizeof(struct tc_u32_key),
- GFP_KERNEL);
-
+ new = kzalloc(struct_size(new, sel.keys, s->nkeys), GFP_KERNEL);
if (!new)
return NULL;
@@ -820,18 +834,11 @@ static struct tc_u_knode *u32_init_knode(struct tcf_proto *tp,
new->handle = n->handle;
RCU_INIT_POINTER(new->ht_up, n->ht_up);
-#ifdef CONFIG_NET_CLS_IND
new->ifindex = n->ifindex;
-#endif
new->fshift = n->fshift;
- new->res = n->res;
new->flags = n->flags;
RCU_INIT_POINTER(new->ht_down, ht);
- /* bump reference count as long as we hold pointer to structure */
- if (ht)
- ht->refcnt++;
-
#ifdef CONFIG_CLS_U32_PERF
/* Statistics may be incremented by readers during update
* so we must keep them in tact. When the node is later destroyed
@@ -846,19 +853,23 @@ static struct tc_u_knode *u32_init_knode(struct tcf_proto *tp,
/* Similarly success statistics must be moved as pointers */
new->pcpu_success = n->pcpu_success;
#endif
- memcpy(&new->sel, s, sizeof(*s) + s->nkeys*sizeof(struct tc_u32_key));
+ memcpy(&new->sel, s, struct_size(s, keys, s->nkeys));
- if (tcf_exts_init(&new->exts, TCA_U32_ACT, TCA_U32_POLICE)) {
+ if (tcf_exts_init(&new->exts, net, TCA_U32_ACT, TCA_U32_POLICE)) {
kfree(new);
return NULL;
}
+ /* bump reference count as long as we hold pointer to structure */
+ if (ht)
+ refcount_inc(&ht->refcnt);
+
return new;
}
static int u32_change(struct net *net, struct sk_buff *in_skb,
struct tcf_proto *tp, unsigned long base, u32 handle,
- struct nlattr **tca, void **arg, bool ovr,
+ struct nlattr **tca, void **arg, u32 flags,
struct netlink_ext_ack *extack)
{
struct tc_u_common *tp_c = tp->data;
@@ -867,12 +878,9 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
struct tc_u32_sel *s;
struct nlattr *opt = tca[TCA_OPTIONS];
struct nlattr *tb[TCA_U32_MAX + 1];
- u32 htid, flags = 0;
+ u32 htid, userflags = 0;
size_t sel_size;
int err;
-#ifdef CONFIG_CLS_U32_PERF
- size_t size;
-#endif
if (!opt) {
if (handle) {
@@ -883,13 +891,14 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
}
}
- err = nla_parse_nested(tb, TCA_U32_MAX, opt, u32_policy, extack);
+ err = nla_parse_nested_deprecated(tb, TCA_U32_MAX, opt, u32_policy,
+ extack);
if (err < 0)
return err;
if (tb[TCA_U32_FLAGS]) {
- flags = nla_get_u32(tb[TCA_U32_FLAGS]);
- if (!tc_flags_valid(flags)) {
+ userflags = nla_get_u32(tb[TCA_U32_FLAGS]);
+ if (!tc_flags_valid(userflags)) {
NL_SET_ERR_MSG_MOD(extack, "Invalid filter flags");
return -EINVAL;
}
@@ -904,33 +913,46 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
return -EINVAL;
}
- if ((n->flags ^ flags) &
+ if ((n->flags ^ userflags) &
~(TCA_CLS_FLAGS_IN_HW | TCA_CLS_FLAGS_NOT_IN_HW)) {
NL_SET_ERR_MSG_MOD(extack, "Key node flags do not match passed flags");
return -EINVAL;
}
- new = u32_init_knode(tp, n);
+ new = u32_init_knode(net, tp, n);
if (!new)
return -ENOMEM;
- err = u32_set_parms(net, tp, base, new, tb,
- tca[TCA_RATE], ovr, extack);
+ err = u32_set_parms(net, tp, new, tb, tca[TCA_RATE],
+ flags, new->flags, extack);
if (err) {
- u32_destroy_key(new, false);
+ __u32_destroy_key(new);
return err;
}
+ u32_bind_filter(tp, new, base, tb);
+
err = u32_replace_hw_knode(tp, new, flags, extack);
if (err) {
- u32_destroy_key(new, false);
+ u32_unbind_filter(tp, new, tb);
+
+ if (tb[TCA_U32_LINK]) {
+ struct tc_u_hnode *ht_old;
+
+ ht_old = rtnl_dereference(n->ht_down);
+ if (ht_old)
+ refcount_inc(&ht_old->refcnt);
+ }
+ __u32_destroy_key(new);
return err;
}
if (!tc_in_hw(new->flags))
new->flags |= TCA_CLS_FLAGS_NOT_IN_HW;
+ tcf_proto_update_usesw(tp, new->flags);
+
u32_replace_knode(tp, tp_c, new);
tcf_unbind_filter(tp, &n->res);
tcf_exts_get_net(&n->exts);
@@ -953,7 +975,7 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
NL_SET_ERR_MSG_MOD(extack, "Divisor can only be used on a hash table");
return -EINVAL;
}
- ht = kzalloc(sizeof(*ht) + divisor*sizeof(void *), GFP_KERNEL);
+ ht = kzalloc(struct_size(ht, ht, divisor + 1), GFP_KERNEL);
if (ht == NULL)
return -ENOBUFS;
if (handle == 0) {
@@ -970,16 +992,16 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
return err;
}
}
- ht->refcnt = 1;
+ refcount_set(&ht->refcnt, 1);
ht->divisor = divisor;
ht->handle = handle;
ht->prio = tp->prio;
idr_init(&ht->handle_idr);
- ht->flags = flags;
+ ht->flags = userflags;
- err = u32_replace_hw_hnode(tp, ht, flags, extack);
+ err = u32_replace_hw_hnode(tp, ht, userflags, extack);
if (err) {
- idr_remove(&tp_c->handle_idr, handle);
+ idr_remove(&tp_c->handle_idr, handle2id(handle));
kfree(ht);
return err;
}
@@ -1013,18 +1035,62 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
return -EINVAL;
}
+ /* At this point, we need to derive the new handle that will be used to
+ * uniquely map the identity of this table match entry. The
+ * identity of the entry that we need to construct is 32 bits made of:
+ * htid(12b):bucketid(8b):node/entryid(12b)
+ *
+ * At this point _we have the table(ht)_ in which we will insert this
+ * entry. We carry the table's id in variable "htid".
+ * Note that earlier code picked the ht selection either by a) the user
+ * providing the htid specified via TCA_U32_HASH attribute or b) when
+ * no such attribute is passed then the root ht, is default to at ID
+ * 0x[800][00][000]. Rule: the root table has a single bucket with ID 0.
+ * If OTOH the user passed us the htid, they may also pass a bucketid of
+ * choice. 0 is fine. For example a user htid is 0x[600][01][000] it is
+ * indicating hash bucketid of 1. Rule: the entry/node ID _cannot_ be
+ * passed via the htid, so even if it was non-zero it will be ignored.
+ *
+ * We may also have a handle, if the user passed one. The handle also
+ * carries the same addressing of htid(12b):bucketid(8b):node/entryid(12b).
+ * Rule: the bucketid on the handle is ignored even if one was passed;
+ * rather the value on "htid" is always assumed to be the bucketid.
+ */
if (handle) {
+ /* Rule: The htid from handle and tableid from htid must match */
if (TC_U32_HTID(handle) && TC_U32_HTID(handle ^ htid)) {
NL_SET_ERR_MSG_MOD(extack, "Handle specified hash table address mismatch");
return -EINVAL;
}
- handle = htid | TC_U32_NODE(handle);
- err = idr_alloc_u32(&ht->handle_idr, NULL, &handle, handle,
- GFP_KERNEL);
- if (err)
- return err;
- } else
+ /* Ok, so far we have a valid htid(12b):bucketid(8b) but we
+ * need to finalize the table entry identification with the last
+ * part - the node/entryid(12b)). Rule: Nodeid _cannot be 0_ for
+ * entries. Rule: nodeid of 0 is reserved only for tables(see
+ * earlier code which processes TC_U32_DIVISOR attribute).
+ * Rule: The nodeid can only be derived from the handle (and not
+ * htid).
+ * Rule: if the handle specified zero for the node id example
+ * 0x60000000, then pick a new nodeid from the pool of IDs
+ * this hash table has been allocating from.
+ * If OTOH it is specified (i.e for example the user passed a
+ * handle such as 0x60000123), then we use it generate our final
+ * handle which is used to uniquely identify the match entry.
+ */
+ if (!TC_U32_NODE(handle)) {
+ handle = gen_new_kid(ht, htid);
+ } else {
+ handle = htid | TC_U32_NODE(handle);
+ err = idr_alloc_u32(&ht->handle_idr, NULL, &handle,
+ handle, GFP_KERNEL);
+ if (err)
+ return err;
+ }
+ } else {
+ /* The user did not give us a handle; lets just generate one
+ * from the table's pool of nodeids.
+ */
handle = gen_new_kid(ht, htid);
+ }
if (tb[TCA_U32_SEL] == NULL) {
NL_SET_ERR_MSG_MOD(extack, "Selector not specified");
@@ -1039,28 +1105,32 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
goto erridr;
}
- n = kzalloc(offsetof(typeof(*n), sel) + sel_size, GFP_KERNEL);
+ n = kzalloc(struct_size(n, sel.keys, s->nkeys), GFP_KERNEL);
if (n == NULL) {
err = -ENOBUFS;
goto erridr;
}
#ifdef CONFIG_CLS_U32_PERF
- size = sizeof(struct tc_u32_pcnt) + s->nkeys * sizeof(u64);
- n->pf = __alloc_percpu(size, __alignof__(struct tc_u32_pcnt));
+ n->pf = __alloc_percpu(struct_size(n->pf, kcnts, s->nkeys),
+ __alignof__(struct tc_u32_pcnt));
if (!n->pf) {
err = -ENOBUFS;
goto errfree;
}
#endif
- memcpy(&n->sel, s, sel_size);
+ unsafe_memcpy(&n->sel, s, sel_size,
+ /* A composite flex-array structure destination,
+ * which was correctly sized with struct_size(),
+ * bounds-checked against nla_len(), and allocated
+ * above. */);
RCU_INIT_POINTER(n->ht_up, ht);
n->handle = handle;
n->fshift = s->hmask ? ffs(ntohl(s->hmask)) - 1 : 0;
- n->flags = flags;
+ n->flags = userflags;
- err = tcf_exts_init(&n->exts, TCA_U32_ACT, TCA_U32_POLICE);
+ err = tcf_exts_init(&n->exts, net, TCA_U32_ACT, TCA_U32_POLICE);
if (err < 0)
goto errout;
@@ -1080,19 +1150,24 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
}
#endif
- err = u32_set_parms(net, tp, base, n, tb, tca[TCA_RATE], ovr,
- extack);
+ err = u32_set_parms(net, tp, n, tb, tca[TCA_RATE],
+ flags, n->flags, extack);
+
+ u32_bind_filter(tp, n, base, tb);
+
if (err == 0) {
struct tc_u_knode __rcu **ins;
struct tc_u_knode *pins;
err = u32_replace_hw_knode(tp, n, flags, extack);
if (err)
- goto errhw;
+ goto errunbind;
if (!tc_in_hw(n->flags))
n->flags |= TCA_CLS_FLAGS_NOT_IN_HW;
+ tcf_proto_update_usesw(tp, n->flags);
+
ins = &ht->ht[TC_U32_HASH(handle)];
for (pins = rtnl_dereference(*ins); pins;
ins = &pins->next, pins = rtnl_dereference(*ins))
@@ -1106,7 +1181,9 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
return 0;
}
-errhw:
+errunbind:
+ u32_unbind_filter(tp, n, tb);
+
#ifdef CONFIG_CLS_U32_MARK
free_percpu(n->pcpu_success);
#endif
@@ -1123,7 +1200,8 @@ erridr:
return err;
}
-static void u32_walk(struct tcf_proto *tp, struct tcf_walker *arg)
+static void u32_walk(struct tcf_proto *tp, struct tcf_walker *arg,
+ bool rtnl_held)
{
struct tc_u_common *tp_c = tp->data;
struct tc_u_hnode *ht;
@@ -1138,33 +1216,23 @@ static void u32_walk(struct tcf_proto *tp, struct tcf_walker *arg)
ht = rtnl_dereference(ht->next)) {
if (ht->prio != tp->prio)
continue;
- if (arg->count >= arg->skip) {
- if (arg->fn(tp, ht, arg) < 0) {
- arg->stop = 1;
- return;
- }
- }
- arg->count++;
+
+ if (!tc_cls_stats_dump(tp, arg, ht))
+ return;
+
for (h = 0; h <= ht->divisor; h++) {
for (n = rtnl_dereference(ht->ht[h]);
n;
n = rtnl_dereference(n->next)) {
- if (arg->count < arg->skip) {
- arg->count++;
- continue;
- }
- if (arg->fn(tp, n, arg) < 0) {
- arg->stop = 1;
+ if (!tc_cls_stats_dump(tp, arg, n))
return;
- }
- arg->count++;
}
}
}
}
static int u32_reoffload_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht,
- bool add, tc_setup_cb_t *cb, void *cb_priv,
+ bool add, flow_setup_cb_t *cb, void *cb_priv,
struct netlink_ext_ack *extack)
{
struct tc_cls_u32_offload cls_u32 = {};
@@ -1184,13 +1252,12 @@ static int u32_reoffload_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht,
}
static int u32_reoffload_knode(struct tcf_proto *tp, struct tc_u_knode *n,
- bool add, tc_setup_cb_t *cb, void *cb_priv,
+ bool add, flow_setup_cb_t *cb, void *cb_priv,
struct netlink_ext_ack *extack)
{
struct tc_u_hnode *ht = rtnl_dereference(n->ht_down);
struct tcf_block *block = tp->chain->block;
struct tc_cls_u32_offload cls_u32 = {};
- int err;
tc_cls_common_offload_init(&cls_u32.common, tp, n->flags, extack);
cls_u32.command = add ?
@@ -1213,19 +1280,12 @@ static int u32_reoffload_knode(struct tcf_proto *tp, struct tc_u_knode *n,
cls_u32.knode.link_handle = ht->handle;
}
- err = cb(TC_SETUP_CLSU32, &cls_u32, cb_priv);
- if (err) {
- if (add && tc_skip_sw(n->flags))
- return err;
- return 0;
- }
-
- tc_cls_offload_cnt_update(block, &n->in_hw_count, &n->flags, add);
-
- return 0;
+ return tc_setup_cb_reoffload(block, tp, add, cb, TC_SETUP_CLSU32,
+ &cls_u32, cb_priv, &n->flags,
+ &n->in_hw_count);
}
-static int u32_reoffload(struct tcf_proto *tp, bool add, tc_setup_cb_t *cb,
+static int u32_reoffload(struct tcf_proto *tp, bool add, flow_setup_cb_t *cb,
void *cb_priv, struct netlink_ext_ack *extack)
{
struct tc_u_common *tp_c = tp->data;
@@ -1272,16 +1332,16 @@ static int u32_reoffload(struct tcf_proto *tp, bool add, tc_setup_cb_t *cb,
return 0;
}
-static void u32_bind_class(void *fh, u32 classid, unsigned long cl)
+static void u32_bind_class(void *fh, u32 classid, unsigned long cl, void *q,
+ unsigned long base)
{
struct tc_u_knode *n = fh;
- if (n && n->res.classid == classid)
- n->res.class = cl;
+ tc_cls_bind_class(classid, cl, q, &n->res, base);
}
static int u32_dump(struct net *net, struct tcf_proto *tp, void *fh,
- struct sk_buff *skb, struct tcmsg *t)
+ struct sk_buff *skb, struct tcmsg *t, bool rtnl_held)
{
struct tc_u_knode *n = fh;
struct tc_u_hnode *ht_up, *ht_down;
@@ -1292,7 +1352,7 @@ static int u32_dump(struct net *net, struct tcf_proto *tp, void *fh,
t->tcm_handle = n->handle;
- nest = nla_nest_start(skb, TCA_OPTIONS);
+ nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
if (nest == NULL)
goto nla_put_failure;
@@ -1308,8 +1368,7 @@ static int u32_dump(struct net *net, struct tcf_proto *tp, void *fh,
int cpu;
#endif
- if (nla_put(skb, TCA_U32_SEL,
- sizeof(n->sel) + n->sel.nkeys*sizeof(struct tc_u32_key),
+ if (nla_put(skb, TCA_U32_SEL, struct_size(&n->sel, keys, n->sel.nkeys),
&n->sel))
goto nla_put_failure;
@@ -1352,18 +1411,14 @@ static int u32_dump(struct net *net, struct tcf_proto *tp, void *fh,
if (tcf_exts_dump(skb, &n->exts) < 0)
goto nla_put_failure;
-#ifdef CONFIG_NET_CLS_IND
if (n->ifindex) {
struct net_device *dev;
dev = __dev_get_by_index(net, n->ifindex);
if (dev && nla_put_string(skb, TCA_U32_INDEV, dev->name))
goto nla_put_failure;
}
-#endif
#ifdef CONFIG_CLS_U32_PERF
- gpf = kzalloc(sizeof(struct tc_u32_pcnt) +
- n->sel.nkeys * sizeof(u64),
- GFP_KERNEL);
+ gpf = kzalloc(struct_size(gpf, kcnts, n->sel.nkeys), GFP_KERNEL);
if (!gpf)
goto nla_put_failure;
@@ -1377,9 +1432,7 @@ static int u32_dump(struct net *net, struct tcf_proto *tp, void *fh,
gpf->kcnts[i] += pf->kcnts[i];
}
- if (nla_put_64bit(skb, TCA_U32_PCNT,
- sizeof(struct tc_u32_pcnt) +
- n->sel.nkeys * sizeof(u64),
+ if (nla_put_64bit(skb, TCA_U32_PCNT, struct_size(gpf, kcnts, n->sel.nkeys),
gpf, TCA_U32_PAD)) {
kfree(gpf);
goto nla_put_failure;
@@ -1414,6 +1467,7 @@ static struct tcf_proto_ops cls_u32_ops __read_mostly = {
.bind_class = u32_bind_class,
.owner = THIS_MODULE,
};
+MODULE_ALIAS_NET_CLS("u32");
static int __init init_u32(void)
{
@@ -1423,9 +1477,7 @@ static int __init init_u32(void)
#ifdef CONFIG_CLS_U32_PERF
pr_info(" Performance counters on\n");
#endif
-#ifdef CONFIG_NET_CLS_IND
pr_info(" input device check on\n");
-#endif
#ifdef CONFIG_NET_CLS_ACT
pr_info(" Actions configured\n");
#endif
@@ -1452,4 +1504,5 @@ static void __exit exit_u32(void)
module_init(init_u32)
module_exit(exit_u32)
+MODULE_DESCRIPTION("Universal 32bit based TC Classifier");
MODULE_LICENSE("GPL");
diff --git a/net/sched/em_canid.c b/net/sched/em_canid.c
index ddd883ca55b2..2d27f91d8441 100644
--- a/net/sched/em_canid.c
+++ b/net/sched/em_canid.c
@@ -1,11 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* em_canid.c Ematch rule to match CAN frames according to their CAN IDs
*
- * This program is free software; you can distribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Idea: Oliver Hartkopp <oliver.hartkopp@volkswagen.de>
* Copyright: (c) 2011 Czech Technical University in Prague
* (c) 2011 Volkswagen Group Research
@@ -44,6 +40,7 @@ struct canid_match {
/**
* em_canid_get_id() - Extracts Can ID out of the sk_buff structure.
+ * @skb: buffer to extract Can ID from
*/
static canid_t em_canid_get_id(struct sk_buff *skb)
{
@@ -102,6 +99,9 @@ static int em_canid_match(struct sk_buff *skb, struct tcf_ematch *m,
int i;
const struct can_filter *lp;
+ if (!pskb_may_pull(skb, CAN_MTU))
+ return 0;
+
can_id = em_canid_get_id(skb);
if (can_id & CAN_EFF_FLAG) {
@@ -225,6 +225,7 @@ static void __exit exit_em_canid(void)
tcf_em_unregister(&em_canid_ops);
}
+MODULE_DESCRIPTION("ematch classifier to match CAN IDs embedded in skb CAN frames");
MODULE_LICENSE("GPL");
module_init(init_em_canid);
diff --git a/net/sched/em_cmp.c b/net/sched/em_cmp.c
index 1c8360a2752a..48c1bce74f49 100644
--- a/net/sched/em_cmp.c
+++ b/net/sched/em_cmp.c
@@ -1,11 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* net/sched/em_cmp.c Simple packet data comparison ematch
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Authors: Thomas Graf <tgraf@suug.ch>
*/
@@ -14,7 +10,7 @@
#include <linux/kernel.h>
#include <linux/skbuff.h>
#include <linux/tc_ematch/tc_em_cmp.h>
-#include <asm/unaligned.h>
+#include <linux/unaligned.h>
#include <net/pkt_cls.h>
static inline int cmp_needs_transformation(struct tcf_em_cmp *cmp)
@@ -26,9 +22,12 @@ static int em_cmp_match(struct sk_buff *skb, struct tcf_ematch *em,
struct tcf_pkt_info *info)
{
struct tcf_em_cmp *cmp = (struct tcf_em_cmp *) em->data;
- unsigned char *ptr = tcf_get_base_ptr(skb, cmp->layer) + cmp->off;
+ unsigned char *ptr = tcf_get_base_ptr(skb, cmp->layer);
u32 val = 0;
+ if (!ptr)
+ return 0;
+ ptr += cmp->off;
if (!tcf_valid_offset(skb, ptr, cmp->align))
return 0;
@@ -45,7 +44,7 @@ static int em_cmp_match(struct sk_buff *skb, struct tcf_ematch *em,
break;
case TCF_EM_ALIGN_U32:
- /* Worth checking boundries? The branching seems
+ /* Worth checking boundaries? The branching seems
* to get worse. Visit again.
*/
val = get_unaligned_be32(ptr);
@@ -91,6 +90,7 @@ static void __exit exit_em_cmp(void)
tcf_em_unregister(&em_cmp_ops);
}
+MODULE_DESCRIPTION("ematch classifier for basic data types(8/16/32 bit) against skb data");
MODULE_LICENSE("GPL");
module_init(init_em_cmp);
diff --git a/net/sched/em_ipset.c b/net/sched/em_ipset.c
index c1b23e3060b8..c95cf86fb431 100644
--- a/net/sched/em_ipset.c
+++ b/net/sched/em_ipset.c
@@ -1,11 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* net/sched/em_ipset.c ipset ematch
*
* Copyright (c) 2012 Florian Westphal <fw@strlen.de>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * version 2 as published by the Free Software Foundation.
*/
#include <linux/gfp.h>
@@ -62,7 +59,7 @@ static int em_ipset_match(struct sk_buff *skb, struct tcf_ematch *em,
};
int ret, network_offset;
- switch (tc_skb_protocol(skb)) {
+ switch (skb_protocol(skb, true)) {
case htons(ETH_P_IP):
state.pf = NFPROTO_IPV4;
if (!pskb_network_may_pull(skb, sizeof(struct iphdr)))
diff --git a/net/sched/em_ipt.c b/net/sched/em_ipt.c
index a5f34e930eff..3650117da47f 100644
--- a/net/sched/em_ipt.c
+++ b/net/sched/em_ipt.c
@@ -1,12 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* net/sched/em_ipt.c IPtables matches Ematch
*
* (c) 2018 Eyal Birger <eyal.birger@gmail.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#include <linux/gfp.h>
@@ -25,7 +21,8 @@
struct em_ipt_match {
const struct xt_match *match;
u32 hook;
- u8 match_data[0] __aligned(8);
+ u8 nfproto;
+ u8 match_data[] __aligned(8);
};
struct em_ipt_xt_match {
@@ -75,11 +72,25 @@ static int policy_validate_match_data(struct nlattr **tb, u8 mrev)
return 0;
}
+static int addrtype_validate_match_data(struct nlattr **tb, u8 mrev)
+{
+ if (mrev != 1) {
+ pr_err("only addrtype match revision 1 supported");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
static const struct em_ipt_xt_match em_ipt_xt_matches[] = {
{
.match_name = "policy",
.validate_match_data = policy_validate_match_data
},
+ {
+ .match_name = "addrtype",
+ .validate_match_data = addrtype_validate_match_data
+ },
{}
};
@@ -119,9 +130,10 @@ static int em_ipt_change(struct net *net, void *data, int data_len,
struct em_ipt_match *im = NULL;
struct xt_match *match;
int mdata_len, ret;
+ u8 nfproto;
- ret = nla_parse(tb, TCA_EM_IPT_MAX, data, data_len, em_ipt_policy,
- NULL);
+ ret = nla_parse_deprecated(tb, TCA_EM_IPT_MAX, data, data_len,
+ em_ipt_policy, NULL);
if (ret < 0)
return ret;
@@ -129,6 +141,15 @@ static int em_ipt_change(struct net *net, void *data, int data_len,
!tb[TCA_EM_IPT_MATCH_DATA] || !tb[TCA_EM_IPT_NFPROTO])
return -EINVAL;
+ nfproto = nla_get_u8(tb[TCA_EM_IPT_NFPROTO]);
+ switch (nfproto) {
+ case NFPROTO_IPV4:
+ case NFPROTO_IPV6:
+ break;
+ default:
+ return -EINVAL;
+ }
+
match = get_xt_match(tb);
if (IS_ERR(match)) {
pr_err("unable to load match\n");
@@ -144,6 +165,7 @@ static int em_ipt_change(struct net *net, void *data, int data_len,
im->match = match;
im->hook = nla_get_u32(tb[TCA_EM_IPT_HOOK]);
+ im->nfproto = nfproto;
nla_memcpy(im->match_data, tb[TCA_EM_IPT_MATCH_DATA], mdata_len);
ret = check_match(net, im, mdata_len);
@@ -177,7 +199,7 @@ static void em_ipt_destroy(struct tcf_ematch *em)
im->match->destroy(&par);
}
module_put(im->match->me);
- kfree((void *)im);
+ kfree(im);
}
static int em_ipt_match(struct sk_buff *skb, struct tcf_ematch *em,
@@ -186,15 +208,33 @@ static int em_ipt_match(struct sk_buff *skb, struct tcf_ematch *em,
const struct em_ipt_match *im = (const void *)em->data;
struct xt_action_param acpar = {};
struct net_device *indev = NULL;
+ u8 nfproto = im->match->family;
struct nf_hook_state state;
int ret;
+ switch (skb_protocol(skb, true)) {
+ case htons(ETH_P_IP):
+ if (!pskb_network_may_pull(skb, sizeof(struct iphdr)))
+ return 0;
+ if (nfproto == NFPROTO_UNSPEC)
+ nfproto = NFPROTO_IPV4;
+ break;
+ case htons(ETH_P_IPV6):
+ if (!pskb_network_may_pull(skb, sizeof(struct ipv6hdr)))
+ return 0;
+ if (nfproto == NFPROTO_UNSPEC)
+ nfproto = NFPROTO_IPV6;
+ break;
+ default:
+ return 0;
+ }
+
rcu_read_lock();
if (skb->skb_iif)
indev = dev_get_by_index_rcu(em->net, skb->skb_iif);
- nf_hook_state_init(&state, im->hook, im->match->family,
+ nf_hook_state_init(&state, im->hook, nfproto,
indev ?: skb->dev, skb->dev, NULL, em->net, NULL);
acpar.match = im->match;
@@ -217,7 +257,7 @@ static int em_ipt_dump(struct sk_buff *skb, struct tcf_ematch *em)
return -EMSGSIZE;
if (nla_put_u8(skb, TCA_EM_IPT_MATCH_REVISION, im->match->revision) < 0)
return -EMSGSIZE;
- if (nla_put_u8(skb, TCA_EM_IPT_NFPROTO, im->match->family) < 0)
+ if (nla_put_u8(skb, TCA_EM_IPT_NFPROTO, im->nfproto) < 0)
return -EMSGSIZE;
if (nla_put(skb, TCA_EM_IPT_MATCH_DATA,
im->match->usersize ?: im->match->matchsize,
diff --git a/net/sched/em_meta.c b/net/sched/em_meta.c
index d6e97115500b..3f2e707a11d1 100644
--- a/net/sched/em_meta.c
+++ b/net/sched/em_meta.c
@@ -1,11 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* net/sched/em_meta.c Metadata ematch
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Authors: Thomas Graf <tgraf@suug.ch>
*
* ==========================================================================
@@ -48,7 +44,7 @@
* be provided for non-numeric types.
*
* Additionally, type dependent modifiers such as shift operators
- * or mask may be applied to extend the functionaliy. As of now,
+ * or mask may be applied to extend the functionality. As of now,
* the variable length type supports shifting the byte string to
* the right, eating up any number of octets and thus supporting
* wildcard interface name comparisons such as "ppp%" matching
@@ -199,7 +195,7 @@ META_COLLECTOR(int_priority)
META_COLLECTOR(int_protocol)
{
/* Let userspace take care of the byte ordering */
- dst->value = tc_skb_protocol(skb);
+ dst->value = skb_protocol(skb, false);
}
META_COLLECTOR(int_pkttype)
@@ -315,12 +311,15 @@ META_COLLECTOR(int_sk_bound_if)
META_COLLECTOR(var_sk_bound_if)
{
+ int bound_dev_if;
+
if (skip_nonlocal(skb)) {
*err = -1;
return;
}
- if (skb->sk->sk_bound_dev_if == 0) {
+ bound_dev_if = READ_ONCE(skb->sk->sk_bound_dev_if);
+ if (bound_dev_if == 0) {
dst->value = (unsigned long) "any";
dst->len = 3;
} else {
@@ -328,7 +327,7 @@ META_COLLECTOR(var_sk_bound_if)
rcu_read_lock();
dev = dev_get_by_index_rcu(sock_net(skb->sk),
- skb->sk->sk_bound_dev_if);
+ bound_dev_if);
*err = var_dev(dev, dst);
rcu_read_unlock();
}
@@ -450,7 +449,7 @@ META_COLLECTOR(int_sk_wmem_queued)
*err = -1;
return;
}
- dst->value = sk->sk_wmem_queued;
+ dst->value = READ_ONCE(sk->sk_wmem_queued);
}
META_COLLECTOR(int_sk_fwd_alloc)
@@ -461,7 +460,7 @@ META_COLLECTOR(int_sk_fwd_alloc)
*err = -1;
return;
}
- dst->value = sk->sk_forward_alloc;
+ dst->value = READ_ONCE(sk->sk_forward_alloc);
}
META_COLLECTOR(int_sk_sndbuf)
@@ -503,7 +502,7 @@ META_COLLECTOR(int_sk_lingertime)
*err = -1;
return;
}
- dst->value = sk->sk_lingertime / HZ;
+ dst->value = READ_ONCE(sk->sk_lingertime) / HZ;
}
META_COLLECTOR(int_sk_err_qlen)
@@ -525,7 +524,7 @@ META_COLLECTOR(int_sk_ack_bl)
*err = -1;
return;
}
- dst->value = sk->sk_ack_backlog;
+ dst->value = READ_ONCE(sk->sk_ack_backlog);
}
META_COLLECTOR(int_sk_max_ack_bl)
@@ -536,7 +535,7 @@ META_COLLECTOR(int_sk_max_ack_bl)
*err = -1;
return;
}
- dst->value = sk->sk_max_ack_backlog;
+ dst->value = READ_ONCE(sk->sk_max_ack_backlog);
}
META_COLLECTOR(int_sk_prio)
@@ -547,7 +546,7 @@ META_COLLECTOR(int_sk_prio)
*err = -1;
return;
}
- dst->value = sk->sk_priority;
+ dst->value = READ_ONCE(sk->sk_priority);
}
META_COLLECTOR(int_sk_rcvlowat)
@@ -558,7 +557,7 @@ META_COLLECTOR(int_sk_rcvlowat)
*err = -1;
return;
}
- dst->value = sk->sk_rcvlowat;
+ dst->value = READ_ONCE(sk->sk_rcvlowat);
}
META_COLLECTOR(int_sk_rcvtimeo)
@@ -569,7 +568,7 @@ META_COLLECTOR(int_sk_rcvtimeo)
*err = -1;
return;
}
- dst->value = sk->sk_rcvtimeo / HZ;
+ dst->value = READ_ONCE(sk->sk_rcvtimeo) / HZ;
}
META_COLLECTOR(int_sk_sndtimeo)
@@ -580,7 +579,7 @@ META_COLLECTOR(int_sk_sndtimeo)
*err = -1;
return;
}
- dst->value = sk->sk_sndtimeo / HZ;
+ dst->value = READ_ONCE(sk->sk_sndtimeo) / HZ;
}
META_COLLECTOR(int_sk_sendmsg_off)
@@ -912,7 +911,8 @@ static int em_meta_change(struct net *net, void *data, int len,
struct tcf_meta_hdr *hdr;
struct meta_match *meta = NULL;
- err = nla_parse(tb, TCA_EM_META_MAX, data, len, meta_policy, NULL);
+ err = nla_parse_deprecated(tb, TCA_EM_META_MAX, data, len,
+ meta_policy, NULL);
if (err < 0)
goto errout;
@@ -1006,6 +1006,7 @@ static void __exit exit_em_meta(void)
tcf_em_unregister(&em_meta_ops);
}
+MODULE_DESCRIPTION("ematch classifier for various internal kernel metadata, skb metadata and sk metadata");
MODULE_LICENSE("GPL");
module_init(init_em_meta);
diff --git a/net/sched/em_nbyte.c b/net/sched/em_nbyte.c
index 07c10bac06a0..c65ffa5fff94 100644
--- a/net/sched/em_nbyte.c
+++ b/net/sched/em_nbyte.c
@@ -1,11 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* net/sched/em_nbyte.c N-Byte ematch
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Authors: Thomas Graf <tgraf@suug.ch>
*/
@@ -20,7 +16,7 @@
struct nbyte_data {
struct tcf_em_nbyte hdr;
- char pattern[0];
+ char pattern[];
};
static int em_nbyte_change(struct net *net, void *data, int data_len,
@@ -35,7 +31,7 @@ static int em_nbyte_change(struct net *net, void *data, int data_len,
em->datalen = sizeof(*nbyte) + nbyte->len;
em->data = (unsigned long)kmemdup(data, em->datalen, GFP_KERNEL);
if (em->data == 0UL)
- return -ENOBUFS;
+ return -ENOMEM;
return 0;
}
@@ -46,6 +42,8 @@ static int em_nbyte_match(struct sk_buff *skb, struct tcf_ematch *em,
struct nbyte_data *nbyte = (struct nbyte_data *) em->data;
unsigned char *ptr = tcf_get_base_ptr(skb, nbyte->hdr.layer);
+ if (!ptr)
+ return 0;
ptr += nbyte->hdr.off;
if (!tcf_valid_offset(skb, ptr, nbyte->hdr.len))
@@ -72,6 +70,7 @@ static void __exit exit_em_nbyte(void)
tcf_em_unregister(&em_nbyte_ops);
}
+MODULE_DESCRIPTION("ematch classifier for arbitrary skb multi-bytes");
MODULE_LICENSE("GPL");
module_init(init_em_nbyte);
diff --git a/net/sched/em_text.c b/net/sched/em_text.c
index 73e2ed576ceb..692e2be1793e 100644
--- a/net/sched/em_text.c
+++ b/net/sched/em_text.c
@@ -1,11 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* net/sched/em_text.c Textsearch ematch
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Authors: Thomas Graf <tgraf@suug.ch>
*/
@@ -33,12 +29,19 @@ static int em_text_match(struct sk_buff *skb, struct tcf_ematch *m,
struct tcf_pkt_info *info)
{
struct text_match *tm = EM_TEXT_PRIV(m);
+ unsigned char *ptr;
int from, to;
- from = tcf_get_base_ptr(skb, tm->from_layer) - skb->data;
+ ptr = tcf_get_base_ptr(skb, tm->from_layer);
+ if (!ptr)
+ return 0;
+ from = ptr - skb->data;
from += tm->from_offset;
- to = tcf_get_base_ptr(skb, tm->to_layer) - skb->data;
+ ptr = tcf_get_base_ptr(skb, tm->to_layer);
+ if (!ptr)
+ return 0;
+ to = ptr - skb->data;
to += tm->to_offset;
return skb_find_text(skb, from, to, tm->config) != UINT_MAX;
@@ -101,8 +104,10 @@ retry:
static void em_text_destroy(struct tcf_ematch *m)
{
- if (EM_TEXT_PRIV(m) && EM_TEXT_PRIV(m)->config)
+ if (EM_TEXT_PRIV(m) && EM_TEXT_PRIV(m)->config) {
textsearch_destroy(EM_TEXT_PRIV(m)->config);
+ kfree(EM_TEXT_PRIV(m));
+ }
}
static int em_text_dump(struct sk_buff *skb, struct tcf_ematch *m)
@@ -110,7 +115,7 @@ static int em_text_dump(struct sk_buff *skb, struct tcf_ematch *m)
struct text_match *tm = EM_TEXT_PRIV(m);
struct tcf_em_text conf;
- strncpy(conf.algo, tm->config->ops->name, sizeof(conf.algo) - 1);
+ strscpy(conf.algo, tm->config->ops->name);
conf.from_offset = tm->from_offset;
conf.to_offset = tm->to_offset;
conf.from_layer = tm->from_layer;
@@ -149,6 +154,7 @@ static void __exit exit_em_text(void)
tcf_em_unregister(&em_text_ops);
}
+MODULE_DESCRIPTION("ematch classifier for embedded text in skbs");
MODULE_LICENSE("GPL");
module_init(init_em_text);
diff --git a/net/sched/em_u32.c b/net/sched/em_u32.c
index 797bdb88c010..fdec4db5ec89 100644
--- a/net/sched/em_u32.c
+++ b/net/sched/em_u32.c
@@ -1,11 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* net/sched/em_u32.c U32 Ematch
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Authors: Thomas Graf <tgraf@suug.ch>
* Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
*
@@ -56,6 +52,7 @@ static void __exit exit_em_u32(void)
tcf_em_unregister(&em_u32_ops);
}
+MODULE_DESCRIPTION("ematch skb classifier using 32 bit chunks of data");
MODULE_LICENSE("GPL");
module_init(init_em_u32);
diff --git a/net/sched/ematch.c b/net/sched/ematch.c
index 1331a4c2d8ff..5c1235e6076a 100644
--- a/net/sched/ematch.c
+++ b/net/sched/ematch.c
@@ -1,11 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* net/sched/ematch.c Extended Match API
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Authors: Thomas Graf <tgraf@suug.ch>
*
* ==========================================================================
@@ -145,7 +141,7 @@ errout:
EXPORT_SYMBOL(tcf_em_register);
/**
- * tcf_em_unregister - unregster and extended match
+ * tcf_em_unregister - unregister and extended match
*
* @ops: ematch operations lookup table
*
@@ -242,6 +238,9 @@ static int tcf_em_validate(struct tcf_proto *tp,
goto errout;
if (em->ops->change) {
+ err = -EINVAL;
+ if (em_hdr->flags & TCF_EM_SIMPLE)
+ goto errout;
err = em->ops->change(net, data, data_len, em);
if (err < 0)
goto errout;
@@ -256,6 +255,8 @@ static int tcf_em_validate(struct tcf_proto *tp,
* the value carried.
*/
if (em_hdr->flags & TCF_EM_SIMPLE) {
+ if (em->ops->datalen > 0)
+ goto errout;
if (data_len < sizeof(u32))
goto errout;
em->data = *(u32 *) data;
@@ -267,12 +268,12 @@ static int tcf_em_validate(struct tcf_proto *tp,
}
em->data = (unsigned long) v;
}
+ em->datalen = data_len;
}
}
em->matchid = em_hdr->matchid;
em->flags = em_hdr->flags;
- em->datalen = data_len;
em->net = net;
err = 0;
@@ -314,7 +315,8 @@ int tcf_em_tree_validate(struct tcf_proto *tp, struct nlattr *nla,
if (!nla)
return 0;
- err = nla_parse_nested(tb, TCA_EMATCH_TREE_MAX, nla, em_policy, NULL);
+ err = nla_parse_nested_deprecated(tb, TCA_EMATCH_TREE_MAX, nla,
+ em_policy, NULL);
if (err < 0)
goto errout;
@@ -389,7 +391,6 @@ EXPORT_SYMBOL(tcf_em_tree_validate);
/**
* tcf_em_tree_destroy - destroy an ematch tree
*
- * @tp: classifier kind handle
* @tree: ematch tree to be deleted
*
* This functions destroys an ematch tree previously created by
@@ -425,7 +426,7 @@ EXPORT_SYMBOL(tcf_em_tree_destroy);
* tcf_em_tree_dump - dump ematch tree into a rtnl message
*
* @skb: skb holding the rtnl message
- * @t: ematch tree to be dumped
+ * @tree: ematch tree to be dumped
* @tlv: TLV type to be used to encapsulate the tree
*
* This function dumps a ematch tree into a rtnl message. It is valid to
@@ -440,14 +441,14 @@ int tcf_em_tree_dump(struct sk_buff *skb, struct tcf_ematch_tree *tree, int tlv)
struct nlattr *top_start;
struct nlattr *list_start;
- top_start = nla_nest_start(skb, tlv);
+ top_start = nla_nest_start_noflag(skb, tlv);
if (top_start == NULL)
goto nla_put_failure;
if (nla_put(skb, TCA_EMATCH_TREE_HDR, sizeof(tree->hdr), &tree->hdr))
goto nla_put_failure;
- list_start = nla_nest_start(skb, TCA_EMATCH_TREE_LIST);
+ list_start = nla_nest_start_noflag(skb, TCA_EMATCH_TREE_LIST);
if (list_start == NULL)
goto nla_put_failure;
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index 03e26e8d0ec9..f56b18c8aebf 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -1,11 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* net/sched/sch_api.c Packet scheduler API.
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
*
* Fixes:
@@ -29,12 +25,17 @@
#include <linux/hrtimer.h>
#include <linux/slab.h>
#include <linux/hashtable.h>
+#include <linux/bpf.h>
+#include <net/netdev_lock.h>
#include <net/net_namespace.h>
#include <net/sock.h>
#include <net/netlink.h>
#include <net/pkt_sched.h>
#include <net/pkt_cls.h>
+#include <net/tc_wrapper.h>
+
+#include <trace/events/qdisc.h>
/*
@@ -173,7 +174,7 @@ out_einval:
}
EXPORT_SYMBOL(register_qdisc);
-int unregister_qdisc(struct Qdisc_ops *qops)
+void unregister_qdisc(struct Qdisc_ops *qops)
{
struct Qdisc_ops *q, **qp;
int err = -ENOENT;
@@ -188,7 +189,8 @@ int unregister_qdisc(struct Qdisc_ops *qops)
err = 0;
}
write_unlock(&qdisc_mod_lock);
- return err;
+
+ WARN(err, "unregister qdisc(%s) failed\n", qops->id);
}
EXPORT_SYMBOL(unregister_qdisc);
@@ -196,7 +198,7 @@ EXPORT_SYMBOL(unregister_qdisc);
void qdisc_get_default(char *name, size_t len)
{
read_lock(&qdisc_mod_lock);
- strlcpy(name, default_qdisc_ops->id, len);
+ strscpy(name, default_qdisc_ops->id, len);
read_unlock(&qdisc_mod_lock);
}
@@ -206,7 +208,7 @@ static struct Qdisc_ops *qdisc_lookup_default(const char *name)
for (q = qdisc_base; q; q = q->next) {
if (!strcmp(name, q->id)) {
- if (!try_module_get(q->owner))
+ if (!bpf_try_module_get(q, q->owner))
q = NULL;
break;
}
@@ -228,7 +230,7 @@ int qdisc_set_default(const char *name)
if (!ops) {
/* Not found, drop lock and try to load module */
write_unlock(&qdisc_mod_lock);
- request_module("sch_%s", name);
+ request_module(NET_SCH_ALIAS_PREFIX "%s", name);
write_lock(&qdisc_mod_lock);
ops = qdisc_lookup_default(name);
@@ -236,7 +238,7 @@ int qdisc_set_default(const char *name)
if (ops) {
/* Set new default */
- module_put(default_qdisc_ops->owner);
+ bpf_module_put(default_qdisc_ops, default_qdisc_ops->owner);
default_qdisc_ops = ops;
}
write_unlock(&qdisc_mod_lock);
@@ -269,7 +271,8 @@ static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
root->handle == handle)
return root;
- hash_for_each_possible_rcu(qdisc_dev(root)->qdisc_hash, q, hash, handle) {
+ hash_for_each_possible_rcu(qdisc_dev(root)->qdisc_hash, q, hash, handle,
+ lockdep_rtnl_is_held()) {
if (q->handle == handle)
return q;
}
@@ -302,13 +305,13 @@ struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
if (!handle)
return NULL;
- q = qdisc_match_from_root(dev->qdisc, handle);
+ q = qdisc_match_from_root(rtnl_dereference(dev->qdisc), handle);
if (q)
goto out;
if (dev_ingress_queue(dev))
q = qdisc_match_from_root(
- dev_ingress_queue(dev)->qdisc_sleeping,
+ rtnl_dereference(dev_ingress_queue(dev)->qdisc_sleeping),
handle);
out:
return q;
@@ -321,28 +324,34 @@ struct Qdisc *qdisc_lookup_rcu(struct net_device *dev, u32 handle)
if (!handle)
return NULL;
- q = qdisc_match_from_root(dev->qdisc, handle);
+ q = qdisc_match_from_root(rcu_dereference(dev->qdisc), handle);
if (q)
goto out;
nq = dev_ingress_queue_rcu(dev);
if (nq)
- q = qdisc_match_from_root(nq->qdisc_sleeping, handle);
+ q = qdisc_match_from_root(rcu_dereference(nq->qdisc_sleeping),
+ handle);
out:
return q;
}
-static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
+static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid,
+ struct netlink_ext_ack *extack)
{
unsigned long cl;
const struct Qdisc_class_ops *cops = p->ops->cl_ops;
- if (cops == NULL)
- return NULL;
+ if (cops == NULL) {
+ NL_SET_ERR_MSG(extack, "Parent qdisc is not classful");
+ return ERR_PTR(-EOPNOTSUPP);
+ }
cl = cops->find(p, classid);
- if (cl == 0)
- return NULL;
+ if (cl == 0) {
+ NL_SET_ERR_MSG(extack, "Specified class not found");
+ return ERR_PTR(-ENOENT);
+ }
return cops->leaf(p, cl);
}
@@ -356,7 +365,7 @@ static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
read_lock(&qdisc_mod_lock);
for (q = qdisc_base; q; q = q->next) {
if (nla_strcmp(kind, q->id) == 0) {
- if (!try_module_get(q->owner))
+ if (!bpf_try_module_get(q, q->owner))
q = NULL;
break;
}
@@ -413,7 +422,8 @@ struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r,
{
struct qdisc_rate_table *rtab;
- if (tab == NULL || r->rate == 0 || r->cell_log == 0 ||
+ if (tab == NULL || r->rate == 0 ||
+ r->cell_log == 0 || r->cell_log >= 32 ||
nla_len(tab) != TC_RTAB_SIZE) {
NL_SET_ERR_MSG(extack, "Invalid rate table parameters for searching");
return NULL;
@@ -421,7 +431,7 @@ struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r,
for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
if (!memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) &&
- !memcmp(&rtab->data, nla_data(tab), 1024)) {
+ !memcmp(&rtab->data, nla_data(tab), TC_RTAB_SIZE)) {
rtab->refcnt++;
return rtab;
}
@@ -431,7 +441,7 @@ struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r,
if (rtab) {
rtab->rate = *r;
rtab->refcnt = 1;
- memcpy(rtab->data, nla_data(tab), 1024);
+ memcpy(rtab->data, nla_data(tab), TC_RTAB_SIZE);
if (r->linklayer == TC_LINKLAYER_UNAWARE)
r->linklayer = __detect_linklayer(r, rtab->data);
rtab->next = qdisc_rtab_list;
@@ -479,7 +489,8 @@ static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt,
u16 *tab = NULL;
int err;
- err = nla_parse_nested(tb, TCA_STAB_MAX, opt, stab_policy, extack);
+ err = nla_parse_nested_deprecated(tb, TCA_STAB_MAX, opt, stab_policy,
+ extack);
if (err < 0)
return ERR_PTR(err);
if (!tb[TCA_STAB_BASE]) {
@@ -506,31 +517,33 @@ static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt,
list_for_each_entry(stab, &qdisc_stab_list, list) {
if (memcmp(&stab->szopts, s, sizeof(*s)))
continue;
- if (tsize > 0 && memcmp(stab->data, tab, tsize * sizeof(u16)))
+ if (tsize > 0 &&
+ memcmp(stab->data, tab, flex_array_size(stab, data, tsize)))
continue;
stab->refcnt++;
return stab;
}
- stab = kmalloc(sizeof(*stab) + tsize * sizeof(u16), GFP_KERNEL);
+ if (s->size_log > STAB_SIZE_LOG_MAX ||
+ s->cell_log > STAB_SIZE_LOG_MAX) {
+ NL_SET_ERR_MSG(extack, "Invalid logarithmic size of size table");
+ return ERR_PTR(-EINVAL);
+ }
+
+ stab = kmalloc(struct_size(stab, data, tsize), GFP_KERNEL);
if (!stab)
return ERR_PTR(-ENOMEM);
stab->refcnt = 1;
stab->szopts = *s;
if (tsize > 0)
- memcpy(stab->data, tab, tsize * sizeof(u16));
+ memcpy(stab->data, tab, flex_array_size(stab, data, tsize));
list_add_tail(&stab->list, &qdisc_stab_list);
return stab;
}
-static void stab_kfree_rcu(struct rcu_head *head)
-{
- kfree(container_of(head, struct qdisc_size_table, rcu));
-}
-
void qdisc_put_stab(struct qdisc_size_table *tab)
{
if (!tab)
@@ -538,7 +551,7 @@ void qdisc_put_stab(struct qdisc_size_table *tab)
if (--tab->refcnt == 0) {
list_del(&tab->list);
- call_rcu(&tab->rcu, stab_kfree_rcu);
+ kfree_rcu(tab, rcu);
}
}
EXPORT_SYMBOL(qdisc_put_stab);
@@ -547,7 +560,7 @@ static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab)
{
struct nlattr *nest;
- nest = nla_nest_start(skb, TCA_STAB);
+ nest = nla_nest_start_noflag(skb, TCA_STAB);
if (nest == NULL)
goto nla_put_failure;
if (nla_put(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts))
@@ -587,17 +600,6 @@ out:
pkt_len = 1;
qdisc_skb_cb(skb)->pkt_len = pkt_len;
}
-EXPORT_SYMBOL(__qdisc_calculate_pkt_len);
-
-void qdisc_warn_nonwc(const char *txt, struct Qdisc *qdisc)
-{
- if (!(qdisc->flags & TCQ_F_WARN_NONWC)) {
- pr_warn("%s: %s qdisc %X: is non-work-conserving?\n",
- txt, qdisc->ops->id, qdisc->handle >> 16);
- qdisc->flags |= TCQ_F_WARN_NONWC;
- }
-}
-EXPORT_SYMBOL(qdisc_warn_nonwc);
static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
{
@@ -614,8 +616,7 @@ static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
void qdisc_watchdog_init_clockid(struct qdisc_watchdog *wd, struct Qdisc *qdisc,
clockid_t clockid)
{
- hrtimer_init(&wd->timer, clockid, HRTIMER_MODE_ABS_PINNED);
- wd->timer.function = qdisc_watchdog;
+ hrtimer_setup(&wd->timer, qdisc_watchdog, clockid, HRTIMER_MODE_ABS_PINNED);
wd->qdisc = qdisc;
}
EXPORT_SYMBOL(qdisc_watchdog_init_clockid);
@@ -626,21 +627,35 @@ void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
}
EXPORT_SYMBOL(qdisc_watchdog_init);
-void qdisc_watchdog_schedule_ns(struct qdisc_watchdog *wd, u64 expires)
+void qdisc_watchdog_schedule_range_ns(struct qdisc_watchdog *wd, u64 expires,
+ u64 delta_ns)
{
- if (test_bit(__QDISC_STATE_DEACTIVATED,
- &qdisc_root_sleeping(wd->qdisc)->state))
- return;
+ bool deactivated;
- if (wd->last_expires == expires)
+ rcu_read_lock();
+ deactivated = test_bit(__QDISC_STATE_DEACTIVATED,
+ &qdisc_root_sleeping(wd->qdisc)->state);
+ rcu_read_unlock();
+ if (deactivated)
return;
- wd->last_expires = expires;
- hrtimer_start(&wd->timer,
- ns_to_ktime(expires),
- HRTIMER_MODE_ABS_PINNED);
+ if (hrtimer_is_queued(&wd->timer)) {
+ u64 softexpires;
+
+ softexpires = ktime_to_ns(hrtimer_get_softexpires(&wd->timer));
+ /* If timer is already set in [expires, expires + delta_ns],
+ * do not reprogram it.
+ */
+ if (softexpires - expires <= delta_ns)
+ return;
+ }
+
+ hrtimer_start_range_ns(&wd->timer,
+ ns_to_ktime(expires),
+ delta_ns,
+ HRTIMER_MODE_ABS_PINNED);
}
-EXPORT_SYMBOL(qdisc_watchdog_schedule_ns);
+EXPORT_SYMBOL(qdisc_watchdog_schedule_range_ns);
void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
{
@@ -760,42 +775,33 @@ static u32 qdisc_alloc_handle(struct net_device *dev)
void qdisc_tree_reduce_backlog(struct Qdisc *sch, int n, int len)
{
- bool qdisc_is_offloaded = sch->flags & TCQ_F_OFFLOADED;
const struct Qdisc_class_ops *cops;
unsigned long cl;
u32 parentid;
bool notify;
int drops;
- if (n == 0 && len == 0)
- return;
drops = max_t(int, n, 0);
rcu_read_lock();
while ((parentid = sch->parent)) {
- if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS))
+ if (parentid == TC_H_ROOT)
break;
if (sch->flags & TCQ_F_NOPARENT)
break;
- /* Notify parent qdisc only if child qdisc becomes empty.
- *
- * If child was empty even before update then backlog
- * counter is screwed and we skip notification because
- * parent class is already passive.
- *
- * If the original child was offloaded then it is allowed
- * to be seem as empty, so the parent is notified anyway.
- */
- notify = !sch->q.qlen && !WARN_ON_ONCE(!n &&
- !qdisc_is_offloaded);
+ /* Notify parent qdisc only if child qdisc becomes empty. */
+ notify = !sch->q.qlen;
/* TODO: perform the search on a per txq basis */
- sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid));
+ sch = qdisc_lookup_rcu(qdisc_dev(sch), TC_H_MAJ(parentid));
if (sch == NULL) {
WARN_ON_ONCE(parentid != TC_H_ROOT);
break;
}
cops = sch->ops->cl_ops;
if (notify && cops->qlen_notify) {
+ /* Note that qlen_notify must be idempotent as it may get called
+ * multiple times.
+ */
cl = cops->find(sch, parentid);
cops->qlen_notify(sch, cl);
}
@@ -857,6 +863,23 @@ void qdisc_offload_graft_helper(struct net_device *dev, struct Qdisc *sch,
}
EXPORT_SYMBOL(qdisc_offload_graft_helper);
+void qdisc_offload_query_caps(struct net_device *dev,
+ enum tc_setup_type type,
+ void *caps, size_t caps_len)
+{
+ const struct net_device_ops *ops = dev->netdev_ops;
+ struct tc_query_caps_base base = {
+ .type = type,
+ .caps = caps,
+ };
+
+ memset(caps, 0, caps_len);
+
+ if (ops->ndo_setup_tc)
+ ops->ndo_setup_tc(dev, TC_QUERY_CAPS, &base);
+}
+EXPORT_SYMBOL(qdisc_offload_query_caps);
+
static void qdisc_offload_graft_root(struct net_device *dev,
struct Qdisc *new, struct Qdisc *old,
struct netlink_ext_ack *extack)
@@ -873,9 +896,10 @@ static void qdisc_offload_graft_root(struct net_device *dev,
}
static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
- u32 portid, u32 seq, u16 flags, int event)
+ u32 portid, u32 seq, u16 flags, int event,
+ struct netlink_ext_ack *extack)
{
- struct gnet_stats_basic_cpu __percpu *cpu_bstats = NULL;
+ struct gnet_stats_basic_sync __percpu *cpu_bstats = NULL;
struct gnet_stats_queue __percpu *cpu_qstats = NULL;
struct tcmsg *tcm;
struct nlmsghdr *nlh;
@@ -933,8 +957,7 @@ static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
cpu_qstats = q->cpu_qstats;
}
- if (gnet_stats_copy_basic(qdisc_root_sleeping_running(q),
- &d, cpu_bstats, &q->bstats) < 0 ||
+ if (gnet_stats_copy_basic(&d, cpu_bstats, &q->bstats, true) < 0 ||
gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 ||
gnet_stats_copy_queue(&d, cpu_qstats, &q->qstats, qlen) < 0)
goto nla_put_failure;
@@ -942,7 +965,12 @@ static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
if (gnet_stats_finish_copy(&d) < 0)
goto nla_put_failure;
+ if (extack && extack->_msg &&
+ nla_put_string(skb, TCA_EXT_WARN_MSG, extack->_msg))
+ goto out_nlmsg_trim;
+
nlh->nlmsg_len = skb_tail_pointer(skb) - b;
+
return skb->len;
out_nlmsg_trim:
@@ -961,25 +989,55 @@ static bool tc_qdisc_dump_ignore(struct Qdisc *q, bool dump_invisible)
return false;
}
+static int qdisc_get_notify(struct net *net, struct sk_buff *oskb,
+ struct nlmsghdr *n, u32 clid, struct Qdisc *q,
+ struct netlink_ext_ack *extack)
+{
+ struct sk_buff *skb;
+ u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
+
+ skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+ if (!skb)
+ return -ENOBUFS;
+
+ if (!tc_qdisc_dump_ignore(q, false)) {
+ if (tc_fill_qdisc(skb, q, clid, portid, n->nlmsg_seq, 0,
+ RTM_NEWQDISC, extack) < 0)
+ goto err_out;
+ }
+
+ if (skb->len)
+ return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
+ n->nlmsg_flags & NLM_F_ECHO);
+
+err_out:
+ kfree_skb(skb);
+ return -EINVAL;
+}
+
static int qdisc_notify(struct net *net, struct sk_buff *oskb,
struct nlmsghdr *n, u32 clid,
- struct Qdisc *old, struct Qdisc *new)
+ struct Qdisc *old, struct Qdisc *new,
+ struct netlink_ext_ack *extack)
{
struct sk_buff *skb;
u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
+ if (!rtnl_notify_needed(net, n->nlmsg_flags, RTNLGRP_TC))
+ return 0;
+
skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
if (!skb)
return -ENOBUFS;
if (old && !tc_qdisc_dump_ignore(old, false)) {
if (tc_fill_qdisc(skb, old, clid, portid, n->nlmsg_seq,
- 0, RTM_DELQDISC) < 0)
+ 0, RTM_DELQDISC, extack) < 0)
goto err_out;
}
if (new && !tc_qdisc_dump_ignore(new, false)) {
if (tc_fill_qdisc(skb, new, clid, portid, n->nlmsg_seq,
- old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
+ old ? NLM_F_REPLACE : 0, RTM_NEWQDISC, extack) < 0)
goto err_out;
}
@@ -994,15 +1052,29 @@ err_out:
static void notify_and_destroy(struct net *net, struct sk_buff *skb,
struct nlmsghdr *n, u32 clid,
- struct Qdisc *old, struct Qdisc *new)
+ struct Qdisc *old, struct Qdisc *new,
+ struct netlink_ext_ack *extack)
{
if (new || old)
- qdisc_notify(net, skb, n, clid, old, new);
+ qdisc_notify(net, skb, n, clid, old, new, extack);
if (old)
qdisc_put(old);
}
+static void qdisc_clear_nolock(struct Qdisc *sch)
+{
+ sch->flags &= ~TCQ_F_NOLOCK;
+ if (!(sch->flags & TCQ_F_CPUSTATS))
+ return;
+
+ free_percpu(sch->cpu_bstats);
+ free_percpu(sch->cpu_qstats);
+ sch->cpu_bstats = NULL;
+ sch->cpu_qstats = NULL;
+ sch->flags &= ~TCQ_F_CPUSTATS;
+}
+
/* Graft qdisc "new" to class "classid" of qdisc "parent" or
* to device "dev".
*
@@ -1022,17 +1094,29 @@ static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
if (parent == NULL) {
unsigned int i, num_q, ingress;
+ struct netdev_queue *dev_queue;
ingress = 0;
num_q = dev->num_tx_queues;
if ((q && q->flags & TCQ_F_INGRESS) ||
(new && new->flags & TCQ_F_INGRESS)) {
- num_q = 1;
ingress = 1;
- if (!dev_ingress_queue(dev)) {
+ dev_queue = dev_ingress_queue(dev);
+ if (!dev_queue) {
NL_SET_ERR_MSG(extack, "Device does not have an ingress queue");
return -ENOENT;
}
+
+ q = rtnl_dereference(dev_queue->qdisc_sleeping);
+
+ /* This is the counterpart of that qdisc_refcount_inc_nz() call in
+ * __tcf_qdisc_find() for filter requests.
+ */
+ if (!qdisc_refcount_dec_if_one(q)) {
+ NL_SET_ERR_MSG(extack,
+ "Current ingress or clsact Qdisc has ongoing filter requests");
+ return -EBUSY;
+ }
}
if (dev->flags & IFF_UP)
@@ -1040,35 +1124,42 @@ static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
qdisc_offload_graft_root(dev, new, old, extack);
- if (new && new->ops->attach)
+ if (new && new->ops->attach && !ingress)
goto skip;
- for (i = 0; i < num_q; i++) {
- struct netdev_queue *dev_queue = dev_ingress_queue(dev);
-
- if (!ingress)
+ if (!ingress) {
+ for (i = 0; i < num_q; i++) {
dev_queue = netdev_get_tx_queue(dev, i);
+ old = dev_graft_qdisc(dev_queue, new);
- old = dev_graft_qdisc(dev_queue, new);
- if (new && i > 0)
- qdisc_refcount_inc(new);
-
- if (!ingress)
+ if (new && i > 0)
+ qdisc_refcount_inc(new);
qdisc_put(old);
+ }
+ } else {
+ old = dev_graft_qdisc(dev_queue, NULL);
+
+ /* {ingress,clsact}_destroy() @old before grafting @new to avoid
+ * unprotected concurrent accesses to net_device::miniq_{in,e}gress
+ * pointer(s) in mini_qdisc_pair_swap().
+ */
+ qdisc_notify(net, skb, n, classid, old, new, extack);
+ qdisc_destroy(old);
+
+ dev_graft_qdisc(dev_queue, new);
}
skip:
if (!ingress) {
- notify_and_destroy(net, skb, n, classid,
- dev->qdisc, new);
+ old = rtnl_dereference(dev->qdisc);
if (new && !new->ops->attach)
qdisc_refcount_inc(new);
- dev->qdisc = new ? : &noop_qdisc;
+ rcu_assign_pointer(dev->qdisc, new ? : &noop_qdisc);
+
+ notify_and_destroy(net, skb, n, classid, old, new, extack);
if (new && new->ops->attach)
new->ops->attach(new);
- } else {
- notify_and_destroy(net, skb, n, classid, old, new);
}
if (dev->flags & IFF_UP)
@@ -1079,9 +1170,8 @@ skip:
int err;
/* Only support running class lockless if parent is lockless */
- if (new && (new->flags & TCQ_F_NOLOCK) &&
- parent && !(parent->flags & TCQ_F_NOLOCK))
- new->flags &= ~TCQ_F_NOLOCK;
+ if (new && (new->flags & TCQ_F_NOLOCK) && !(parent->flags & TCQ_F_NOLOCK))
+ qdisc_clear_nolock(new);
if (!cops || !cops->graft)
return -EOPNOTSUPP;
@@ -1092,10 +1182,21 @@ skip:
return -ENOENT;
}
+ if (new && new->ops == &noqueue_qdisc_ops) {
+ NL_SET_ERR_MSG(extack, "Cannot assign noqueue to a class");
+ return -EINVAL;
+ }
+
+ if (new &&
+ !(parent->flags & TCQ_F_MQROOT) &&
+ rcu_access_pointer(new->stab)) {
+ NL_SET_ERR_MSG(extack, "STAB not supported on a non root");
+ return -EINVAL;
+ }
err = cops->graft(parent, cl, new, &old, extack);
if (err)
return err;
- notify_and_destroy(net, skb, n, classid, old, new);
+ notify_and_destroy(net, skb, n, classid, old, new, extack);
}
return 0;
}
@@ -1142,7 +1243,7 @@ static int qdisc_block_indexes_set(struct Qdisc *sch, struct nlattr **tca,
static struct Qdisc *qdisc_create(struct net_device *dev,
struct netdev_queue *dev_queue,
- struct Qdisc *p, u32 parent, u32 handle,
+ u32 parent, u32 handle,
struct nlattr **tca, int *errp,
struct netlink_ext_ack *extack)
{
@@ -1153,37 +1254,9 @@ static struct Qdisc *qdisc_create(struct net_device *dev,
struct qdisc_size_table *stab;
ops = qdisc_lookup_ops(kind);
-#ifdef CONFIG_MODULES
- if (ops == NULL && kind != NULL) {
- char name[IFNAMSIZ];
- if (nla_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
- /* We dropped the RTNL semaphore in order to
- * perform the module load. So, even if we
- * succeeded in loading the module we have to
- * tell the caller to replay the request. We
- * indicate this using -EAGAIN.
- * We replay the request because the device may
- * go away in the mean time.
- */
- rtnl_unlock();
- request_module("sch_%s", name);
- rtnl_lock();
- ops = qdisc_lookup_ops(kind);
- if (ops != NULL) {
- /* We will try again qdisc_lookup_ops,
- * so don't keep a reference.
- */
- module_put(ops->owner);
- err = -EAGAIN;
- goto err_out;
- }
- }
- }
-#endif
-
- err = -ENOENT;
if (!ops) {
- NL_SET_ERR_MSG(extack, "Specified qdisc not found");
+ err = -ENOENT;
+ NL_SET_ERR_MSG(extack, "Specified qdisc kind is unknown");
goto err_out;
}
@@ -1196,14 +1269,21 @@ static struct Qdisc *qdisc_create(struct net_device *dev,
sch->parent = parent;
if (handle == TC_H_INGRESS) {
- sch->flags |= TCQ_F_INGRESS;
+ if (!(sch->flags & TCQ_F_INGRESS)) {
+ NL_SET_ERR_MSG(extack,
+ "Specified parent ID is reserved for ingress and clsact Qdiscs");
+ err = -EINVAL;
+ goto err_out3;
+ }
handle = TC_H_MAKE(TC_H_INGRESS, 0);
} else {
if (handle == 0) {
handle = qdisc_alloc_handle(dev);
- err = -ENOMEM;
- if (handle == 0)
+ if (handle == 0) {
+ NL_SET_ERR_MSG(extack, "Maximum number of qdisc handles was exceeded");
+ err = -ENOSPC;
goto err_out3;
+ }
}
if (!netif_is_multiqueue(dev))
sch->flags |= TCQ_F_ONETXQUEUE;
@@ -1218,7 +1298,7 @@ static struct Qdisc *qdisc_create(struct net_device *dev,
* before again attaching a qdisc.
*/
if ((dev->priv_flags & IFF_NO_QUEUE) && (dev->tx_queue_len == 0)) {
- dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
+ WRITE_ONCE(dev->tx_queue_len, DEFAULT_TX_QUEUE_LEN);
netdev_info(dev, "Caught tx_queue_len zero misconfig\n");
}
@@ -1226,41 +1306,33 @@ static struct Qdisc *qdisc_create(struct net_device *dev,
if (err)
goto err_out3;
- if (ops->init) {
- err = ops->init(sch, tca[TCA_OPTIONS], extack);
- if (err != 0)
- goto err_out5;
- }
-
if (tca[TCA_STAB]) {
stab = qdisc_get_stab(tca[TCA_STAB], extack);
if (IS_ERR(stab)) {
err = PTR_ERR(stab);
- goto err_out4;
+ goto err_out3;
}
rcu_assign_pointer(sch->stab, stab);
}
- if (tca[TCA_RATE]) {
- seqcount_t *running;
+ if (ops->init) {
+ err = ops->init(sch, tca[TCA_OPTIONS], extack);
+ if (err != 0)
+ goto err_out4;
+ }
+
+ if (tca[TCA_RATE]) {
err = -EOPNOTSUPP;
if (sch->flags & TCQ_F_MQROOT) {
NL_SET_ERR_MSG(extack, "Cannot attach rate estimator to a multi-queue root qdisc");
goto err_out4;
}
- if (sch->parent != TC_H_ROOT &&
- !(sch->flags & TCQ_F_INGRESS) &&
- (!p || !(p->flags & TCQ_F_MQROOT)))
- running = qdisc_root_sleeping_running(sch);
- else
- running = &sch->running;
-
err = gen_new_estimator(&sch->bstats,
sch->cpu_bstats,
&sch->rate_est,
NULL,
- running,
+ true,
tca[TCA_RATE]);
if (err) {
NL_SET_ERR_MSG(extack, "Failed to generate new estimator");
@@ -1269,31 +1341,26 @@ static struct Qdisc *qdisc_create(struct net_device *dev,
}
qdisc_hash_add(sch, false);
+ trace_qdisc_create(ops, dev, parent);
return sch;
-err_out5:
- /* ops->init() failed, we call ->destroy() like qdisc_create_dflt() */
+err_out4:
+ /* Even if ops->init() failed, we call ops->destroy()
+ * like qdisc_create_dflt().
+ */
if (ops->destroy)
ops->destroy(sch);
+ qdisc_put_stab(rtnl_dereference(sch->stab));
err_out3:
- dev_put(dev);
+ lockdep_unregister_key(&sch->root_lock_key);
+ netdev_put(dev, &sch->dev_tracker);
qdisc_free(sch);
err_out2:
- module_put(ops->owner);
+ bpf_module_put(ops, ops->owner);
err_out:
*errp = err;
return NULL;
-
-err_out4:
- /*
- * Any broken qdiscs that would require a ops->reset() here?
- * The qdisc was never in action so it shouldn't be necessary.
- */
- qdisc_put_stab(rtnl_dereference(sch->stab));
- if (ops->destroy)
- ops->destroy(sch);
- goto err_out3;
}
static int qdisc_change(struct Qdisc *sch, struct nlattr **tca,
@@ -1335,7 +1402,7 @@ static int qdisc_change(struct Qdisc *sch, struct nlattr **tca,
sch->cpu_bstats,
&sch->rate_est,
NULL,
- qdisc_root_sleeping_running(sch),
+ true,
tca[TCA_RATE]);
}
out:
@@ -1397,31 +1464,18 @@ const struct nla_policy rtm_tca_policy[TCA_MAX + 1] = {
* Delete/get qdisc.
*/
-static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
- struct netlink_ext_ack *extack)
+static int __tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
+ struct netlink_ext_ack *extack,
+ struct net_device *dev,
+ struct nlattr *tca[TCA_MAX + 1],
+ struct tcmsg *tcm)
{
struct net *net = sock_net(skb->sk);
- struct tcmsg *tcm = nlmsg_data(n);
- struct nlattr *tca[TCA_MAX + 1];
- struct net_device *dev;
- u32 clid;
struct Qdisc *q = NULL;
struct Qdisc *p = NULL;
+ u32 clid;
int err;
- if ((n->nlmsg_type != RTM_GETQDISC) &&
- !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
- return -EPERM;
-
- err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, rtm_tca_policy,
- extack);
- if (err < 0)
- return err;
-
- dev = __dev_get_by_index(net, tcm->tcm_ifindex);
- if (!dev)
- return -ENODEV;
-
clid = tcm->tcm_parent;
if (clid) {
if (clid != TC_H_ROOT) {
@@ -1431,17 +1485,19 @@ static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified classid");
return -ENOENT;
}
- q = qdisc_leaf(p, clid);
+ q = qdisc_leaf(p, clid, extack);
} else if (dev_ingress_queue(dev)) {
- q = dev_ingress_queue(dev)->qdisc_sleeping;
+ q = rtnl_dereference(dev_ingress_queue(dev)->qdisc_sleeping);
}
} else {
- q = dev->qdisc;
+ q = rtnl_dereference(dev->qdisc);
}
if (!q) {
NL_SET_ERR_MSG(extack, "Cannot find specified qdisc on specified device");
return -ENOENT;
}
+ if (IS_ERR(q))
+ return PTR_ERR(q);
if (tcm->tcm_handle && q->handle != tcm->tcm_handle) {
NL_SET_ERR_MSG(extack, "Invalid handle");
@@ -1456,7 +1512,7 @@ static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
}
if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) {
- NL_SET_ERR_MSG(extack, "Invalid qdisc name");
+ NL_SET_ERR_MSG(extack, "Invalid qdisc name: must match existing qdisc");
return -EINVAL;
}
@@ -1473,44 +1529,67 @@ static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
if (err != 0)
return err;
} else {
- qdisc_notify(net, skb, n, clid, NULL, q);
+ qdisc_get_notify(net, skb, n, clid, q, NULL);
}
return 0;
}
-/*
- * Create/change qdisc.
- */
-
-static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
- struct netlink_ext_ack *extack)
+static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
+ struct netlink_ext_ack *extack)
{
struct net *net = sock_net(skb->sk);
- struct tcmsg *tcm;
+ struct tcmsg *tcm = nlmsg_data(n);
struct nlattr *tca[TCA_MAX + 1];
struct net_device *dev;
- u32 clid;
- struct Qdisc *q, *p;
int err;
- if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
- return -EPERM;
-
-replay:
- /* Reinit, just in case something touches this. */
- err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, rtm_tca_policy,
- extack);
+ err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX,
+ rtm_tca_policy, extack);
if (err < 0)
return err;
- tcm = nlmsg_data(n);
- clid = tcm->tcm_parent;
- q = p = NULL;
-
dev = __dev_get_by_index(net, tcm->tcm_ifindex);
if (!dev)
return -ENODEV;
+ netdev_lock_ops(dev);
+ err = __tc_get_qdisc(skb, n, extack, dev, tca, tcm);
+ netdev_unlock_ops(dev);
+
+ return err;
+}
+
+static bool req_create_or_replace(struct nlmsghdr *n)
+{
+ return (n->nlmsg_flags & NLM_F_CREATE &&
+ n->nlmsg_flags & NLM_F_REPLACE);
+}
+
+static bool req_create_exclusive(struct nlmsghdr *n)
+{
+ return (n->nlmsg_flags & NLM_F_CREATE &&
+ n->nlmsg_flags & NLM_F_EXCL);
+}
+
+static bool req_change(struct nlmsghdr *n)
+{
+ return (!(n->nlmsg_flags & NLM_F_CREATE) &&
+ !(n->nlmsg_flags & NLM_F_REPLACE) &&
+ !(n->nlmsg_flags & NLM_F_EXCL));
+}
+
+static int __tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
+ struct netlink_ext_ack *extack,
+ struct net_device *dev,
+ struct nlattr *tca[TCA_MAX + 1],
+ struct tcmsg *tcm)
+{
+ struct Qdisc *q = NULL;
+ struct Qdisc *p = NULL;
+ u32 clid;
+ int err;
+
+ clid = tcm->tcm_parent;
if (clid) {
if (clid != TC_H_ROOT) {
@@ -1520,12 +1599,19 @@ replay:
NL_SET_ERR_MSG(extack, "Failed to find specified qdisc");
return -ENOENT;
}
- q = qdisc_leaf(p, clid);
+ if (p->flags & TCQ_F_INGRESS) {
+ NL_SET_ERR_MSG(extack,
+ "Cannot add children to ingress/clsact qdisc");
+ return -EOPNOTSUPP;
+ }
+ q = qdisc_leaf(p, clid, extack);
+ if (IS_ERR(q))
+ return PTR_ERR(q);
} else if (dev_ingress_queue_create(dev)) {
- q = dev_ingress_queue(dev)->qdisc_sleeping;
+ q = rtnl_dereference(dev_ingress_queue(dev)->qdisc_sleeping);
}
} else {
- q = dev->qdisc;
+ q = rtnl_dereference(dev->qdisc);
}
/* It may be default qdisc, ignore it */
@@ -1545,13 +1631,22 @@ replay:
q = qdisc_lookup(dev, tcm->tcm_handle);
if (!q)
goto create_n_graft;
+ if (q->parent != tcm->tcm_parent) {
+ NL_SET_ERR_MSG(extack, "Cannot move an existing qdisc to a different parent");
+ return -EINVAL;
+ }
if (n->nlmsg_flags & NLM_F_EXCL) {
NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot override");
return -EEXIST;
}
if (tca[TCA_KIND] &&
nla_strcmp(tca[TCA_KIND], q->ops->id)) {
- NL_SET_ERR_MSG(extack, "Invalid qdisc name");
+ NL_SET_ERR_MSG(extack, "Invalid qdisc name: must match existing qdisc");
+ return -EINVAL;
+ }
+ if (q->flags & TCQ_F_INGRESS) {
+ NL_SET_ERR_MSG(extack,
+ "Cannot regraft ingress or clsact Qdiscs");
return -EINVAL;
}
if (q == p ||
@@ -1559,6 +1654,10 @@ replay:
NL_SET_ERR_MSG(extack, "Qdisc parent/child loop detected");
return -ELOOP;
}
+ if (clid == TC_H_INGRESS) {
+ NL_SET_ERR_MSG(extack, "Ingress cannot graft directly");
+ return -EINVAL;
+ }
qdisc_refcount_inc(q);
goto graft;
} else {
@@ -1569,27 +1668,35 @@ replay:
*
* We know, that some child q is already
* attached to this parent and have choice:
- * either to change it or to create/graft new one.
+ * 1) change it or 2) create/graft new one.
+ * If the requested qdisc kind is different
+ * than the existing one, then we choose graft.
+ * If they are the same then this is "change"
+ * operation - just let it fallthrough..
*
* 1. We are allowed to create/graft only
- * if CREATE and REPLACE flags are set.
+ * if the request is explicitly stating
+ * "please create if it doesn't exist".
*
- * 2. If EXCL is set, requestor wanted to say,
- * that qdisc tcm_handle is not expected
+ * 2. If the request is to exclusive create
+ * then the qdisc tcm_handle is not expected
* to exist, so that we choose create/graft too.
*
* 3. The last case is when no flags are set.
+ * This will happen when for example tc
+ * utility issues a "change" command.
* Alas, it is sort of hole in API, we
* cannot decide what to do unambiguously.
- * For now we select create/graft, if
- * user gave KIND, which does not match existing.
+ * For now we select create/graft.
*/
- if ((n->nlmsg_flags & NLM_F_CREATE) &&
- (n->nlmsg_flags & NLM_F_REPLACE) &&
- ((n->nlmsg_flags & NLM_F_EXCL) ||
- (tca[TCA_KIND] &&
- nla_strcmp(tca[TCA_KIND], q->ops->id))))
- goto create_n_graft;
+ if (tca[TCA_KIND] &&
+ nla_strcmp(tca[TCA_KIND], q->ops->id)) {
+ if (req_create_or_replace(n) ||
+ req_create_exclusive(n))
+ goto create_n_graft;
+ else if (req_change(n))
+ goto create_n_graft2;
+ }
}
}
} else {
@@ -1610,12 +1717,12 @@ replay:
return -EEXIST;
}
if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) {
- NL_SET_ERR_MSG(extack, "Invalid qdisc name");
+ NL_SET_ERR_MSG(extack, "Invalid qdisc name: must match existing qdisc");
return -EINVAL;
}
err = qdisc_change(q, tca, extack);
if (err == 0)
- qdisc_notify(net, skb, n, clid, NULL, q);
+ qdisc_notify(sock_net(skb->sk), skb, n, clid, NULL, q, extack);
return err;
create_n_graft:
@@ -1623,9 +1730,10 @@ create_n_graft:
NL_SET_ERR_MSG(extack, "Qdisc not found. To create specify NLM_F_CREATE flag");
return -ENOENT;
}
+create_n_graft2:
if (clid == TC_H_INGRESS) {
if (dev_ingress_queue(dev)) {
- q = qdisc_create(dev, dev_ingress_queue(dev), p,
+ q = qdisc_create(dev, dev_ingress_queue(dev),
tcm->tcm_parent, tcm->tcm_parent,
tca, &err, extack);
} else {
@@ -1642,15 +1750,12 @@ create_n_graft:
else
dev_queue = netdev_get_tx_queue(dev, 0);
- q = qdisc_create(dev, dev_queue, p,
+ q = qdisc_create(dev, dev_queue,
tcm->tcm_parent, tcm->tcm_handle,
tca, &err, extack);
}
- if (q == NULL) {
- if (err == -EAGAIN)
- goto replay;
+ if (!q)
return err;
- }
graft:
err = qdisc_graft(dev, p, skb, n, clid, q, NULL, extack);
@@ -1663,6 +1768,58 @@ graft:
return 0;
}
+static void request_qdisc_module(struct nlattr *kind)
+{
+ struct Qdisc_ops *ops;
+ char name[IFNAMSIZ];
+
+ if (!kind)
+ return;
+
+ ops = qdisc_lookup_ops(kind);
+ if (ops) {
+ bpf_module_put(ops, ops->owner);
+ return;
+ }
+
+ if (nla_strscpy(name, kind, IFNAMSIZ) >= 0) {
+ rtnl_unlock();
+ request_module(NET_SCH_ALIAS_PREFIX "%s", name);
+ rtnl_lock();
+ }
+}
+
+/*
+ * Create/change qdisc.
+ */
+static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
+ struct netlink_ext_ack *extack)
+{
+ struct net *net = sock_net(skb->sk);
+ struct nlattr *tca[TCA_MAX + 1];
+ struct net_device *dev;
+ struct tcmsg *tcm;
+ int err;
+
+ err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX,
+ rtm_tca_policy, extack);
+ if (err < 0)
+ return err;
+
+ request_qdisc_module(tca[TCA_KIND]);
+
+ tcm = nlmsg_data(n);
+ dev = __dev_get_by_index(net, tcm->tcm_ifindex);
+ if (!dev)
+ return -ENODEV;
+
+ netdev_lock_ops(dev);
+ err = __tc_modify_qdisc(skb, n, extack, dev, tca, tcm);
+ netdev_unlock_ops(dev);
+
+ return err;
+}
+
static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
struct netlink_callback *cb,
int *q_idx_p, int s_q_idx, bool recur,
@@ -1682,7 +1839,7 @@ static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
if (!tc_qdisc_dump_ignore(q, dump_invisible) &&
tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq, NLM_F_MULTI,
- RTM_NEWQDISC) <= 0)
+ RTM_NEWQDISC, NULL) <= 0)
goto done;
q_idx++;
}
@@ -1704,7 +1861,7 @@ static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
if (!tc_qdisc_dump_ignore(q, dump_invisible) &&
tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq, NLM_F_MULTI,
- RTM_NEWQDISC) <= 0)
+ RTM_NEWQDISC, NULL) <= 0)
goto done;
q_idx++;
}
@@ -1733,8 +1890,8 @@ static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
idx = 0;
ASSERT_RTNL();
- err = nlmsg_parse(nlh, sizeof(struct tcmsg), tca, TCA_MAX,
- rtm_tca_policy, cb->extack);
+ err = nlmsg_parse_deprecated(nlh, sizeof(struct tcmsg), tca, TCA_MAX,
+ rtm_tca_policy, cb->extack);
if (err < 0)
return err;
@@ -1747,16 +1904,23 @@ static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
s_q_idx = 0;
q_idx = 0;
- if (tc_dump_qdisc_root(dev->qdisc, skb, cb, &q_idx, s_q_idx,
- true, tca[TCA_DUMP_INVISIBLE]) < 0)
+ netdev_lock_ops(dev);
+ if (tc_dump_qdisc_root(rtnl_dereference(dev->qdisc),
+ skb, cb, &q_idx, s_q_idx,
+ true, tca[TCA_DUMP_INVISIBLE]) < 0) {
+ netdev_unlock_ops(dev);
goto done;
+ }
dev_queue = dev_ingress_queue(dev);
if (dev_queue &&
- tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb,
- &q_idx, s_q_idx, false,
- tca[TCA_DUMP_INVISIBLE]) < 0)
+ tc_dump_qdisc_root(rtnl_dereference(dev_queue->qdisc_sleeping),
+ skb, cb, &q_idx, s_q_idx, false,
+ tca[TCA_DUMP_INVISIBLE]) < 0) {
+ netdev_unlock_ops(dev);
goto done;
+ }
+ netdev_unlock_ops(dev);
cont:
idx++;
@@ -1776,8 +1940,8 @@ done:
************************************************/
static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
- unsigned long cl,
- u32 portid, u32 seq, u16 flags, int event)
+ unsigned long cl, u32 portid, u32 seq, u16 flags,
+ int event, struct netlink_ext_ack *extack)
{
struct tcmsg *tcm;
struct nlmsghdr *nlh;
@@ -1812,7 +1976,12 @@ static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
if (gnet_stats_finish_copy(&d) < 0)
goto nla_put_failure;
+ if (extack && extack->_msg &&
+ nla_put_string(skb, TCA_EXT_WARN_MSG, extack->_msg))
+ goto out_nlmsg_trim;
+
nlh->nlmsg_len = skb_tail_pointer(skb) - b;
+
return skb->len;
out_nlmsg_trim:
@@ -1823,7 +1992,30 @@ nla_put_failure:
static int tclass_notify(struct net *net, struct sk_buff *oskb,
struct nlmsghdr *n, struct Qdisc *q,
- unsigned long cl, int event)
+ unsigned long cl, int event, struct netlink_ext_ack *extack)
+{
+ struct sk_buff *skb;
+ u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
+
+ if (!rtnl_notify_needed(net, n->nlmsg_flags, RTNLGRP_TC))
+ return 0;
+
+ skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+ if (!skb)
+ return -ENOBUFS;
+
+ if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, event, extack) < 0) {
+ kfree_skb(skb);
+ return -EINVAL;
+ }
+
+ return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
+ n->nlmsg_flags & NLM_F_ECHO);
+}
+
+static int tclass_get_notify(struct net *net, struct sk_buff *oskb,
+ struct nlmsghdr *n, struct Qdisc *q,
+ unsigned long cl, struct netlink_ext_ack *extack)
{
struct sk_buff *skb;
u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
@@ -1832,7 +2024,8 @@ static int tclass_notify(struct net *net, struct sk_buff *oskb,
if (!skb)
return -ENOBUFS;
- if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, event) < 0) {
+ if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, RTM_NEWTCLASS,
+ extack) < 0) {
kfree_skb(skb);
return -EINVAL;
}
@@ -1844,7 +2037,8 @@ static int tclass_notify(struct net *net, struct sk_buff *oskb,
static int tclass_del_notify(struct net *net,
const struct Qdisc_class_ops *cops,
struct sk_buff *oskb, struct nlmsghdr *n,
- struct Qdisc *q, unsigned long cl)
+ struct Qdisc *q, unsigned long cl,
+ struct netlink_ext_ack *extack)
{
u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
struct sk_buff *skb;
@@ -1853,75 +2047,105 @@ static int tclass_del_notify(struct net *net,
if (!cops->delete)
return -EOPNOTSUPP;
- skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
- if (!skb)
- return -ENOBUFS;
+ if (rtnl_notify_needed(net, n->nlmsg_flags, RTNLGRP_TC)) {
+ skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+ if (!skb)
+ return -ENOBUFS;
- if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0,
- RTM_DELTCLASS) < 0) {
- kfree_skb(skb);
- return -EINVAL;
+ if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0,
+ RTM_DELTCLASS, extack) < 0) {
+ kfree_skb(skb);
+ return -EINVAL;
+ }
+ } else {
+ skb = NULL;
}
- err = cops->delete(q, cl);
+ err = cops->delete(q, cl, extack);
if (err) {
kfree_skb(skb);
return err;
}
- return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
- n->nlmsg_flags & NLM_F_ECHO);
+ err = rtnetlink_maybe_send(skb, net, portid, RTNLGRP_TC,
+ n->nlmsg_flags & NLM_F_ECHO);
+ return err;
}
#ifdef CONFIG_NET_CLS
struct tcf_bind_args {
struct tcf_walker w;
- u32 classid;
+ unsigned long base;
unsigned long cl;
+ u32 classid;
};
static int tcf_node_bind(struct tcf_proto *tp, void *n, struct tcf_walker *arg)
{
struct tcf_bind_args *a = (void *)arg;
- if (tp->ops->bind_class) {
+ if (n && tp->ops->bind_class) {
struct Qdisc *q = tcf_block_q(tp->chain->block);
sch_tree_lock(q);
- tp->ops->bind_class(n, a->classid, a->cl);
+ tp->ops->bind_class(n, a->classid, a->cl, q, a->base);
sch_tree_unlock(q);
}
return 0;
}
-static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
- unsigned long new_cl)
+struct tc_bind_class_args {
+ struct qdisc_walker w;
+ unsigned long new_cl;
+ u32 portid;
+ u32 clid;
+};
+
+static int tc_bind_class_walker(struct Qdisc *q, unsigned long cl,
+ struct qdisc_walker *w)
{
+ struct tc_bind_class_args *a = (struct tc_bind_class_args *)w;
const struct Qdisc_class_ops *cops = q->ops->cl_ops;
struct tcf_block *block;
struct tcf_chain *chain;
- unsigned long cl;
- cl = cops->find(q, portid);
- if (!cl)
- return;
block = cops->tcf_block(q, cl, NULL);
if (!block)
- return;
- list_for_each_entry(chain, &block->chain_list, list) {
+ return 0;
+ for (chain = tcf_get_next_chain(block, NULL);
+ chain;
+ chain = tcf_get_next_chain(block, chain)) {
struct tcf_proto *tp;
- for (tp = rtnl_dereference(chain->filter_chain);
- tp; tp = rtnl_dereference(tp->next)) {
+ for (tp = tcf_get_next_proto(chain, NULL);
+ tp; tp = tcf_get_next_proto(chain, tp)) {
struct tcf_bind_args arg = {};
arg.w.fn = tcf_node_bind;
- arg.classid = clid;
- arg.cl = new_cl;
- tp->ops->walk(tp, &arg.w);
+ arg.classid = a->clid;
+ arg.base = cl;
+ arg.cl = a->new_cl;
+ tp->ops->walk(tp, &arg.w, true);
}
}
+
+ return 0;
+}
+
+static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
+ unsigned long new_cl)
+{
+ const struct Qdisc_class_ops *cops = q->ops->cl_ops;
+ struct tc_bind_class_args args = {};
+
+ if (!cops->tcf_block)
+ return;
+ args.portid = portid;
+ args.clid = clid;
+ args.new_cl = new_cl;
+ args.w.fn = tc_bind_class_walker;
+ q->ops->cl_ops->walk(q, &args.w);
}
#else
@@ -1933,15 +2157,15 @@ static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
#endif
-static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n,
- struct netlink_ext_ack *extack)
+static int __tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n,
+ struct netlink_ext_ack *extack,
+ struct net_device *dev,
+ struct nlattr *tca[TCA_MAX + 1],
+ struct tcmsg *tcm)
{
struct net *net = sock_net(skb->sk);
- struct tcmsg *tcm = nlmsg_data(n);
- struct nlattr *tca[TCA_MAX + 1];
- struct net_device *dev;
- struct Qdisc *q = NULL;
const struct Qdisc_class_ops *cops;
+ struct Qdisc *q = NULL;
unsigned long cl = 0;
unsigned long new_cl;
u32 portid;
@@ -1949,19 +2173,6 @@ static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n,
u32 qid;
int err;
- if ((n->nlmsg_type != RTM_GETTCLASS) &&
- !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
- return -EPERM;
-
- err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, rtm_tca_policy,
- extack);
- if (err < 0)
- return err;
-
- dev = __dev_get_by_index(net, tcm->tcm_ifindex);
- if (!dev)
- return -ENODEV;
-
/*
parent == TC_H_UNSPEC - unspecified parent.
parent == TC_H_ROOT - class is root, which has no parent.
@@ -1991,7 +2202,7 @@ static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n,
} else if (qid1) {
qid = qid1;
} else if (qid == 0)
- qid = dev->qdisc->handle;
+ qid = rtnl_dereference(dev->qdisc)->handle;
/* Now qid is genuine qdisc handle consistent
* both with parent and child.
@@ -2002,7 +2213,7 @@ static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n,
portid = TC_H_MAKE(qid, portid);
} else {
if (qid == 0)
- qid = dev->qdisc->handle;
+ qid = rtnl_dereference(dev->qdisc)->handle;
}
/* OK. Locate qdisc */
@@ -2038,12 +2249,12 @@ static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n,
goto out;
break;
case RTM_DELTCLASS:
- err = tclass_del_notify(net, cops, skb, n, q, cl);
+ err = tclass_del_notify(net, cops, skb, n, q, cl, extack);
/* Unbind the class with flilters with 0 */
tc_bind_tclass(q, portid, clid, 0);
goto out;
case RTM_GETTCLASS:
- err = tclass_notify(net, skb, n, q, cl, RTM_NEWTCLASS);
+ err = tclass_get_notify(net, skb, n, q, cl, extack);
goto out;
default:
err = -EINVAL;
@@ -2056,12 +2267,18 @@ static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n,
return -EOPNOTSUPP;
}
+ /* Prevent creation of traffic classes with classid TC_H_ROOT */
+ if (clid == TC_H_ROOT) {
+ NL_SET_ERR_MSG(extack, "Cannot create traffic class with classid TC_H_ROOT");
+ return -EINVAL;
+ }
+
new_cl = cl;
err = -EOPNOTSUPP;
if (cops->change)
err = cops->change(q, clid, portid, tca, &new_cl, extack);
if (err == 0) {
- tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS);
+ tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS, extack);
/* We just create a new class, need to do reverse binding. */
if (cl != new_cl)
tc_bind_tclass(q, portid, clid, new_cl);
@@ -2070,6 +2287,31 @@ out:
return err;
}
+static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n,
+ struct netlink_ext_ack *extack)
+{
+ struct net *net = sock_net(skb->sk);
+ struct tcmsg *tcm = nlmsg_data(n);
+ struct nlattr *tca[TCA_MAX + 1];
+ struct net_device *dev;
+ int err;
+
+ err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX,
+ rtm_tca_policy, extack);
+ if (err < 0)
+ return err;
+
+ dev = __dev_get_by_index(net, tcm->tcm_ifindex);
+ if (!dev)
+ return -ENODEV;
+
+ netdev_lock_ops(dev);
+ err = __tc_ctl_tclass(skb, n, extack, dev, tca, tcm);
+ netdev_unlock_ops(dev);
+
+ return err;
+}
+
struct qdisc_dump_args {
struct qdisc_walker w;
struct sk_buff *skb;
@@ -2083,7 +2325,7 @@ static int qdisc_class_dump(struct Qdisc *q, unsigned long cl,
return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).portid,
a->cb->nlh->nlmsg_seq, NLM_F_MULTI,
- RTM_NEWTCLASS);
+ RTM_NEWTCLASS, NULL);
}
static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
@@ -2117,7 +2359,7 @@ static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
struct tcmsg *tcm, struct netlink_callback *cb,
- int *t_p, int s_t)
+ int *t_p, int s_t, bool recur)
{
struct Qdisc *q;
int b;
@@ -2128,7 +2370,7 @@ static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0)
return -1;
- if (!qdisc_dev(root))
+ if (!qdisc_dev(root) || !recur)
return 0;
if (tcm->tcm_parent) {
@@ -2146,39 +2388,54 @@ static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
return 0;
}
-static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
+static int __tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb,
+ struct tcmsg *tcm, struct net_device *dev)
{
- struct tcmsg *tcm = nlmsg_data(cb->nlh);
- struct net *net = sock_net(skb->sk);
struct netdev_queue *dev_queue;
- struct net_device *dev;
int t, s_t;
- if (nlmsg_len(cb->nlh) < sizeof(*tcm))
- return 0;
- dev = dev_get_by_index(net, tcm->tcm_ifindex);
- if (!dev)
- return 0;
-
s_t = cb->args[0];
t = 0;
- if (tc_dump_tclass_root(dev->qdisc, skb, tcm, cb, &t, s_t) < 0)
+ if (tc_dump_tclass_root(rtnl_dereference(dev->qdisc),
+ skb, tcm, cb, &t, s_t, true) < 0)
goto done;
dev_queue = dev_ingress_queue(dev);
if (dev_queue &&
- tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb,
- &t, s_t) < 0)
+ tc_dump_tclass_root(rtnl_dereference(dev_queue->qdisc_sleeping),
+ skb, tcm, cb, &t, s_t, false) < 0)
goto done;
done:
cb->args[0] = t;
- dev_put(dev);
return skb->len;
}
+static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct tcmsg *tcm = nlmsg_data(cb->nlh);
+ struct net *net = sock_net(skb->sk);
+ struct net_device *dev;
+ int err;
+
+ if (nlmsg_len(cb->nlh) < sizeof(*tcm))
+ return 0;
+
+ dev = dev_get_by_index(net, tcm->tcm_ifindex);
+ if (!dev)
+ return 0;
+
+ netdev_lock_ops(dev);
+ err = __tc_dump_tclass(skb, cb, tcm, dev);
+ netdev_unlock_ops(dev);
+
+ dev_put(dev);
+
+ return err;
+}
+
#ifdef CONFIG_PROC_FS
static int psched_show(struct seq_file *seq, void *v)
{
@@ -2221,6 +2478,21 @@ static struct pernet_operations psched_net_ops = {
.exit = psched_net_exit,
};
+#if IS_ENABLED(CONFIG_MITIGATION_RETPOLINE)
+DEFINE_STATIC_KEY_FALSE(tc_skip_wrapper);
+#endif
+
+static const struct rtnl_msg_handler psched_rtnl_msg_handlers[] __initconst = {
+ {.msgtype = RTM_NEWQDISC, .doit = tc_modify_qdisc},
+ {.msgtype = RTM_DELQDISC, .doit = tc_get_qdisc},
+ {.msgtype = RTM_GETQDISC, .doit = tc_get_qdisc,
+ .dumpit = tc_dump_qdisc},
+ {.msgtype = RTM_NEWTCLASS, .doit = tc_ctl_tclass},
+ {.msgtype = RTM_DELTCLASS, .doit = tc_ctl_tclass},
+ {.msgtype = RTM_GETTCLASS, .doit = tc_ctl_tclass,
+ .dumpit = tc_dump_tclass},
+};
+
static int __init pktsched_init(void)
{
int err;
@@ -2239,14 +2511,9 @@ static int __init pktsched_init(void)
register_qdisc(&mq_qdisc_ops);
register_qdisc(&noqueue_qdisc_ops);
- rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL, 0);
- rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL, 0);
- rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc,
- 0);
- rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL, 0);
- rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL, 0);
- rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass,
- 0);
+ rtnl_register_many(psched_rtnl_msg_handlers);
+
+ tc_wrapper_init();
return 0;
}
diff --git a/net/sched/sch_atm.c b/net/sched/sch_atm.c
deleted file mode 100644
index d714d3747bcb..000000000000
--- a/net/sched/sch_atm.c
+++ /dev/null
@@ -1,705 +0,0 @@
-/* net/sched/sch_atm.c - ATM VC selection "queueing discipline" */
-
-/* Written 1998-2000 by Werner Almesberger, EPFL ICA */
-
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <linux/init.h>
-#include <linux/interrupt.h>
-#include <linux/string.h>
-#include <linux/errno.h>
-#include <linux/skbuff.h>
-#include <linux/atmdev.h>
-#include <linux/atmclip.h>
-#include <linux/rtnetlink.h>
-#include <linux/file.h> /* for fput */
-#include <net/netlink.h>
-#include <net/pkt_sched.h>
-#include <net/pkt_cls.h>
-
-/*
- * The ATM queuing discipline provides a framework for invoking classifiers
- * (aka "filters"), which in turn select classes of this queuing discipline.
- * Each class maps the flow(s) it is handling to a given VC. Multiple classes
- * may share the same VC.
- *
- * When creating a class, VCs are specified by passing the number of the open
- * socket descriptor by which the calling process references the VC. The kernel
- * keeps the VC open at least until all classes using it are removed.
- *
- * In this file, most functions are named atm_tc_* to avoid confusion with all
- * the atm_* in net/atm. This naming convention differs from what's used in the
- * rest of net/sched.
- *
- * Known bugs:
- * - sometimes messes up the IP stack
- * - any manipulations besides the few operations described in the README, are
- * untested and likely to crash the system
- * - should lock the flow while there is data in the queue (?)
- */
-
-#define VCC2FLOW(vcc) ((struct atm_flow_data *) ((vcc)->user_back))
-
-struct atm_flow_data {
- struct Qdisc_class_common common;
- struct Qdisc *q; /* FIFO, TBF, etc. */
- struct tcf_proto __rcu *filter_list;
- struct tcf_block *block;
- struct atm_vcc *vcc; /* VCC; NULL if VCC is closed */
- void (*old_pop)(struct atm_vcc *vcc,
- struct sk_buff *skb); /* chaining */
- struct atm_qdisc_data *parent; /* parent qdisc */
- struct socket *sock; /* for closing */
- int ref; /* reference count */
- struct gnet_stats_basic_packed bstats;
- struct gnet_stats_queue qstats;
- struct list_head list;
- struct atm_flow_data *excess; /* flow for excess traffic;
- NULL to set CLP instead */
- int hdr_len;
- unsigned char hdr[0]; /* header data; MUST BE LAST */
-};
-
-struct atm_qdisc_data {
- struct atm_flow_data link; /* unclassified skbs go here */
- struct list_head flows; /* NB: "link" is also on this
- list */
- struct tasklet_struct task; /* dequeue tasklet */
-};
-
-/* ------------------------- Class/flow operations ------------------------- */
-
-static inline struct atm_flow_data *lookup_flow(struct Qdisc *sch, u32 classid)
-{
- struct atm_qdisc_data *p = qdisc_priv(sch);
- struct atm_flow_data *flow;
-
- list_for_each_entry(flow, &p->flows, list) {
- if (flow->common.classid == classid)
- return flow;
- }
- return NULL;
-}
-
-static int atm_tc_graft(struct Qdisc *sch, unsigned long arg,
- struct Qdisc *new, struct Qdisc **old,
- struct netlink_ext_ack *extack)
-{
- struct atm_qdisc_data *p = qdisc_priv(sch);
- struct atm_flow_data *flow = (struct atm_flow_data *)arg;
-
- pr_debug("atm_tc_graft(sch %p,[qdisc %p],flow %p,new %p,old %p)\n",
- sch, p, flow, new, old);
- if (list_empty(&flow->list))
- return -EINVAL;
- if (!new)
- new = &noop_qdisc;
- *old = flow->q;
- flow->q = new;
- if (*old)
- qdisc_reset(*old);
- return 0;
-}
-
-static struct Qdisc *atm_tc_leaf(struct Qdisc *sch, unsigned long cl)
-{
- struct atm_flow_data *flow = (struct atm_flow_data *)cl;
-
- pr_debug("atm_tc_leaf(sch %p,flow %p)\n", sch, flow);
- return flow ? flow->q : NULL;
-}
-
-static unsigned long atm_tc_find(struct Qdisc *sch, u32 classid)
-{
- struct atm_qdisc_data *p __maybe_unused = qdisc_priv(sch);
- struct atm_flow_data *flow;
-
- pr_debug("%s(sch %p,[qdisc %p],classid %x)\n", __func__, sch, p, classid);
- flow = lookup_flow(sch, classid);
- pr_debug("%s: flow %p\n", __func__, flow);
- return (unsigned long)flow;
-}
-
-static unsigned long atm_tc_bind_filter(struct Qdisc *sch,
- unsigned long parent, u32 classid)
-{
- struct atm_qdisc_data *p __maybe_unused = qdisc_priv(sch);
- struct atm_flow_data *flow;
-
- pr_debug("%s(sch %p,[qdisc %p],classid %x)\n", __func__, sch, p, classid);
- flow = lookup_flow(sch, classid);
- if (flow)
- flow->ref++;
- pr_debug("%s: flow %p\n", __func__, flow);
- return (unsigned long)flow;
-}
-
-/*
- * atm_tc_put handles all destructions, including the ones that are explicitly
- * requested (atm_tc_destroy, etc.). The assumption here is that we never drop
- * anything that still seems to be in use.
- */
-static void atm_tc_put(struct Qdisc *sch, unsigned long cl)
-{
- struct atm_qdisc_data *p = qdisc_priv(sch);
- struct atm_flow_data *flow = (struct atm_flow_data *)cl;
-
- pr_debug("atm_tc_put(sch %p,[qdisc %p],flow %p)\n", sch, p, flow);
- if (--flow->ref)
- return;
- pr_debug("atm_tc_put: destroying\n");
- list_del_init(&flow->list);
- pr_debug("atm_tc_put: qdisc %p\n", flow->q);
- qdisc_put(flow->q);
- tcf_block_put(flow->block);
- if (flow->sock) {
- pr_debug("atm_tc_put: f_count %ld\n",
- file_count(flow->sock->file));
- flow->vcc->pop = flow->old_pop;
- sockfd_put(flow->sock);
- }
- if (flow->excess)
- atm_tc_put(sch, (unsigned long)flow->excess);
- if (flow != &p->link)
- kfree(flow);
- /*
- * If flow == &p->link, the qdisc no longer works at this point and
- * needs to be removed. (By the caller of atm_tc_put.)
- */
-}
-
-static void sch_atm_pop(struct atm_vcc *vcc, struct sk_buff *skb)
-{
- struct atm_qdisc_data *p = VCC2FLOW(vcc)->parent;
-
- pr_debug("sch_atm_pop(vcc %p,skb %p,[qdisc %p])\n", vcc, skb, p);
- VCC2FLOW(vcc)->old_pop(vcc, skb);
- tasklet_schedule(&p->task);
-}
-
-static const u8 llc_oui_ip[] = {
- 0xaa, /* DSAP: non-ISO */
- 0xaa, /* SSAP: non-ISO */
- 0x03, /* Ctrl: Unnumbered Information Command PDU */
- 0x00, /* OUI: EtherType */
- 0x00, 0x00,
- 0x08, 0x00
-}; /* Ethertype IP (0800) */
-
-static const struct nla_policy atm_policy[TCA_ATM_MAX + 1] = {
- [TCA_ATM_FD] = { .type = NLA_U32 },
- [TCA_ATM_EXCESS] = { .type = NLA_U32 },
-};
-
-static int atm_tc_change(struct Qdisc *sch, u32 classid, u32 parent,
- struct nlattr **tca, unsigned long *arg,
- struct netlink_ext_ack *extack)
-{
- struct atm_qdisc_data *p = qdisc_priv(sch);
- struct atm_flow_data *flow = (struct atm_flow_data *)*arg;
- struct atm_flow_data *excess = NULL;
- struct nlattr *opt = tca[TCA_OPTIONS];
- struct nlattr *tb[TCA_ATM_MAX + 1];
- struct socket *sock;
- int fd, error, hdr_len;
- void *hdr;
-
- pr_debug("atm_tc_change(sch %p,[qdisc %p],classid %x,parent %x,"
- "flow %p,opt %p)\n", sch, p, classid, parent, flow, opt);
- /*
- * The concept of parents doesn't apply for this qdisc.
- */
- if (parent && parent != TC_H_ROOT && parent != sch->handle)
- return -EINVAL;
- /*
- * ATM classes cannot be changed. In order to change properties of the
- * ATM connection, that socket needs to be modified directly (via the
- * native ATM API. In order to send a flow to a different VC, the old
- * class needs to be removed and a new one added. (This may be changed
- * later.)
- */
- if (flow)
- return -EBUSY;
- if (opt == NULL)
- return -EINVAL;
-
- error = nla_parse_nested(tb, TCA_ATM_MAX, opt, atm_policy, NULL);
- if (error < 0)
- return error;
-
- if (!tb[TCA_ATM_FD])
- return -EINVAL;
- fd = nla_get_u32(tb[TCA_ATM_FD]);
- pr_debug("atm_tc_change: fd %d\n", fd);
- if (tb[TCA_ATM_HDR]) {
- hdr_len = nla_len(tb[TCA_ATM_HDR]);
- hdr = nla_data(tb[TCA_ATM_HDR]);
- } else {
- hdr_len = RFC1483LLC_LEN;
- hdr = NULL; /* default LLC/SNAP for IP */
- }
- if (!tb[TCA_ATM_EXCESS])
- excess = NULL;
- else {
- excess = (struct atm_flow_data *)
- atm_tc_find(sch, nla_get_u32(tb[TCA_ATM_EXCESS]));
- if (!excess)
- return -ENOENT;
- }
- pr_debug("atm_tc_change: type %d, payload %d, hdr_len %d\n",
- opt->nla_type, nla_len(opt), hdr_len);
- sock = sockfd_lookup(fd, &error);
- if (!sock)
- return error; /* f_count++ */
- pr_debug("atm_tc_change: f_count %ld\n", file_count(sock->file));
- if (sock->ops->family != PF_ATMSVC && sock->ops->family != PF_ATMPVC) {
- error = -EPROTOTYPE;
- goto err_out;
- }
- /* @@@ should check if the socket is really operational or we'll crash
- on vcc->send */
- if (classid) {
- if (TC_H_MAJ(classid ^ sch->handle)) {
- pr_debug("atm_tc_change: classid mismatch\n");
- error = -EINVAL;
- goto err_out;
- }
- } else {
- int i;
- unsigned long cl;
-
- for (i = 1; i < 0x8000; i++) {
- classid = TC_H_MAKE(sch->handle, 0x8000 | i);
- cl = atm_tc_find(sch, classid);
- if (!cl)
- break;
- }
- }
- pr_debug("atm_tc_change: new id %x\n", classid);
- flow = kzalloc(sizeof(struct atm_flow_data) + hdr_len, GFP_KERNEL);
- pr_debug("atm_tc_change: flow %p\n", flow);
- if (!flow) {
- error = -ENOBUFS;
- goto err_out;
- }
-
- error = tcf_block_get(&flow->block, &flow->filter_list, sch,
- extack);
- if (error) {
- kfree(flow);
- goto err_out;
- }
-
- flow->q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, classid,
- extack);
- if (!flow->q)
- flow->q = &noop_qdisc;
- pr_debug("atm_tc_change: qdisc %p\n", flow->q);
- flow->sock = sock;
- flow->vcc = ATM_SD(sock); /* speedup */
- flow->vcc->user_back = flow;
- pr_debug("atm_tc_change: vcc %p\n", flow->vcc);
- flow->old_pop = flow->vcc->pop;
- flow->parent = p;
- flow->vcc->pop = sch_atm_pop;
- flow->common.classid = classid;
- flow->ref = 1;
- flow->excess = excess;
- list_add(&flow->list, &p->link.list);
- flow->hdr_len = hdr_len;
- if (hdr)
- memcpy(flow->hdr, hdr, hdr_len);
- else
- memcpy(flow->hdr, llc_oui_ip, sizeof(llc_oui_ip));
- *arg = (unsigned long)flow;
- return 0;
-err_out:
- sockfd_put(sock);
- return error;
-}
-
-static int atm_tc_delete(struct Qdisc *sch, unsigned long arg)
-{
- struct atm_qdisc_data *p = qdisc_priv(sch);
- struct atm_flow_data *flow = (struct atm_flow_data *)arg;
-
- pr_debug("atm_tc_delete(sch %p,[qdisc %p],flow %p)\n", sch, p, flow);
- if (list_empty(&flow->list))
- return -EINVAL;
- if (rcu_access_pointer(flow->filter_list) || flow == &p->link)
- return -EBUSY;
- /*
- * Reference count must be 2: one for "keepalive" (set at class
- * creation), and one for the reference held when calling delete.
- */
- if (flow->ref < 2) {
- pr_err("atm_tc_delete: flow->ref == %d\n", flow->ref);
- return -EINVAL;
- }
- if (flow->ref > 2)
- return -EBUSY; /* catch references via excess, etc. */
- atm_tc_put(sch, arg);
- return 0;
-}
-
-static void atm_tc_walk(struct Qdisc *sch, struct qdisc_walker *walker)
-{
- struct atm_qdisc_data *p = qdisc_priv(sch);
- struct atm_flow_data *flow;
-
- pr_debug("atm_tc_walk(sch %p,[qdisc %p],walker %p)\n", sch, p, walker);
- if (walker->stop)
- return;
- list_for_each_entry(flow, &p->flows, list) {
- if (walker->count >= walker->skip &&
- walker->fn(sch, (unsigned long)flow, walker) < 0) {
- walker->stop = 1;
- break;
- }
- walker->count++;
- }
-}
-
-static struct tcf_block *atm_tc_tcf_block(struct Qdisc *sch, unsigned long cl,
- struct netlink_ext_ack *extack)
-{
- struct atm_qdisc_data *p = qdisc_priv(sch);
- struct atm_flow_data *flow = (struct atm_flow_data *)cl;
-
- pr_debug("atm_tc_find_tcf(sch %p,[qdisc %p],flow %p)\n", sch, p, flow);
- return flow ? flow->block : p->link.block;
-}
-
-/* --------------------------- Qdisc operations ---------------------------- */
-
-static int atm_tc_enqueue(struct sk_buff *skb, struct Qdisc *sch,
- struct sk_buff **to_free)
-{
- struct atm_qdisc_data *p = qdisc_priv(sch);
- struct atm_flow_data *flow;
- struct tcf_result res;
- int result;
- int ret = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
-
- pr_debug("atm_tc_enqueue(skb %p,sch %p,[qdisc %p])\n", skb, sch, p);
- result = TC_ACT_OK; /* be nice to gcc */
- flow = NULL;
- if (TC_H_MAJ(skb->priority) != sch->handle ||
- !(flow = (struct atm_flow_data *)atm_tc_find(sch, skb->priority))) {
- struct tcf_proto *fl;
-
- list_for_each_entry(flow, &p->flows, list) {
- fl = rcu_dereference_bh(flow->filter_list);
- if (fl) {
- result = tcf_classify(skb, fl, &res, true);
- if (result < 0)
- continue;
- flow = (struct atm_flow_data *)res.class;
- if (!flow)
- flow = lookup_flow(sch, res.classid);
- goto done;
- }
- }
- flow = NULL;
-done:
- ;
- }
- if (!flow) {
- flow = &p->link;
- } else {
- if (flow->vcc)
- ATM_SKB(skb)->atm_options = flow->vcc->atm_options;
- /*@@@ looks good ... but it's not supposed to work :-) */
-#ifdef CONFIG_NET_CLS_ACT
- switch (result) {
- case TC_ACT_QUEUED:
- case TC_ACT_STOLEN:
- case TC_ACT_TRAP:
- __qdisc_drop(skb, to_free);
- return NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
- case TC_ACT_SHOT:
- __qdisc_drop(skb, to_free);
- goto drop;
- case TC_ACT_RECLASSIFY:
- if (flow->excess)
- flow = flow->excess;
- else
- ATM_SKB(skb)->atm_options |= ATM_ATMOPT_CLP;
- break;
- }
-#endif
- }
-
- ret = qdisc_enqueue(skb, flow->q, to_free);
- if (ret != NET_XMIT_SUCCESS) {
-drop: __maybe_unused
- if (net_xmit_drop_count(ret)) {
- qdisc_qstats_drop(sch);
- if (flow)
- flow->qstats.drops++;
- }
- return ret;
- }
- /*
- * Okay, this may seem weird. We pretend we've dropped the packet if
- * it goes via ATM. The reason for this is that the outer qdisc
- * expects to be able to q->dequeue the packet later on if we return
- * success at this place. Also, sch->q.qdisc needs to reflect whether
- * there is a packet egligible for dequeuing or not. Note that the
- * statistics of the outer qdisc are necessarily wrong because of all
- * this. There's currently no correct solution for this.
- */
- if (flow == &p->link) {
- sch->q.qlen++;
- return NET_XMIT_SUCCESS;
- }
- tasklet_schedule(&p->task);
- return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
-}
-
-/*
- * Dequeue packets and send them over ATM. Note that we quite deliberately
- * avoid checking net_device's flow control here, simply because sch_atm
- * uses its own channels, which have nothing to do with any CLIP/LANE/or
- * non-ATM interfaces.
- */
-
-static void sch_atm_dequeue(unsigned long data)
-{
- struct Qdisc *sch = (struct Qdisc *)data;
- struct atm_qdisc_data *p = qdisc_priv(sch);
- struct atm_flow_data *flow;
- struct sk_buff *skb;
-
- pr_debug("sch_atm_dequeue(sch %p,[qdisc %p])\n", sch, p);
- list_for_each_entry(flow, &p->flows, list) {
- if (flow == &p->link)
- continue;
- /*
- * If traffic is properly shaped, this won't generate nasty
- * little bursts. Otherwise, it may ... (but that's okay)
- */
- while ((skb = flow->q->ops->peek(flow->q))) {
- if (!atm_may_send(flow->vcc, skb->truesize))
- break;
-
- skb = qdisc_dequeue_peeked(flow->q);
- if (unlikely(!skb))
- break;
-
- qdisc_bstats_update(sch, skb);
- bstats_update(&flow->bstats, skb);
- pr_debug("atm_tc_dequeue: sending on class %p\n", flow);
- /* remove any LL header somebody else has attached */
- skb_pull(skb, skb_network_offset(skb));
- if (skb_headroom(skb) < flow->hdr_len) {
- struct sk_buff *new;
-
- new = skb_realloc_headroom(skb, flow->hdr_len);
- dev_kfree_skb(skb);
- if (!new)
- continue;
- skb = new;
- }
- pr_debug("sch_atm_dequeue: ip %p, data %p\n",
- skb_network_header(skb), skb->data);
- ATM_SKB(skb)->vcc = flow->vcc;
- memcpy(skb_push(skb, flow->hdr_len), flow->hdr,
- flow->hdr_len);
- refcount_add(skb->truesize,
- &sk_atm(flow->vcc)->sk_wmem_alloc);
- /* atm.atm_options are already set by atm_tc_enqueue */
- flow->vcc->send(flow->vcc, skb);
- }
- }
-}
-
-static struct sk_buff *atm_tc_dequeue(struct Qdisc *sch)
-{
- struct atm_qdisc_data *p = qdisc_priv(sch);
- struct sk_buff *skb;
-
- pr_debug("atm_tc_dequeue(sch %p,[qdisc %p])\n", sch, p);
- tasklet_schedule(&p->task);
- skb = qdisc_dequeue_peeked(p->link.q);
- if (skb)
- sch->q.qlen--;
- return skb;
-}
-
-static struct sk_buff *atm_tc_peek(struct Qdisc *sch)
-{
- struct atm_qdisc_data *p = qdisc_priv(sch);
-
- pr_debug("atm_tc_peek(sch %p,[qdisc %p])\n", sch, p);
-
- return p->link.q->ops->peek(p->link.q);
-}
-
-static int atm_tc_init(struct Qdisc *sch, struct nlattr *opt,
- struct netlink_ext_ack *extack)
-{
- struct atm_qdisc_data *p = qdisc_priv(sch);
- int err;
-
- pr_debug("atm_tc_init(sch %p,[qdisc %p],opt %p)\n", sch, p, opt);
- INIT_LIST_HEAD(&p->flows);
- INIT_LIST_HEAD(&p->link.list);
- list_add(&p->link.list, &p->flows);
- p->link.q = qdisc_create_dflt(sch->dev_queue,
- &pfifo_qdisc_ops, sch->handle, extack);
- if (!p->link.q)
- p->link.q = &noop_qdisc;
- pr_debug("atm_tc_init: link (%p) qdisc %p\n", &p->link, p->link.q);
-
- err = tcf_block_get(&p->link.block, &p->link.filter_list, sch,
- extack);
- if (err)
- return err;
-
- p->link.vcc = NULL;
- p->link.sock = NULL;
- p->link.common.classid = sch->handle;
- p->link.ref = 1;
- tasklet_init(&p->task, sch_atm_dequeue, (unsigned long)sch);
- return 0;
-}
-
-static void atm_tc_reset(struct Qdisc *sch)
-{
- struct atm_qdisc_data *p = qdisc_priv(sch);
- struct atm_flow_data *flow;
-
- pr_debug("atm_tc_reset(sch %p,[qdisc %p])\n", sch, p);
- list_for_each_entry(flow, &p->flows, list)
- qdisc_reset(flow->q);
- sch->q.qlen = 0;
-}
-
-static void atm_tc_destroy(struct Qdisc *sch)
-{
- struct atm_qdisc_data *p = qdisc_priv(sch);
- struct atm_flow_data *flow, *tmp;
-
- pr_debug("atm_tc_destroy(sch %p,[qdisc %p])\n", sch, p);
- list_for_each_entry(flow, &p->flows, list) {
- tcf_block_put(flow->block);
- flow->block = NULL;
- }
-
- list_for_each_entry_safe(flow, tmp, &p->flows, list) {
- if (flow->ref > 1)
- pr_err("atm_destroy: %p->ref = %d\n", flow, flow->ref);
- atm_tc_put(sch, (unsigned long)flow);
- }
- tasklet_kill(&p->task);
-}
-
-static int atm_tc_dump_class(struct Qdisc *sch, unsigned long cl,
- struct sk_buff *skb, struct tcmsg *tcm)
-{
- struct atm_qdisc_data *p = qdisc_priv(sch);
- struct atm_flow_data *flow = (struct atm_flow_data *)cl;
- struct nlattr *nest;
-
- pr_debug("atm_tc_dump_class(sch %p,[qdisc %p],flow %p,skb %p,tcm %p)\n",
- sch, p, flow, skb, tcm);
- if (list_empty(&flow->list))
- return -EINVAL;
- tcm->tcm_handle = flow->common.classid;
- tcm->tcm_info = flow->q->handle;
-
- nest = nla_nest_start(skb, TCA_OPTIONS);
- if (nest == NULL)
- goto nla_put_failure;
-
- if (nla_put(skb, TCA_ATM_HDR, flow->hdr_len, flow->hdr))
- goto nla_put_failure;
- if (flow->vcc) {
- struct sockaddr_atmpvc pvc;
- int state;
-
- memset(&pvc, 0, sizeof(pvc));
- pvc.sap_family = AF_ATMPVC;
- pvc.sap_addr.itf = flow->vcc->dev ? flow->vcc->dev->number : -1;
- pvc.sap_addr.vpi = flow->vcc->vpi;
- pvc.sap_addr.vci = flow->vcc->vci;
- if (nla_put(skb, TCA_ATM_ADDR, sizeof(pvc), &pvc))
- goto nla_put_failure;
- state = ATM_VF2VS(flow->vcc->flags);
- if (nla_put_u32(skb, TCA_ATM_STATE, state))
- goto nla_put_failure;
- }
- if (flow->excess) {
- if (nla_put_u32(skb, TCA_ATM_EXCESS, flow->common.classid))
- goto nla_put_failure;
- } else {
- if (nla_put_u32(skb, TCA_ATM_EXCESS, 0))
- goto nla_put_failure;
- }
- return nla_nest_end(skb, nest);
-
-nla_put_failure:
- nla_nest_cancel(skb, nest);
- return -1;
-}
-static int
-atm_tc_dump_class_stats(struct Qdisc *sch, unsigned long arg,
- struct gnet_dump *d)
-{
- struct atm_flow_data *flow = (struct atm_flow_data *)arg;
-
- if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
- d, NULL, &flow->bstats) < 0 ||
- gnet_stats_copy_queue(d, NULL, &flow->qstats, flow->q->q.qlen) < 0)
- return -1;
-
- return 0;
-}
-
-static int atm_tc_dump(struct Qdisc *sch, struct sk_buff *skb)
-{
- return 0;
-}
-
-static const struct Qdisc_class_ops atm_class_ops = {
- .graft = atm_tc_graft,
- .leaf = atm_tc_leaf,
- .find = atm_tc_find,
- .change = atm_tc_change,
- .delete = atm_tc_delete,
- .walk = atm_tc_walk,
- .tcf_block = atm_tc_tcf_block,
- .bind_tcf = atm_tc_bind_filter,
- .unbind_tcf = atm_tc_put,
- .dump = atm_tc_dump_class,
- .dump_stats = atm_tc_dump_class_stats,
-};
-
-static struct Qdisc_ops atm_qdisc_ops __read_mostly = {
- .cl_ops = &atm_class_ops,
- .id = "atm",
- .priv_size = sizeof(struct atm_qdisc_data),
- .enqueue = atm_tc_enqueue,
- .dequeue = atm_tc_dequeue,
- .peek = atm_tc_peek,
- .init = atm_tc_init,
- .reset = atm_tc_reset,
- .destroy = atm_tc_destroy,
- .dump = atm_tc_dump,
- .owner = THIS_MODULE,
-};
-
-static int __init atm_init(void)
-{
- return register_qdisc(&atm_qdisc_ops);
-}
-
-static void __exit atm_exit(void)
-{
- unregister_qdisc(&atm_qdisc_ops);
-}
-
-module_init(atm_init)
-module_exit(atm_exit)
-MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_blackhole.c b/net/sched/sch_blackhole.c
index 9c4c2bb547d7..a7f7667ae984 100644
--- a/net/sched/sch_blackhole.c
+++ b/net/sched/sch_blackhole.c
@@ -1,11 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* net/sched/sch_blackhole.c Black hole queue
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Authors: Thomas Graf <tgraf@suug.ch>
*
* Note: Quantum tunneling is not supported.
diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c
index 73940293700d..4a64d6397b6f 100644
--- a/net/sched/sch_cake.c
+++ b/net/sched/sch_cake.c
@@ -65,6 +65,7 @@
#include <linux/reciprocal_div.h>
#include <net/netlink.h>
#include <linux/if_vlan.h>
+#include <net/gso.h>
#include <net/pkt_sched.h>
#include <net/pkt_cls.h>
#include <net/tcp.h>
@@ -138,8 +139,8 @@ struct cake_flow {
struct cake_host {
u32 srchost_tag;
u32 dsthost_tag;
- u16 srchost_refcnt;
- u16 dsthost_refcnt;
+ u16 srchost_bulk_flow_count;
+ u16 dsthost_bulk_flow_count;
};
struct cake_heap_entry {
@@ -173,8 +174,7 @@ struct cake_tin_data {
u64 tin_rate_bps;
u16 tin_rate_shft;
- u16 tin_quantum_prio;
- u16 tin_quantum_band;
+ u16 tin_quantum;
s32 tin_deficit;
u32 tin_backlog;
u32 tin_dropped;
@@ -211,6 +211,9 @@ struct cake_sched_data {
u8 ack_filter;
u8 atm_mode;
+ u32 fwmark_mask;
+ u16 fwmark_shft;
+
/* time_next = time_this + ((len * rate_ns) >> rate_shft) */
u16 rate_shft;
ktime_t time_next_packet;
@@ -310,8 +313,8 @@ static const u8 precedence[] = {
};
static const u8 diffserv8[] = {
- 2, 5, 1, 2, 4, 2, 2, 2,
- 0, 2, 1, 2, 1, 2, 1, 2,
+ 2, 0, 1, 2, 4, 2, 2, 2,
+ 1, 2, 1, 2, 1, 2, 1, 2,
5, 2, 4, 2, 4, 2, 4, 2,
3, 2, 3, 2, 3, 2, 3, 2,
6, 2, 3, 2, 3, 2, 3, 2,
@@ -321,7 +324,7 @@ static const u8 diffserv8[] = {
};
static const u8 diffserv4[] = {
- 0, 2, 0, 0, 2, 0, 0, 0,
+ 0, 1, 0, 0, 2, 0, 0, 0,
1, 0, 0, 0, 0, 0, 0, 0,
2, 0, 2, 0, 2, 0, 2, 0,
2, 0, 2, 0, 2, 0, 2, 0,
@@ -332,7 +335,7 @@ static const u8 diffserv4[] = {
};
static const u8 diffserv3[] = {
- 0, 0, 0, 0, 2, 0, 0, 0,
+ 0, 1, 0, 0, 2, 0, 0, 0,
1, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
@@ -358,8 +361,24 @@ static const u8 besteffort[] = {
static const u8 normal_order[] = {0, 1, 2, 3, 4, 5, 6, 7};
static const u8 bulk_order[] = {1, 0, 2, 3};
+/* There is a big difference in timing between the accurate values placed in the
+ * cache and the approximations given by a single Newton step for small count
+ * values, particularly when stepping from count 1 to 2 or vice versa. Hence,
+ * these values are calculated using eight Newton steps, using the
+ * implementation below. Above 16, a single Newton step gives sufficient
+ * accuracy in either direction, given the precision stored.
+ *
+ * The magnitude of the error when stepping up to count 2 is such as to give the
+ * value that *should* have been produced at count 4.
+ */
+
#define REC_INV_SQRT_CACHE (16)
-static u32 cobalt_rec_inv_sqrt_cache[REC_INV_SQRT_CACHE] = {0};
+static const u32 inv_sqrt_cache[REC_INV_SQRT_CACHE] = {
+ ~0, ~0, 3037000500, 2479700525,
+ 2147483647, 1920767767, 1753413056, 1623345051,
+ 1518500250, 1431655765, 1358187914, 1294981364,
+ 1239850263, 1191209601, 1147878294, 1108955788
+};
/* http://en.wikipedia.org/wiki/Methods_of_computing_square_roots
* new_invsqrt = (invsqrt / 2) * (3 - count * invsqrt^2)
@@ -385,47 +404,14 @@ static void cobalt_newton_step(struct cobalt_vars *vars)
static void cobalt_invsqrt(struct cobalt_vars *vars)
{
if (vars->count < REC_INV_SQRT_CACHE)
- vars->rec_inv_sqrt = cobalt_rec_inv_sqrt_cache[vars->count];
+ vars->rec_inv_sqrt = inv_sqrt_cache[vars->count];
else
cobalt_newton_step(vars);
}
-/* There is a big difference in timing between the accurate values placed in
- * the cache and the approximations given by a single Newton step for small
- * count values, particularly when stepping from count 1 to 2 or vice versa.
- * Above 16, a single Newton step gives sufficient accuracy in either
- * direction, given the precision stored.
- *
- * The magnitude of the error when stepping up to count 2 is such as to give
- * the value that *should* have been produced at count 4.
- */
-
-static void cobalt_cache_init(void)
-{
- struct cobalt_vars v;
-
- memset(&v, 0, sizeof(v));
- v.rec_inv_sqrt = ~0U;
- cobalt_rec_inv_sqrt_cache[0] = v.rec_inv_sqrt;
-
- for (v.count = 1; v.count < REC_INV_SQRT_CACHE; v.count++) {
- cobalt_newton_step(&v);
- cobalt_newton_step(&v);
- cobalt_newton_step(&v);
- cobalt_newton_step(&v);
-
- cobalt_rec_inv_sqrt_cache[v.count] = v.rec_inv_sqrt;
- }
-}
-
static void cobalt_vars_init(struct cobalt_vars *vars)
{
memset(vars, 0, sizeof(*vars));
-
- if (!cobalt_rec_inv_sqrt_cache[0]) {
- cobalt_cache_init();
- cobalt_rec_inv_sqrt_cache[0] = ~0;
- }
}
/* CoDel control_law is t + interval/sqrt(count)
@@ -498,13 +484,14 @@ static bool cobalt_queue_empty(struct cobalt_vars *vars,
/* Call this with a freshly dequeued packet for possible congestion marking.
* Returns true as an instruction to drop the packet, false for delivery.
*/
-static bool cobalt_should_drop(struct cobalt_vars *vars,
- struct cobalt_params *p,
- ktime_t now,
- struct sk_buff *skb,
- u32 bulk_flows)
-{
- bool next_due, over_target, drop = false;
+static enum skb_drop_reason cobalt_should_drop(struct cobalt_vars *vars,
+ struct cobalt_params *p,
+ ktime_t now,
+ struct sk_buff *skb,
+ u32 bulk_flows)
+{
+ enum skb_drop_reason reason = SKB_NOT_DROPPED_YET;
+ bool next_due, over_target;
ktime_t schedule;
u64 sojourn;
@@ -547,7 +534,8 @@ static bool cobalt_should_drop(struct cobalt_vars *vars,
if (next_due && vars->dropping) {
/* Use ECN mark if possible, otherwise drop */
- drop = !(vars->ecn_marked = INET_ECN_set_ce(skb));
+ if (!(vars->ecn_marked = INET_ECN_set_ce(skb)))
+ reason = SKB_DROP_REASON_QDISC_CONGESTED;
vars->count++;
if (!vars->count)
@@ -570,38 +558,61 @@ static bool cobalt_should_drop(struct cobalt_vars *vars,
}
/* Simple BLUE implementation. Lack of ECN is deliberate. */
- if (vars->p_drop)
- drop |= (prandom_u32() < vars->p_drop);
+ if (vars->p_drop && reason == SKB_NOT_DROPPED_YET &&
+ get_random_u32() < vars->p_drop)
+ reason = SKB_DROP_REASON_CAKE_FLOOD;
/* Overload the drop_next field as an activity timeout */
if (!vars->count)
vars->drop_next = ktime_add_ns(now, p->interval);
- else if (ktime_to_ns(schedule) > 0 && !drop)
+ else if (ktime_to_ns(schedule) > 0 && reason == SKB_NOT_DROPPED_YET)
vars->drop_next = now;
- return drop;
+ return reason;
}
-static void cake_update_flowkeys(struct flow_keys *keys,
+static bool cake_update_flowkeys(struct flow_keys *keys,
const struct sk_buff *skb)
{
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
struct nf_conntrack_tuple tuple = {};
- bool rev = !skb->_nfct;
+ bool rev = !skb->_nfct, upd = false;
+ __be32 ip;
- if (tc_skb_protocol(skb) != htons(ETH_P_IP))
- return;
+ if (skb_protocol(skb, true) != htons(ETH_P_IP))
+ return false;
if (!nf_ct_get_tuple_skb(&tuple, skb))
- return;
+ return false;
- keys->addrs.v4addrs.src = rev ? tuple.dst.u3.ip : tuple.src.u3.ip;
- keys->addrs.v4addrs.dst = rev ? tuple.src.u3.ip : tuple.dst.u3.ip;
+ ip = rev ? tuple.dst.u3.ip : tuple.src.u3.ip;
+ if (ip != keys->addrs.v4addrs.src) {
+ keys->addrs.v4addrs.src = ip;
+ upd = true;
+ }
+ ip = rev ? tuple.src.u3.ip : tuple.dst.u3.ip;
+ if (ip != keys->addrs.v4addrs.dst) {
+ keys->addrs.v4addrs.dst = ip;
+ upd = true;
+ }
if (keys->ports.ports) {
- keys->ports.src = rev ? tuple.dst.u.all : tuple.src.u.all;
- keys->ports.dst = rev ? tuple.src.u.all : tuple.dst.u.all;
+ __be16 port;
+
+ port = rev ? tuple.dst.u.all : tuple.src.u.all;
+ if (port != keys->ports.src) {
+ keys->ports.src = port;
+ upd = true;
+ }
+ port = rev ? tuple.src.u.all : tuple.dst.u.all;
+ if (port != keys->ports.dst) {
+ port = keys->ports.dst;
+ upd = true;
+ }
}
+ return upd;
+#else
+ return false;
#endif
}
@@ -619,26 +630,96 @@ static bool cake_ddst(int flow_mode)
return (flow_mode & CAKE_FLOW_DUAL_DST) == CAKE_FLOW_DUAL_DST;
}
+static void cake_dec_srchost_bulk_flow_count(struct cake_tin_data *q,
+ struct cake_flow *flow,
+ int flow_mode)
+{
+ if (likely(cake_dsrc(flow_mode) &&
+ q->hosts[flow->srchost].srchost_bulk_flow_count))
+ q->hosts[flow->srchost].srchost_bulk_flow_count--;
+}
+
+static void cake_inc_srchost_bulk_flow_count(struct cake_tin_data *q,
+ struct cake_flow *flow,
+ int flow_mode)
+{
+ if (likely(cake_dsrc(flow_mode) &&
+ q->hosts[flow->srchost].srchost_bulk_flow_count < CAKE_QUEUES))
+ q->hosts[flow->srchost].srchost_bulk_flow_count++;
+}
+
+static void cake_dec_dsthost_bulk_flow_count(struct cake_tin_data *q,
+ struct cake_flow *flow,
+ int flow_mode)
+{
+ if (likely(cake_ddst(flow_mode) &&
+ q->hosts[flow->dsthost].dsthost_bulk_flow_count))
+ q->hosts[flow->dsthost].dsthost_bulk_flow_count--;
+}
+
+static void cake_inc_dsthost_bulk_flow_count(struct cake_tin_data *q,
+ struct cake_flow *flow,
+ int flow_mode)
+{
+ if (likely(cake_ddst(flow_mode) &&
+ q->hosts[flow->dsthost].dsthost_bulk_flow_count < CAKE_QUEUES))
+ q->hosts[flow->dsthost].dsthost_bulk_flow_count++;
+}
+
+static u16 cake_get_flow_quantum(struct cake_tin_data *q,
+ struct cake_flow *flow,
+ int flow_mode)
+{
+ u16 host_load = 1;
+
+ if (cake_dsrc(flow_mode))
+ host_load = max(host_load,
+ q->hosts[flow->srchost].srchost_bulk_flow_count);
+
+ if (cake_ddst(flow_mode))
+ host_load = max(host_load,
+ q->hosts[flow->dsthost].dsthost_bulk_flow_count);
+
+ /* The get_random_u16() is a way to apply dithering to avoid
+ * accumulating roundoff errors
+ */
+ return (q->flow_quantum * quantum_div[host_load] +
+ get_random_u16()) >> 16;
+}
+
static u32 cake_hash(struct cake_tin_data *q, const struct sk_buff *skb,
int flow_mode, u16 flow_override, u16 host_override)
{
+ bool hash_flows = (!flow_override && !!(flow_mode & CAKE_FLOW_FLOWS));
+ bool hash_hosts = (!host_override && !!(flow_mode & CAKE_FLOW_HOSTS));
+ bool nat_enabled = !!(flow_mode & CAKE_FLOW_NAT_FLAG);
u32 flow_hash = 0, srchost_hash = 0, dsthost_hash = 0;
u16 reduced_hash, srchost_idx, dsthost_idx;
struct flow_keys keys, host_keys;
+ bool use_skbhash = skb->l4_hash;
if (unlikely(flow_mode == CAKE_FLOW_NONE))
return 0;
- /* If both overrides are set we can skip packet dissection entirely */
- if ((flow_override || !(flow_mode & CAKE_FLOW_FLOWS)) &&
- (host_override || !(flow_mode & CAKE_FLOW_HOSTS)))
+ /* If both overrides are set, or we can use the SKB hash and nat mode is
+ * disabled, we can skip packet dissection entirely. If nat mode is
+ * enabled there's another check below after doing the conntrack lookup.
+ */
+ if ((!hash_flows || (use_skbhash && !nat_enabled)) && !hash_hosts)
goto skip_hash;
skb_flow_dissect_flow_keys(skb, &keys,
FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL);
- if (flow_mode & CAKE_FLOW_NAT_FLAG)
- cake_update_flowkeys(&keys, skb);
+ /* Don't use the SKB hash if we change the lookup keys from conntrack */
+ if (nat_enabled && cake_update_flowkeys(&keys, skb))
+ use_skbhash = false;
+
+ /* If we can still use the SKB hash and don't need the host hash, we can
+ * skip the rest of the hashing procedure
+ */
+ if (use_skbhash && !hash_hosts)
+ goto skip_hash;
/* flow_hash_from_keys() sorts the addresses by value, so we have
* to preserve their order in a separate data structure to treat
@@ -677,12 +758,14 @@ static u32 cake_hash(struct cake_tin_data *q, const struct sk_buff *skb,
/* This *must* be after the above switch, since as a
* side-effect it sorts the src and dst addresses.
*/
- if (flow_mode & CAKE_FLOW_FLOWS)
+ if (hash_flows && !use_skbhash)
flow_hash = flow_hash_from_keys(&keys);
skip_hash:
if (flow_override)
flow_hash = flow_override - 1;
+ else if (use_skbhash && (flow_mode & CAKE_FLOW_FLOWS))
+ flow_hash = skb->hash;
if (host_override) {
dsthost_hash = host_override - 1;
srchost_hash = host_override - 1;
@@ -746,10 +829,13 @@ skip_hash:
* queue, accept the collision, update the host tags.
*/
q->way_collisions++;
- q->hosts[q->flows[reduced_hash].srchost].srchost_refcnt--;
- q->hosts[q->flows[reduced_hash].dsthost].dsthost_refcnt--;
allocate_src = cake_dsrc(flow_mode);
allocate_dst = cake_ddst(flow_mode);
+
+ if (q->flows[outer_hash + k].set == CAKE_SET_BULK) {
+ cake_dec_srchost_bulk_flow_count(q, &q->flows[outer_hash + k], flow_mode);
+ cake_dec_dsthost_bulk_flow_count(q, &q->flows[outer_hash + k], flow_mode);
+ }
found:
/* reserve queue for future packets in same flow */
reduced_hash = outer_hash + k;
@@ -767,14 +853,16 @@ found:
}
for (i = 0; i < CAKE_SET_WAYS;
i++, k = (k + 1) % CAKE_SET_WAYS) {
- if (!q->hosts[outer_hash + k].srchost_refcnt)
+ if (!q->hosts[outer_hash + k].srchost_bulk_flow_count)
break;
}
q->hosts[outer_hash + k].srchost_tag = srchost_hash;
found_src:
srchost_idx = outer_hash + k;
- q->hosts[srchost_idx].srchost_refcnt++;
q->flows[reduced_hash].srchost = srchost_idx;
+
+ if (q->flows[reduced_hash].set == CAKE_SET_BULK)
+ cake_inc_srchost_bulk_flow_count(q, &q->flows[reduced_hash], flow_mode);
}
if (allocate_dst) {
@@ -789,14 +877,16 @@ found_src:
}
for (i = 0; i < CAKE_SET_WAYS;
i++, k = (k + 1) % CAKE_SET_WAYS) {
- if (!q->hosts[outer_hash + k].dsthost_refcnt)
+ if (!q->hosts[outer_hash + k].dsthost_bulk_flow_count)
break;
}
q->hosts[outer_hash + k].dsthost_tag = dsthost_hash;
found_dst:
dsthost_idx = outer_hash + k;
- q->hosts[dsthost_idx].dsthost_refcnt++;
q->flows[reduced_hash].dsthost = dsthost_idx;
+
+ if (q->flows[reduced_hash].set == CAKE_SET_BULK)
+ cake_inc_dsthost_bulk_flow_count(q, &q->flows[reduced_hash], flow_mode);
}
}
@@ -900,7 +990,7 @@ static struct tcphdr *cake_get_tcphdr(const struct sk_buff *skb,
}
tcph = skb_header_pointer(skb, offset, sizeof(_tcph), &_tcph);
- if (!tcph)
+ if (!tcph || tcph->doff < 5)
return NULL;
return skb_header_pointer(skb, offset,
@@ -924,6 +1014,8 @@ static const void *cake_get_tcpopt(const struct tcphdr *tcph,
length--;
continue;
}
+ if (length < 2)
+ break;
opsize = *ptr++;
if (opsize < 2 || opsize > length)
break;
@@ -1061,6 +1153,8 @@ static bool cake_tcph_may_drop(const struct tcphdr *tcph,
length--;
continue;
}
+ if (length < 2)
+ break;
opsize = *ptr++;
if (opsize < 2 || opsize > length)
break;
@@ -1162,7 +1256,7 @@ static struct sk_buff *cake_ack_filter(struct cake_sched_data *q,
iph_check->daddr != iph->daddr)
continue;
- seglen = ntohs(iph_check->tot_len) -
+ seglen = iph_totlen(skb, iph_check) -
(4 * iph_check->ihl);
} else if (iph_check->version == 6) {
ipv6h = (struct ipv6hdr *)iph;
@@ -1304,16 +1398,19 @@ static u32 cake_overhead(struct cake_sched_data *q, const struct sk_buff *skb)
const struct skb_shared_info *shinfo = skb_shinfo(skb);
unsigned int hdr_len, last_len = 0;
u32 off = skb_network_offset(skb);
+ u16 segs = qdisc_pkt_segs(skb);
u32 len = qdisc_pkt_len(skb);
- u16 segs = 1;
q->avg_netoff = cake_ewma(q->avg_netoff, off << 16, 8);
- if (!shinfo->gso_size)
+ if (segs == 1)
return cake_calc_overhead(q, len, off);
- /* borrowed from qdisc_pkt_len_init() */
- hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
+ /* borrowed from qdisc_pkt_len_segs_init() */
+ if (!skb->encapsulation)
+ hdr_len = skb_transport_offset(skb);
+ else
+ hdr_len = skb_inner_transport_offset(skb);
/* + transport layer */
if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 |
@@ -1321,24 +1418,18 @@ static u32 cake_overhead(struct cake_sched_data *q, const struct sk_buff *skb)
const struct tcphdr *th;
struct tcphdr _tcphdr;
- th = skb_header_pointer(skb, skb_transport_offset(skb),
+ th = skb_header_pointer(skb, hdr_len,
sizeof(_tcphdr), &_tcphdr);
if (likely(th))
hdr_len += __tcp_hdrlen(th);
} else {
struct udphdr _udphdr;
- if (skb_header_pointer(skb, skb_transport_offset(skb),
+ if (skb_header_pointer(skb, hdr_len,
sizeof(_udphdr), &_udphdr))
hdr_len += sizeof(struct udphdr);
}
- if (unlikely(shinfo->gso_type & SKB_GSO_DODGY))
- segs = DIV_ROUND_UP(skb->len - hdr_len,
- shinfo->gso_size);
- else
- segs = shinfo->gso_segs;
-
len = shinfo->gso_size + hdr_len;
last_len = skb->len - shinfo->gso_size * (segs - 1);
@@ -1464,7 +1555,7 @@ static unsigned int cake_drop(struct Qdisc *sch, struct sk_buff **to_free)
if (!q->overflow_timeout) {
int i;
/* Build fresh max-heap */
- for (i = CAKE_MAX_TINS * CAKE_QUEUES / 2; i >= 0; i--)
+ for (i = CAKE_MAX_TINS * CAKE_QUEUES / 2 - 1; i >= 0; i--)
cake_heapify(q, i);
}
q->overflow_timeout = 65535;
@@ -1491,16 +1582,14 @@ static unsigned int cake_drop(struct Qdisc *sch, struct sk_buff **to_free)
b->backlogs[idx] -= len;
b->tin_backlog -= len;
sch->qstats.backlog -= len;
- qdisc_tree_reduce_backlog(sch, 1, len);
flow->dropped++;
b->tin_dropped++;
- sch->qstats.drops++;
if (q->rate_flags & CAKE_FLAG_INGRESS)
cake_advance_shaper(q, b, skb, now, true);
- __qdisc_drop(skb, to_free);
+ qdisc_drop_reason(skb, sch, to_free, SKB_DROP_REASON_QDISC_OVERLIMIT);
sch->q.qlen--;
cake_heapify(q, 0);
@@ -1508,35 +1597,51 @@ static unsigned int cake_drop(struct Qdisc *sch, struct sk_buff **to_free)
return idx + (tin << 16);
}
-static void cake_wash_diffserv(struct sk_buff *skb)
-{
- switch (skb->protocol) {
- case htons(ETH_P_IP):
- ipv4_change_dsfield(ip_hdr(skb), INET_ECN_MASK, 0);
- break;
- case htons(ETH_P_IPV6):
- ipv6_change_dsfield(ipv6_hdr(skb), INET_ECN_MASK, 0);
- break;
- default:
- break;
- }
-}
-
-static u8 cake_handle_diffserv(struct sk_buff *skb, u16 wash)
+static u8 cake_handle_diffserv(struct sk_buff *skb, bool wash)
{
+ const int offset = skb_network_offset(skb);
+ u16 *buf, buf_;
u8 dscp;
- switch (skb->protocol) {
+ switch (skb_protocol(skb, true)) {
case htons(ETH_P_IP):
- dscp = ipv4_get_dsfield(ip_hdr(skb)) >> 2;
- if (wash && dscp)
+ buf = skb_header_pointer(skb, offset, sizeof(buf_), &buf_);
+ if (unlikely(!buf))
+ return 0;
+
+ /* ToS is in the second byte of iphdr */
+ dscp = ipv4_get_dsfield((struct iphdr *)buf) >> 2;
+
+ if (wash && dscp) {
+ const int wlen = offset + sizeof(struct iphdr);
+
+ if (!pskb_may_pull(skb, wlen) ||
+ skb_try_make_writable(skb, wlen))
+ return 0;
+
ipv4_change_dsfield(ip_hdr(skb), INET_ECN_MASK, 0);
+ }
+
return dscp;
case htons(ETH_P_IPV6):
- dscp = ipv6_get_dsfield(ipv6_hdr(skb)) >> 2;
- if (wash && dscp)
+ buf = skb_header_pointer(skb, offset, sizeof(buf_), &buf_);
+ if (unlikely(!buf))
+ return 0;
+
+ /* Traffic class is in the first and second bytes of ipv6hdr */
+ dscp = ipv6_get_dsfield((struct ipv6hdr *)buf) >> 2;
+
+ if (wash && dscp) {
+ const int wlen = offset + sizeof(struct ipv6hdr);
+
+ if (!pskb_may_pull(skb, wlen) ||
+ skb_try_make_writable(skb, wlen))
+ return 0;
+
ipv6_change_dsfield(ipv6_hdr(skb), INET_ECN_MASK, 0);
+ }
+
return dscp;
case htons(ETH_P_ARP):
@@ -1552,26 +1657,37 @@ static struct cake_tin_data *cake_select_tin(struct Qdisc *sch,
struct sk_buff *skb)
{
struct cake_sched_data *q = qdisc_priv(sch);
- u32 tin;
+ u32 tin, mark;
+ bool wash;
+ u8 dscp;
- if (TC_H_MAJ(skb->priority) == sch->handle &&
- TC_H_MIN(skb->priority) > 0 &&
- TC_H_MIN(skb->priority) <= q->tin_cnt) {
+ /* Tin selection: Default to diffserv-based selection, allow overriding
+ * using firewall marks or skb->priority. Call DSCP parsing early if
+ * wash is enabled, otherwise defer to below to skip unneeded parsing.
+ */
+ mark = (skb->mark & q->fwmark_mask) >> q->fwmark_shft;
+ wash = !!(q->rate_flags & CAKE_FLAG_WASH);
+ if (wash)
+ dscp = cake_handle_diffserv(skb, wash);
+
+ if (q->tin_mode == CAKE_DIFFSERV_BESTEFFORT)
+ tin = 0;
+
+ else if (mark && mark <= q->tin_cnt)
+ tin = q->tin_order[mark - 1];
+
+ else if (TC_H_MAJ(skb->priority) == sch->handle &&
+ TC_H_MIN(skb->priority) > 0 &&
+ TC_H_MIN(skb->priority) <= q->tin_cnt)
tin = q->tin_order[TC_H_MIN(skb->priority) - 1];
- if (q->rate_flags & CAKE_FLAG_WASH)
- cake_wash_diffserv(skb);
- } else if (q->tin_mode != CAKE_DIFFSERV_BESTEFFORT) {
- /* extract the Diffserv Precedence field, if it exists */
- /* and clear DSCP bits if washing */
- tin = q->tin_index[cake_handle_diffserv(skb,
- q->rate_flags & CAKE_FLAG_WASH)];
+ else {
+ if (!wash)
+ dscp = cake_handle_diffserv(skb, wash);
+ tin = q->tin_index[dscp];
+
if (unlikely(tin >= q->tin_cnt))
tin = 0;
- } else {
- tin = 0;
- if (q->rate_flags & CAKE_FLAG_WASH)
- cake_wash_diffserv(skb);
}
return &q->tins[tin];
@@ -1591,7 +1707,7 @@ static u32 cake_classify(struct Qdisc *sch, struct cake_tin_data **t,
goto hash;
*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
- result = tcf_classify(skb, filter, &res, false);
+ result = tcf_classify(skb, NULL, filter, &res, false);
if (result >= 0) {
#ifdef CONFIG_NET_CLS_ACT
@@ -1600,7 +1716,7 @@ static u32 cake_classify(struct Qdisc *sch, struct cake_tin_data **t,
case TC_ACT_QUEUED:
case TC_ACT_TRAP:
*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
- /* fall through */
+ fallthrough;
case TC_ACT_SHOT:
return 0;
}
@@ -1620,14 +1736,14 @@ static void cake_reconfigure(struct Qdisc *sch);
static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch,
struct sk_buff **to_free)
{
+ u32 idx, tin, prev_qlen, prev_backlog, drop_id;
struct cake_sched_data *q = qdisc_priv(sch);
- int len = qdisc_pkt_len(skb);
- int uninitialized_var(ret);
+ int len = qdisc_pkt_len(skb), ret;
struct sk_buff *ack = NULL;
ktime_t now = ktime_get();
struct cake_tin_data *b;
struct cake_flow *flow;
- u32 idx;
+ bool same_flow = false;
/* choose flow to insert into */
idx = cake_classify(sch, &b, skb, q->flow_mode, &ret);
@@ -1637,6 +1753,7 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch,
__qdisc_drop(skb, to_free);
return ret;
}
+ tin = (u32)(b - q->tins);
idx--;
flow = &b->flows[idx];
@@ -1664,7 +1781,7 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch,
if (unlikely(len > b->max_skblen))
b->max_skblen = len;
- if (skb_is_gso(skb) && q->rate_flags & CAKE_FLAG_SPLIT_GSO) {
+ if (qdisc_pkt_segs(skb) > 1 && q->rate_flags & CAKE_FLAG_SPLIT_GSO) {
struct sk_buff *segs, *nskb;
netdev_features_t features = netif_skb_features(skb);
unsigned int slen = 0, numsegs = 0;
@@ -1673,10 +1790,10 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch,
if (IS_ERR_OR_NULL(segs))
return qdisc_drop(skb, sch, to_free);
- while (segs) {
- nskb = segs->next;
+ skb_list_walk_safe(segs, segs, nskb) {
skb_mark_not_on_list(segs);
qdisc_skb_cb(segs)->pkt_len = segs->len;
+ qdisc_skb_cb(segs)->pkt_segs = 1;
cobalt_set_enqueue_time(segs, now);
get_cobalt_cb(segs)->adjusted_len = cake_overhead(q,
segs);
@@ -1687,7 +1804,6 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch,
slen += segs->len;
q->buffer_used += segs->truesize;
b->packets++;
- segs = nskb;
}
/* stats */
@@ -1701,6 +1817,8 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch,
consume_skb(skb);
} else {
/* not splitting */
+ int ack_pkt_len = 0;
+
cobalt_set_enqueue_time(skb, now);
get_cobalt_cb(skb)->adjusted_len = cake_overhead(q, skb);
flow_queue_add(flow, skb);
@@ -1711,13 +1829,13 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch,
if (ack) {
b->ack_drops++;
sch->qstats.drops++;
- b->bytes += qdisc_pkt_len(ack);
- len -= qdisc_pkt_len(ack);
+ ack_pkt_len = qdisc_pkt_len(ack);
+ b->bytes += ack_pkt_len;
q->buffer_used += skb->truesize - ack->truesize;
if (q->rate_flags & CAKE_FLAG_INGRESS)
cake_advance_shaper(q, b, ack, now, true);
- qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(ack));
+ qdisc_tree_reduce_backlog(sch, 1, ack_pkt_len);
consume_skb(ack);
} else {
sch->q.qlen++;
@@ -1726,11 +1844,11 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch,
/* stats */
b->packets++;
- b->bytes += len;
- b->backlogs[idx] += len;
- b->tin_backlog += len;
- sch->qstats.backlog += len;
- q->avg_window_bytes += len;
+ b->bytes += len - ack_pkt_len;
+ b->backlogs[idx] += len - ack_pkt_len;
+ b->tin_backlog += len - ack_pkt_len;
+ sch->qstats.backlog += len - ack_pkt_len;
+ q->avg_window_bytes += len - ack_pkt_len;
}
if (q->overflow_timeout)
@@ -1759,7 +1877,7 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch,
q->avg_window_begin));
u64 b = q->avg_window_bytes * (u64)NSEC_PER_SEC;
- do_div(b, window_interval);
+ b = div64_u64(b, window_interval);
q->avg_peak_bandwidth =
cake_ewma(q->avg_peak_bandwidth, b,
b > q->avg_peak_bandwidth ? 2 : 8);
@@ -1780,10 +1898,6 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch,
/* flowchain */
if (!flow->set || flow->set == CAKE_SET_DECAYING) {
- struct cake_host *srchost = &b->hosts[flow->srchost];
- struct cake_host *dsthost = &b->hosts[flow->dsthost];
- u16 host_load = 1;
-
if (!flow->set) {
list_add_tail(&flow->flowchain, &b->new_flows);
} else {
@@ -1793,14 +1907,7 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch,
flow->set = CAKE_SET_SPARSE;
b->sparse_flow_count++;
- if (cake_dsrc(q->flow_mode))
- host_load = max(host_load, srchost->srchost_refcnt);
-
- if (cake_ddst(q->flow_mode))
- host_load = max(host_load, dsthost->dsthost_refcnt);
-
- flow->deficit = (b->flow_quantum *
- quantum_div[host_load]) >> 16;
+ flow->deficit = cake_get_flow_quantum(b, flow, q->flow_mode);
} else if (flow->set == CAKE_SET_SPARSE_WAIT) {
/* this flow was empty, accounted as a sparse flow, but actually
* in the bulk rotation.
@@ -1808,20 +1915,37 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch,
flow->set = CAKE_SET_BULK;
b->sparse_flow_count--;
b->bulk_flow_count++;
+
+ cake_inc_srchost_bulk_flow_count(b, flow, q->flow_mode);
+ cake_inc_dsthost_bulk_flow_count(b, flow, q->flow_mode);
}
if (q->buffer_used > q->buffer_max_used)
q->buffer_max_used = q->buffer_used;
- if (q->buffer_used > q->buffer_limit) {
- u32 dropped = 0;
+ if (q->buffer_used <= q->buffer_limit)
+ return NET_XMIT_SUCCESS;
- while (q->buffer_used > q->buffer_limit) {
- dropped++;
- cake_drop(sch, to_free);
- }
- b->drop_overlimit += dropped;
+ prev_qlen = sch->q.qlen;
+ prev_backlog = sch->qstats.backlog;
+
+ while (q->buffer_used > q->buffer_limit) {
+ drop_id = cake_drop(sch, to_free);
+ if ((drop_id >> 16) == tin &&
+ (drop_id & 0xFFFF) == idx)
+ same_flow = true;
+ }
+
+ prev_qlen -= sch->q.qlen;
+ prev_backlog -= sch->qstats.backlog;
+ b->drop_overlimit += prev_qlen;
+
+ if (same_flow) {
+ qdisc_tree_reduce_backlog(sch, prev_qlen - 1,
+ prev_backlog - len);
+ return NET_XMIT_CN;
}
+ qdisc_tree_reduce_backlog(sch, prev_qlen, prev_backlog);
return NET_XMIT_SUCCESS;
}
@@ -1857,20 +1981,19 @@ static void cake_clear_tin(struct Qdisc *sch, u16 tin)
q->cur_tin = tin;
for (q->cur_flow = 0; q->cur_flow < CAKE_QUEUES; q->cur_flow++)
while (!!(skb = cake_dequeue_one(sch)))
- kfree_skb(skb);
+ kfree_skb_reason(skb, SKB_DROP_REASON_QUEUE_PURGE);
}
static struct sk_buff *cake_dequeue(struct Qdisc *sch)
{
struct cake_sched_data *q = qdisc_priv(sch);
struct cake_tin_data *b = &q->tins[q->cur_tin];
- struct cake_host *srchost, *dsthost;
+ enum skb_drop_reason reason;
ktime_t now = ktime_get();
struct cake_flow *flow;
struct list_head *head;
bool first_flow = true;
struct sk_buff *skb;
- u16 host_load;
u64 delay;
u32 len;
@@ -1899,7 +2022,7 @@ begin:
while (b->tin_deficit < 0 ||
!(b->sparse_flow_count + b->bulk_flow_count)) {
if (b->tin_deficit <= 0)
- b->tin_deficit += b->tin_quantum_band;
+ b->tin_deficit += b->tin_quantum;
if (b->sparse_flow_count + b->bulk_flow_count)
empty = false;
@@ -1970,28 +2093,8 @@ retry:
q->cur_flow = flow - b->flows;
first_flow = false;
- /* triple isolation (modified DRR++) */
- srchost = &b->hosts[flow->srchost];
- dsthost = &b->hosts[flow->dsthost];
- host_load = 1;
-
- if (cake_dsrc(q->flow_mode))
- host_load = max(host_load, srchost->srchost_refcnt);
-
- if (cake_ddst(q->flow_mode))
- host_load = max(host_load, dsthost->dsthost_refcnt);
-
- WARN_ON(host_load > CAKE_QUEUES);
-
/* flow isolation (DRR++) */
if (flow->deficit <= 0) {
- /* The shifted prandom_u32() is a way to apply dithering to
- * avoid accumulating roundoff errors
- */
- flow->deficit += (b->flow_quantum * quantum_div[host_load] +
- (prandom_u32() >> 16)) >> 16;
- list_move_tail(&flow->flowchain, &b->old_flows);
-
/* Keep all flows with deficits out of the sparse and decaying
* rotations. No non-empty flow can go into the decaying
* rotation, so they can't get deficits
@@ -2000,6 +2103,10 @@ retry:
if (flow->head) {
b->sparse_flow_count--;
b->bulk_flow_count++;
+
+ cake_inc_srchost_bulk_flow_count(b, flow, q->flow_mode);
+ cake_inc_dsthost_bulk_flow_count(b, flow, q->flow_mode);
+
flow->set = CAKE_SET_BULK;
} else {
/* we've moved it to the bulk rotation for
@@ -2009,6 +2116,10 @@ retry:
flow->set = CAKE_SET_SPARSE_WAIT;
}
}
+
+ flow->deficit += cake_get_flow_quantum(b, flow, q->flow_mode);
+ list_move_tail(&flow->flowchain, &b->old_flows);
+
goto retry;
}
@@ -2029,6 +2140,10 @@ retry:
&b->decaying_flows);
if (flow->set == CAKE_SET_BULK) {
b->bulk_flow_count--;
+
+ cake_dec_srchost_bulk_flow_count(b, flow, q->flow_mode);
+ cake_dec_dsthost_bulk_flow_count(b, flow, q->flow_mode);
+
b->decaying_flow_count++;
} else if (flow->set == CAKE_SET_SPARSE ||
flow->set == CAKE_SET_SPARSE_WAIT) {
@@ -2042,24 +2157,25 @@ retry:
if (flow->set == CAKE_SET_SPARSE ||
flow->set == CAKE_SET_SPARSE_WAIT)
b->sparse_flow_count--;
- else if (flow->set == CAKE_SET_BULK)
+ else if (flow->set == CAKE_SET_BULK) {
b->bulk_flow_count--;
- else
+
+ cake_dec_srchost_bulk_flow_count(b, flow, q->flow_mode);
+ cake_dec_dsthost_bulk_flow_count(b, flow, q->flow_mode);
+ } else
b->decaying_flow_count--;
flow->set = CAKE_SET_NONE;
- srchost->srchost_refcnt--;
- dsthost->dsthost_refcnt--;
}
goto begin;
}
+ reason = cobalt_should_drop(&flow->cvars, &b->cparams, now, skb,
+ (b->bulk_flow_count *
+ !!(q->rate_flags &
+ CAKE_FLAG_INGRESS)));
/* Last packet in queue may be marked, shouldn't be dropped */
- if (!cobalt_should_drop(&flow->cvars, &b->cparams, now, skb,
- (b->bulk_flow_count *
- !!(q->rate_flags &
- CAKE_FLAG_INGRESS))) ||
- !flow->head)
+ if (reason == SKB_NOT_DROPPED_YET || !flow->head)
break;
/* drop this packet, get another one */
@@ -2073,7 +2189,7 @@ retry:
b->tin_dropped++;
qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(skb));
qdisc_qstats_drop(sch);
- kfree_skb(skb);
+ qdisc_dequeue_drop(sch, skb, reason);
if (q->rate_flags & CAKE_FLAG_INGRESS)
goto retry;
}
@@ -2122,8 +2238,12 @@ retry:
static void cake_reset(struct Qdisc *sch)
{
+ struct cake_sched_data *q = qdisc_priv(sch);
u32 c;
+ if (!q->tins)
+ return;
+
for (c = 0; c < CAKE_MAX_TINS; c++)
cake_clear_tin(sch, c);
}
@@ -2144,6 +2264,8 @@ static const struct nla_policy cake_policy[TCA_CAKE_MAX + 1] = {
[TCA_CAKE_MPU] = { .type = NLA_U32 },
[TCA_CAKE_INGRESS] = { .type = NLA_U32 },
[TCA_CAKE_ACK_FILTER] = { .type = NLA_U32 },
+ [TCA_CAKE_SPLIT_GSO] = { .type = NLA_U32 },
+ [TCA_CAKE_FWMARK] = { .type = NLA_U32 },
};
static void cake_set_rate(struct cake_tin_data *b, u64 rate, u32 mtu,
@@ -2199,8 +2321,7 @@ static int cake_config_besteffort(struct Qdisc *sch)
cake_set_rate(b, rate, mtu,
us_to_ns(q->target), us_to_ns(q->interval));
- b->tin_quantum_band = 65535;
- b->tin_quantum_prio = 65535;
+ b->tin_quantum = 65535;
return 0;
}
@@ -2211,8 +2332,7 @@ static int cake_config_precedence(struct Qdisc *sch)
struct cake_sched_data *q = qdisc_priv(sch);
u32 mtu = psched_mtu(qdisc_dev(sch));
u64 rate = q->rate_bps;
- u32 quantum1 = 256;
- u32 quantum2 = 256;
+ u32 quantum = 256;
u32 i;
q->tin_cnt = 8;
@@ -2225,18 +2345,14 @@ static int cake_config_precedence(struct Qdisc *sch)
cake_set_rate(b, rate, mtu, us_to_ns(q->target),
us_to_ns(q->interval));
- b->tin_quantum_prio = max_t(u16, 1U, quantum1);
- b->tin_quantum_band = max_t(u16, 1U, quantum2);
+ b->tin_quantum = max_t(u16, 1U, quantum);
/* calculate next class's parameters */
rate *= 7;
rate >>= 3;
- quantum1 *= 3;
- quantum1 >>= 1;
-
- quantum2 *= 7;
- quantum2 >>= 3;
+ quantum *= 7;
+ quantum >>= 3;
}
return 0;
@@ -2244,9 +2360,7 @@ static int cake_config_precedence(struct Qdisc *sch)
/* List of known Diffserv codepoints:
*
- * Least Effort (CS1)
- * Best Effort (CS0)
- * Max Reliability & LLT "Lo" (TOS1)
+ * Default Forwarding (DF/CS0) - Best Effort
* Max Throughput (TOS2)
* Min Delay (TOS4)
* LLT "La" (TOS5)
@@ -2254,6 +2368,7 @@ static int cake_config_precedence(struct Qdisc *sch)
* Assured Forwarding 2 (AF2x) - x3
* Assured Forwarding 3 (AF3x) - x3
* Assured Forwarding 4 (AF4x) - x3
+ * Precedence Class 1 (CS1)
* Precedence Class 2 (CS2)
* Precedence Class 3 (CS3)
* Precedence Class 4 (CS4)
@@ -2262,11 +2377,12 @@ static int cake_config_precedence(struct Qdisc *sch)
* Precedence Class 7 (CS7)
* Voice Admit (VA)
* Expedited Forwarding (EF)
-
- * Total 25 codepoints.
+ * Lower Effort (LE)
+ *
+ * Total 26 codepoints.
*/
-/* List of traffic classes in RFC 4594:
+/* List of traffic classes in RFC 4594, updated by RFC 8622:
* (roughly descending order of contended priority)
* (roughly ascending order of uncontended throughput)
*
@@ -2277,12 +2393,12 @@ static int cake_config_precedence(struct Qdisc *sch)
* Realtime Interactive (CS4) - eg. games
* Multimedia Streaming (AF3x) - eg. YouTube, NetFlix, Twitch
* Broadcast Video (CS3)
- * Low Latency Data (AF2x,TOS4) - eg. database
- * Ops, Admin, Management (CS2,TOS1) - eg. ssh
- * Standard Service (CS0 & unrecognised codepoints)
- * High Throughput Data (AF1x,TOS2) - eg. web traffic
- * Low Priority Data (CS1) - eg. BitTorrent
-
+ * Low-Latency Data (AF2x,TOS4) - eg. database
+ * Ops, Admin, Management (CS2) - eg. ssh
+ * Standard Service (DF & unrecognised codepoints)
+ * High-Throughput Data (AF1x,TOS2) - eg. web traffic
+ * Low-Priority Data (LE,CS1) - eg. BitTorrent
+ *
* Total 12 traffic classes.
*/
@@ -2292,12 +2408,12 @@ static int cake_config_diffserv8(struct Qdisc *sch)
*
* Network Control (CS6, CS7)
* Minimum Latency (EF, VA, CS5, CS4)
- * Interactive Shell (CS2, TOS1)
+ * Interactive Shell (CS2)
* Low Latency Transactions (AF2x, TOS4)
* Video Streaming (AF4x, AF3x, CS3)
- * Bog Standard (CS0 etc.)
- * High Throughput (AF1x, TOS2)
- * Background Traffic (CS1)
+ * Bog Standard (DF etc.)
+ * High Throughput (AF1x, TOS2, CS1)
+ * Background Traffic (LE)
*
* Total 8 traffic classes.
*/
@@ -2305,8 +2421,7 @@ static int cake_config_diffserv8(struct Qdisc *sch)
struct cake_sched_data *q = qdisc_priv(sch);
u32 mtu = psched_mtu(qdisc_dev(sch));
u64 rate = q->rate_bps;
- u32 quantum1 = 256;
- u32 quantum2 = 256;
+ u32 quantum = 256;
u32 i;
q->tin_cnt = 8;
@@ -2322,18 +2437,14 @@ static int cake_config_diffserv8(struct Qdisc *sch)
cake_set_rate(b, rate, mtu, us_to_ns(q->target),
us_to_ns(q->interval));
- b->tin_quantum_prio = max_t(u16, 1U, quantum1);
- b->tin_quantum_band = max_t(u16, 1U, quantum2);
+ b->tin_quantum = max_t(u16, 1U, quantum);
/* calculate next class's parameters */
rate *= 7;
rate >>= 3;
- quantum1 *= 3;
- quantum1 >>= 1;
-
- quantum2 *= 7;
- quantum2 >>= 3;
+ quantum *= 7;
+ quantum >>= 3;
}
return 0;
@@ -2344,9 +2455,9 @@ static int cake_config_diffserv4(struct Qdisc *sch)
/* Further pruned list of traffic classes for four-class system:
*
* Latency Sensitive (CS7, CS6, EF, VA, CS5, CS4)
- * Streaming Media (AF4x, AF3x, CS3, AF2x, TOS4, CS2, TOS1)
- * Best Effort (CS0, AF1x, TOS2, and those not specified)
- * Background Traffic (CS1)
+ * Streaming Media (AF4x, AF3x, CS3, AF2x, TOS4, CS2)
+ * Best Effort (DF, AF1x, TOS2, and those not specified)
+ * Background Traffic (LE, CS1)
*
* Total 4 traffic classes.
*/
@@ -2372,17 +2483,11 @@ static int cake_config_diffserv4(struct Qdisc *sch)
cake_set_rate(&q->tins[3], rate >> 2, mtu,
us_to_ns(q->target), us_to_ns(q->interval));
- /* priority weights */
- q->tins[0].tin_quantum_prio = quantum;
- q->tins[1].tin_quantum_prio = quantum >> 4;
- q->tins[2].tin_quantum_prio = quantum << 2;
- q->tins[3].tin_quantum_prio = quantum << 4;
-
/* bandwidth-sharing weights */
- q->tins[0].tin_quantum_band = quantum;
- q->tins[1].tin_quantum_band = quantum >> 4;
- q->tins[2].tin_quantum_band = quantum >> 1;
- q->tins[3].tin_quantum_band = quantum >> 2;
+ q->tins[0].tin_quantum = quantum;
+ q->tins[1].tin_quantum = quantum >> 4;
+ q->tins[2].tin_quantum = quantum >> 1;
+ q->tins[3].tin_quantum = quantum >> 2;
return 0;
}
@@ -2390,9 +2495,9 @@ static int cake_config_diffserv4(struct Qdisc *sch)
static int cake_config_diffserv3(struct Qdisc *sch)
{
/* Simplified Diffserv structure with 3 tins.
- * Low Priority (CS1)
+ * Latency Sensitive (CS7, CS6, EF, VA, TOS4)
* Best Effort
- * Latency Sensitive (TOS4, VA, EF, CS6, CS7)
+ * Low Priority (LE, CS1)
*/
struct cake_sched_data *q = qdisc_priv(sch);
u32 mtu = psched_mtu(qdisc_dev(sch));
@@ -2413,15 +2518,10 @@ static int cake_config_diffserv3(struct Qdisc *sch)
cake_set_rate(&q->tins[2], rate >> 2, mtu,
us_to_ns(q->target), us_to_ns(q->interval));
- /* priority weights */
- q->tins[0].tin_quantum_prio = quantum;
- q->tins[1].tin_quantum_prio = quantum >> 4;
- q->tins[2].tin_quantum_prio = quantum << 4;
-
/* bandwidth-sharing weights */
- q->tins[0].tin_quantum_band = quantum;
- q->tins[1].tin_quantum_band = quantum >> 4;
- q->tins[2].tin_quantum_band = quantum >> 2;
+ q->tins[0].tin_quantum = quantum;
+ q->tins[1].tin_quantum = quantum >> 4;
+ q->tins[2].tin_quantum = quantum >> 2;
return 0;
}
@@ -2485,19 +2585,20 @@ static int cake_change(struct Qdisc *sch, struct nlattr *opt,
{
struct cake_sched_data *q = qdisc_priv(sch);
struct nlattr *tb[TCA_CAKE_MAX + 1];
+ u16 rate_flags;
+ u8 flow_mode;
int err;
- if (!opt)
- return -EINVAL;
-
- err = nla_parse_nested(tb, TCA_CAKE_MAX, opt, cake_policy, extack);
+ err = nla_parse_nested_deprecated(tb, TCA_CAKE_MAX, opt, cake_policy,
+ extack);
if (err < 0)
return err;
+ flow_mode = q->flow_mode;
if (tb[TCA_CAKE_NAT]) {
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
- q->flow_mode &= ~CAKE_FLOW_NAT_FLAG;
- q->flow_mode |= CAKE_FLOW_NAT_FLAG *
+ flow_mode &= ~CAKE_FLOW_NAT_FLAG;
+ flow_mode |= CAKE_FLOW_NAT_FLAG *
!!nla_get_u32(tb[TCA_CAKE_NAT]);
#else
NL_SET_ERR_MSG_ATTR(extack, tb[TCA_CAKE_NAT],
@@ -2507,29 +2608,34 @@ static int cake_change(struct Qdisc *sch, struct nlattr *opt,
}
if (tb[TCA_CAKE_BASE_RATE64])
- q->rate_bps = nla_get_u64(tb[TCA_CAKE_BASE_RATE64]);
+ WRITE_ONCE(q->rate_bps,
+ nla_get_u64(tb[TCA_CAKE_BASE_RATE64]));
if (tb[TCA_CAKE_DIFFSERV_MODE])
- q->tin_mode = nla_get_u32(tb[TCA_CAKE_DIFFSERV_MODE]);
+ WRITE_ONCE(q->tin_mode,
+ nla_get_u32(tb[TCA_CAKE_DIFFSERV_MODE]));
+ rate_flags = q->rate_flags;
if (tb[TCA_CAKE_WASH]) {
if (!!nla_get_u32(tb[TCA_CAKE_WASH]))
- q->rate_flags |= CAKE_FLAG_WASH;
+ rate_flags |= CAKE_FLAG_WASH;
else
- q->rate_flags &= ~CAKE_FLAG_WASH;
+ rate_flags &= ~CAKE_FLAG_WASH;
}
if (tb[TCA_CAKE_FLOW_MODE])
- q->flow_mode = ((q->flow_mode & CAKE_FLOW_NAT_FLAG) |
+ flow_mode = ((flow_mode & CAKE_FLOW_NAT_FLAG) |
(nla_get_u32(tb[TCA_CAKE_FLOW_MODE]) &
CAKE_FLOW_MASK));
if (tb[TCA_CAKE_ATM])
- q->atm_mode = nla_get_u32(tb[TCA_CAKE_ATM]);
+ WRITE_ONCE(q->atm_mode,
+ nla_get_u32(tb[TCA_CAKE_ATM]));
if (tb[TCA_CAKE_OVERHEAD]) {
- q->rate_overhead = nla_get_s32(tb[TCA_CAKE_OVERHEAD]);
- q->rate_flags |= CAKE_FLAG_OVERHEAD;
+ WRITE_ONCE(q->rate_overhead,
+ nla_get_s32(tb[TCA_CAKE_OVERHEAD]));
+ rate_flags |= CAKE_FLAG_OVERHEAD;
q->max_netlen = 0;
q->max_adjlen = 0;
@@ -2538,7 +2644,7 @@ static int cake_change(struct Qdisc *sch, struct nlattr *opt,
}
if (tb[TCA_CAKE_RAW]) {
- q->rate_flags &= ~CAKE_FLAG_OVERHEAD;
+ rate_flags &= ~CAKE_FLAG_OVERHEAD;
q->max_netlen = 0;
q->max_adjlen = 0;
@@ -2547,49 +2653,58 @@ static int cake_change(struct Qdisc *sch, struct nlattr *opt,
}
if (tb[TCA_CAKE_MPU])
- q->rate_mpu = nla_get_u32(tb[TCA_CAKE_MPU]);
+ WRITE_ONCE(q->rate_mpu,
+ nla_get_u32(tb[TCA_CAKE_MPU]));
if (tb[TCA_CAKE_RTT]) {
- q->interval = nla_get_u32(tb[TCA_CAKE_RTT]);
+ u32 interval = nla_get_u32(tb[TCA_CAKE_RTT]);
- if (!q->interval)
- q->interval = 1;
+ WRITE_ONCE(q->interval, max(interval, 1U));
}
if (tb[TCA_CAKE_TARGET]) {
- q->target = nla_get_u32(tb[TCA_CAKE_TARGET]);
+ u32 target = nla_get_u32(tb[TCA_CAKE_TARGET]);
- if (!q->target)
- q->target = 1;
+ WRITE_ONCE(q->target, max(target, 1U));
}
if (tb[TCA_CAKE_AUTORATE]) {
if (!!nla_get_u32(tb[TCA_CAKE_AUTORATE]))
- q->rate_flags |= CAKE_FLAG_AUTORATE_INGRESS;
+ rate_flags |= CAKE_FLAG_AUTORATE_INGRESS;
else
- q->rate_flags &= ~CAKE_FLAG_AUTORATE_INGRESS;
+ rate_flags &= ~CAKE_FLAG_AUTORATE_INGRESS;
}
if (tb[TCA_CAKE_INGRESS]) {
if (!!nla_get_u32(tb[TCA_CAKE_INGRESS]))
- q->rate_flags |= CAKE_FLAG_INGRESS;
+ rate_flags |= CAKE_FLAG_INGRESS;
else
- q->rate_flags &= ~CAKE_FLAG_INGRESS;
+ rate_flags &= ~CAKE_FLAG_INGRESS;
}
if (tb[TCA_CAKE_ACK_FILTER])
- q->ack_filter = nla_get_u32(tb[TCA_CAKE_ACK_FILTER]);
+ WRITE_ONCE(q->ack_filter,
+ nla_get_u32(tb[TCA_CAKE_ACK_FILTER]));
if (tb[TCA_CAKE_MEMORY])
- q->buffer_config_limit = nla_get_u32(tb[TCA_CAKE_MEMORY]);
+ WRITE_ONCE(q->buffer_config_limit,
+ nla_get_u32(tb[TCA_CAKE_MEMORY]));
if (tb[TCA_CAKE_SPLIT_GSO]) {
if (!!nla_get_u32(tb[TCA_CAKE_SPLIT_GSO]))
- q->rate_flags |= CAKE_FLAG_SPLIT_GSO;
+ rate_flags |= CAKE_FLAG_SPLIT_GSO;
else
- q->rate_flags &= ~CAKE_FLAG_SPLIT_GSO;
+ rate_flags &= ~CAKE_FLAG_SPLIT_GSO;
+ }
+
+ if (tb[TCA_CAKE_FWMARK]) {
+ WRITE_ONCE(q->fwmark_mask, nla_get_u32(tb[TCA_CAKE_FWMARK]));
+ WRITE_ONCE(q->fwmark_shft,
+ q->fwmark_mask ? __ffs(q->fwmark_mask) : 0);
}
+ WRITE_ONCE(q->rate_flags, rate_flags);
+ WRITE_ONCE(q->flow_mode, flow_mode);
if (q->tins) {
sch_tree_lock(sch);
cake_reconfigure(sch);
@@ -2615,6 +2730,8 @@ static int cake_init(struct Qdisc *sch, struct nlattr *opt,
int i, j, err;
sch->limit = 10240;
+ sch->flags |= TCQ_F_DEQUEUE_DROPS;
+
q->tin_mode = CAKE_DIFFSERV_DIFFSERV3;
q->flow_mode = CAKE_FLOW_TRIPLE;
@@ -2631,7 +2748,7 @@ static int cake_init(struct Qdisc *sch, struct nlattr *opt,
qdisc_watchdog_init(&q->watchdog, sch);
if (opt) {
- int err = cake_change(sch, opt, extack);
+ err = cake_change(sch, opt, extack);
if (err)
return err;
@@ -2648,7 +2765,7 @@ static int cake_init(struct Qdisc *sch, struct nlattr *opt,
q->tins = kvcalloc(CAKE_MAX_TINS, sizeof(struct cake_tin_data),
GFP_KERNEL);
if (!q->tins)
- goto nomem;
+ return -ENOMEM;
for (i = 0; i < CAKE_MAX_TINS; i++) {
struct cake_tin_data *b = q->tins + i;
@@ -2678,75 +2795,78 @@ static int cake_init(struct Qdisc *sch, struct nlattr *opt,
q->min_netlen = ~0;
q->min_adjlen = ~0;
return 0;
-
-nomem:
- cake_destroy(sch);
- return -ENOMEM;
}
static int cake_dump(struct Qdisc *sch, struct sk_buff *skb)
{
struct cake_sched_data *q = qdisc_priv(sch);
struct nlattr *opts;
+ u16 rate_flags;
+ u8 flow_mode;
- opts = nla_nest_start(skb, TCA_OPTIONS);
+ opts = nla_nest_start_noflag(skb, TCA_OPTIONS);
if (!opts)
goto nla_put_failure;
- if (nla_put_u64_64bit(skb, TCA_CAKE_BASE_RATE64, q->rate_bps,
- TCA_CAKE_PAD))
+ if (nla_put_u64_64bit(skb, TCA_CAKE_BASE_RATE64,
+ READ_ONCE(q->rate_bps), TCA_CAKE_PAD))
goto nla_put_failure;
- if (nla_put_u32(skb, TCA_CAKE_FLOW_MODE,
- q->flow_mode & CAKE_FLOW_MASK))
+ flow_mode = READ_ONCE(q->flow_mode);
+ if (nla_put_u32(skb, TCA_CAKE_FLOW_MODE, flow_mode & CAKE_FLOW_MASK))
goto nla_put_failure;
- if (nla_put_u32(skb, TCA_CAKE_RTT, q->interval))
+ if (nla_put_u32(skb, TCA_CAKE_RTT, READ_ONCE(q->interval)))
goto nla_put_failure;
- if (nla_put_u32(skb, TCA_CAKE_TARGET, q->target))
+ if (nla_put_u32(skb, TCA_CAKE_TARGET, READ_ONCE(q->target)))
goto nla_put_failure;
- if (nla_put_u32(skb, TCA_CAKE_MEMORY, q->buffer_config_limit))
+ if (nla_put_u32(skb, TCA_CAKE_MEMORY,
+ READ_ONCE(q->buffer_config_limit)))
goto nla_put_failure;
+ rate_flags = READ_ONCE(q->rate_flags);
if (nla_put_u32(skb, TCA_CAKE_AUTORATE,
- !!(q->rate_flags & CAKE_FLAG_AUTORATE_INGRESS)))
+ !!(rate_flags & CAKE_FLAG_AUTORATE_INGRESS)))
goto nla_put_failure;
if (nla_put_u32(skb, TCA_CAKE_INGRESS,
- !!(q->rate_flags & CAKE_FLAG_INGRESS)))
+ !!(rate_flags & CAKE_FLAG_INGRESS)))
goto nla_put_failure;
- if (nla_put_u32(skb, TCA_CAKE_ACK_FILTER, q->ack_filter))
+ if (nla_put_u32(skb, TCA_CAKE_ACK_FILTER, READ_ONCE(q->ack_filter)))
goto nla_put_failure;
if (nla_put_u32(skb, TCA_CAKE_NAT,
- !!(q->flow_mode & CAKE_FLOW_NAT_FLAG)))
+ !!(flow_mode & CAKE_FLOW_NAT_FLAG)))
goto nla_put_failure;
- if (nla_put_u32(skb, TCA_CAKE_DIFFSERV_MODE, q->tin_mode))
+ if (nla_put_u32(skb, TCA_CAKE_DIFFSERV_MODE, READ_ONCE(q->tin_mode)))
goto nla_put_failure;
if (nla_put_u32(skb, TCA_CAKE_WASH,
- !!(q->rate_flags & CAKE_FLAG_WASH)))
+ !!(rate_flags & CAKE_FLAG_WASH)))
goto nla_put_failure;
- if (nla_put_u32(skb, TCA_CAKE_OVERHEAD, q->rate_overhead))
+ if (nla_put_u32(skb, TCA_CAKE_OVERHEAD, READ_ONCE(q->rate_overhead)))
goto nla_put_failure;
- if (!(q->rate_flags & CAKE_FLAG_OVERHEAD))
+ if (!(rate_flags & CAKE_FLAG_OVERHEAD))
if (nla_put_u32(skb, TCA_CAKE_RAW, 0))
goto nla_put_failure;
- if (nla_put_u32(skb, TCA_CAKE_ATM, q->atm_mode))
+ if (nla_put_u32(skb, TCA_CAKE_ATM, READ_ONCE(q->atm_mode)))
goto nla_put_failure;
- if (nla_put_u32(skb, TCA_CAKE_MPU, q->rate_mpu))
+ if (nla_put_u32(skb, TCA_CAKE_MPU, READ_ONCE(q->rate_mpu)))
goto nla_put_failure;
if (nla_put_u32(skb, TCA_CAKE_SPLIT_GSO,
- !!(q->rate_flags & CAKE_FLAG_SPLIT_GSO)))
+ !!(rate_flags & CAKE_FLAG_SPLIT_GSO)))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, TCA_CAKE_FWMARK, READ_ONCE(q->fwmark_mask)))
goto nla_put_failure;
return nla_nest_end(skb, opts);
@@ -2757,7 +2877,7 @@ nla_put_failure:
static int cake_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
{
- struct nlattr *stats = nla_nest_start(d->skb, TCA_STATS_APP);
+ struct nlattr *stats = nla_nest_start_noflag(d->skb, TCA_STATS_APP);
struct cake_sched_data *q = qdisc_priv(sch);
struct nlattr *tstats, *ts;
int i;
@@ -2787,7 +2907,7 @@ static int cake_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
#undef PUT_STAT_U32
#undef PUT_STAT_U64
- tstats = nla_nest_start(d->skb, TCA_CAKE_STATS_TIN_STATS);
+ tstats = nla_nest_start_noflag(d->skb, TCA_CAKE_STATS_TIN_STATS);
if (!tstats)
goto nla_put_failure;
@@ -2804,7 +2924,7 @@ static int cake_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
for (i = 0; i < q->tin_cnt; i++) {
struct cake_tin_data *b = &q->tins[q->tin_order[i]];
- ts = nla_nest_start(d->skb, i + 1);
+ ts = nla_nest_start_noflag(d->skb, i + 1);
if (!ts)
goto nla_put_failure;
@@ -2924,7 +3044,7 @@ static int cake_dump_class_stats(struct Qdisc *sch, unsigned long cl,
if (flow) {
ktime_t now = ktime_get();
- stats = nla_nest_start(d->skb, TCA_STATS_APP);
+ stats = nla_nest_start_noflag(d->skb, TCA_STATS_APP);
if (!stats)
return -1;
@@ -2945,7 +3065,7 @@ static int cake_dump_class_stats(struct Qdisc *sch, unsigned long cl,
PUT_STAT_S32(BLUE_TIMER_US,
ktime_to_us(
ktime_sub(now,
- flow->cvars.blue_timer)));
+ flow->cvars.blue_timer)));
}
if (flow->cvars.dropping) {
PUT_STAT_S32(DROP_NEXT_US,
@@ -2977,16 +3097,13 @@ static void cake_walk(struct Qdisc *sch, struct qdisc_walker *arg)
struct cake_tin_data *b = &q->tins[q->tin_order[i]];
for (j = 0; j < CAKE_QUEUES; j++) {
- if (list_empty(&b->flows[j].flowchain) ||
- arg->count < arg->skip) {
+ if (list_empty(&b->flows[j].flowchain)) {
arg->count++;
continue;
}
- if (arg->fn(sch, i * CAKE_QUEUES + j + 1, arg) < 0) {
- arg->stop = 1;
+ if (!tc_qdisc_stats_dump(sch, i * CAKE_QUEUES + j + 1,
+ arg))
break;
- }
- arg->count++;
}
}
}
@@ -3017,6 +3134,7 @@ static struct Qdisc_ops cake_qdisc_ops __read_mostly = {
.dump_stats = cake_dump_stats,
.owner = THIS_MODULE,
};
+MODULE_ALIAS_NET_SCH("cake");
static int __init cake_module_init(void)
{
diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c
deleted file mode 100644
index 4dc05409e3fb..000000000000
--- a/net/sched/sch_cbq.c
+++ /dev/null
@@ -1,1807 +0,0 @@
-/*
- * net/sched/sch_cbq.c Class-Based Queueing discipline.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
- * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
- *
- */
-
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/string.h>
-#include <linux/errno.h>
-#include <linux/skbuff.h>
-#include <net/netlink.h>
-#include <net/pkt_sched.h>
-#include <net/pkt_cls.h>
-
-
-/* Class-Based Queueing (CBQ) algorithm.
- =======================================
-
- Sources: [1] Sally Floyd and Van Jacobson, "Link-sharing and Resource
- Management Models for Packet Networks",
- IEEE/ACM Transactions on Networking, Vol.3, No.4, 1995
-
- [2] Sally Floyd, "Notes on CBQ and Guaranteed Service", 1995
-
- [3] Sally Floyd, "Notes on Class-Based Queueing: Setting
- Parameters", 1996
-
- [4] Sally Floyd and Michael Speer, "Experimental Results
- for Class-Based Queueing", 1998, not published.
-
- -----------------------------------------------------------------------
-
- Algorithm skeleton was taken from NS simulator cbq.cc.
- If someone wants to check this code against the LBL version,
- he should take into account that ONLY the skeleton was borrowed,
- the implementation is different. Particularly:
-
- --- The WRR algorithm is different. Our version looks more
- reasonable (I hope) and works when quanta are allowed to be
- less than MTU, which is always the case when real time classes
- have small rates. Note, that the statement of [3] is
- incomplete, delay may actually be estimated even if class
- per-round allotment is less than MTU. Namely, if per-round
- allotment is W*r_i, and r_1+...+r_k = r < 1
-
- delay_i <= ([MTU/(W*r_i)]*W*r + W*r + k*MTU)/B
-
- In the worst case we have IntServ estimate with D = W*r+k*MTU
- and C = MTU*r. The proof (if correct at all) is trivial.
-
-
- --- It seems that cbq-2.0 is not very accurate. At least, I cannot
- interpret some places, which look like wrong translations
- from NS. Anyone is advised to find these differences
- and explain to me, why I am wrong 8).
-
- --- Linux has no EOI event, so that we cannot estimate true class
- idle time. Workaround is to consider the next dequeue event
- as sign that previous packet is finished. This is wrong because of
- internal device queueing, but on a permanently loaded link it is true.
- Moreover, combined with clock integrator, this scheme looks
- very close to an ideal solution. */
-
-struct cbq_sched_data;
-
-
-struct cbq_class {
- struct Qdisc_class_common common;
- struct cbq_class *next_alive; /* next class with backlog in this priority band */
-
-/* Parameters */
- unsigned char priority; /* class priority */
- unsigned char priority2; /* priority to be used after overlimit */
- unsigned char ewma_log; /* time constant for idle time calculation */
-
- u32 defmap;
-
- /* Link-sharing scheduler parameters */
- long maxidle; /* Class parameters: see below. */
- long offtime;
- long minidle;
- u32 avpkt;
- struct qdisc_rate_table *R_tab;
-
- /* General scheduler (WRR) parameters */
- long allot;
- long quantum; /* Allotment per WRR round */
- long weight; /* Relative allotment: see below */
-
- struct Qdisc *qdisc; /* Ptr to CBQ discipline */
- struct cbq_class *split; /* Ptr to split node */
- struct cbq_class *share; /* Ptr to LS parent in the class tree */
- struct cbq_class *tparent; /* Ptr to tree parent in the class tree */
- struct cbq_class *borrow; /* NULL if class is bandwidth limited;
- parent otherwise */
- struct cbq_class *sibling; /* Sibling chain */
- struct cbq_class *children; /* Pointer to children chain */
-
- struct Qdisc *q; /* Elementary queueing discipline */
-
-
-/* Variables */
- unsigned char cpriority; /* Effective priority */
- unsigned char delayed;
- unsigned char level; /* level of the class in hierarchy:
- 0 for leaf classes, and maximal
- level of children + 1 for nodes.
- */
-
- psched_time_t last; /* Last end of service */
- psched_time_t undertime;
- long avgidle;
- long deficit; /* Saved deficit for WRR */
- psched_time_t penalized;
- struct gnet_stats_basic_packed bstats;
- struct gnet_stats_queue qstats;
- struct net_rate_estimator __rcu *rate_est;
- struct tc_cbq_xstats xstats;
-
- struct tcf_proto __rcu *filter_list;
- struct tcf_block *block;
-
- int filters;
-
- struct cbq_class *defaults[TC_PRIO_MAX + 1];
-};
-
-struct cbq_sched_data {
- struct Qdisc_class_hash clhash; /* Hash table of all classes */
- int nclasses[TC_CBQ_MAXPRIO + 1];
- unsigned int quanta[TC_CBQ_MAXPRIO + 1];
-
- struct cbq_class link;
-
- unsigned int activemask;
- struct cbq_class *active[TC_CBQ_MAXPRIO + 1]; /* List of all classes
- with backlog */
-
-#ifdef CONFIG_NET_CLS_ACT
- struct cbq_class *rx_class;
-#endif
- struct cbq_class *tx_class;
- struct cbq_class *tx_borrowed;
- int tx_len;
- psched_time_t now; /* Cached timestamp */
- unsigned int pmask;
-
- struct hrtimer delay_timer;
- struct qdisc_watchdog watchdog; /* Watchdog timer,
- started when CBQ has
- backlog, but cannot
- transmit just now */
- psched_tdiff_t wd_expires;
- int toplevel;
- u32 hgenerator;
-};
-
-
-#define L2T(cl, len) qdisc_l2t((cl)->R_tab, len)
-
-static inline struct cbq_class *
-cbq_class_lookup(struct cbq_sched_data *q, u32 classid)
-{
- struct Qdisc_class_common *clc;
-
- clc = qdisc_class_find(&q->clhash, classid);
- if (clc == NULL)
- return NULL;
- return container_of(clc, struct cbq_class, common);
-}
-
-#ifdef CONFIG_NET_CLS_ACT
-
-static struct cbq_class *
-cbq_reclassify(struct sk_buff *skb, struct cbq_class *this)
-{
- struct cbq_class *cl;
-
- for (cl = this->tparent; cl; cl = cl->tparent) {
- struct cbq_class *new = cl->defaults[TC_PRIO_BESTEFFORT];
-
- if (new != NULL && new != this)
- return new;
- }
- return NULL;
-}
-
-#endif
-
-/* Classify packet. The procedure is pretty complicated, but
- * it allows us to combine link sharing and priority scheduling
- * transparently.
- *
- * Namely, you can put link sharing rules (f.e. route based) at root of CBQ,
- * so that it resolves to split nodes. Then packets are classified
- * by logical priority, or a more specific classifier may be attached
- * to the split node.
- */
-
-static struct cbq_class *
-cbq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
-{
- struct cbq_sched_data *q = qdisc_priv(sch);
- struct cbq_class *head = &q->link;
- struct cbq_class **defmap;
- struct cbq_class *cl = NULL;
- u32 prio = skb->priority;
- struct tcf_proto *fl;
- struct tcf_result res;
-
- /*
- * Step 1. If skb->priority points to one of our classes, use it.
- */
- if (TC_H_MAJ(prio ^ sch->handle) == 0 &&
- (cl = cbq_class_lookup(q, prio)) != NULL)
- return cl;
-
- *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
- for (;;) {
- int result = 0;
- defmap = head->defaults;
-
- fl = rcu_dereference_bh(head->filter_list);
- /*
- * Step 2+n. Apply classifier.
- */
- result = tcf_classify(skb, fl, &res, true);
- if (!fl || result < 0)
- goto fallback;
-
- cl = (void *)res.class;
- if (!cl) {
- if (TC_H_MAJ(res.classid))
- cl = cbq_class_lookup(q, res.classid);
- else if ((cl = defmap[res.classid & TC_PRIO_MAX]) == NULL)
- cl = defmap[TC_PRIO_BESTEFFORT];
-
- if (cl == NULL)
- goto fallback;
- }
- if (cl->level >= head->level)
- goto fallback;
-#ifdef CONFIG_NET_CLS_ACT
- switch (result) {
- case TC_ACT_QUEUED:
- case TC_ACT_STOLEN:
- case TC_ACT_TRAP:
- *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
- /* fall through */
- case TC_ACT_SHOT:
- return NULL;
- case TC_ACT_RECLASSIFY:
- return cbq_reclassify(skb, cl);
- }
-#endif
- if (cl->level == 0)
- return cl;
-
- /*
- * Step 3+n. If classifier selected a link sharing class,
- * apply agency specific classifier.
- * Repeat this procdure until we hit a leaf node.
- */
- head = cl;
- }
-
-fallback:
- cl = head;
-
- /*
- * Step 4. No success...
- */
- if (TC_H_MAJ(prio) == 0 &&
- !(cl = head->defaults[prio & TC_PRIO_MAX]) &&
- !(cl = head->defaults[TC_PRIO_BESTEFFORT]))
- return head;
-
- return cl;
-}
-
-/*
- * A packet has just been enqueued on the empty class.
- * cbq_activate_class adds it to the tail of active class list
- * of its priority band.
- */
-
-static inline void cbq_activate_class(struct cbq_class *cl)
-{
- struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
- int prio = cl->cpriority;
- struct cbq_class *cl_tail;
-
- cl_tail = q->active[prio];
- q->active[prio] = cl;
-
- if (cl_tail != NULL) {
- cl->next_alive = cl_tail->next_alive;
- cl_tail->next_alive = cl;
- } else {
- cl->next_alive = cl;
- q->activemask |= (1<<prio);
- }
-}
-
-/*
- * Unlink class from active chain.
- * Note that this same procedure is done directly in cbq_dequeue*
- * during round-robin procedure.
- */
-
-static void cbq_deactivate_class(struct cbq_class *this)
-{
- struct cbq_sched_data *q = qdisc_priv(this->qdisc);
- int prio = this->cpriority;
- struct cbq_class *cl;
- struct cbq_class *cl_prev = q->active[prio];
-
- do {
- cl = cl_prev->next_alive;
- if (cl == this) {
- cl_prev->next_alive = cl->next_alive;
- cl->next_alive = NULL;
-
- if (cl == q->active[prio]) {
- q->active[prio] = cl_prev;
- if (cl == q->active[prio]) {
- q->active[prio] = NULL;
- q->activemask &= ~(1<<prio);
- return;
- }
- }
- return;
- }
- } while ((cl_prev = cl) != q->active[prio]);
-}
-
-static void
-cbq_mark_toplevel(struct cbq_sched_data *q, struct cbq_class *cl)
-{
- int toplevel = q->toplevel;
-
- if (toplevel > cl->level) {
- psched_time_t now = psched_get_time();
-
- do {
- if (cl->undertime < now) {
- q->toplevel = cl->level;
- return;
- }
- } while ((cl = cl->borrow) != NULL && toplevel > cl->level);
- }
-}
-
-static int
-cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch,
- struct sk_buff **to_free)
-{
- struct cbq_sched_data *q = qdisc_priv(sch);
- int uninitialized_var(ret);
- struct cbq_class *cl = cbq_classify(skb, sch, &ret);
-
-#ifdef CONFIG_NET_CLS_ACT
- q->rx_class = cl;
-#endif
- if (cl == NULL) {
- if (ret & __NET_XMIT_BYPASS)
- qdisc_qstats_drop(sch);
- __qdisc_drop(skb, to_free);
- return ret;
- }
-
- ret = qdisc_enqueue(skb, cl->q, to_free);
- if (ret == NET_XMIT_SUCCESS) {
- sch->q.qlen++;
- cbq_mark_toplevel(q, cl);
- if (!cl->next_alive)
- cbq_activate_class(cl);
- return ret;
- }
-
- if (net_xmit_drop_count(ret)) {
- qdisc_qstats_drop(sch);
- cbq_mark_toplevel(q, cl);
- cl->qstats.drops++;
- }
- return ret;
-}
-
-/* Overlimit action: penalize leaf class by adding offtime */
-static void cbq_overlimit(struct cbq_class *cl)
-{
- struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
- psched_tdiff_t delay = cl->undertime - q->now;
-
- if (!cl->delayed) {
- delay += cl->offtime;
-
- /*
- * Class goes to sleep, so that it will have no
- * chance to work avgidle. Let's forgive it 8)
- *
- * BTW cbq-2.0 has a crap in this
- * place, apparently they forgot to shift it by cl->ewma_log.
- */
- if (cl->avgidle < 0)
- delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log);
- if (cl->avgidle < cl->minidle)
- cl->avgidle = cl->minidle;
- if (delay <= 0)
- delay = 1;
- cl->undertime = q->now + delay;
-
- cl->xstats.overactions++;
- cl->delayed = 1;
- }
- if (q->wd_expires == 0 || q->wd_expires > delay)
- q->wd_expires = delay;
-
- /* Dirty work! We must schedule wakeups based on
- * real available rate, rather than leaf rate,
- * which may be tiny (even zero).
- */
- if (q->toplevel == TC_CBQ_MAXLEVEL) {
- struct cbq_class *b;
- psched_tdiff_t base_delay = q->wd_expires;
-
- for (b = cl->borrow; b; b = b->borrow) {
- delay = b->undertime - q->now;
- if (delay < base_delay) {
- if (delay <= 0)
- delay = 1;
- base_delay = delay;
- }
- }
-
- q->wd_expires = base_delay;
- }
-}
-
-static psched_tdiff_t cbq_undelay_prio(struct cbq_sched_data *q, int prio,
- psched_time_t now)
-{
- struct cbq_class *cl;
- struct cbq_class *cl_prev = q->active[prio];
- psched_time_t sched = now;
-
- if (cl_prev == NULL)
- return 0;
-
- do {
- cl = cl_prev->next_alive;
- if (now - cl->penalized > 0) {
- cl_prev->next_alive = cl->next_alive;
- cl->next_alive = NULL;
- cl->cpriority = cl->priority;
- cl->delayed = 0;
- cbq_activate_class(cl);
-
- if (cl == q->active[prio]) {
- q->active[prio] = cl_prev;
- if (cl == q->active[prio]) {
- q->active[prio] = NULL;
- return 0;
- }
- }
-
- cl = cl_prev->next_alive;
- } else if (sched - cl->penalized > 0)
- sched = cl->penalized;
- } while ((cl_prev = cl) != q->active[prio]);
-
- return sched - now;
-}
-
-static enum hrtimer_restart cbq_undelay(struct hrtimer *timer)
-{
- struct cbq_sched_data *q = container_of(timer, struct cbq_sched_data,
- delay_timer);
- struct Qdisc *sch = q->watchdog.qdisc;
- psched_time_t now;
- psched_tdiff_t delay = 0;
- unsigned int pmask;
-
- now = psched_get_time();
-
- pmask = q->pmask;
- q->pmask = 0;
-
- while (pmask) {
- int prio = ffz(~pmask);
- psched_tdiff_t tmp;
-
- pmask &= ~(1<<prio);
-
- tmp = cbq_undelay_prio(q, prio, now);
- if (tmp > 0) {
- q->pmask |= 1<<prio;
- if (tmp < delay || delay == 0)
- delay = tmp;
- }
- }
-
- if (delay) {
- ktime_t time;
-
- time = 0;
- time = ktime_add_ns(time, PSCHED_TICKS2NS(now + delay));
- hrtimer_start(&q->delay_timer, time, HRTIMER_MODE_ABS_PINNED);
- }
-
- __netif_schedule(qdisc_root(sch));
- return HRTIMER_NORESTART;
-}
-
-/*
- * It is mission critical procedure.
- *
- * We "regenerate" toplevel cutoff, if transmitting class
- * has backlog and it is not regulated. It is not part of
- * original CBQ description, but looks more reasonable.
- * Probably, it is wrong. This question needs further investigation.
- */
-
-static inline void
-cbq_update_toplevel(struct cbq_sched_data *q, struct cbq_class *cl,
- struct cbq_class *borrowed)
-{
- if (cl && q->toplevel >= borrowed->level) {
- if (cl->q->q.qlen > 1) {
- do {
- if (borrowed->undertime == PSCHED_PASTPERFECT) {
- q->toplevel = borrowed->level;
- return;
- }
- } while ((borrowed = borrowed->borrow) != NULL);
- }
-#if 0
- /* It is not necessary now. Uncommenting it
- will save CPU cycles, but decrease fairness.
- */
- q->toplevel = TC_CBQ_MAXLEVEL;
-#endif
- }
-}
-
-static void
-cbq_update(struct cbq_sched_data *q)
-{
- struct cbq_class *this = q->tx_class;
- struct cbq_class *cl = this;
- int len = q->tx_len;
- psched_time_t now;
-
- q->tx_class = NULL;
- /* Time integrator. We calculate EOS time
- * by adding expected packet transmission time.
- */
- now = q->now + L2T(&q->link, len);
-
- for ( ; cl; cl = cl->share) {
- long avgidle = cl->avgidle;
- long idle;
-
- cl->bstats.packets++;
- cl->bstats.bytes += len;
-
- /*
- * (now - last) is total time between packet right edges.
- * (last_pktlen/rate) is "virtual" busy time, so that
- *
- * idle = (now - last) - last_pktlen/rate
- */
-
- idle = now - cl->last;
- if ((unsigned long)idle > 128*1024*1024) {
- avgidle = cl->maxidle;
- } else {
- idle -= L2T(cl, len);
-
- /* true_avgidle := (1-W)*true_avgidle + W*idle,
- * where W=2^{-ewma_log}. But cl->avgidle is scaled:
- * cl->avgidle == true_avgidle/W,
- * hence:
- */
- avgidle += idle - (avgidle>>cl->ewma_log);
- }
-
- if (avgidle <= 0) {
- /* Overlimit or at-limit */
-
- if (avgidle < cl->minidle)
- avgidle = cl->minidle;
-
- cl->avgidle = avgidle;
-
- /* Calculate expected time, when this class
- * will be allowed to send.
- * It will occur, when:
- * (1-W)*true_avgidle + W*delay = 0, i.e.
- * idle = (1/W - 1)*(-true_avgidle)
- * or
- * idle = (1 - W)*(-cl->avgidle);
- */
- idle = (-avgidle) - ((-avgidle) >> cl->ewma_log);
-
- /*
- * That is not all.
- * To maintain the rate allocated to the class,
- * we add to undertime virtual clock,
- * necessary to complete transmitted packet.
- * (len/phys_bandwidth has been already passed
- * to the moment of cbq_update)
- */
-
- idle -= L2T(&q->link, len);
- idle += L2T(cl, len);
-
- cl->undertime = now + idle;
- } else {
- /* Underlimit */
-
- cl->undertime = PSCHED_PASTPERFECT;
- if (avgidle > cl->maxidle)
- cl->avgidle = cl->maxidle;
- else
- cl->avgidle = avgidle;
- }
- if ((s64)(now - cl->last) > 0)
- cl->last = now;
- }
-
- cbq_update_toplevel(q, this, q->tx_borrowed);
-}
-
-static inline struct cbq_class *
-cbq_under_limit(struct cbq_class *cl)
-{
- struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
- struct cbq_class *this_cl = cl;
-
- if (cl->tparent == NULL)
- return cl;
-
- if (cl->undertime == PSCHED_PASTPERFECT || q->now >= cl->undertime) {
- cl->delayed = 0;
- return cl;
- }
-
- do {
- /* It is very suspicious place. Now overlimit
- * action is generated for not bounded classes
- * only if link is completely congested.
- * Though it is in agree with ancestor-only paradigm,
- * it looks very stupid. Particularly,
- * it means that this chunk of code will either
- * never be called or result in strong amplification
- * of burstiness. Dangerous, silly, and, however,
- * no another solution exists.
- */
- cl = cl->borrow;
- if (!cl) {
- this_cl->qstats.overlimits++;
- cbq_overlimit(this_cl);
- return NULL;
- }
- if (cl->level > q->toplevel)
- return NULL;
- } while (cl->undertime != PSCHED_PASTPERFECT && q->now < cl->undertime);
-
- cl->delayed = 0;
- return cl;
-}
-
-static inline struct sk_buff *
-cbq_dequeue_prio(struct Qdisc *sch, int prio)
-{
- struct cbq_sched_data *q = qdisc_priv(sch);
- struct cbq_class *cl_tail, *cl_prev, *cl;
- struct sk_buff *skb;
- int deficit;
-
- cl_tail = cl_prev = q->active[prio];
- cl = cl_prev->next_alive;
-
- do {
- deficit = 0;
-
- /* Start round */
- do {
- struct cbq_class *borrow = cl;
-
- if (cl->q->q.qlen &&
- (borrow = cbq_under_limit(cl)) == NULL)
- goto skip_class;
-
- if (cl->deficit <= 0) {
- /* Class exhausted its allotment per
- * this round. Switch to the next one.
- */
- deficit = 1;
- cl->deficit += cl->quantum;
- goto next_class;
- }
-
- skb = cl->q->dequeue(cl->q);
-
- /* Class did not give us any skb :-(
- * It could occur even if cl->q->q.qlen != 0
- * f.e. if cl->q == "tbf"
- */
- if (skb == NULL)
- goto skip_class;
-
- cl->deficit -= qdisc_pkt_len(skb);
- q->tx_class = cl;
- q->tx_borrowed = borrow;
- if (borrow != cl) {
-#ifndef CBQ_XSTATS_BORROWS_BYTES
- borrow->xstats.borrows++;
- cl->xstats.borrows++;
-#else
- borrow->xstats.borrows += qdisc_pkt_len(skb);
- cl->xstats.borrows += qdisc_pkt_len(skb);
-#endif
- }
- q->tx_len = qdisc_pkt_len(skb);
-
- if (cl->deficit <= 0) {
- q->active[prio] = cl;
- cl = cl->next_alive;
- cl->deficit += cl->quantum;
- }
- return skb;
-
-skip_class:
- if (cl->q->q.qlen == 0 || prio != cl->cpriority) {
- /* Class is empty or penalized.
- * Unlink it from active chain.
- */
- cl_prev->next_alive = cl->next_alive;
- cl->next_alive = NULL;
-
- /* Did cl_tail point to it? */
- if (cl == cl_tail) {
- /* Repair it! */
- cl_tail = cl_prev;
-
- /* Was it the last class in this band? */
- if (cl == cl_tail) {
- /* Kill the band! */
- q->active[prio] = NULL;
- q->activemask &= ~(1<<prio);
- if (cl->q->q.qlen)
- cbq_activate_class(cl);
- return NULL;
- }
-
- q->active[prio] = cl_tail;
- }
- if (cl->q->q.qlen)
- cbq_activate_class(cl);
-
- cl = cl_prev;
- }
-
-next_class:
- cl_prev = cl;
- cl = cl->next_alive;
- } while (cl_prev != cl_tail);
- } while (deficit);
-
- q->active[prio] = cl_prev;
-
- return NULL;
-}
-
-static inline struct sk_buff *
-cbq_dequeue_1(struct Qdisc *sch)
-{
- struct cbq_sched_data *q = qdisc_priv(sch);
- struct sk_buff *skb;
- unsigned int activemask;
-
- activemask = q->activemask & 0xFF;
- while (activemask) {
- int prio = ffz(~activemask);
- activemask &= ~(1<<prio);
- skb = cbq_dequeue_prio(sch, prio);
- if (skb)
- return skb;
- }
- return NULL;
-}
-
-static struct sk_buff *
-cbq_dequeue(struct Qdisc *sch)
-{
- struct sk_buff *skb;
- struct cbq_sched_data *q = qdisc_priv(sch);
- psched_time_t now;
-
- now = psched_get_time();
-
- if (q->tx_class)
- cbq_update(q);
-
- q->now = now;
-
- for (;;) {
- q->wd_expires = 0;
-
- skb = cbq_dequeue_1(sch);
- if (skb) {
- qdisc_bstats_update(sch, skb);
- sch->q.qlen--;
- return skb;
- }
-
- /* All the classes are overlimit.
- *
- * It is possible, if:
- *
- * 1. Scheduler is empty.
- * 2. Toplevel cutoff inhibited borrowing.
- * 3. Root class is overlimit.
- *
- * Reset 2d and 3d conditions and retry.
- *
- * Note, that NS and cbq-2.0 are buggy, peeking
- * an arbitrary class is appropriate for ancestor-only
- * sharing, but not for toplevel algorithm.
- *
- * Our version is better, but slower, because it requires
- * two passes, but it is unavoidable with top-level sharing.
- */
-
- if (q->toplevel == TC_CBQ_MAXLEVEL &&
- q->link.undertime == PSCHED_PASTPERFECT)
- break;
-
- q->toplevel = TC_CBQ_MAXLEVEL;
- q->link.undertime = PSCHED_PASTPERFECT;
- }
-
- /* No packets in scheduler or nobody wants to give them to us :-(
- * Sigh... start watchdog timer in the last case.
- */
-
- if (sch->q.qlen) {
- qdisc_qstats_overlimit(sch);
- if (q->wd_expires)
- qdisc_watchdog_schedule(&q->watchdog,
- now + q->wd_expires);
- }
- return NULL;
-}
-
-/* CBQ class maintanance routines */
-
-static void cbq_adjust_levels(struct cbq_class *this)
-{
- if (this == NULL)
- return;
-
- do {
- int level = 0;
- struct cbq_class *cl;
-
- cl = this->children;
- if (cl) {
- do {
- if (cl->level > level)
- level = cl->level;
- } while ((cl = cl->sibling) != this->children);
- }
- this->level = level + 1;
- } while ((this = this->tparent) != NULL);
-}
-
-static void cbq_normalize_quanta(struct cbq_sched_data *q, int prio)
-{
- struct cbq_class *cl;
- unsigned int h;
-
- if (q->quanta[prio] == 0)
- return;
-
- for (h = 0; h < q->clhash.hashsize; h++) {
- hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
- /* BUGGGG... Beware! This expression suffer of
- * arithmetic overflows!
- */
- if (cl->priority == prio) {
- cl->quantum = (cl->weight*cl->allot*q->nclasses[prio])/
- q->quanta[prio];
- }
- if (cl->quantum <= 0 ||
- cl->quantum > 32*qdisc_dev(cl->qdisc)->mtu) {
- pr_warn("CBQ: class %08x has bad quantum==%ld, repaired.\n",
- cl->common.classid, cl->quantum);
- cl->quantum = qdisc_dev(cl->qdisc)->mtu/2 + 1;
- }
- }
- }
-}
-
-static void cbq_sync_defmap(struct cbq_class *cl)
-{
- struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
- struct cbq_class *split = cl->split;
- unsigned int h;
- int i;
-
- if (split == NULL)
- return;
-
- for (i = 0; i <= TC_PRIO_MAX; i++) {
- if (split->defaults[i] == cl && !(cl->defmap & (1<<i)))
- split->defaults[i] = NULL;
- }
-
- for (i = 0; i <= TC_PRIO_MAX; i++) {
- int level = split->level;
-
- if (split->defaults[i])
- continue;
-
- for (h = 0; h < q->clhash.hashsize; h++) {
- struct cbq_class *c;
-
- hlist_for_each_entry(c, &q->clhash.hash[h],
- common.hnode) {
- if (c->split == split && c->level < level &&
- c->defmap & (1<<i)) {
- split->defaults[i] = c;
- level = c->level;
- }
- }
- }
- }
-}
-
-static void cbq_change_defmap(struct cbq_class *cl, u32 splitid, u32 def, u32 mask)
-{
- struct cbq_class *split = NULL;
-
- if (splitid == 0) {
- split = cl->split;
- if (!split)
- return;
- splitid = split->common.classid;
- }
-
- if (split == NULL || split->common.classid != splitid) {
- for (split = cl->tparent; split; split = split->tparent)
- if (split->common.classid == splitid)
- break;
- }
-
- if (split == NULL)
- return;
-
- if (cl->split != split) {
- cl->defmap = 0;
- cbq_sync_defmap(cl);
- cl->split = split;
- cl->defmap = def & mask;
- } else
- cl->defmap = (cl->defmap & ~mask) | (def & mask);
-
- cbq_sync_defmap(cl);
-}
-
-static void cbq_unlink_class(struct cbq_class *this)
-{
- struct cbq_class *cl, **clp;
- struct cbq_sched_data *q = qdisc_priv(this->qdisc);
-
- qdisc_class_hash_remove(&q->clhash, &this->common);
-
- if (this->tparent) {
- clp = &this->sibling;
- cl = *clp;
- do {
- if (cl == this) {
- *clp = cl->sibling;
- break;
- }
- clp = &cl->sibling;
- } while ((cl = *clp) != this->sibling);
-
- if (this->tparent->children == this) {
- this->tparent->children = this->sibling;
- if (this->sibling == this)
- this->tparent->children = NULL;
- }
- } else {
- WARN_ON(this->sibling != this);
- }
-}
-
-static void cbq_link_class(struct cbq_class *this)
-{
- struct cbq_sched_data *q = qdisc_priv(this->qdisc);
- struct cbq_class *parent = this->tparent;
-
- this->sibling = this;
- qdisc_class_hash_insert(&q->clhash, &this->common);
-
- if (parent == NULL)
- return;
-
- if (parent->children == NULL) {
- parent->children = this;
- } else {
- this->sibling = parent->children->sibling;
- parent->children->sibling = this;
- }
-}
-
-static void
-cbq_reset(struct Qdisc *sch)
-{
- struct cbq_sched_data *q = qdisc_priv(sch);
- struct cbq_class *cl;
- int prio;
- unsigned int h;
-
- q->activemask = 0;
- q->pmask = 0;
- q->tx_class = NULL;
- q->tx_borrowed = NULL;
- qdisc_watchdog_cancel(&q->watchdog);
- hrtimer_cancel(&q->delay_timer);
- q->toplevel = TC_CBQ_MAXLEVEL;
- q->now = psched_get_time();
-
- for (prio = 0; prio <= TC_CBQ_MAXPRIO; prio++)
- q->active[prio] = NULL;
-
- for (h = 0; h < q->clhash.hashsize; h++) {
- hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
- qdisc_reset(cl->q);
-
- cl->next_alive = NULL;
- cl->undertime = PSCHED_PASTPERFECT;
- cl->avgidle = cl->maxidle;
- cl->deficit = cl->quantum;
- cl->cpriority = cl->priority;
- }
- }
- sch->q.qlen = 0;
-}
-
-
-static int cbq_set_lss(struct cbq_class *cl, struct tc_cbq_lssopt *lss)
-{
- if (lss->change & TCF_CBQ_LSS_FLAGS) {
- cl->share = (lss->flags & TCF_CBQ_LSS_ISOLATED) ? NULL : cl->tparent;
- cl->borrow = (lss->flags & TCF_CBQ_LSS_BOUNDED) ? NULL : cl->tparent;
- }
- if (lss->change & TCF_CBQ_LSS_EWMA)
- cl->ewma_log = lss->ewma_log;
- if (lss->change & TCF_CBQ_LSS_AVPKT)
- cl->avpkt = lss->avpkt;
- if (lss->change & TCF_CBQ_LSS_MINIDLE)
- cl->minidle = -(long)lss->minidle;
- if (lss->change & TCF_CBQ_LSS_MAXIDLE) {
- cl->maxidle = lss->maxidle;
- cl->avgidle = lss->maxidle;
- }
- if (lss->change & TCF_CBQ_LSS_OFFTIME)
- cl->offtime = lss->offtime;
- return 0;
-}
-
-static void cbq_rmprio(struct cbq_sched_data *q, struct cbq_class *cl)
-{
- q->nclasses[cl->priority]--;
- q->quanta[cl->priority] -= cl->weight;
- cbq_normalize_quanta(q, cl->priority);
-}
-
-static void cbq_addprio(struct cbq_sched_data *q, struct cbq_class *cl)
-{
- q->nclasses[cl->priority]++;
- q->quanta[cl->priority] += cl->weight;
- cbq_normalize_quanta(q, cl->priority);
-}
-
-static int cbq_set_wrr(struct cbq_class *cl, struct tc_cbq_wrropt *wrr)
-{
- struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
-
- if (wrr->allot)
- cl->allot = wrr->allot;
- if (wrr->weight)
- cl->weight = wrr->weight;
- if (wrr->priority) {
- cl->priority = wrr->priority - 1;
- cl->cpriority = cl->priority;
- if (cl->priority >= cl->priority2)
- cl->priority2 = TC_CBQ_MAXPRIO - 1;
- }
-
- cbq_addprio(q, cl);
- return 0;
-}
-
-static int cbq_set_fopt(struct cbq_class *cl, struct tc_cbq_fopt *fopt)
-{
- cbq_change_defmap(cl, fopt->split, fopt->defmap, fopt->defchange);
- return 0;
-}
-
-static const struct nla_policy cbq_policy[TCA_CBQ_MAX + 1] = {
- [TCA_CBQ_LSSOPT] = { .len = sizeof(struct tc_cbq_lssopt) },
- [TCA_CBQ_WRROPT] = { .len = sizeof(struct tc_cbq_wrropt) },
- [TCA_CBQ_FOPT] = { .len = sizeof(struct tc_cbq_fopt) },
- [TCA_CBQ_OVL_STRATEGY] = { .len = sizeof(struct tc_cbq_ovl) },
- [TCA_CBQ_RATE] = { .len = sizeof(struct tc_ratespec) },
- [TCA_CBQ_RTAB] = { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
- [TCA_CBQ_POLICE] = { .len = sizeof(struct tc_cbq_police) },
-};
-
-static int cbq_init(struct Qdisc *sch, struct nlattr *opt,
- struct netlink_ext_ack *extack)
-{
- struct cbq_sched_data *q = qdisc_priv(sch);
- struct nlattr *tb[TCA_CBQ_MAX + 1];
- struct tc_ratespec *r;
- int err;
-
- qdisc_watchdog_init(&q->watchdog, sch);
- hrtimer_init(&q->delay_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
- q->delay_timer.function = cbq_undelay;
-
- if (!opt) {
- NL_SET_ERR_MSG(extack, "CBQ options are required for this operation");
- return -EINVAL;
- }
-
- err = nla_parse_nested(tb, TCA_CBQ_MAX, opt, cbq_policy, extack);
- if (err < 0)
- return err;
-
- if (!tb[TCA_CBQ_RTAB] || !tb[TCA_CBQ_RATE]) {
- NL_SET_ERR_MSG(extack, "Rate specification missing or incomplete");
- return -EINVAL;
- }
-
- r = nla_data(tb[TCA_CBQ_RATE]);
-
- q->link.R_tab = qdisc_get_rtab(r, tb[TCA_CBQ_RTAB], extack);
- if (!q->link.R_tab)
- return -EINVAL;
-
- err = tcf_block_get(&q->link.block, &q->link.filter_list, sch, extack);
- if (err)
- goto put_rtab;
-
- err = qdisc_class_hash_init(&q->clhash);
- if (err < 0)
- goto put_block;
-
- q->link.sibling = &q->link;
- q->link.common.classid = sch->handle;
- q->link.qdisc = sch;
- q->link.q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
- sch->handle, NULL);
- if (!q->link.q)
- q->link.q = &noop_qdisc;
- else
- qdisc_hash_add(q->link.q, true);
-
- q->link.priority = TC_CBQ_MAXPRIO - 1;
- q->link.priority2 = TC_CBQ_MAXPRIO - 1;
- q->link.cpriority = TC_CBQ_MAXPRIO - 1;
- q->link.allot = psched_mtu(qdisc_dev(sch));
- q->link.quantum = q->link.allot;
- q->link.weight = q->link.R_tab->rate.rate;
-
- q->link.ewma_log = TC_CBQ_DEF_EWMA;
- q->link.avpkt = q->link.allot/2;
- q->link.minidle = -0x7FFFFFFF;
-
- q->toplevel = TC_CBQ_MAXLEVEL;
- q->now = psched_get_time();
-
- cbq_link_class(&q->link);
-
- if (tb[TCA_CBQ_LSSOPT])
- cbq_set_lss(&q->link, nla_data(tb[TCA_CBQ_LSSOPT]));
-
- cbq_addprio(q, &q->link);
- return 0;
-
-put_block:
- tcf_block_put(q->link.block);
-
-put_rtab:
- qdisc_put_rtab(q->link.R_tab);
- return err;
-}
-
-static int cbq_dump_rate(struct sk_buff *skb, struct cbq_class *cl)
-{
- unsigned char *b = skb_tail_pointer(skb);
-
- if (nla_put(skb, TCA_CBQ_RATE, sizeof(cl->R_tab->rate), &cl->R_tab->rate))
- goto nla_put_failure;
- return skb->len;
-
-nla_put_failure:
- nlmsg_trim(skb, b);
- return -1;
-}
-
-static int cbq_dump_lss(struct sk_buff *skb, struct cbq_class *cl)
-{
- unsigned char *b = skb_tail_pointer(skb);
- struct tc_cbq_lssopt opt;
-
- opt.flags = 0;
- if (cl->borrow == NULL)
- opt.flags |= TCF_CBQ_LSS_BOUNDED;
- if (cl->share == NULL)
- opt.flags |= TCF_CBQ_LSS_ISOLATED;
- opt.ewma_log = cl->ewma_log;
- opt.level = cl->level;
- opt.avpkt = cl->avpkt;
- opt.maxidle = cl->maxidle;
- opt.minidle = (u32)(-cl->minidle);
- opt.offtime = cl->offtime;
- opt.change = ~0;
- if (nla_put(skb, TCA_CBQ_LSSOPT, sizeof(opt), &opt))
- goto nla_put_failure;
- return skb->len;
-
-nla_put_failure:
- nlmsg_trim(skb, b);
- return -1;
-}
-
-static int cbq_dump_wrr(struct sk_buff *skb, struct cbq_class *cl)
-{
- unsigned char *b = skb_tail_pointer(skb);
- struct tc_cbq_wrropt opt;
-
- memset(&opt, 0, sizeof(opt));
- opt.flags = 0;
- opt.allot = cl->allot;
- opt.priority = cl->priority + 1;
- opt.cpriority = cl->cpriority + 1;
- opt.weight = cl->weight;
- if (nla_put(skb, TCA_CBQ_WRROPT, sizeof(opt), &opt))
- goto nla_put_failure;
- return skb->len;
-
-nla_put_failure:
- nlmsg_trim(skb, b);
- return -1;
-}
-
-static int cbq_dump_fopt(struct sk_buff *skb, struct cbq_class *cl)
-{
- unsigned char *b = skb_tail_pointer(skb);
- struct tc_cbq_fopt opt;
-
- if (cl->split || cl->defmap) {
- opt.split = cl->split ? cl->split->common.classid : 0;
- opt.defmap = cl->defmap;
- opt.defchange = ~0;
- if (nla_put(skb, TCA_CBQ_FOPT, sizeof(opt), &opt))
- goto nla_put_failure;
- }
- return skb->len;
-
-nla_put_failure:
- nlmsg_trim(skb, b);
- return -1;
-}
-
-static int cbq_dump_attr(struct sk_buff *skb, struct cbq_class *cl)
-{
- if (cbq_dump_lss(skb, cl) < 0 ||
- cbq_dump_rate(skb, cl) < 0 ||
- cbq_dump_wrr(skb, cl) < 0 ||
- cbq_dump_fopt(skb, cl) < 0)
- return -1;
- return 0;
-}
-
-static int cbq_dump(struct Qdisc *sch, struct sk_buff *skb)
-{
- struct cbq_sched_data *q = qdisc_priv(sch);
- struct nlattr *nest;
-
- nest = nla_nest_start(skb, TCA_OPTIONS);
- if (nest == NULL)
- goto nla_put_failure;
- if (cbq_dump_attr(skb, &q->link) < 0)
- goto nla_put_failure;
- return nla_nest_end(skb, nest);
-
-nla_put_failure:
- nla_nest_cancel(skb, nest);
- return -1;
-}
-
-static int
-cbq_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
-{
- struct cbq_sched_data *q = qdisc_priv(sch);
-
- q->link.xstats.avgidle = q->link.avgidle;
- return gnet_stats_copy_app(d, &q->link.xstats, sizeof(q->link.xstats));
-}
-
-static int
-cbq_dump_class(struct Qdisc *sch, unsigned long arg,
- struct sk_buff *skb, struct tcmsg *tcm)
-{
- struct cbq_class *cl = (struct cbq_class *)arg;
- struct nlattr *nest;
-
- if (cl->tparent)
- tcm->tcm_parent = cl->tparent->common.classid;
- else
- tcm->tcm_parent = TC_H_ROOT;
- tcm->tcm_handle = cl->common.classid;
- tcm->tcm_info = cl->q->handle;
-
- nest = nla_nest_start(skb, TCA_OPTIONS);
- if (nest == NULL)
- goto nla_put_failure;
- if (cbq_dump_attr(skb, cl) < 0)
- goto nla_put_failure;
- return nla_nest_end(skb, nest);
-
-nla_put_failure:
- nla_nest_cancel(skb, nest);
- return -1;
-}
-
-static int
-cbq_dump_class_stats(struct Qdisc *sch, unsigned long arg,
- struct gnet_dump *d)
-{
- struct cbq_sched_data *q = qdisc_priv(sch);
- struct cbq_class *cl = (struct cbq_class *)arg;
-
- cl->xstats.avgidle = cl->avgidle;
- cl->xstats.undertime = 0;
-
- if (cl->undertime != PSCHED_PASTPERFECT)
- cl->xstats.undertime = cl->undertime - q->now;
-
- if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
- d, NULL, &cl->bstats) < 0 ||
- gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
- gnet_stats_copy_queue(d, NULL, &cl->qstats, cl->q->q.qlen) < 0)
- return -1;
-
- return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats));
-}
-
-static int cbq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
- struct Qdisc **old, struct netlink_ext_ack *extack)
-{
- struct cbq_class *cl = (struct cbq_class *)arg;
-
- if (new == NULL) {
- new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
- cl->common.classid, extack);
- if (new == NULL)
- return -ENOBUFS;
- }
-
- *old = qdisc_replace(sch, new, &cl->q);
- return 0;
-}
-
-static struct Qdisc *cbq_leaf(struct Qdisc *sch, unsigned long arg)
-{
- struct cbq_class *cl = (struct cbq_class *)arg;
-
- return cl->q;
-}
-
-static void cbq_qlen_notify(struct Qdisc *sch, unsigned long arg)
-{
- struct cbq_class *cl = (struct cbq_class *)arg;
-
- cbq_deactivate_class(cl);
-}
-
-static unsigned long cbq_find(struct Qdisc *sch, u32 classid)
-{
- struct cbq_sched_data *q = qdisc_priv(sch);
-
- return (unsigned long)cbq_class_lookup(q, classid);
-}
-
-static void cbq_destroy_class(struct Qdisc *sch, struct cbq_class *cl)
-{
- struct cbq_sched_data *q = qdisc_priv(sch);
-
- WARN_ON(cl->filters);
-
- tcf_block_put(cl->block);
- qdisc_put(cl->q);
- qdisc_put_rtab(cl->R_tab);
- gen_kill_estimator(&cl->rate_est);
- if (cl != &q->link)
- kfree(cl);
-}
-
-static void cbq_destroy(struct Qdisc *sch)
-{
- struct cbq_sched_data *q = qdisc_priv(sch);
- struct hlist_node *next;
- struct cbq_class *cl;
- unsigned int h;
-
-#ifdef CONFIG_NET_CLS_ACT
- q->rx_class = NULL;
-#endif
- /*
- * Filters must be destroyed first because we don't destroy the
- * classes from root to leafs which means that filters can still
- * be bound to classes which have been destroyed already. --TGR '04
- */
- for (h = 0; h < q->clhash.hashsize; h++) {
- hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
- tcf_block_put(cl->block);
- cl->block = NULL;
- }
- }
- for (h = 0; h < q->clhash.hashsize; h++) {
- hlist_for_each_entry_safe(cl, next, &q->clhash.hash[h],
- common.hnode)
- cbq_destroy_class(sch, cl);
- }
- qdisc_class_hash_destroy(&q->clhash);
-}
-
-static int
-cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **tca,
- unsigned long *arg, struct netlink_ext_ack *extack)
-{
- int err;
- struct cbq_sched_data *q = qdisc_priv(sch);
- struct cbq_class *cl = (struct cbq_class *)*arg;
- struct nlattr *opt = tca[TCA_OPTIONS];
- struct nlattr *tb[TCA_CBQ_MAX + 1];
- struct cbq_class *parent;
- struct qdisc_rate_table *rtab = NULL;
-
- if (!opt) {
- NL_SET_ERR_MSG(extack, "Mandatory qdisc options missing");
- return -EINVAL;
- }
-
- err = nla_parse_nested(tb, TCA_CBQ_MAX, opt, cbq_policy, extack);
- if (err < 0)
- return err;
-
- if (tb[TCA_CBQ_OVL_STRATEGY] || tb[TCA_CBQ_POLICE]) {
- NL_SET_ERR_MSG(extack, "Neither overlimit strategy nor policing attributes can be used for changing class params");
- return -EOPNOTSUPP;
- }
-
- if (cl) {
- /* Check parent */
- if (parentid) {
- if (cl->tparent &&
- cl->tparent->common.classid != parentid) {
- NL_SET_ERR_MSG(extack, "Invalid parent id");
- return -EINVAL;
- }
- if (!cl->tparent && parentid != TC_H_ROOT) {
- NL_SET_ERR_MSG(extack, "Parent must be root");
- return -EINVAL;
- }
- }
-
- if (tb[TCA_CBQ_RATE]) {
- rtab = qdisc_get_rtab(nla_data(tb[TCA_CBQ_RATE]),
- tb[TCA_CBQ_RTAB], extack);
- if (rtab == NULL)
- return -EINVAL;
- }
-
- if (tca[TCA_RATE]) {
- err = gen_replace_estimator(&cl->bstats, NULL,
- &cl->rate_est,
- NULL,
- qdisc_root_sleeping_running(sch),
- tca[TCA_RATE]);
- if (err) {
- NL_SET_ERR_MSG(extack, "Failed to replace specified rate estimator");
- qdisc_put_rtab(rtab);
- return err;
- }
- }
-
- /* Change class parameters */
- sch_tree_lock(sch);
-
- if (cl->next_alive != NULL)
- cbq_deactivate_class(cl);
-
- if (rtab) {
- qdisc_put_rtab(cl->R_tab);
- cl->R_tab = rtab;
- }
-
- if (tb[TCA_CBQ_LSSOPT])
- cbq_set_lss(cl, nla_data(tb[TCA_CBQ_LSSOPT]));
-
- if (tb[TCA_CBQ_WRROPT]) {
- cbq_rmprio(q, cl);
- cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT]));
- }
-
- if (tb[TCA_CBQ_FOPT])
- cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT]));
-
- if (cl->q->q.qlen)
- cbq_activate_class(cl);
-
- sch_tree_unlock(sch);
-
- return 0;
- }
-
- if (parentid == TC_H_ROOT)
- return -EINVAL;
-
- if (!tb[TCA_CBQ_WRROPT] || !tb[TCA_CBQ_RATE] || !tb[TCA_CBQ_LSSOPT]) {
- NL_SET_ERR_MSG(extack, "One of the following attributes MUST be specified: WRR, rate or link sharing");
- return -EINVAL;
- }
-
- rtab = qdisc_get_rtab(nla_data(tb[TCA_CBQ_RATE]), tb[TCA_CBQ_RTAB],
- extack);
- if (rtab == NULL)
- return -EINVAL;
-
- if (classid) {
- err = -EINVAL;
- if (TC_H_MAJ(classid ^ sch->handle) ||
- cbq_class_lookup(q, classid)) {
- NL_SET_ERR_MSG(extack, "Specified class not found");
- goto failure;
- }
- } else {
- int i;
- classid = TC_H_MAKE(sch->handle, 0x8000);
-
- for (i = 0; i < 0x8000; i++) {
- if (++q->hgenerator >= 0x8000)
- q->hgenerator = 1;
- if (cbq_class_lookup(q, classid|q->hgenerator) == NULL)
- break;
- }
- err = -ENOSR;
- if (i >= 0x8000) {
- NL_SET_ERR_MSG(extack, "Unable to generate classid");
- goto failure;
- }
- classid = classid|q->hgenerator;
- }
-
- parent = &q->link;
- if (parentid) {
- parent = cbq_class_lookup(q, parentid);
- err = -EINVAL;
- if (!parent) {
- NL_SET_ERR_MSG(extack, "Failed to find parentid");
- goto failure;
- }
- }
-
- err = -ENOBUFS;
- cl = kzalloc(sizeof(*cl), GFP_KERNEL);
- if (cl == NULL)
- goto failure;
-
- err = tcf_block_get(&cl->block, &cl->filter_list, sch, extack);
- if (err) {
- kfree(cl);
- return err;
- }
-
- if (tca[TCA_RATE]) {
- err = gen_new_estimator(&cl->bstats, NULL, &cl->rate_est,
- NULL,
- qdisc_root_sleeping_running(sch),
- tca[TCA_RATE]);
- if (err) {
- NL_SET_ERR_MSG(extack, "Couldn't create new estimator");
- tcf_block_put(cl->block);
- kfree(cl);
- goto failure;
- }
- }
-
- cl->R_tab = rtab;
- rtab = NULL;
- cl->q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, classid,
- NULL);
- if (!cl->q)
- cl->q = &noop_qdisc;
- else
- qdisc_hash_add(cl->q, true);
-
- cl->common.classid = classid;
- cl->tparent = parent;
- cl->qdisc = sch;
- cl->allot = parent->allot;
- cl->quantum = cl->allot;
- cl->weight = cl->R_tab->rate.rate;
-
- sch_tree_lock(sch);
- cbq_link_class(cl);
- cl->borrow = cl->tparent;
- if (cl->tparent != &q->link)
- cl->share = cl->tparent;
- cbq_adjust_levels(parent);
- cl->minidle = -0x7FFFFFFF;
- cbq_set_lss(cl, nla_data(tb[TCA_CBQ_LSSOPT]));
- cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT]));
- if (cl->ewma_log == 0)
- cl->ewma_log = q->link.ewma_log;
- if (cl->maxidle == 0)
- cl->maxidle = q->link.maxidle;
- if (cl->avpkt == 0)
- cl->avpkt = q->link.avpkt;
- if (tb[TCA_CBQ_FOPT])
- cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT]));
- sch_tree_unlock(sch);
-
- qdisc_class_hash_grow(sch, &q->clhash);
-
- *arg = (unsigned long)cl;
- return 0;
-
-failure:
- qdisc_put_rtab(rtab);
- return err;
-}
-
-static int cbq_delete(struct Qdisc *sch, unsigned long arg)
-{
- struct cbq_sched_data *q = qdisc_priv(sch);
- struct cbq_class *cl = (struct cbq_class *)arg;
- unsigned int qlen, backlog;
-
- if (cl->filters || cl->children || cl == &q->link)
- return -EBUSY;
-
- sch_tree_lock(sch);
-
- qlen = cl->q->q.qlen;
- backlog = cl->q->qstats.backlog;
- qdisc_reset(cl->q);
- qdisc_tree_reduce_backlog(cl->q, qlen, backlog);
-
- if (cl->next_alive)
- cbq_deactivate_class(cl);
-
- if (q->tx_borrowed == cl)
- q->tx_borrowed = q->tx_class;
- if (q->tx_class == cl) {
- q->tx_class = NULL;
- q->tx_borrowed = NULL;
- }
-#ifdef CONFIG_NET_CLS_ACT
- if (q->rx_class == cl)
- q->rx_class = NULL;
-#endif
-
- cbq_unlink_class(cl);
- cbq_adjust_levels(cl->tparent);
- cl->defmap = 0;
- cbq_sync_defmap(cl);
-
- cbq_rmprio(q, cl);
- sch_tree_unlock(sch);
-
- cbq_destroy_class(sch, cl);
- return 0;
-}
-
-static struct tcf_block *cbq_tcf_block(struct Qdisc *sch, unsigned long arg,
- struct netlink_ext_ack *extack)
-{
- struct cbq_sched_data *q = qdisc_priv(sch);
- struct cbq_class *cl = (struct cbq_class *)arg;
-
- if (cl == NULL)
- cl = &q->link;
-
- return cl->block;
-}
-
-static unsigned long cbq_bind_filter(struct Qdisc *sch, unsigned long parent,
- u32 classid)
-{
- struct cbq_sched_data *q = qdisc_priv(sch);
- struct cbq_class *p = (struct cbq_class *)parent;
- struct cbq_class *cl = cbq_class_lookup(q, classid);
-
- if (cl) {
- if (p && p->level <= cl->level)
- return 0;
- cl->filters++;
- return (unsigned long)cl;
- }
- return 0;
-}
-
-static void cbq_unbind_filter(struct Qdisc *sch, unsigned long arg)
-{
- struct cbq_class *cl = (struct cbq_class *)arg;
-
- cl->filters--;
-}
-
-static void cbq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
-{
- struct cbq_sched_data *q = qdisc_priv(sch);
- struct cbq_class *cl;
- unsigned int h;
-
- if (arg->stop)
- return;
-
- for (h = 0; h < q->clhash.hashsize; h++) {
- hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
- if (arg->count < arg->skip) {
- arg->count++;
- continue;
- }
- if (arg->fn(sch, (unsigned long)cl, arg) < 0) {
- arg->stop = 1;
- return;
- }
- arg->count++;
- }
- }
-}
-
-static const struct Qdisc_class_ops cbq_class_ops = {
- .graft = cbq_graft,
- .leaf = cbq_leaf,
- .qlen_notify = cbq_qlen_notify,
- .find = cbq_find,
- .change = cbq_change_class,
- .delete = cbq_delete,
- .walk = cbq_walk,
- .tcf_block = cbq_tcf_block,
- .bind_tcf = cbq_bind_filter,
- .unbind_tcf = cbq_unbind_filter,
- .dump = cbq_dump_class,
- .dump_stats = cbq_dump_class_stats,
-};
-
-static struct Qdisc_ops cbq_qdisc_ops __read_mostly = {
- .next = NULL,
- .cl_ops = &cbq_class_ops,
- .id = "cbq",
- .priv_size = sizeof(struct cbq_sched_data),
- .enqueue = cbq_enqueue,
- .dequeue = cbq_dequeue,
- .peek = qdisc_peek_dequeued,
- .init = cbq_init,
- .reset = cbq_reset,
- .destroy = cbq_destroy,
- .change = NULL,
- .dump = cbq_dump,
- .dump_stats = cbq_dump_stats,
- .owner = THIS_MODULE,
-};
-
-static int __init cbq_module_init(void)
-{
- return register_qdisc(&cbq_qdisc_ops);
-}
-static void __exit cbq_module_exit(void)
-{
- unregister_qdisc(&cbq_qdisc_ops);
-}
-module_init(cbq_module_init)
-module_exit(cbq_module_exit)
-MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_cbs.c b/net/sched/sch_cbs.c
index c6a502933fe7..8c9a0400c862 100644
--- a/net/sched/sch_cbs.c
+++ b/net/sched/sch_cbs.c
@@ -1,13 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* net/sched/sch_cbs.c Credit Based Shaper
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Authors: Vinicius Costa Gomes <vinicius.gomes@intel.com>
- *
*/
/* Credit Based Shaper (CBS)
@@ -55,22 +50,27 @@
* locredit = max_frame_size * (sendslope / port_transmit_rate)
*/
+#include <linux/ethtool.h>
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/string.h>
#include <linux/errno.h>
#include <linux/skbuff.h>
+#include <linux/units.h>
+
+#include <net/netevent.h>
#include <net/netlink.h>
#include <net/sch_generic.h>
#include <net/pkt_sched.h>
-#define BYTES_PER_KBIT (1000LL / 8)
+static LIST_HEAD(cbs_list);
+static DEFINE_SPINLOCK(cbs_list_lock);
struct cbs_sched_data {
bool offload;
int queue;
- s64 port_rate; /* in bytes/s */
+ atomic64_t port_rate; /* in bytes/s */
s64 last; /* timestamp in ns */
s64 credits; /* in bytes */
s32 locredit; /* in bytes */
@@ -82,6 +82,7 @@ struct cbs_sched_data {
struct sk_buff **to_free);
struct sk_buff *(*dequeue)(struct Qdisc *sch);
struct Qdisc *qdisc;
+ struct list_head cbs_list;
};
static int cbs_child_enqueue(struct sk_buff *skb, struct Qdisc *sch,
@@ -181,6 +182,11 @@ static struct sk_buff *cbs_dequeue_soft(struct Qdisc *sch)
s64 credits;
int len;
+ /* The previous packet is still being sent */
+ if (now < q->last) {
+ qdisc_watchdog_schedule_ns(&q->watchdog, q->last);
+ return NULL;
+ }
if (q->credits < 0) {
credits = timediff_to_credits(now - q->last, q->idleslope);
@@ -207,11 +213,17 @@ static struct sk_buff *cbs_dequeue_soft(struct Qdisc *sch)
/* As sendslope is a negative number, this will decrease the
* amount of q->credits.
*/
- credits = credits_from_len(len, q->sendslope, q->port_rate);
+ credits = credits_from_len(len, q->sendslope,
+ atomic64_read(&q->port_rate));
credits += q->credits;
q->credits = max_t(s64, credits, q->locredit);
- q->last = now;
+ /* Estimate of the transmission of the last byte of the packet in ns */
+ if (unlikely(atomic64_read(&q->port_rate) == 0))
+ q->last = now;
+ else
+ q->last = now + div64_s64(len * NSEC_PER_SEC,
+ atomic64_read(&q->port_rate));
return skb;
}
@@ -294,6 +306,58 @@ static int cbs_enable_offload(struct net_device *dev, struct cbs_sched_data *q,
return 0;
}
+static void cbs_set_port_rate(struct net_device *dev, struct cbs_sched_data *q)
+{
+ struct ethtool_link_ksettings ecmd;
+ int speed = SPEED_10;
+ s64 port_rate;
+ int err;
+
+ err = __ethtool_get_link_ksettings(dev, &ecmd);
+ if (err < 0)
+ goto skip;
+
+ if (ecmd.base.speed && ecmd.base.speed != SPEED_UNKNOWN)
+ speed = ecmd.base.speed;
+
+skip:
+ port_rate = speed * 1000 * BYTES_PER_KBIT;
+
+ atomic64_set(&q->port_rate, port_rate);
+ netdev_dbg(dev, "cbs: set %s's port_rate to: %lld, linkspeed: %d\n",
+ dev->name, (long long)atomic64_read(&q->port_rate),
+ ecmd.base.speed);
+}
+
+static int cbs_dev_notifier(struct notifier_block *nb, unsigned long event,
+ void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct cbs_sched_data *q;
+ struct net_device *qdev;
+ bool found = false;
+
+ ASSERT_RTNL();
+
+ if (event != NETDEV_UP && event != NETDEV_CHANGE)
+ return NOTIFY_DONE;
+
+ spin_lock(&cbs_list_lock);
+ list_for_each_entry(q, &cbs_list, cbs_list) {
+ qdev = qdisc_dev(q->qdisc);
+ if (qdev == dev) {
+ found = true;
+ break;
+ }
+ }
+ spin_unlock(&cbs_list_lock);
+
+ if (found)
+ cbs_set_port_rate(dev, q);
+
+ return NOTIFY_DONE;
+}
+
static int cbs_change(struct Qdisc *sch, struct nlattr *opt,
struct netlink_ext_ack *extack)
{
@@ -303,7 +367,8 @@ static int cbs_change(struct Qdisc *sch, struct nlattr *opt,
struct tc_cbs_qopt *qopt;
int err;
- err = nla_parse_nested(tb, TCA_CBS_MAX, opt, cbs_policy, extack);
+ err = nla_parse_nested_deprecated(tb, TCA_CBS_MAX, opt, cbs_policy,
+ extack);
if (err < 0)
return err;
@@ -315,16 +380,7 @@ static int cbs_change(struct Qdisc *sch, struct nlattr *opt,
qopt = nla_data(tb[TCA_CBS_PARMS]);
if (!qopt->offload) {
- struct ethtool_link_ksettings ecmd;
- s64 link_speed;
-
- if (!__ethtool_get_link_ksettings(dev, &ecmd))
- link_speed = ecmd.base.speed;
- else
- link_speed = SPEED_1000;
-
- q->port_rate = link_speed * 1000 * BYTES_PER_KBIT;
-
+ cbs_set_port_rate(dev, q);
cbs_disable_offload(dev, q);
} else {
err = cbs_enable_offload(dev, q, qopt, extack);
@@ -333,11 +389,11 @@ static int cbs_change(struct Qdisc *sch, struct nlattr *opt,
}
/* Everything went OK, save the parameters used. */
- q->hicredit = qopt->hicredit;
- q->locredit = qopt->locredit;
- q->idleslope = qopt->idleslope * BYTES_PER_KBIT;
- q->sendslope = qopt->sendslope * BYTES_PER_KBIT;
- q->offload = qopt->offload;
+ WRITE_ONCE(q->hicredit, qopt->hicredit);
+ WRITE_ONCE(q->locredit, qopt->locredit);
+ WRITE_ONCE(q->idleslope, qopt->idleslope * BYTES_PER_KBIT);
+ WRITE_ONCE(q->sendslope, qopt->sendslope * BYTES_PER_KBIT);
+ WRITE_ONCE(q->offload, qopt->offload);
return 0;
}
@@ -358,6 +414,10 @@ static int cbs_init(struct Qdisc *sch, struct nlattr *opt,
if (!q->qdisc)
return -ENOMEM;
+ spin_lock(&cbs_list_lock);
+ list_add(&q->cbs_list, &cbs_list);
+ spin_unlock(&cbs_list_lock);
+
qdisc_hash_add(q->qdisc, false);
q->queue = sch->dev_queue - netdev_get_tx_queue(dev, 0);
@@ -375,12 +435,18 @@ static void cbs_destroy(struct Qdisc *sch)
struct cbs_sched_data *q = qdisc_priv(sch);
struct net_device *dev = qdisc_dev(sch);
- qdisc_watchdog_cancel(&q->watchdog);
+ /* Nothing to do if we couldn't create the underlying qdisc */
+ if (!q->qdisc)
+ return;
+ qdisc_watchdog_cancel(&q->watchdog);
cbs_disable_offload(dev, q);
- if (q->qdisc)
- qdisc_put(q->qdisc);
+ spin_lock(&cbs_list_lock);
+ list_del(&q->cbs_list);
+ spin_unlock(&cbs_list_lock);
+
+ qdisc_put(q->qdisc);
}
static int cbs_dump(struct Qdisc *sch, struct sk_buff *skb)
@@ -389,15 +455,15 @@ static int cbs_dump(struct Qdisc *sch, struct sk_buff *skb)
struct tc_cbs_qopt opt = { };
struct nlattr *nest;
- nest = nla_nest_start(skb, TCA_OPTIONS);
+ nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
if (!nest)
goto nla_put_failure;
- opt.hicredit = q->hicredit;
- opt.locredit = q->locredit;
- opt.sendslope = div64_s64(q->sendslope, BYTES_PER_KBIT);
- opt.idleslope = div64_s64(q->idleslope, BYTES_PER_KBIT);
- opt.offload = q->offload;
+ opt.hicredit = READ_ONCE(q->hicredit);
+ opt.locredit = READ_ONCE(q->locredit);
+ opt.sendslope = div64_s64(READ_ONCE(q->sendslope), BYTES_PER_KBIT);
+ opt.idleslope = div64_s64(READ_ONCE(q->idleslope), BYTES_PER_KBIT);
+ opt.offload = READ_ONCE(q->offload);
if (nla_put(skb, TCA_CBS_PARMS, sizeof(opt), &opt))
goto nla_put_failure;
@@ -454,13 +520,7 @@ static unsigned long cbs_find(struct Qdisc *sch, u32 classid)
static void cbs_walk(struct Qdisc *sch, struct qdisc_walker *walker)
{
if (!walker->stop) {
- if (walker->count >= walker->skip) {
- if (walker->fn(sch, 1, walker) < 0) {
- walker->stop = 1;
- return;
- }
- }
- walker->count++;
+ tc_qdisc_stats_dump(sch, 1, walker);
}
}
@@ -486,16 +546,33 @@ static struct Qdisc_ops cbs_qdisc_ops __read_mostly = {
.dump = cbs_dump,
.owner = THIS_MODULE,
};
+MODULE_ALIAS_NET_SCH("cbs");
+
+static struct notifier_block cbs_device_notifier = {
+ .notifier_call = cbs_dev_notifier,
+};
static int __init cbs_module_init(void)
{
- return register_qdisc(&cbs_qdisc_ops);
+ int err;
+
+ err = register_netdevice_notifier(&cbs_device_notifier);
+ if (err)
+ return err;
+
+ err = register_qdisc(&cbs_qdisc_ops);
+ if (err)
+ unregister_netdevice_notifier(&cbs_device_notifier);
+
+ return err;
}
static void __exit cbs_module_exit(void)
{
unregister_qdisc(&cbs_qdisc_ops);
+ unregister_netdevice_notifier(&cbs_device_notifier);
}
module_init(cbs_module_init)
module_exit(cbs_module_exit)
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Credit Based shaper");
diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c
index eafc0d17d174..59e7bdf5063e 100644
--- a/net/sched/sch_choke.c
+++ b/net/sched/sch_choke.c
@@ -1,13 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* net/sched/sch_choke.c CHOKE scheduler
*
* Copyright (c) 2011 Stephen Hemminger <shemminger@vyatta.com>
* Copyright (c) 2011 Eric Dumazet <eric.dumazet@gmail.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * version 2 as published by the Free Software Foundation.
- *
*/
#include <linux/module.h>
@@ -64,7 +60,6 @@ struct choke_sched_data {
u32 forced_drop; /* Forced drops, qavg > max_thresh */
u32 forced_mark; /* Forced marks, qavg > max_thresh */
u32 pdrop; /* Drops due to queue limits */
- u32 other; /* Drops due to drop() calls */
u32 matched; /* Drops to flow match */
} stats;
@@ -128,14 +123,13 @@ static void choke_drop_by_idx(struct Qdisc *sch, unsigned int idx,
if (idx == q->tail)
choke_zap_tail_holes(q);
+ --sch->q.qlen;
qdisc_qstats_backlog_dec(sch, skb);
qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(skb));
qdisc_drop(skb, sch, to_free);
- --sch->q.qlen;
}
struct choke_skb_cb {
- u16 classid;
u8 keys_valid;
struct flow_keys_digest keys;
};
@@ -146,11 +140,6 @@ static inline struct choke_skb_cb *choke_skb_cb(const struct sk_buff *skb)
return (struct choke_skb_cb *)qdisc_skb_cb(skb)->data;
}
-static inline void choke_set_classid(struct sk_buff *skb, u16 classid)
-{
- choke_skb_cb(skb)->classid = classid;
-}
-
/*
* Compare flow of two packets
* Returns true only if source and destination address and port match.
@@ -194,7 +183,7 @@ static struct sk_buff *choke_peek_random(const struct choke_sched_data *q,
int retrys = 3;
do {
- *pidx = (q->head + prandom_u32_max(choke_len(q))) & q->tab_mask;
+ *pidx = (q->head + get_random_u32_below(choke_len(q))) & q->tab_mask;
skb = q->tab[*pidx];
if (skb)
return skb;
@@ -325,9 +314,8 @@ static void choke_reset(struct Qdisc *sch)
rtnl_qdisc_drop(skb, sch);
}
- sch->q.qlen = 0;
- sch->qstats.backlog = 0;
- memset(q->tab, 0, (q->tab_mask + 1) * sizeof(struct sk_buff *));
+ if (q->tab)
+ memset(q->tab, 0, (q->tab_mask + 1) * sizeof(struct sk_buff *));
q->head = q->tail = 0;
red_restart(&q->vars);
}
@@ -354,11 +342,13 @@ static int choke_change(struct Qdisc *sch, struct nlattr *opt,
struct sk_buff **old = NULL;
unsigned int mask;
u32 max_P;
+ u8 *stab;
if (opt == NULL)
return -EINVAL;
- err = nla_parse_nested(tb, TCA_CHOKE_MAX, opt, choke_policy, NULL);
+ err = nla_parse_nested_deprecated(tb, TCA_CHOKE_MAX, opt,
+ choke_policy, NULL);
if (err < 0)
return err;
@@ -366,11 +356,11 @@ static int choke_change(struct Qdisc *sch, struct nlattr *opt,
tb[TCA_CHOKE_STAB] == NULL)
return -EINVAL;
- max_P = tb[TCA_CHOKE_MAX_P] ? nla_get_u32(tb[TCA_CHOKE_MAX_P]) : 0;
+ max_P = nla_get_u32_default(tb[TCA_CHOKE_MAX_P], 0);
ctl = nla_data(tb[TCA_CHOKE_PARMS]);
-
- if (!red_check_params(ctl->qth_min, ctl->qth_max, ctl->Wlog))
+ stab = nla_data(tb[TCA_CHOKE_STAB]);
+ if (!red_check_params(ctl->qth_min, ctl->qth_max, ctl->Wlog, ctl->Scell_log, stab))
return -EINVAL;
if (ctl->limit > CHOKE_MAX_QUEUE)
@@ -380,7 +370,7 @@ static int choke_change(struct Qdisc *sch, struct nlattr *opt,
if (mask != q->tab_mask) {
struct sk_buff **ntab;
- ntab = kvmalloc_array((mask + 1), sizeof(struct sk_buff *), GFP_KERNEL | __GFP_ZERO);
+ ntab = kvcalloc(mask + 1, sizeof(struct sk_buff *), GFP_KERNEL);
if (!ntab)
return -ENOMEM;
@@ -415,12 +405,12 @@ static int choke_change(struct Qdisc *sch, struct nlattr *opt,
} else
sch_tree_lock(sch);
- q->flags = ctl->flags;
- q->limit = ctl->limit;
+ WRITE_ONCE(q->flags, ctl->flags);
+ WRITE_ONCE(q->limit, ctl->limit);
red_set_parms(&q->parms, ctl->qth_min, ctl->qth_max, ctl->Wlog,
ctl->Plog, ctl->Scell_log,
- nla_data(tb[TCA_CHOKE_STAB]),
+ stab,
max_P);
red_set_vars(&q->vars);
@@ -441,23 +431,24 @@ static int choke_init(struct Qdisc *sch, struct nlattr *opt,
static int choke_dump(struct Qdisc *sch, struct sk_buff *skb)
{
struct choke_sched_data *q = qdisc_priv(sch);
+ u8 Wlog = READ_ONCE(q->parms.Wlog);
struct nlattr *opts = NULL;
struct tc_red_qopt opt = {
- .limit = q->limit,
- .flags = q->flags,
- .qth_min = q->parms.qth_min >> q->parms.Wlog,
- .qth_max = q->parms.qth_max >> q->parms.Wlog,
- .Wlog = q->parms.Wlog,
- .Plog = q->parms.Plog,
- .Scell_log = q->parms.Scell_log,
+ .limit = READ_ONCE(q->limit),
+ .flags = READ_ONCE(q->flags),
+ .qth_min = READ_ONCE(q->parms.qth_min) >> Wlog,
+ .qth_max = READ_ONCE(q->parms.qth_max) >> Wlog,
+ .Wlog = Wlog,
+ .Plog = READ_ONCE(q->parms.Plog),
+ .Scell_log = READ_ONCE(q->parms.Scell_log),
};
- opts = nla_nest_start(skb, TCA_OPTIONS);
+ opts = nla_nest_start_noflag(skb, TCA_OPTIONS);
if (opts == NULL)
goto nla_put_failure;
if (nla_put(skb, TCA_CHOKE_PARMS, sizeof(opt), &opt) ||
- nla_put_u32(skb, TCA_CHOKE_MAX_P, q->parms.max_P))
+ nla_put_u32(skb, TCA_CHOKE_MAX_P, READ_ONCE(q->parms.max_P)))
goto nla_put_failure;
return nla_nest_end(skb, opts);
@@ -473,7 +464,6 @@ static int choke_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
.early = q->stats.prob_drop + q->stats.forced_drop,
.marked = q->stats.prob_mark + q->stats.forced_mark,
.pdrop = q->stats.pdrop,
- .other = q->stats.other,
.matched = q->stats.matched,
};
@@ -509,6 +499,7 @@ static struct Qdisc_ops choke_qdisc_ops __read_mostly = {
.dump_stats = choke_dump_stats,
.owner = THIS_MODULE,
};
+MODULE_ALIAS_NET_SCH("choke");
static int __init choke_module_init(void)
{
@@ -524,3 +515,4 @@ module_init(choke_module_init)
module_exit(choke_module_exit)
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Choose and keep responsive flows scheduler");
diff --git a/net/sched/sch_codel.c b/net/sched/sch_codel.c
index 17cd81f84b5d..c6551578f1cf 100644
--- a/net/sched/sch_codel.c
+++ b/net/sched/sch_codel.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
/*
* Codel - The Controlled-Delay Active Queue Management algorithm
*
@@ -7,37 +8,6 @@
* Implemented on linux by :
* Copyright (C) 2012 Michael D. Taht <dave.taht@bufferbloat.net>
* Copyright (C) 2012,2015 Eric Dumazet <edumazet@google.com>
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions, and the following disclaimer,
- * without modification.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- * 3. The names of the authors may not be used to endorse or promote products
- * derived from this software without specific prior written permission.
- *
- * Alternatively, provided that this notice is retained in full, this
- * software may be distributed under the terms of the GNU General
- * Public License ("GPL") version 2, in which case the provisions of the
- * GPL apply INSTEAD OF those given above.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
- * DAMAGE.
- *
*/
#include <linux/module.h>
@@ -71,10 +41,10 @@ static struct sk_buff *dequeue_func(struct codel_vars *vars, void *ctx)
struct Qdisc *sch = ctx;
struct sk_buff *skb = __qdisc_dequeue_head(&sch->q);
- if (skb)
+ if (skb) {
sch->qstats.backlog -= qdisc_pkt_len(skb);
-
- prefetch(&skb->end); /* we'll need skb_shinfo() */
+ prefetch(&skb->end); /* we'll need skb_shinfo() */
+ }
return skb;
}
@@ -82,7 +52,7 @@ static void drop_func(struct sk_buff *skb, void *ctx)
{
struct Qdisc *sch = ctx;
- kfree_skb(skb);
+ qdisc_dequeue_drop(sch, skb, SKB_DROP_REASON_QDISC_CONGESTED);
qdisc_qstats_drop(sch);
}
@@ -95,10 +65,7 @@ static struct sk_buff *codel_qdisc_dequeue(struct Qdisc *sch)
&q->stats, qdisc_pkt_len, codel_get_enqueue_time,
drop_func, dequeue_func);
- /* We cant call qdisc_tree_reduce_backlog() if our qlen is 0,
- * or HTB crashes. Defer it for next round.
- */
- if (q->stats.drop_count && sch->q.qlen) {
+ if (q->stats.drop_count) {
qdisc_tree_reduce_backlog(sch, q->stats.drop_count, q->stats.drop_len);
q->stats.drop_count = 0;
q->stats.drop_len = 0;
@@ -119,7 +86,8 @@ static int codel_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch,
}
q = qdisc_priv(sch);
q->drop_overlimit++;
- return qdisc_drop(skb, sch, to_free);
+ return qdisc_drop_reason(skb, sch, to_free,
+ SKB_DROP_REASON_QDISC_OVERLIMIT);
}
static const struct nla_policy codel_policy[TCA_CODEL_MAX + 1] = {
@@ -133,15 +101,13 @@ static const struct nla_policy codel_policy[TCA_CODEL_MAX + 1] = {
static int codel_change(struct Qdisc *sch, struct nlattr *opt,
struct netlink_ext_ack *extack)
{
+ unsigned int dropped_pkts = 0, dropped_bytes = 0;
struct codel_sched_data *q = qdisc_priv(sch);
struct nlattr *tb[TCA_CODEL_MAX + 1];
- unsigned int qlen, dropped = 0;
int err;
- if (!opt)
- return -EINVAL;
-
- err = nla_parse_nested(tb, TCA_CODEL_MAX, opt, codel_policy, NULL);
+ err = nla_parse_nested_deprecated(tb, TCA_CODEL_MAX, opt,
+ codel_policy, NULL);
if (err < 0)
return err;
@@ -150,36 +116,43 @@ static int codel_change(struct Qdisc *sch, struct nlattr *opt,
if (tb[TCA_CODEL_TARGET]) {
u32 target = nla_get_u32(tb[TCA_CODEL_TARGET]);
- q->params.target = ((u64)target * NSEC_PER_USEC) >> CODEL_SHIFT;
+ WRITE_ONCE(q->params.target,
+ ((u64)target * NSEC_PER_USEC) >> CODEL_SHIFT);
}
if (tb[TCA_CODEL_CE_THRESHOLD]) {
u64 val = nla_get_u32(tb[TCA_CODEL_CE_THRESHOLD]);
- q->params.ce_threshold = (val * NSEC_PER_USEC) >> CODEL_SHIFT;
+ WRITE_ONCE(q->params.ce_threshold,
+ (val * NSEC_PER_USEC) >> CODEL_SHIFT);
}
if (tb[TCA_CODEL_INTERVAL]) {
u32 interval = nla_get_u32(tb[TCA_CODEL_INTERVAL]);
- q->params.interval = ((u64)interval * NSEC_PER_USEC) >> CODEL_SHIFT;
+ WRITE_ONCE(q->params.interval,
+ ((u64)interval * NSEC_PER_USEC) >> CODEL_SHIFT);
}
if (tb[TCA_CODEL_LIMIT])
- sch->limit = nla_get_u32(tb[TCA_CODEL_LIMIT]);
+ WRITE_ONCE(sch->limit,
+ nla_get_u32(tb[TCA_CODEL_LIMIT]));
if (tb[TCA_CODEL_ECN])
- q->params.ecn = !!nla_get_u32(tb[TCA_CODEL_ECN]);
+ WRITE_ONCE(q->params.ecn,
+ !!nla_get_u32(tb[TCA_CODEL_ECN]));
- qlen = sch->q.qlen;
while (sch->q.qlen > sch->limit) {
- struct sk_buff *skb = __qdisc_dequeue_head(&sch->q);
+ struct sk_buff *skb = qdisc_dequeue_internal(sch, true);
- dropped += qdisc_pkt_len(skb);
- qdisc_qstats_backlog_dec(sch, skb);
+ if (!skb)
+ break;
+
+ dropped_pkts++;
+ dropped_bytes += qdisc_pkt_len(skb);
rtnl_qdisc_drop(skb, sch);
}
- qdisc_tree_reduce_backlog(sch, qlen - sch->q.qlen, dropped);
+ qdisc_tree_reduce_backlog(sch, dropped_pkts, dropped_bytes);
sch_tree_unlock(sch);
return 0;
@@ -209,30 +182,34 @@ static int codel_init(struct Qdisc *sch, struct nlattr *opt,
else
sch->flags &= ~TCQ_F_CAN_BYPASS;
+ sch->flags |= TCQ_F_DEQUEUE_DROPS;
+
return 0;
}
static int codel_dump(struct Qdisc *sch, struct sk_buff *skb)
{
struct codel_sched_data *q = qdisc_priv(sch);
+ codel_time_t ce_threshold;
struct nlattr *opts;
- opts = nla_nest_start(skb, TCA_OPTIONS);
+ opts = nla_nest_start_noflag(skb, TCA_OPTIONS);
if (opts == NULL)
goto nla_put_failure;
if (nla_put_u32(skb, TCA_CODEL_TARGET,
- codel_time_to_us(q->params.target)) ||
+ codel_time_to_us(READ_ONCE(q->params.target))) ||
nla_put_u32(skb, TCA_CODEL_LIMIT,
- sch->limit) ||
+ READ_ONCE(sch->limit)) ||
nla_put_u32(skb, TCA_CODEL_INTERVAL,
- codel_time_to_us(q->params.interval)) ||
+ codel_time_to_us(READ_ONCE(q->params.interval))) ||
nla_put_u32(skb, TCA_CODEL_ECN,
- q->params.ecn))
+ READ_ONCE(q->params.ecn)))
goto nla_put_failure;
- if (q->params.ce_threshold != CODEL_DISABLED_THRESHOLD &&
+ ce_threshold = READ_ONCE(q->params.ce_threshold);
+ if (ce_threshold != CODEL_DISABLED_THRESHOLD &&
nla_put_u32(skb, TCA_CODEL_CE_THRESHOLD,
- codel_time_to_us(q->params.ce_threshold)))
+ codel_time_to_us(ce_threshold)))
goto nla_put_failure;
return nla_nest_end(skb, opts);
@@ -289,6 +266,7 @@ static struct Qdisc_ops codel_qdisc_ops __read_mostly = {
.dump_stats = codel_dump_stats,
.owner = THIS_MODULE,
};
+MODULE_ALIAS_NET_SCH("codel");
static int __init codel_module_init(void)
{
diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c
index 09b800991065..9b6d79bd8737 100644
--- a/net/sched/sch_drr.c
+++ b/net/sched/sch_drr.c
@@ -1,11 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* net/sched/sch_drr.c Deficit Round Robin scheduler
*
* Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * version 2 as published by the Free Software Foundation.
*/
#include <linux/module.h>
@@ -20,9 +17,8 @@
struct drr_class {
struct Qdisc_class_common common;
- unsigned int filter_cnt;
- struct gnet_stats_basic_packed bstats;
+ struct gnet_stats_basic_sync bstats;
struct gnet_stats_queue qstats;
struct net_rate_estimator __rcu *rate_est;
struct list_head alist;
@@ -39,6 +35,11 @@ struct drr_sched {
struct Qdisc_class_hash clhash;
};
+static bool cl_is_active(struct drr_class *cl)
+{
+ return !list_empty(&cl->alist);
+}
+
static struct drr_class *drr_find_class(struct Qdisc *sch, u32 classid)
{
struct drr_sched *q = qdisc_priv(sch);
@@ -50,15 +51,6 @@ static struct drr_class *drr_find_class(struct Qdisc *sch, u32 classid)
return container_of(clc, struct drr_class, common);
}
-static void drr_purge_queue(struct drr_class *cl)
-{
- unsigned int len = cl->qdisc->q.qlen;
- unsigned int backlog = cl->qdisc->qstats.backlog;
-
- qdisc_reset(cl->qdisc);
- qdisc_tree_reduce_backlog(cl->qdisc, len, backlog);
-}
-
static const struct nla_policy drr_policy[TCA_DRR_MAX + 1] = {
[TCA_DRR_QUANTUM] = { .type = NLA_U32 },
};
@@ -79,7 +71,8 @@ static int drr_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
return -EINVAL;
}
- err = nla_parse_nested(tb, TCA_DRR_MAX, opt, drr_policy, extack);
+ err = nla_parse_nested_deprecated(tb, TCA_DRR_MAX, opt, drr_policy,
+ extack);
if (err < 0)
return err;
@@ -96,8 +89,7 @@ static int drr_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
if (tca[TCA_RATE]) {
err = gen_replace_estimator(&cl->bstats, NULL,
&cl->rate_est,
- NULL,
- qdisc_root_sleeping_running(sch),
+ NULL, true,
tca[TCA_RATE]);
if (err) {
NL_SET_ERR_MSG(extack, "Failed to replace estimator");
@@ -117,6 +109,8 @@ static int drr_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
if (cl == NULL)
return -ENOBUFS;
+ gnet_stats_basic_sync_init(&cl->bstats);
+ INIT_LIST_HEAD(&cl->alist);
cl->common.classid = classid;
cl->quantum = quantum;
cl->qdisc = qdisc_create_dflt(sch->dev_queue,
@@ -129,9 +123,7 @@ static int drr_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
if (tca[TCA_RATE]) {
err = gen_replace_estimator(&cl->bstats, NULL, &cl->rate_est,
- NULL,
- qdisc_root_sleeping_running(sch),
- tca[TCA_RATE]);
+ NULL, true, tca[TCA_RATE]);
if (err) {
NL_SET_ERR_MSG(extack, "Failed to replace estimator");
qdisc_put(cl->qdisc);
@@ -157,17 +149,20 @@ static void drr_destroy_class(struct Qdisc *sch, struct drr_class *cl)
kfree(cl);
}
-static int drr_delete_class(struct Qdisc *sch, unsigned long arg)
+static int drr_delete_class(struct Qdisc *sch, unsigned long arg,
+ struct netlink_ext_ack *extack)
{
struct drr_sched *q = qdisc_priv(sch);
struct drr_class *cl = (struct drr_class *)arg;
- if (cl->filter_cnt > 0)
+ if (qdisc_class_in_use(&cl->common)) {
+ NL_SET_ERR_MSG(extack, "DRR class is in use");
return -EBUSY;
+ }
sch_tree_lock(sch);
- drr_purge_queue(cl);
+ qdisc_purge_queue(cl->qdisc);
qdisc_class_hash_remove(&q->clhash, &cl->common);
sch_tree_unlock(sch);
@@ -199,8 +194,8 @@ static unsigned long drr_bind_tcf(struct Qdisc *sch, unsigned long parent,
{
struct drr_class *cl = drr_find_class(sch, classid);
- if (cl != NULL)
- cl->filter_cnt++;
+ if (cl)
+ qdisc_class_get(&cl->common);
return (unsigned long)cl;
}
@@ -209,7 +204,7 @@ static void drr_unbind_tcf(struct Qdisc *sch, unsigned long arg)
{
struct drr_class *cl = (struct drr_class *)arg;
- cl->filter_cnt--;
+ qdisc_class_put(&cl->common);
}
static int drr_graft_class(struct Qdisc *sch, unsigned long arg,
@@ -240,7 +235,7 @@ static void drr_qlen_notify(struct Qdisc *csh, unsigned long arg)
{
struct drr_class *cl = (struct drr_class *)arg;
- list_del(&cl->alist);
+ list_del_init(&cl->alist);
}
static int drr_dump_class(struct Qdisc *sch, unsigned long arg,
@@ -253,7 +248,7 @@ static int drr_dump_class(struct Qdisc *sch, unsigned long arg,
tcm->tcm_handle = cl->common.classid;
tcm->tcm_info = cl->qdisc->handle;
- nest = nla_nest_start(skb, TCA_OPTIONS);
+ nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
if (nest == NULL)
goto nla_put_failure;
if (nla_put_u32(skb, TCA_DRR_QUANTUM, cl->quantum))
@@ -269,17 +264,17 @@ static int drr_dump_class_stats(struct Qdisc *sch, unsigned long arg,
struct gnet_dump *d)
{
struct drr_class *cl = (struct drr_class *)arg;
- __u32 qlen = cl->qdisc->q.qlen;
+ __u32 qlen = qdisc_qlen_sum(cl->qdisc);
+ struct Qdisc *cl_q = cl->qdisc;
struct tc_drr_stats xstats;
memset(&xstats, 0, sizeof(xstats));
if (qlen)
xstats.deficit = cl->deficit;
- if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
- d, NULL, &cl->bstats) < 0 ||
+ if (gnet_stats_copy_basic(d, NULL, &cl->bstats, true) < 0 ||
gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
- gnet_stats_copy_queue(d, NULL, &cl->qdisc->qstats, qlen) < 0)
+ gnet_stats_copy_queue(d, cl_q->cpu_qstats, &cl_q->qstats, qlen) < 0)
return -1;
return gnet_stats_copy_app(d, &xstats, sizeof(xstats));
@@ -296,15 +291,8 @@ static void drr_walk(struct Qdisc *sch, struct qdisc_walker *arg)
for (i = 0; i < q->clhash.hashsize; i++) {
hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) {
- if (arg->count < arg->skip) {
- arg->count++;
- continue;
- }
- if (arg->fn(sch, (unsigned long)cl, arg) < 0) {
- arg->stop = 1;
+ if (!tc_qdisc_stats_dump(sch, (unsigned long)cl, arg))
return;
- }
- arg->count++;
}
}
}
@@ -326,7 +314,7 @@ static struct drr_class *drr_classify(struct sk_buff *skb, struct Qdisc *sch,
*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
fl = rcu_dereference_bh(q->filter_list);
- result = tcf_classify(skb, fl, &res, false);
+ result = tcf_classify(skb, NULL, fl, &res, false);
if (result >= 0) {
#ifdef CONFIG_NET_CLS_ACT
switch (result) {
@@ -334,7 +322,7 @@ static struct drr_class *drr_classify(struct sk_buff *skb, struct Qdisc *sch,
case TC_ACT_STOLEN:
case TC_ACT_TRAP:
*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
- /* fall through */
+ fallthrough;
case TC_ACT_SHOT:
return NULL;
}
@@ -354,7 +342,6 @@ static int drr_enqueue(struct sk_buff *skb, struct Qdisc *sch,
struct drr_sched *q = qdisc_priv(sch);
struct drr_class *cl;
int err = 0;
- bool first;
cl = drr_classify(skb, sch, &err);
if (cl == NULL) {
@@ -364,7 +351,6 @@ static int drr_enqueue(struct sk_buff *skb, struct Qdisc *sch,
return err;
}
- first = !cl->qdisc->q.qlen;
err = qdisc_enqueue(skb, cl->qdisc, to_free);
if (unlikely(err != NET_XMIT_SUCCESS)) {
if (net_xmit_drop_count(err)) {
@@ -374,7 +360,7 @@ static int drr_enqueue(struct sk_buff *skb, struct Qdisc *sch,
return err;
}
- if (first) {
+ if (!cl_is_active(cl)) {
list_add_tail(&cl->alist, &q->active);
cl->deficit = cl->quantum;
}
@@ -408,7 +394,7 @@ static struct sk_buff *drr_dequeue(struct Qdisc *sch)
if (unlikely(skb == NULL))
goto out;
if (cl->qdisc->q.qlen == 0)
- list_del(&cl->alist);
+ list_del_init(&cl->alist);
bstats_update(&cl->bstats, skb);
qdisc_bstats_update(sch, skb);
@@ -449,12 +435,10 @@ static void drr_reset_qdisc(struct Qdisc *sch)
for (i = 0; i < q->clhash.hashsize; i++) {
hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) {
if (cl->qdisc->q.qlen)
- list_del(&cl->alist);
+ list_del_init(&cl->alist);
qdisc_reset(cl->qdisc);
}
}
- sch->qstats.backlog = 0;
- sch->q.qlen = 0;
}
static void drr_destroy_qdisc(struct Qdisc *sch)
@@ -501,6 +485,7 @@ static struct Qdisc_ops drr_qdisc_ops __read_mostly = {
.destroy = drr_destroy_qdisc,
.owner = THIS_MODULE,
};
+MODULE_ALIAS_NET_SCH("drr");
static int __init drr_init(void)
{
@@ -515,3 +500,4 @@ static void __exit drr_exit(void)
module_init(drr_init);
module_exit(drr_exit);
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Deficit Round Robin scheduler");
diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c
deleted file mode 100644
index 42471464ded3..000000000000
--- a/net/sched/sch_dsmark.c
+++ /dev/null
@@ -1,517 +0,0 @@
-/* net/sched/sch_dsmark.c - Differentiated Services field marker */
-
-/* Written 1998-2000 by Werner Almesberger, EPFL ICA */
-
-
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/slab.h>
-#include <linux/types.h>
-#include <linux/string.h>
-#include <linux/errno.h>
-#include <linux/skbuff.h>
-#include <linux/rtnetlink.h>
-#include <linux/bitops.h>
-#include <net/pkt_sched.h>
-#include <net/pkt_cls.h>
-#include <net/dsfield.h>
-#include <net/inet_ecn.h>
-#include <asm/byteorder.h>
-
-/*
- * classid class marking
- * ------- ----- -------
- * n/a 0 n/a
- * x:0 1 use entry [0]
- * ... ... ...
- * x:y y>0 y+1 use entry [y]
- * ... ... ...
- * x:indices-1 indices use entry [indices-1]
- * ... ... ...
- * x:y y+1 use entry [y & (indices-1)]
- * ... ... ...
- * 0xffff 0x10000 use entry [indices-1]
- */
-
-
-#define NO_DEFAULT_INDEX (1 << 16)
-
-struct mask_value {
- u8 mask;
- u8 value;
-};
-
-struct dsmark_qdisc_data {
- struct Qdisc *q;
- struct tcf_proto __rcu *filter_list;
- struct tcf_block *block;
- struct mask_value *mv;
- u16 indices;
- u8 set_tc_index;
- u32 default_index; /* index range is 0...0xffff */
-#define DSMARK_EMBEDDED_SZ 16
- struct mask_value embedded[DSMARK_EMBEDDED_SZ];
-};
-
-static inline int dsmark_valid_index(struct dsmark_qdisc_data *p, u16 index)
-{
- return index <= p->indices && index > 0;
-}
-
-/* ------------------------- Class/flow operations ------------------------- */
-
-static int dsmark_graft(struct Qdisc *sch, unsigned long arg,
- struct Qdisc *new, struct Qdisc **old,
- struct netlink_ext_ack *extack)
-{
- struct dsmark_qdisc_data *p = qdisc_priv(sch);
-
- pr_debug("%s(sch %p,[qdisc %p],new %p,old %p)\n",
- __func__, sch, p, new, old);
-
- if (new == NULL) {
- new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
- sch->handle, NULL);
- if (new == NULL)
- new = &noop_qdisc;
- }
-
- *old = qdisc_replace(sch, new, &p->q);
- return 0;
-}
-
-static struct Qdisc *dsmark_leaf(struct Qdisc *sch, unsigned long arg)
-{
- struct dsmark_qdisc_data *p = qdisc_priv(sch);
- return p->q;
-}
-
-static unsigned long dsmark_find(struct Qdisc *sch, u32 classid)
-{
- return TC_H_MIN(classid) + 1;
-}
-
-static unsigned long dsmark_bind_filter(struct Qdisc *sch,
- unsigned long parent, u32 classid)
-{
- pr_debug("%s(sch %p,[qdisc %p],classid %x)\n",
- __func__, sch, qdisc_priv(sch), classid);
-
- return dsmark_find(sch, classid);
-}
-
-static void dsmark_unbind_filter(struct Qdisc *sch, unsigned long cl)
-{
-}
-
-static const struct nla_policy dsmark_policy[TCA_DSMARK_MAX + 1] = {
- [TCA_DSMARK_INDICES] = { .type = NLA_U16 },
- [TCA_DSMARK_DEFAULT_INDEX] = { .type = NLA_U16 },
- [TCA_DSMARK_SET_TC_INDEX] = { .type = NLA_FLAG },
- [TCA_DSMARK_MASK] = { .type = NLA_U8 },
- [TCA_DSMARK_VALUE] = { .type = NLA_U8 },
-};
-
-static int dsmark_change(struct Qdisc *sch, u32 classid, u32 parent,
- struct nlattr **tca, unsigned long *arg,
- struct netlink_ext_ack *extack)
-{
- struct dsmark_qdisc_data *p = qdisc_priv(sch);
- struct nlattr *opt = tca[TCA_OPTIONS];
- struct nlattr *tb[TCA_DSMARK_MAX + 1];
- int err = -EINVAL;
-
- pr_debug("%s(sch %p,[qdisc %p],classid %x,parent %x), arg 0x%lx\n",
- __func__, sch, p, classid, parent, *arg);
-
- if (!dsmark_valid_index(p, *arg)) {
- err = -ENOENT;
- goto errout;
- }
-
- if (!opt)
- goto errout;
-
- err = nla_parse_nested(tb, TCA_DSMARK_MAX, opt, dsmark_policy, NULL);
- if (err < 0)
- goto errout;
-
- if (tb[TCA_DSMARK_VALUE])
- p->mv[*arg - 1].value = nla_get_u8(tb[TCA_DSMARK_VALUE]);
-
- if (tb[TCA_DSMARK_MASK])
- p->mv[*arg - 1].mask = nla_get_u8(tb[TCA_DSMARK_MASK]);
-
- err = 0;
-
-errout:
- return err;
-}
-
-static int dsmark_delete(struct Qdisc *sch, unsigned long arg)
-{
- struct dsmark_qdisc_data *p = qdisc_priv(sch);
-
- if (!dsmark_valid_index(p, arg))
- return -EINVAL;
-
- p->mv[arg - 1].mask = 0xff;
- p->mv[arg - 1].value = 0;
-
- return 0;
-}
-
-static void dsmark_walk(struct Qdisc *sch, struct qdisc_walker *walker)
-{
- struct dsmark_qdisc_data *p = qdisc_priv(sch);
- int i;
-
- pr_debug("%s(sch %p,[qdisc %p],walker %p)\n",
- __func__, sch, p, walker);
-
- if (walker->stop)
- return;
-
- for (i = 0; i < p->indices; i++) {
- if (p->mv[i].mask == 0xff && !p->mv[i].value)
- goto ignore;
- if (walker->count >= walker->skip) {
- if (walker->fn(sch, i + 1, walker) < 0) {
- walker->stop = 1;
- break;
- }
- }
-ignore:
- walker->count++;
- }
-}
-
-static struct tcf_block *dsmark_tcf_block(struct Qdisc *sch, unsigned long cl,
- struct netlink_ext_ack *extack)
-{
- struct dsmark_qdisc_data *p = qdisc_priv(sch);
-
- return p->block;
-}
-
-/* --------------------------- Qdisc operations ---------------------------- */
-
-static int dsmark_enqueue(struct sk_buff *skb, struct Qdisc *sch,
- struct sk_buff **to_free)
-{
- unsigned int len = qdisc_pkt_len(skb);
- struct dsmark_qdisc_data *p = qdisc_priv(sch);
- int err;
-
- pr_debug("%s(skb %p,sch %p,[qdisc %p])\n", __func__, skb, sch, p);
-
- if (p->set_tc_index) {
- int wlen = skb_network_offset(skb);
-
- switch (tc_skb_protocol(skb)) {
- case htons(ETH_P_IP):
- wlen += sizeof(struct iphdr);
- if (!pskb_may_pull(skb, wlen) ||
- skb_try_make_writable(skb, wlen))
- goto drop;
-
- skb->tc_index = ipv4_get_dsfield(ip_hdr(skb))
- & ~INET_ECN_MASK;
- break;
-
- case htons(ETH_P_IPV6):
- wlen += sizeof(struct ipv6hdr);
- if (!pskb_may_pull(skb, wlen) ||
- skb_try_make_writable(skb, wlen))
- goto drop;
-
- skb->tc_index = ipv6_get_dsfield(ipv6_hdr(skb))
- & ~INET_ECN_MASK;
- break;
- default:
- skb->tc_index = 0;
- break;
- }
- }
-
- if (TC_H_MAJ(skb->priority) == sch->handle)
- skb->tc_index = TC_H_MIN(skb->priority);
- else {
- struct tcf_result res;
- struct tcf_proto *fl = rcu_dereference_bh(p->filter_list);
- int result = tcf_classify(skb, fl, &res, false);
-
- pr_debug("result %d class 0x%04x\n", result, res.classid);
-
- switch (result) {
-#ifdef CONFIG_NET_CLS_ACT
- case TC_ACT_QUEUED:
- case TC_ACT_STOLEN:
- case TC_ACT_TRAP:
- __qdisc_drop(skb, to_free);
- return NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
-
- case TC_ACT_SHOT:
- goto drop;
-#endif
- case TC_ACT_OK:
- skb->tc_index = TC_H_MIN(res.classid);
- break;
-
- default:
- if (p->default_index != NO_DEFAULT_INDEX)
- skb->tc_index = p->default_index;
- break;
- }
- }
-
- err = qdisc_enqueue(skb, p->q, to_free);
- if (err != NET_XMIT_SUCCESS) {
- if (net_xmit_drop_count(err))
- qdisc_qstats_drop(sch);
- return err;
- }
-
- sch->qstats.backlog += len;
- sch->q.qlen++;
-
- return NET_XMIT_SUCCESS;
-
-drop:
- qdisc_drop(skb, sch, to_free);
- return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
-}
-
-static struct sk_buff *dsmark_dequeue(struct Qdisc *sch)
-{
- struct dsmark_qdisc_data *p = qdisc_priv(sch);
- struct sk_buff *skb;
- u32 index;
-
- pr_debug("%s(sch %p,[qdisc %p])\n", __func__, sch, p);
-
- skb = qdisc_dequeue_peeked(p->q);
- if (skb == NULL)
- return NULL;
-
- qdisc_bstats_update(sch, skb);
- qdisc_qstats_backlog_dec(sch, skb);
- sch->q.qlen--;
-
- index = skb->tc_index & (p->indices - 1);
- pr_debug("index %d->%d\n", skb->tc_index, index);
-
- switch (tc_skb_protocol(skb)) {
- case htons(ETH_P_IP):
- ipv4_change_dsfield(ip_hdr(skb), p->mv[index].mask,
- p->mv[index].value);
- break;
- case htons(ETH_P_IPV6):
- ipv6_change_dsfield(ipv6_hdr(skb), p->mv[index].mask,
- p->mv[index].value);
- break;
- default:
- /*
- * Only complain if a change was actually attempted.
- * This way, we can send non-IP traffic through dsmark
- * and don't need yet another qdisc as a bypass.
- */
- if (p->mv[index].mask != 0xff || p->mv[index].value)
- pr_warn("%s: unsupported protocol %d\n",
- __func__, ntohs(tc_skb_protocol(skb)));
- break;
- }
-
- return skb;
-}
-
-static struct sk_buff *dsmark_peek(struct Qdisc *sch)
-{
- struct dsmark_qdisc_data *p = qdisc_priv(sch);
-
- pr_debug("%s(sch %p,[qdisc %p])\n", __func__, sch, p);
-
- return p->q->ops->peek(p->q);
-}
-
-static int dsmark_init(struct Qdisc *sch, struct nlattr *opt,
- struct netlink_ext_ack *extack)
-{
- struct dsmark_qdisc_data *p = qdisc_priv(sch);
- struct nlattr *tb[TCA_DSMARK_MAX + 1];
- int err = -EINVAL;
- u32 default_index = NO_DEFAULT_INDEX;
- u16 indices;
- int i;
-
- pr_debug("%s(sch %p,[qdisc %p],opt %p)\n", __func__, sch, p, opt);
-
- if (!opt)
- goto errout;
-
- err = tcf_block_get(&p->block, &p->filter_list, sch, extack);
- if (err)
- return err;
-
- err = nla_parse_nested(tb, TCA_DSMARK_MAX, opt, dsmark_policy, NULL);
- if (err < 0)
- goto errout;
-
- err = -EINVAL;
- indices = nla_get_u16(tb[TCA_DSMARK_INDICES]);
-
- if (hweight32(indices) != 1)
- goto errout;
-
- if (tb[TCA_DSMARK_DEFAULT_INDEX])
- default_index = nla_get_u16(tb[TCA_DSMARK_DEFAULT_INDEX]);
-
- if (indices <= DSMARK_EMBEDDED_SZ)
- p->mv = p->embedded;
- else
- p->mv = kmalloc_array(indices, sizeof(*p->mv), GFP_KERNEL);
- if (!p->mv) {
- err = -ENOMEM;
- goto errout;
- }
- for (i = 0; i < indices; i++) {
- p->mv[i].mask = 0xff;
- p->mv[i].value = 0;
- }
- p->indices = indices;
- p->default_index = default_index;
- p->set_tc_index = nla_get_flag(tb[TCA_DSMARK_SET_TC_INDEX]);
-
- p->q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, sch->handle,
- NULL);
- if (p->q == NULL)
- p->q = &noop_qdisc;
- else
- qdisc_hash_add(p->q, true);
-
- pr_debug("%s: qdisc %p\n", __func__, p->q);
-
- err = 0;
-errout:
- return err;
-}
-
-static void dsmark_reset(struct Qdisc *sch)
-{
- struct dsmark_qdisc_data *p = qdisc_priv(sch);
-
- pr_debug("%s(sch %p,[qdisc %p])\n", __func__, sch, p);
- qdisc_reset(p->q);
- sch->qstats.backlog = 0;
- sch->q.qlen = 0;
-}
-
-static void dsmark_destroy(struct Qdisc *sch)
-{
- struct dsmark_qdisc_data *p = qdisc_priv(sch);
-
- pr_debug("%s(sch %p,[qdisc %p])\n", __func__, sch, p);
-
- tcf_block_put(p->block);
- qdisc_put(p->q);
- if (p->mv != p->embedded)
- kfree(p->mv);
-}
-
-static int dsmark_dump_class(struct Qdisc *sch, unsigned long cl,
- struct sk_buff *skb, struct tcmsg *tcm)
-{
- struct dsmark_qdisc_data *p = qdisc_priv(sch);
- struct nlattr *opts = NULL;
-
- pr_debug("%s(sch %p,[qdisc %p],class %ld\n", __func__, sch, p, cl);
-
- if (!dsmark_valid_index(p, cl))
- return -EINVAL;
-
- tcm->tcm_handle = TC_H_MAKE(TC_H_MAJ(sch->handle), cl - 1);
- tcm->tcm_info = p->q->handle;
-
- opts = nla_nest_start(skb, TCA_OPTIONS);
- if (opts == NULL)
- goto nla_put_failure;
- if (nla_put_u8(skb, TCA_DSMARK_MASK, p->mv[cl - 1].mask) ||
- nla_put_u8(skb, TCA_DSMARK_VALUE, p->mv[cl - 1].value))
- goto nla_put_failure;
-
- return nla_nest_end(skb, opts);
-
-nla_put_failure:
- nla_nest_cancel(skb, opts);
- return -EMSGSIZE;
-}
-
-static int dsmark_dump(struct Qdisc *sch, struct sk_buff *skb)
-{
- struct dsmark_qdisc_data *p = qdisc_priv(sch);
- struct nlattr *opts = NULL;
-
- opts = nla_nest_start(skb, TCA_OPTIONS);
- if (opts == NULL)
- goto nla_put_failure;
- if (nla_put_u16(skb, TCA_DSMARK_INDICES, p->indices))
- goto nla_put_failure;
-
- if (p->default_index != NO_DEFAULT_INDEX &&
- nla_put_u16(skb, TCA_DSMARK_DEFAULT_INDEX, p->default_index))
- goto nla_put_failure;
-
- if (p->set_tc_index &&
- nla_put_flag(skb, TCA_DSMARK_SET_TC_INDEX))
- goto nla_put_failure;
-
- return nla_nest_end(skb, opts);
-
-nla_put_failure:
- nla_nest_cancel(skb, opts);
- return -EMSGSIZE;
-}
-
-static const struct Qdisc_class_ops dsmark_class_ops = {
- .graft = dsmark_graft,
- .leaf = dsmark_leaf,
- .find = dsmark_find,
- .change = dsmark_change,
- .delete = dsmark_delete,
- .walk = dsmark_walk,
- .tcf_block = dsmark_tcf_block,
- .bind_tcf = dsmark_bind_filter,
- .unbind_tcf = dsmark_unbind_filter,
- .dump = dsmark_dump_class,
-};
-
-static struct Qdisc_ops dsmark_qdisc_ops __read_mostly = {
- .next = NULL,
- .cl_ops = &dsmark_class_ops,
- .id = "dsmark",
- .priv_size = sizeof(struct dsmark_qdisc_data),
- .enqueue = dsmark_enqueue,
- .dequeue = dsmark_dequeue,
- .peek = dsmark_peek,
- .init = dsmark_init,
- .reset = dsmark_reset,
- .destroy = dsmark_destroy,
- .change = NULL,
- .dump = dsmark_dump,
- .owner = THIS_MODULE,
-};
-
-static int __init dsmark_module_init(void)
-{
- return register_qdisc(&dsmark_qdisc_ops);
-}
-
-static void __exit dsmark_module_exit(void)
-{
- unregister_qdisc(&dsmark_qdisc_ops);
-}
-
-module_init(dsmark_module_init)
-module_exit(dsmark_module_exit)
-
-MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_dualpi2.c b/net/sched/sch_dualpi2.c
new file mode 100644
index 000000000000..6d7e6389758d
--- /dev/null
+++ b/net/sched/sch_dualpi2.c
@@ -0,0 +1,1177 @@
+// SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause
+/* Copyright (C) 2024 Nokia
+ *
+ * Author: Koen De Schepper <koen.de_schepper@nokia-bell-labs.com>
+ * Author: Olga Albisser <olga@albisser.org>
+ * Author: Henrik Steen <henrist@henrist.net>
+ * Author: Olivier Tilmans <olivier.tilmans@nokia.com>
+ * Author: Chia-Yu Chang <chia-yu.chang@nokia-bell-labs.com>
+ *
+ * DualPI Improved with a Square (dualpi2):
+ * - Supports congestion controls that comply with the Prague requirements
+ * in RFC9331 (e.g. TCP-Prague)
+ * - Supports coupled dual-queue with PI2 as defined in RFC9332
+ * - Supports ECN L4S-identifier (IP.ECN==0b*1)
+ *
+ * note: Although DCTCP and BBRv3 can use shallow-threshold ECN marks,
+ * they do not meet the 'Prague L4S Requirements' listed in RFC 9331
+ * Section 4, so they can only be used with DualPI2 in a datacenter
+ * context.
+ *
+ * References:
+ * - RFC9332: https://datatracker.ietf.org/doc/html/rfc9332
+ * - De Schepper, Koen, et al. "PI 2: A linearized AQM for both classic and
+ * scalable TCP." in proc. ACM CoNEXT'16, 2016.
+ */
+
+#include <linux/errno.h>
+#include <linux/hrtimer.h>
+#include <linux/if_vlan.h>
+#include <linux/kernel.h>
+#include <linux/limits.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/types.h>
+
+#include <net/gso.h>
+#include <net/inet_ecn.h>
+#include <net/pkt_cls.h>
+#include <net/pkt_sched.h>
+
+/* 32b enable to support flows with windows up to ~8.6 * 1e9 packets
+ * i.e., twice the maximal snd_cwnd.
+ * MAX_PROB must be consistent with the RNG in dualpi2_roll().
+ */
+#define MAX_PROB U32_MAX
+
+/* alpha/beta values exchanged over netlink are in units of 256ns */
+#define ALPHA_BETA_SHIFT 8
+
+/* Scaled values of alpha/beta must fit in 32b to avoid overflow in later
+ * computations. Consequently (see and dualpi2_scale_alpha_beta()), their
+ * netlink-provided values can use at most 31b, i.e. be at most (2^23)-1
+ * (~4MHz) as those are given in 1/256th. This enable to tune alpha/beta to
+ * control flows whose maximal RTTs can be in usec up to few secs.
+ */
+#define ALPHA_BETA_MAX ((1U << 31) - 1)
+
+/* Internal alpha/beta are in units of 64ns.
+ * This enables to use all alpha/beta values in the allowed range without loss
+ * of precision due to rounding when scaling them internally, e.g.,
+ * scale_alpha_beta(1) will not round down to 0.
+ */
+#define ALPHA_BETA_GRANULARITY 6
+
+#define ALPHA_BETA_SCALING (ALPHA_BETA_SHIFT - ALPHA_BETA_GRANULARITY)
+
+/* We express the weights (wc, wl) in %, i.e., wc + wl = 100 */
+#define MAX_WC 100
+
+struct dualpi2_sched_data {
+ struct Qdisc *l_queue; /* The L4S Low latency queue (L-queue) */
+ struct Qdisc *sch; /* The Classic queue (C-queue) */
+
+ /* Registered tc filters */
+ struct tcf_proto __rcu *tcf_filters;
+ struct tcf_block *tcf_block;
+
+ /* PI2 parameters */
+ u64 pi2_target; /* Target delay in nanoseconds */
+ u32 pi2_tupdate; /* Timer frequency in nanoseconds */
+ u32 pi2_prob; /* Base PI probability */
+ u32 pi2_alpha; /* Gain factor for the integral rate response */
+ u32 pi2_beta; /* Gain factor for the proportional response */
+ struct hrtimer pi2_timer; /* prob update timer */
+
+ /* Step AQM (L-queue only) parameters */
+ u32 step_thresh; /* Step threshold */
+ bool step_in_packets; /* Step thresh in packets (1) or time (0) */
+
+ /* C-queue starvation protection */
+ s32 c_protection_credit; /* Credit (sign indicates which queue) */
+ s32 c_protection_init; /* Reset value of the credit */
+ u8 c_protection_wc; /* C-queue weight (between 0 and MAX_WC) */
+ u8 c_protection_wl; /* L-queue weight (MAX_WC - wc) */
+
+ /* General dualQ parameters */
+ u32 memory_limit; /* Memory limit of both queues */
+ u8 coupling_factor;/* Coupling factor (k) between both queues */
+ u8 ecn_mask; /* Mask to match packets into L-queue */
+ u32 min_qlen_step; /* Minimum queue length to apply step thresh */
+ bool drop_early; /* Drop at enqueue (1) instead of dequeue (0) */
+ bool drop_overload; /* Drop (1) on overload, or overflow (0) */
+ bool split_gso; /* Split aggregated skb (1) or leave as is (0) */
+
+ /* Statistics */
+ u64 c_head_ts; /* Enqueue timestamp of the C-queue head */
+ u64 l_head_ts; /* Enqueue timestamp of the L-queue head */
+ u64 last_qdelay; /* Q delay val at the last probability update */
+ u32 packets_in_c; /* Enqueue packet counter of the C-queue */
+ u32 packets_in_l; /* Enqueue packet counter of the L-queue */
+ u32 maxq; /* Maximum queue size of the C-queue */
+ u32 ecn_mark; /* ECN mark pkt counter due to PI probability */
+ u32 step_marks; /* ECN mark pkt counter due to step AQM */
+ u32 memory_used; /* Memory used of both queues */
+ u32 max_memory_used;/* Maximum used memory */
+
+ /* Deferred drop statistics */
+ u32 deferred_drops_cnt; /* Packets dropped */
+ u32 deferred_drops_len; /* Bytes dropped */
+};
+
+struct dualpi2_skb_cb {
+ u64 ts; /* Timestamp at enqueue */
+ u8 apply_step:1, /* Can we apply the step threshold */
+ classified:2, /* Packet classification results */
+ ect:2; /* Packet ECT codepoint */
+};
+
+enum dualpi2_classification_results {
+ DUALPI2_C_CLASSIC = 0, /* C-queue */
+ DUALPI2_C_L4S = 1, /* L-queue (scale mark/classic drop) */
+ DUALPI2_C_LLLL = 2, /* L-queue (no drops/marks) */
+ __DUALPI2_C_MAX /* Keep last*/
+};
+
+static struct dualpi2_skb_cb *dualpi2_skb_cb(struct sk_buff *skb)
+{
+ qdisc_cb_private_validate(skb, sizeof(struct dualpi2_skb_cb));
+ return (struct dualpi2_skb_cb *)qdisc_skb_cb(skb)->data;
+}
+
+static u64 dualpi2_sojourn_time(struct sk_buff *skb, u64 reference)
+{
+ return reference - dualpi2_skb_cb(skb)->ts;
+}
+
+static u64 head_enqueue_time(struct Qdisc *q)
+{
+ struct sk_buff *skb = qdisc_peek_head(q);
+
+ return skb ? dualpi2_skb_cb(skb)->ts : 0;
+}
+
+static u32 dualpi2_scale_alpha_beta(u32 param)
+{
+ u64 tmp = ((u64)param * MAX_PROB >> ALPHA_BETA_SCALING);
+
+ do_div(tmp, NSEC_PER_SEC);
+ return tmp;
+}
+
+static u32 dualpi2_unscale_alpha_beta(u32 param)
+{
+ u64 tmp = ((u64)param * NSEC_PER_SEC << ALPHA_BETA_SCALING);
+
+ do_div(tmp, MAX_PROB);
+ return tmp;
+}
+
+static ktime_t next_pi2_timeout(struct dualpi2_sched_data *q)
+{
+ return ktime_add_ns(ktime_get_ns(), q->pi2_tupdate);
+}
+
+static bool skb_is_l4s(struct sk_buff *skb)
+{
+ return dualpi2_skb_cb(skb)->classified == DUALPI2_C_L4S;
+}
+
+static bool skb_in_l_queue(struct sk_buff *skb)
+{
+ return dualpi2_skb_cb(skb)->classified != DUALPI2_C_CLASSIC;
+}
+
+static bool skb_apply_step(struct sk_buff *skb, struct dualpi2_sched_data *q)
+{
+ return skb_is_l4s(skb) && qdisc_qlen(q->l_queue) >= q->min_qlen_step;
+}
+
+static bool dualpi2_mark(struct dualpi2_sched_data *q, struct sk_buff *skb)
+{
+ if (INET_ECN_set_ce(skb)) {
+ q->ecn_mark++;
+ return true;
+ }
+ return false;
+}
+
+static void dualpi2_reset_c_protection(struct dualpi2_sched_data *q)
+{
+ q->c_protection_credit = q->c_protection_init;
+}
+
+/* This computes the initial credit value and WRR weight for the L queue (wl)
+ * from the weight of the C queue (wc).
+ * If wl > wc, the scheduler will start with the L queue when reset.
+ */
+static void dualpi2_calculate_c_protection(struct Qdisc *sch,
+ struct dualpi2_sched_data *q, u32 wc)
+{
+ q->c_protection_wc = wc;
+ q->c_protection_wl = MAX_WC - wc;
+ q->c_protection_init = (s32)psched_mtu(qdisc_dev(sch)) *
+ ((int)q->c_protection_wc - (int)q->c_protection_wl);
+ dualpi2_reset_c_protection(q);
+}
+
+static bool dualpi2_roll(u32 prob)
+{
+ return get_random_u32() <= prob;
+}
+
+/* Packets in the C-queue are subject to a marking probability pC, which is the
+ * square of the internal PI probability (i.e., have an overall lower mark/drop
+ * probability). If the qdisc is overloaded, ignore ECT values and only drop.
+ *
+ * Note that this marking scheme is also applied to L4S packets during overload.
+ * Return true if packet dropping is required in C queue
+ */
+static bool dualpi2_classic_marking(struct dualpi2_sched_data *q,
+ struct sk_buff *skb, u32 prob,
+ bool overload)
+{
+ if (dualpi2_roll(prob) && dualpi2_roll(prob)) {
+ if (overload || dualpi2_skb_cb(skb)->ect == INET_ECN_NOT_ECT)
+ return true;
+ dualpi2_mark(q, skb);
+ }
+ return false;
+}
+
+/* Packets in the L-queue are subject to a marking probability pL given by the
+ * internal PI probability scaled by the coupling factor.
+ *
+ * On overload (i.e., @local_l_prob is >= 100%):
+ * - if the qdisc is configured to trade losses to preserve latency (i.e.,
+ * @q->drop_overload), apply classic drops first before marking.
+ * - otherwise, preserve the "no loss" property of ECN at the cost of queueing
+ * delay, eventually resulting in taildrop behavior once sch->limit is
+ * reached.
+ * Return true if packet dropping is required in L queue
+ */
+static bool dualpi2_scalable_marking(struct dualpi2_sched_data *q,
+ struct sk_buff *skb,
+ u64 local_l_prob, u32 prob,
+ bool overload)
+{
+ if (overload) {
+ /* Apply classic drop */
+ if (!q->drop_overload ||
+ !(dualpi2_roll(prob) && dualpi2_roll(prob)))
+ goto mark;
+ return true;
+ }
+
+ /* We can safely cut the upper 32b as overload==false */
+ if (dualpi2_roll(local_l_prob)) {
+ /* Non-ECT packets could have classified as L4S by filters. */
+ if (dualpi2_skb_cb(skb)->ect == INET_ECN_NOT_ECT)
+ return true;
+mark:
+ dualpi2_mark(q, skb);
+ }
+ return false;
+}
+
+/* Decide whether a given packet must be dropped (or marked if ECT), according
+ * to the PI2 probability.
+ *
+ * Never mark/drop if we have a standing queue of less than 2 MTUs.
+ */
+static bool must_drop(struct Qdisc *sch, struct dualpi2_sched_data *q,
+ struct sk_buff *skb)
+{
+ u64 local_l_prob;
+ bool overload;
+ u32 prob;
+
+ if (sch->qstats.backlog < 2 * psched_mtu(qdisc_dev(sch)))
+ return false;
+
+ prob = READ_ONCE(q->pi2_prob);
+ local_l_prob = (u64)prob * q->coupling_factor;
+ overload = local_l_prob > MAX_PROB;
+
+ switch (dualpi2_skb_cb(skb)->classified) {
+ case DUALPI2_C_CLASSIC:
+ return dualpi2_classic_marking(q, skb, prob, overload);
+ case DUALPI2_C_L4S:
+ return dualpi2_scalable_marking(q, skb, local_l_prob, prob,
+ overload);
+ default: /* DUALPI2_C_LLLL */
+ return false;
+ }
+}
+
+static void dualpi2_read_ect(struct sk_buff *skb)
+{
+ struct dualpi2_skb_cb *cb = dualpi2_skb_cb(skb);
+ int wlen = skb_network_offset(skb);
+
+ switch (skb_protocol(skb, true)) {
+ case htons(ETH_P_IP):
+ wlen += sizeof(struct iphdr);
+ if (!pskb_may_pull(skb, wlen) ||
+ skb_try_make_writable(skb, wlen))
+ goto not_ecn;
+
+ cb->ect = ipv4_get_dsfield(ip_hdr(skb)) & INET_ECN_MASK;
+ break;
+ case htons(ETH_P_IPV6):
+ wlen += sizeof(struct ipv6hdr);
+ if (!pskb_may_pull(skb, wlen) ||
+ skb_try_make_writable(skb, wlen))
+ goto not_ecn;
+
+ cb->ect = ipv6_get_dsfield(ipv6_hdr(skb)) & INET_ECN_MASK;
+ break;
+ default:
+ goto not_ecn;
+ }
+ return;
+
+not_ecn:
+ /* Non pullable/writable packets can only be dropped hence are
+ * classified as not ECT.
+ */
+ cb->ect = INET_ECN_NOT_ECT;
+}
+
+static int dualpi2_skb_classify(struct dualpi2_sched_data *q,
+ struct sk_buff *skb)
+{
+ struct dualpi2_skb_cb *cb = dualpi2_skb_cb(skb);
+ struct tcf_result res;
+ struct tcf_proto *fl;
+ int result;
+
+ dualpi2_read_ect(skb);
+ if (cb->ect & q->ecn_mask) {
+ cb->classified = DUALPI2_C_L4S;
+ return NET_XMIT_SUCCESS;
+ }
+
+ if (TC_H_MAJ(skb->priority) == q->sch->handle &&
+ TC_H_MIN(skb->priority) < __DUALPI2_C_MAX) {
+ cb->classified = TC_H_MIN(skb->priority);
+ return NET_XMIT_SUCCESS;
+ }
+
+ fl = rcu_dereference_bh(q->tcf_filters);
+ if (!fl) {
+ cb->classified = DUALPI2_C_CLASSIC;
+ return NET_XMIT_SUCCESS;
+ }
+
+ result = tcf_classify(skb, NULL, fl, &res, false);
+ if (result >= 0) {
+#ifdef CONFIG_NET_CLS_ACT
+ switch (result) {
+ case TC_ACT_STOLEN:
+ case TC_ACT_QUEUED:
+ case TC_ACT_TRAP:
+ return NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
+ case TC_ACT_SHOT:
+ return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
+ }
+#endif
+ cb->classified = TC_H_MIN(res.classid) < __DUALPI2_C_MAX ?
+ TC_H_MIN(res.classid) : DUALPI2_C_CLASSIC;
+ }
+ return NET_XMIT_SUCCESS;
+}
+
+static int dualpi2_enqueue_skb(struct sk_buff *skb, struct Qdisc *sch,
+ struct sk_buff **to_free)
+{
+ struct dualpi2_sched_data *q = qdisc_priv(sch);
+ struct dualpi2_skb_cb *cb;
+
+ if (unlikely(qdisc_qlen(sch) >= sch->limit) ||
+ unlikely((u64)q->memory_used + skb->truesize > q->memory_limit)) {
+ qdisc_qstats_overlimit(sch);
+ if (skb_in_l_queue(skb))
+ qdisc_qstats_overlimit(q->l_queue);
+ return qdisc_drop_reason(skb, sch, to_free,
+ SKB_DROP_REASON_QDISC_OVERLIMIT);
+ }
+
+ if (q->drop_early && must_drop(sch, q, skb)) {
+ qdisc_drop_reason(skb, sch, to_free,
+ SKB_DROP_REASON_QDISC_CONGESTED);
+ return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
+ }
+
+ cb = dualpi2_skb_cb(skb);
+ cb->ts = ktime_get_ns();
+ q->memory_used += skb->truesize;
+ if (q->memory_used > q->max_memory_used)
+ q->max_memory_used = q->memory_used;
+
+ if (qdisc_qlen(sch) > q->maxq)
+ q->maxq = qdisc_qlen(sch);
+
+ if (skb_in_l_queue(skb)) {
+ /* Apply step thresh if skb is L4S && L-queue len >= min_qlen */
+ dualpi2_skb_cb(skb)->apply_step = skb_apply_step(skb, q);
+
+ /* Keep the overall qdisc stats consistent */
+ ++sch->q.qlen;
+ qdisc_qstats_backlog_inc(sch, skb);
+ ++q->packets_in_l;
+ if (!q->l_head_ts)
+ q->l_head_ts = cb->ts;
+ return qdisc_enqueue_tail(skb, q->l_queue);
+ }
+ ++q->packets_in_c;
+ if (!q->c_head_ts)
+ q->c_head_ts = cb->ts;
+ return qdisc_enqueue_tail(skb, sch);
+}
+
+/* By default, dualpi2 will split GSO skbs into independent skbs and enqueue
+ * each of those individually. This yields the following benefits, at the
+ * expense of CPU usage:
+ * - Finer-grained AQM actions as the sub-packets of a burst no longer share the
+ * same fate (e.g., the random mark/drop probability is applied individually)
+ * - Improved precision of the starvation protection/WRR scheduler at dequeue,
+ * as the size of the dequeued packets will be smaller.
+ */
+static int dualpi2_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch,
+ struct sk_buff **to_free)
+{
+ struct dualpi2_sched_data *q = qdisc_priv(sch);
+ int err;
+
+ err = dualpi2_skb_classify(q, skb);
+ if (err != NET_XMIT_SUCCESS) {
+ if (err & __NET_XMIT_BYPASS)
+ qdisc_qstats_drop(sch);
+ __qdisc_drop(skb, to_free);
+ return err;
+ }
+
+ if (q->split_gso && skb_is_gso(skb)) {
+ netdev_features_t features;
+ struct sk_buff *nskb, *next;
+ int cnt, byte_len, orig_len;
+ int err;
+
+ features = netif_skb_features(skb);
+ nskb = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
+ if (IS_ERR_OR_NULL(nskb))
+ return qdisc_drop(skb, sch, to_free);
+
+ cnt = 1;
+ byte_len = 0;
+ orig_len = qdisc_pkt_len(skb);
+ skb_list_walk_safe(nskb, nskb, next) {
+ skb_mark_not_on_list(nskb);
+
+ /* Iterate through GSO fragments of an skb:
+ * (1) Set pkt_len from the single GSO fragments
+ * (2) Copy classified and ect values of an skb
+ * (3) Enqueue fragment & set ts in dualpi2_enqueue_skb
+ */
+ qdisc_skb_cb(nskb)->pkt_len = nskb->len;
+ qdisc_skb_cb(nskb)->pkt_segs = 1;
+ dualpi2_skb_cb(nskb)->classified =
+ dualpi2_skb_cb(skb)->classified;
+ dualpi2_skb_cb(nskb)->ect = dualpi2_skb_cb(skb)->ect;
+ err = dualpi2_enqueue_skb(nskb, sch, to_free);
+
+ if (err == NET_XMIT_SUCCESS) {
+ /* Compute the backlog adjustment that needs
+ * to be propagated in the qdisc tree to reflect
+ * all new skbs successfully enqueued.
+ */
+ ++cnt;
+ byte_len += nskb->len;
+ }
+ }
+ if (cnt > 1) {
+ /* The caller will add the original skb stats to its
+ * backlog, compensate this if any nskb is enqueued.
+ */
+ --cnt;
+ byte_len -= orig_len;
+ }
+ qdisc_tree_reduce_backlog(sch, -cnt, -byte_len);
+ consume_skb(skb);
+ return err;
+ }
+ return dualpi2_enqueue_skb(skb, sch, to_free);
+}
+
+/* Select the queue from which the next packet can be dequeued, ensuring that
+ * neither queue can starve the other with a WRR scheduler.
+ *
+ * The sign of the WRR credit determines the next queue, while the size of
+ * the dequeued packet determines the magnitude of the WRR credit change. If
+ * either queue is empty, the WRR credit is kept unchanged.
+ *
+ * As the dequeued packet can be dropped later, the caller has to perform the
+ * qdisc_bstats_update() calls.
+ */
+static struct sk_buff *dequeue_packet(struct Qdisc *sch,
+ struct dualpi2_sched_data *q,
+ int *credit_change,
+ u64 now)
+{
+ struct sk_buff *skb = NULL;
+ int c_len;
+
+ *credit_change = 0;
+ c_len = qdisc_qlen(sch) - qdisc_qlen(q->l_queue);
+ if (qdisc_qlen(q->l_queue) && (!c_len || q->c_protection_credit <= 0)) {
+ skb = __qdisc_dequeue_head(&q->l_queue->q);
+ WRITE_ONCE(q->l_head_ts, head_enqueue_time(q->l_queue));
+ if (c_len)
+ *credit_change = q->c_protection_wc;
+ qdisc_qstats_backlog_dec(q->l_queue, skb);
+
+ /* Keep the global queue size consistent */
+ --sch->q.qlen;
+ q->memory_used -= skb->truesize;
+ } else if (c_len) {
+ skb = __qdisc_dequeue_head(&sch->q);
+ WRITE_ONCE(q->c_head_ts, head_enqueue_time(sch));
+ if (qdisc_qlen(q->l_queue))
+ *credit_change = ~((s32)q->c_protection_wl) + 1;
+ q->memory_used -= skb->truesize;
+ } else {
+ dualpi2_reset_c_protection(q);
+ return NULL;
+ }
+ *credit_change *= qdisc_pkt_len(skb);
+ qdisc_qstats_backlog_dec(sch, skb);
+ return skb;
+}
+
+static int do_step_aqm(struct dualpi2_sched_data *q, struct sk_buff *skb,
+ u64 now)
+{
+ u64 qdelay = 0;
+
+ if (q->step_in_packets)
+ qdelay = qdisc_qlen(q->l_queue);
+ else
+ qdelay = dualpi2_sojourn_time(skb, now);
+
+ if (dualpi2_skb_cb(skb)->apply_step && qdelay > q->step_thresh) {
+ if (!dualpi2_skb_cb(skb)->ect) {
+ /* Drop this non-ECT packet */
+ return 1;
+ }
+
+ if (dualpi2_mark(q, skb))
+ ++q->step_marks;
+ }
+ qdisc_bstats_update(q->l_queue, skb);
+ return 0;
+}
+
+static void drop_and_retry(struct dualpi2_sched_data *q, struct sk_buff *skb,
+ struct Qdisc *sch, enum skb_drop_reason reason)
+{
+ ++q->deferred_drops_cnt;
+ q->deferred_drops_len += qdisc_pkt_len(skb);
+ kfree_skb_reason(skb, reason);
+ qdisc_qstats_drop(sch);
+}
+
+static struct sk_buff *dualpi2_qdisc_dequeue(struct Qdisc *sch)
+{
+ struct dualpi2_sched_data *q = qdisc_priv(sch);
+ struct sk_buff *skb;
+ int credit_change;
+ u64 now;
+
+ now = ktime_get_ns();
+
+ while ((skb = dequeue_packet(sch, q, &credit_change, now))) {
+ if (!q->drop_early && must_drop(sch, q, skb)) {
+ drop_and_retry(q, skb, sch,
+ SKB_DROP_REASON_QDISC_CONGESTED);
+ continue;
+ }
+
+ if (skb_in_l_queue(skb) && do_step_aqm(q, skb, now)) {
+ qdisc_qstats_drop(q->l_queue);
+ drop_and_retry(q, skb, sch,
+ SKB_DROP_REASON_DUALPI2_STEP_DROP);
+ continue;
+ }
+
+ q->c_protection_credit += credit_change;
+ qdisc_bstats_update(sch, skb);
+ break;
+ }
+
+ if (q->deferred_drops_cnt) {
+ qdisc_tree_reduce_backlog(sch, q->deferred_drops_cnt,
+ q->deferred_drops_len);
+ q->deferred_drops_cnt = 0;
+ q->deferred_drops_len = 0;
+ }
+ return skb;
+}
+
+static s64 __scale_delta(u64 diff)
+{
+ do_div(diff, 1 << ALPHA_BETA_GRANULARITY);
+ return diff;
+}
+
+static void get_queue_delays(struct dualpi2_sched_data *q, u64 *qdelay_c,
+ u64 *qdelay_l)
+{
+ u64 now, qc, ql;
+
+ now = ktime_get_ns();
+ qc = READ_ONCE(q->c_head_ts);
+ ql = READ_ONCE(q->l_head_ts);
+
+ *qdelay_c = qc ? now - qc : 0;
+ *qdelay_l = ql ? now - ql : 0;
+}
+
+static u32 calculate_probability(struct Qdisc *sch)
+{
+ struct dualpi2_sched_data *q = qdisc_priv(sch);
+ u32 new_prob;
+ u64 qdelay_c;
+ u64 qdelay_l;
+ u64 qdelay;
+ s64 delta;
+
+ get_queue_delays(q, &qdelay_c, &qdelay_l);
+ qdelay = max(qdelay_l, qdelay_c);
+
+ /* Alpha and beta take at most 32b, i.e, the delay difference would
+ * overflow for queuing delay differences > ~4.2sec.
+ */
+ delta = ((s64)qdelay - (s64)q->pi2_target) * q->pi2_alpha;
+ delta += ((s64)qdelay - (s64)q->last_qdelay) * q->pi2_beta;
+ q->last_qdelay = qdelay;
+
+ /* Bound new_prob between 0 and MAX_PROB */
+ if (delta > 0) {
+ new_prob = __scale_delta(delta) + q->pi2_prob;
+ if (new_prob < q->pi2_prob)
+ new_prob = MAX_PROB;
+ } else {
+ new_prob = q->pi2_prob - __scale_delta(~delta + 1);
+ if (new_prob > q->pi2_prob)
+ new_prob = 0;
+ }
+
+ /* If we do not drop on overload, ensure we cap the L4S probability to
+ * 100% to keep window fairness when overflowing.
+ */
+ if (!q->drop_overload)
+ return min_t(u32, new_prob, MAX_PROB / q->coupling_factor);
+ return new_prob;
+}
+
+static u32 get_memory_limit(struct Qdisc *sch, u32 limit)
+{
+ /* Apply rule of thumb, i.e., doubling the packet length,
+ * to further include per packet overhead in memory_limit.
+ */
+ u64 memlim = mul_u32_u32(limit, 2 * psched_mtu(qdisc_dev(sch)));
+
+ if (upper_32_bits(memlim))
+ return U32_MAX;
+ else
+ return lower_32_bits(memlim);
+}
+
+static u32 convert_us_to_nsec(u32 us)
+{
+ u64 ns = mul_u32_u32(us, NSEC_PER_USEC);
+
+ if (upper_32_bits(ns))
+ return U32_MAX;
+
+ return lower_32_bits(ns);
+}
+
+static u32 convert_ns_to_usec(u64 ns)
+{
+ do_div(ns, NSEC_PER_USEC);
+ if (upper_32_bits(ns))
+ return U32_MAX;
+
+ return lower_32_bits(ns);
+}
+
+static enum hrtimer_restart dualpi2_timer(struct hrtimer *timer)
+{
+ struct dualpi2_sched_data *q = timer_container_of(q, timer, pi2_timer);
+ struct Qdisc *sch = q->sch;
+ spinlock_t *root_lock; /* to lock qdisc for probability calculations */
+
+ rcu_read_lock();
+ root_lock = qdisc_lock(qdisc_root_sleeping(sch));
+ spin_lock(root_lock);
+
+ WRITE_ONCE(q->pi2_prob, calculate_probability(sch));
+ hrtimer_set_expires(&q->pi2_timer, next_pi2_timeout(q));
+
+ spin_unlock(root_lock);
+ rcu_read_unlock();
+ return HRTIMER_RESTART;
+}
+
+static struct netlink_range_validation dualpi2_alpha_beta_range = {
+ .min = 1,
+ .max = ALPHA_BETA_MAX,
+};
+
+static const struct nla_policy dualpi2_policy[TCA_DUALPI2_MAX + 1] = {
+ [TCA_DUALPI2_LIMIT] = NLA_POLICY_MIN(NLA_U32, 1),
+ [TCA_DUALPI2_MEMORY_LIMIT] = NLA_POLICY_MIN(NLA_U32, 1),
+ [TCA_DUALPI2_TARGET] = { .type = NLA_U32 },
+ [TCA_DUALPI2_TUPDATE] = NLA_POLICY_MIN(NLA_U32, 1),
+ [TCA_DUALPI2_ALPHA] =
+ NLA_POLICY_FULL_RANGE(NLA_U32, &dualpi2_alpha_beta_range),
+ [TCA_DUALPI2_BETA] =
+ NLA_POLICY_FULL_RANGE(NLA_U32, &dualpi2_alpha_beta_range),
+ [TCA_DUALPI2_STEP_THRESH_PKTS] = { .type = NLA_U32 },
+ [TCA_DUALPI2_STEP_THRESH_US] = { .type = NLA_U32 },
+ [TCA_DUALPI2_MIN_QLEN_STEP] = { .type = NLA_U32 },
+ [TCA_DUALPI2_COUPLING] = NLA_POLICY_MIN(NLA_U8, 1),
+ [TCA_DUALPI2_DROP_OVERLOAD] =
+ NLA_POLICY_MAX(NLA_U8, TCA_DUALPI2_DROP_OVERLOAD_MAX),
+ [TCA_DUALPI2_DROP_EARLY] =
+ NLA_POLICY_MAX(NLA_U8, TCA_DUALPI2_DROP_EARLY_MAX),
+ [TCA_DUALPI2_C_PROTECTION] =
+ NLA_POLICY_RANGE(NLA_U8, 0, MAX_WC),
+ [TCA_DUALPI2_ECN_MASK] =
+ NLA_POLICY_RANGE(NLA_U8, TC_DUALPI2_ECN_MASK_L4S_ECT,
+ TCA_DUALPI2_ECN_MASK_MAX),
+ [TCA_DUALPI2_SPLIT_GSO] =
+ NLA_POLICY_MAX(NLA_U8, TCA_DUALPI2_SPLIT_GSO_MAX),
+};
+
+static int dualpi2_change(struct Qdisc *sch, struct nlattr *opt,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[TCA_DUALPI2_MAX + 1];
+ struct dualpi2_sched_data *q;
+ int old_backlog;
+ int old_qlen;
+ int err;
+
+ if (!opt || !nla_len(opt)) {
+ NL_SET_ERR_MSG_MOD(extack, "Dualpi2 options are required");
+ return -EINVAL;
+ }
+ err = nla_parse_nested(tb, TCA_DUALPI2_MAX, opt, dualpi2_policy,
+ extack);
+ if (err < 0)
+ return err;
+ if (tb[TCA_DUALPI2_STEP_THRESH_PKTS] && tb[TCA_DUALPI2_STEP_THRESH_US]) {
+ NL_SET_ERR_MSG_MOD(extack, "multiple step thresh attributes");
+ return -EINVAL;
+ }
+
+ q = qdisc_priv(sch);
+ sch_tree_lock(sch);
+
+ if (tb[TCA_DUALPI2_LIMIT]) {
+ u32 limit = nla_get_u32(tb[TCA_DUALPI2_LIMIT]);
+
+ WRITE_ONCE(sch->limit, limit);
+ WRITE_ONCE(q->memory_limit, get_memory_limit(sch, limit));
+ }
+
+ if (tb[TCA_DUALPI2_MEMORY_LIMIT])
+ WRITE_ONCE(q->memory_limit,
+ nla_get_u32(tb[TCA_DUALPI2_MEMORY_LIMIT]));
+
+ if (tb[TCA_DUALPI2_TARGET]) {
+ u64 target = nla_get_u32(tb[TCA_DUALPI2_TARGET]);
+
+ WRITE_ONCE(q->pi2_target, target * NSEC_PER_USEC);
+ }
+
+ if (tb[TCA_DUALPI2_TUPDATE]) {
+ u64 tupdate = nla_get_u32(tb[TCA_DUALPI2_TUPDATE]);
+
+ WRITE_ONCE(q->pi2_tupdate, convert_us_to_nsec(tupdate));
+ }
+
+ if (tb[TCA_DUALPI2_ALPHA]) {
+ u32 alpha = nla_get_u32(tb[TCA_DUALPI2_ALPHA]);
+
+ WRITE_ONCE(q->pi2_alpha, dualpi2_scale_alpha_beta(alpha));
+ }
+
+ if (tb[TCA_DUALPI2_BETA]) {
+ u32 beta = nla_get_u32(tb[TCA_DUALPI2_BETA]);
+
+ WRITE_ONCE(q->pi2_beta, dualpi2_scale_alpha_beta(beta));
+ }
+
+ if (tb[TCA_DUALPI2_STEP_THRESH_PKTS]) {
+ u32 step_th = nla_get_u32(tb[TCA_DUALPI2_STEP_THRESH_PKTS]);
+
+ WRITE_ONCE(q->step_in_packets, true);
+ WRITE_ONCE(q->step_thresh, step_th);
+ } else if (tb[TCA_DUALPI2_STEP_THRESH_US]) {
+ u32 step_th = nla_get_u32(tb[TCA_DUALPI2_STEP_THRESH_US]);
+
+ WRITE_ONCE(q->step_in_packets, false);
+ WRITE_ONCE(q->step_thresh, convert_us_to_nsec(step_th));
+ }
+
+ if (tb[TCA_DUALPI2_MIN_QLEN_STEP])
+ WRITE_ONCE(q->min_qlen_step,
+ nla_get_u32(tb[TCA_DUALPI2_MIN_QLEN_STEP]));
+
+ if (tb[TCA_DUALPI2_COUPLING]) {
+ u8 coupling = nla_get_u8(tb[TCA_DUALPI2_COUPLING]);
+
+ WRITE_ONCE(q->coupling_factor, coupling);
+ }
+
+ if (tb[TCA_DUALPI2_DROP_OVERLOAD]) {
+ u8 drop_overload = nla_get_u8(tb[TCA_DUALPI2_DROP_OVERLOAD]);
+
+ WRITE_ONCE(q->drop_overload, (bool)drop_overload);
+ }
+
+ if (tb[TCA_DUALPI2_DROP_EARLY]) {
+ u8 drop_early = nla_get_u8(tb[TCA_DUALPI2_DROP_EARLY]);
+
+ WRITE_ONCE(q->drop_early, (bool)drop_early);
+ }
+
+ if (tb[TCA_DUALPI2_C_PROTECTION]) {
+ u8 wc = nla_get_u8(tb[TCA_DUALPI2_C_PROTECTION]);
+
+ dualpi2_calculate_c_protection(sch, q, wc);
+ }
+
+ if (tb[TCA_DUALPI2_ECN_MASK]) {
+ u8 ecn_mask = nla_get_u8(tb[TCA_DUALPI2_ECN_MASK]);
+
+ WRITE_ONCE(q->ecn_mask, ecn_mask);
+ }
+
+ if (tb[TCA_DUALPI2_SPLIT_GSO]) {
+ u8 split_gso = nla_get_u8(tb[TCA_DUALPI2_SPLIT_GSO]);
+
+ WRITE_ONCE(q->split_gso, (bool)split_gso);
+ }
+
+ old_qlen = qdisc_qlen(sch);
+ old_backlog = sch->qstats.backlog;
+ while (qdisc_qlen(sch) > sch->limit ||
+ q->memory_used > q->memory_limit) {
+ struct sk_buff *skb = qdisc_dequeue_internal(sch, true);
+
+ q->memory_used -= skb->truesize;
+ qdisc_qstats_backlog_dec(sch, skb);
+ rtnl_qdisc_drop(skb, sch);
+ }
+ qdisc_tree_reduce_backlog(sch, old_qlen - qdisc_qlen(sch),
+ old_backlog - sch->qstats.backlog);
+
+ sch_tree_unlock(sch);
+ return 0;
+}
+
+/* Default alpha/beta values give a 10dB stability margin with max_rtt=100ms. */
+static void dualpi2_reset_default(struct Qdisc *sch)
+{
+ struct dualpi2_sched_data *q = qdisc_priv(sch);
+
+ q->sch->limit = 10000; /* Max 125ms at 1Gbps */
+ q->memory_limit = get_memory_limit(sch, q->sch->limit);
+
+ q->pi2_target = 15 * NSEC_PER_MSEC;
+ q->pi2_tupdate = 16 * NSEC_PER_MSEC;
+ q->pi2_alpha = dualpi2_scale_alpha_beta(41); /* ~0.16 Hz * 256 */
+ q->pi2_beta = dualpi2_scale_alpha_beta(819); /* ~3.20 Hz * 256 */
+
+ q->step_thresh = 1 * NSEC_PER_MSEC;
+ q->step_in_packets = false;
+
+ dualpi2_calculate_c_protection(q->sch, q, 10); /* wc=10%, wl=90% */
+
+ q->ecn_mask = TC_DUALPI2_ECN_MASK_L4S_ECT; /* INET_ECN_ECT_1 */
+ q->min_qlen_step = 0; /* Always apply step mark in L-queue */
+ q->coupling_factor = 2; /* window fairness for equal RTTs */
+ q->drop_overload = TC_DUALPI2_DROP_OVERLOAD_DROP; /* Drop overload */
+ q->drop_early = TC_DUALPI2_DROP_EARLY_DROP_DEQUEUE; /* Drop dequeue */
+ q->split_gso = TC_DUALPI2_SPLIT_GSO_SPLIT_GSO; /* Split GSO */
+}
+
+static int dualpi2_init(struct Qdisc *sch, struct nlattr *opt,
+ struct netlink_ext_ack *extack)
+{
+ struct dualpi2_sched_data *q = qdisc_priv(sch);
+ int err;
+
+ q->l_queue = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
+ TC_H_MAKE(sch->handle, 1), extack);
+ if (!q->l_queue)
+ return -ENOMEM;
+
+ err = tcf_block_get(&q->tcf_block, &q->tcf_filters, sch, extack);
+ if (err)
+ return err;
+
+ q->sch = sch;
+ dualpi2_reset_default(sch);
+ hrtimer_setup(&q->pi2_timer, dualpi2_timer, CLOCK_MONOTONIC,
+ HRTIMER_MODE_ABS_PINNED_SOFT);
+
+ if (opt && nla_len(opt)) {
+ err = dualpi2_change(sch, opt, extack);
+
+ if (err)
+ return err;
+ }
+
+ hrtimer_start(&q->pi2_timer, next_pi2_timeout(q),
+ HRTIMER_MODE_ABS_PINNED_SOFT);
+ return 0;
+}
+
+static int dualpi2_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+ struct dualpi2_sched_data *q = qdisc_priv(sch);
+ struct nlattr *opts;
+ bool step_in_pkts;
+ u32 step_th;
+
+ step_in_pkts = READ_ONCE(q->step_in_packets);
+ step_th = READ_ONCE(q->step_thresh);
+
+ opts = nla_nest_start_noflag(skb, TCA_OPTIONS);
+ if (!opts)
+ goto nla_put_failure;
+
+ if (step_in_pkts &&
+ (nla_put_u32(skb, TCA_DUALPI2_LIMIT, READ_ONCE(sch->limit)) ||
+ nla_put_u32(skb, TCA_DUALPI2_MEMORY_LIMIT,
+ READ_ONCE(q->memory_limit)) ||
+ nla_put_u32(skb, TCA_DUALPI2_TARGET,
+ convert_ns_to_usec(READ_ONCE(q->pi2_target))) ||
+ nla_put_u32(skb, TCA_DUALPI2_TUPDATE,
+ convert_ns_to_usec(READ_ONCE(q->pi2_tupdate))) ||
+ nla_put_u32(skb, TCA_DUALPI2_ALPHA,
+ dualpi2_unscale_alpha_beta(READ_ONCE(q->pi2_alpha))) ||
+ nla_put_u32(skb, TCA_DUALPI2_BETA,
+ dualpi2_unscale_alpha_beta(READ_ONCE(q->pi2_beta))) ||
+ nla_put_u32(skb, TCA_DUALPI2_STEP_THRESH_PKTS, step_th) ||
+ nla_put_u32(skb, TCA_DUALPI2_MIN_QLEN_STEP,
+ READ_ONCE(q->min_qlen_step)) ||
+ nla_put_u8(skb, TCA_DUALPI2_COUPLING,
+ READ_ONCE(q->coupling_factor)) ||
+ nla_put_u8(skb, TCA_DUALPI2_DROP_OVERLOAD,
+ READ_ONCE(q->drop_overload)) ||
+ nla_put_u8(skb, TCA_DUALPI2_DROP_EARLY,
+ READ_ONCE(q->drop_early)) ||
+ nla_put_u8(skb, TCA_DUALPI2_C_PROTECTION,
+ READ_ONCE(q->c_protection_wc)) ||
+ nla_put_u8(skb, TCA_DUALPI2_ECN_MASK, READ_ONCE(q->ecn_mask)) ||
+ nla_put_u8(skb, TCA_DUALPI2_SPLIT_GSO, READ_ONCE(q->split_gso))))
+ goto nla_put_failure;
+
+ if (!step_in_pkts &&
+ (nla_put_u32(skb, TCA_DUALPI2_LIMIT, READ_ONCE(sch->limit)) ||
+ nla_put_u32(skb, TCA_DUALPI2_MEMORY_LIMIT,
+ READ_ONCE(q->memory_limit)) ||
+ nla_put_u32(skb, TCA_DUALPI2_TARGET,
+ convert_ns_to_usec(READ_ONCE(q->pi2_target))) ||
+ nla_put_u32(skb, TCA_DUALPI2_TUPDATE,
+ convert_ns_to_usec(READ_ONCE(q->pi2_tupdate))) ||
+ nla_put_u32(skb, TCA_DUALPI2_ALPHA,
+ dualpi2_unscale_alpha_beta(READ_ONCE(q->pi2_alpha))) ||
+ nla_put_u32(skb, TCA_DUALPI2_BETA,
+ dualpi2_unscale_alpha_beta(READ_ONCE(q->pi2_beta))) ||
+ nla_put_u32(skb, TCA_DUALPI2_STEP_THRESH_US,
+ convert_ns_to_usec(step_th)) ||
+ nla_put_u32(skb, TCA_DUALPI2_MIN_QLEN_STEP,
+ READ_ONCE(q->min_qlen_step)) ||
+ nla_put_u8(skb, TCA_DUALPI2_COUPLING,
+ READ_ONCE(q->coupling_factor)) ||
+ nla_put_u8(skb, TCA_DUALPI2_DROP_OVERLOAD,
+ READ_ONCE(q->drop_overload)) ||
+ nla_put_u8(skb, TCA_DUALPI2_DROP_EARLY,
+ READ_ONCE(q->drop_early)) ||
+ nla_put_u8(skb, TCA_DUALPI2_C_PROTECTION,
+ READ_ONCE(q->c_protection_wc)) ||
+ nla_put_u8(skb, TCA_DUALPI2_ECN_MASK, READ_ONCE(q->ecn_mask)) ||
+ nla_put_u8(skb, TCA_DUALPI2_SPLIT_GSO, READ_ONCE(q->split_gso))))
+ goto nla_put_failure;
+
+ return nla_nest_end(skb, opts);
+
+nla_put_failure:
+ nla_nest_cancel(skb, opts);
+ return -1;
+}
+
+static int dualpi2_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
+{
+ struct dualpi2_sched_data *q = qdisc_priv(sch);
+ struct tc_dualpi2_xstats st = {
+ .prob = READ_ONCE(q->pi2_prob),
+ .packets_in_c = q->packets_in_c,
+ .packets_in_l = q->packets_in_l,
+ .maxq = q->maxq,
+ .ecn_mark = q->ecn_mark,
+ .credit = q->c_protection_credit,
+ .step_marks = q->step_marks,
+ .memory_used = q->memory_used,
+ .max_memory_used = q->max_memory_used,
+ .memory_limit = q->memory_limit,
+ };
+ u64 qc, ql;
+
+ get_queue_delays(q, &qc, &ql);
+ st.delay_l = convert_ns_to_usec(ql);
+ st.delay_c = convert_ns_to_usec(qc);
+ return gnet_stats_copy_app(d, &st, sizeof(st));
+}
+
+/* Reset both L-queue and C-queue, internal packet counters, PI probability,
+ * C-queue protection credit, and timestamps, while preserving current
+ * configuration of DUALPI2.
+ */
+static void dualpi2_reset(struct Qdisc *sch)
+{
+ struct dualpi2_sched_data *q = qdisc_priv(sch);
+
+ qdisc_reset_queue(sch);
+ qdisc_reset_queue(q->l_queue);
+ q->c_head_ts = 0;
+ q->l_head_ts = 0;
+ q->pi2_prob = 0;
+ q->packets_in_c = 0;
+ q->packets_in_l = 0;
+ q->maxq = 0;
+ q->ecn_mark = 0;
+ q->step_marks = 0;
+ q->memory_used = 0;
+ q->max_memory_used = 0;
+ dualpi2_reset_c_protection(q);
+}
+
+static void dualpi2_destroy(struct Qdisc *sch)
+{
+ struct dualpi2_sched_data *q = qdisc_priv(sch);
+
+ q->pi2_tupdate = 0;
+ hrtimer_cancel(&q->pi2_timer);
+ if (q->l_queue)
+ qdisc_put(q->l_queue);
+ tcf_block_put(q->tcf_block);
+}
+
+static struct Qdisc *dualpi2_leaf(struct Qdisc *sch, unsigned long arg)
+{
+ return NULL;
+}
+
+static unsigned long dualpi2_find(struct Qdisc *sch, u32 classid)
+{
+ return 0;
+}
+
+static unsigned long dualpi2_bind(struct Qdisc *sch, unsigned long parent,
+ u32 classid)
+{
+ return 0;
+}
+
+static void dualpi2_unbind(struct Qdisc *q, unsigned long cl)
+{
+}
+
+static struct tcf_block *dualpi2_tcf_block(struct Qdisc *sch, unsigned long cl,
+ struct netlink_ext_ack *extack)
+{
+ struct dualpi2_sched_data *q = qdisc_priv(sch);
+
+ if (cl)
+ return NULL;
+ return q->tcf_block;
+}
+
+static void dualpi2_walk(struct Qdisc *sch, struct qdisc_walker *arg)
+{
+ unsigned int i;
+
+ if (arg->stop)
+ return;
+
+ /* We statically define only 2 queues */
+ for (i = 0; i < 2; i++) {
+ if (arg->count < arg->skip) {
+ arg->count++;
+ continue;
+ }
+ if (arg->fn(sch, i + 1, arg) < 0) {
+ arg->stop = 1;
+ break;
+ }
+ arg->count++;
+ }
+}
+
+/* Minimal class support to handle tc filters */
+static const struct Qdisc_class_ops dualpi2_class_ops = {
+ .leaf = dualpi2_leaf,
+ .find = dualpi2_find,
+ .tcf_block = dualpi2_tcf_block,
+ .bind_tcf = dualpi2_bind,
+ .unbind_tcf = dualpi2_unbind,
+ .walk = dualpi2_walk,
+};
+
+static struct Qdisc_ops dualpi2_qdisc_ops __read_mostly = {
+ .id = "dualpi2",
+ .cl_ops = &dualpi2_class_ops,
+ .priv_size = sizeof(struct dualpi2_sched_data),
+ .enqueue = dualpi2_qdisc_enqueue,
+ .dequeue = dualpi2_qdisc_dequeue,
+ .peek = qdisc_peek_dequeued,
+ .init = dualpi2_init,
+ .destroy = dualpi2_destroy,
+ .reset = dualpi2_reset,
+ .change = dualpi2_change,
+ .dump = dualpi2_dump,
+ .dump_stats = dualpi2_dump_stats,
+ .owner = THIS_MODULE,
+};
+
+static int __init dualpi2_module_init(void)
+{
+ return register_qdisc(&dualpi2_qdisc_ops);
+}
+
+static void __exit dualpi2_module_exit(void)
+{
+ unregister_qdisc(&dualpi2_qdisc_ops);
+}
+
+module_init(dualpi2_module_init);
+module_exit(dualpi2_module_exit);
+
+MODULE_DESCRIPTION("Dual Queue with Proportional Integral controller Improved with a Square (dualpi2) scheduler");
+MODULE_AUTHOR("Koen De Schepper <koen.de_schepper@nokia-bell-labs.com>");
+MODULE_AUTHOR("Chia-Yu Chang <chia-yu.chang@nokia-bell-labs.com>");
+MODULE_AUTHOR("Olga Albisser <olga@albisser.org>");
+MODULE_AUTHOR("Henrik Steen <henrist@henrist.net>");
+MODULE_AUTHOR("Olivier Tilmans <olivier.tilmans@nokia.com>");
+
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_VERSION("1.0");
diff --git a/net/sched/sch_etf.c b/net/sched/sch_etf.c
index 1150f22983df..c74d778c32a1 100644
--- a/net/sched/sch_etf.c
+++ b/net/sched/sch_etf.c
@@ -22,10 +22,12 @@
#define DEADLINE_MODE_IS_ON(x) ((x)->flags & TC_ETF_DEADLINE_MODE_ON)
#define OFFLOAD_IS_ON(x) ((x)->flags & TC_ETF_OFFLOAD_ON)
+#define SKIP_SOCK_CHECK_IS_SET(x) ((x)->flags & TC_ETF_SKIP_SOCK_CHECK)
struct etf_sched_data {
bool offload;
bool deadline_mode;
+ bool skip_sock_check;
int clockid;
int queue;
s32 delta; /* in ns */
@@ -77,7 +79,10 @@ static bool is_packet_valid(struct Qdisc *sch, struct sk_buff *nskb)
struct sock *sk = nskb->sk;
ktime_t now;
- if (!sk)
+ if (q->skip_sock_check)
+ goto skip;
+
+ if (!sk || !sk_fullsock(sk))
return false;
if (!sock_flag(sk, SOCK_TXTIME))
@@ -92,6 +97,7 @@ static bool is_packet_valid(struct Qdisc *sch, struct sk_buff *nskb)
if (sk->sk_txtime_deadline_mode != q->deadline_mode)
return false;
+skip:
now = q->get_time();
if (ktime_before(txtime, now) || ktime_before(txtime, q->last))
return false;
@@ -131,8 +137,9 @@ static void report_sock_error(struct sk_buff *skb, u32 err, u8 code)
struct sock_exterr_skb *serr;
struct sk_buff *clone;
ktime_t txtime = skb->tstamp;
+ struct sock *sk = skb->sk;
- if (!skb->sk || !(skb->sk->sk_txtime_report_errors))
+ if (!sk || !sk_fullsock(sk) || !(sk->sk_txtime_report_errors))
return;
clone = skb_clone(skb, GFP_ATOMIC);
@@ -148,7 +155,7 @@ static void report_sock_error(struct sk_buff *skb, u32 err, u8 code)
serr->ee.ee_data = (txtime >> 32); /* high part of tstamp */
serr->ee.ee_info = txtime; /* low part of tstamp */
- if (sock_queue_err_skb(skb->sk, clone))
+ if (sock_queue_err_skb(sk, clone))
kfree_skb(clone);
}
@@ -171,7 +178,7 @@ static int etf_enqueue_timesortedlist(struct sk_buff *nskb, struct Qdisc *sch,
parent = *p;
skb = rb_to_skb(parent);
- if (ktime_after(txtime, skb->tstamp)) {
+ if (ktime_compare(txtime, skb->tstamp) >= 0) {
p = &parent->rb_right;
leftmost = false;
} else {
@@ -316,9 +323,6 @@ static int etf_enable_offload(struct net_device *dev, struct etf_sched_data *q,
struct tc_etf_qopt_offload etf = { };
int err;
- if (q->offload)
- return 0;
-
if (!ops->ndo_setup_tc) {
NL_SET_ERR_MSG(extack, "Specified device does not support ETF offload");
return -EOPNOTSUPP;
@@ -351,7 +355,8 @@ static int etf_init(struct Qdisc *sch, struct nlattr *opt,
return -EINVAL;
}
- err = nla_parse_nested(tb, TCA_ETF_MAX, opt, etf_policy, extack);
+ err = nla_parse_nested_deprecated(tb, TCA_ETF_MAX, opt, etf_policy,
+ extack);
if (err < 0)
return err;
@@ -384,6 +389,7 @@ static int etf_init(struct Qdisc *sch, struct nlattr *opt,
q->clockid = qopt->clockid;
q->offload = OFFLOAD_IS_ON(qopt);
q->deadline_mode = DEADLINE_MODE_IS_ON(qopt);
+ q->skip_sock_check = SKIP_SOCK_CHECK_IS_SET(qopt);
switch (q->clockid) {
case CLOCK_REALTIME:
@@ -436,9 +442,6 @@ static void etf_reset(struct Qdisc *sch)
timesortedlist_clear(sch);
__qdisc_reset_queue(&sch->q);
- sch->qstats.backlog = 0;
- sch->q.qlen = 0;
-
q->last = 0;
}
@@ -460,18 +463,21 @@ static int etf_dump(struct Qdisc *sch, struct sk_buff *skb)
struct tc_etf_qopt opt = { };
struct nlattr *nest;
- nest = nla_nest_start(skb, TCA_OPTIONS);
+ nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
if (!nest)
goto nla_put_failure;
- opt.delta = q->delta;
- opt.clockid = q->clockid;
- if (q->offload)
+ opt.delta = READ_ONCE(q->delta);
+ opt.clockid = READ_ONCE(q->clockid);
+ if (READ_ONCE(q->offload))
opt.flags |= TC_ETF_OFFLOAD_ON;
- if (q->deadline_mode)
+ if (READ_ONCE(q->deadline_mode))
opt.flags |= TC_ETF_DEADLINE_MODE_ON;
+ if (READ_ONCE(q->skip_sock_check))
+ opt.flags |= TC_ETF_SKIP_SOCK_CHECK;
+
if (nla_put(skb, TCA_ETF_PARMS, sizeof(opt), &opt))
goto nla_put_failure;
@@ -494,6 +500,7 @@ static struct Qdisc_ops etf_qdisc_ops __read_mostly = {
.dump = etf_dump,
.owner = THIS_MODULE,
};
+MODULE_ALIAS_NET_SCH("etf");
static int __init etf_module_init(void)
{
@@ -507,3 +514,4 @@ static void __exit etf_module_exit(void)
module_init(etf_module_init)
module_exit(etf_module_exit)
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Earliest TxTime First (ETF) qdisc");
diff --git a/net/sched/sch_ets.c b/net/sched/sch_ets.c
new file mode 100644
index 000000000000..82635dd2cfa5
--- /dev/null
+++ b/net/sched/sch_ets.c
@@ -0,0 +1,839 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * net/sched/sch_ets.c Enhanced Transmission Selection scheduler
+ *
+ * Description
+ * -----------
+ *
+ * The Enhanced Transmission Selection scheduler is a classful queuing
+ * discipline that merges functionality of PRIO and DRR qdiscs in one scheduler.
+ * ETS makes it easy to configure a set of strict and bandwidth-sharing bands to
+ * implement the transmission selection described in 802.1Qaz.
+ *
+ * Although ETS is technically classful, it's not possible to add and remove
+ * classes at will. Instead one specifies number of classes, how many are
+ * PRIO-like and how many DRR-like, and quanta for the latter.
+ *
+ * Algorithm
+ * ---------
+ *
+ * The strict classes, if any, are tried for traffic first: first band 0, if it
+ * has no traffic then band 1, etc.
+ *
+ * When there is no traffic in any of the strict queues, the bandwidth-sharing
+ * ones are tried next. Each band is assigned a deficit counter, initialized to
+ * "quantum" of that band. ETS maintains a list of active bandwidth-sharing
+ * bands whose qdiscs are non-empty. A packet is dequeued from the band at the
+ * head of the list if the packet size is smaller or equal to the deficit
+ * counter. If the counter is too small, it is increased by "quantum" and the
+ * scheduler moves on to the next band in the active list.
+ */
+
+#include <linux/module.h>
+#include <net/gen_stats.h>
+#include <net/netlink.h>
+#include <net/pkt_cls.h>
+#include <net/pkt_sched.h>
+#include <net/sch_generic.h>
+
+struct ets_class {
+ struct list_head alist; /* In struct ets_sched.active. */
+ struct Qdisc *qdisc;
+ u32 quantum;
+ u32 deficit;
+ struct gnet_stats_basic_sync bstats;
+ struct gnet_stats_queue qstats;
+};
+
+struct ets_sched {
+ struct list_head active;
+ struct tcf_proto __rcu *filter_list;
+ struct tcf_block *block;
+ unsigned int nbands;
+ unsigned int nstrict;
+ u8 prio2band[TC_PRIO_MAX + 1];
+ struct ets_class classes[TCQ_ETS_MAX_BANDS];
+};
+
+static const struct nla_policy ets_policy[TCA_ETS_MAX + 1] = {
+ [TCA_ETS_NBANDS] = { .type = NLA_U8 },
+ [TCA_ETS_NSTRICT] = { .type = NLA_U8 },
+ [TCA_ETS_QUANTA] = { .type = NLA_NESTED },
+ [TCA_ETS_PRIOMAP] = { .type = NLA_NESTED },
+};
+
+static const struct nla_policy ets_priomap_policy[TCA_ETS_MAX + 1] = {
+ [TCA_ETS_PRIOMAP_BAND] = { .type = NLA_U8 },
+};
+
+static const struct nla_policy ets_quanta_policy[TCA_ETS_MAX + 1] = {
+ [TCA_ETS_QUANTA_BAND] = { .type = NLA_U32 },
+};
+
+static const struct nla_policy ets_class_policy[TCA_ETS_MAX + 1] = {
+ [TCA_ETS_QUANTA_BAND] = { .type = NLA_U32 },
+};
+
+static bool cl_is_active(struct ets_class *cl)
+{
+ return !list_empty(&cl->alist);
+}
+
+static int ets_quantum_parse(struct Qdisc *sch, const struct nlattr *attr,
+ unsigned int *quantum,
+ struct netlink_ext_ack *extack)
+{
+ *quantum = nla_get_u32(attr);
+ if (!*quantum) {
+ NL_SET_ERR_MSG(extack, "ETS quantum cannot be zero");
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static struct ets_class *
+ets_class_from_arg(struct Qdisc *sch, unsigned long arg)
+{
+ struct ets_sched *q = qdisc_priv(sch);
+
+ if (arg == 0 || arg > q->nbands)
+ return NULL;
+ return &q->classes[arg - 1];
+}
+
+static u32 ets_class_id(struct Qdisc *sch, const struct ets_class *cl)
+{
+ struct ets_sched *q = qdisc_priv(sch);
+ int band = cl - q->classes;
+
+ return TC_H_MAKE(sch->handle, band + 1);
+}
+
+static void ets_offload_change(struct Qdisc *sch)
+{
+ struct net_device *dev = qdisc_dev(sch);
+ struct ets_sched *q = qdisc_priv(sch);
+ struct tc_ets_qopt_offload qopt;
+ unsigned int w_psum_prev = 0;
+ unsigned int q_psum = 0;
+ unsigned int q_sum = 0;
+ unsigned int quantum;
+ unsigned int w_psum;
+ unsigned int weight;
+ unsigned int i;
+
+ if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
+ return;
+
+ qopt.command = TC_ETS_REPLACE;
+ qopt.handle = sch->handle;
+ qopt.parent = sch->parent;
+ qopt.replace_params.bands = q->nbands;
+ qopt.replace_params.qstats = &sch->qstats;
+ memcpy(&qopt.replace_params.priomap,
+ q->prio2band, sizeof(q->prio2band));
+
+ for (i = 0; i < q->nbands; i++)
+ q_sum += q->classes[i].quantum;
+
+ for (i = 0; i < q->nbands; i++) {
+ quantum = q->classes[i].quantum;
+ q_psum += quantum;
+ w_psum = quantum ? q_psum * 100 / q_sum : 0;
+ weight = w_psum - w_psum_prev;
+ w_psum_prev = w_psum;
+
+ qopt.replace_params.quanta[i] = quantum;
+ qopt.replace_params.weights[i] = weight;
+ }
+
+ dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_ETS, &qopt);
+}
+
+static void ets_offload_destroy(struct Qdisc *sch)
+{
+ struct net_device *dev = qdisc_dev(sch);
+ struct tc_ets_qopt_offload qopt;
+
+ if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
+ return;
+
+ qopt.command = TC_ETS_DESTROY;
+ qopt.handle = sch->handle;
+ qopt.parent = sch->parent;
+ dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_ETS, &qopt);
+}
+
+static void ets_offload_graft(struct Qdisc *sch, struct Qdisc *new,
+ struct Qdisc *old, unsigned long arg,
+ struct netlink_ext_ack *extack)
+{
+ struct net_device *dev = qdisc_dev(sch);
+ struct tc_ets_qopt_offload qopt;
+
+ qopt.command = TC_ETS_GRAFT;
+ qopt.handle = sch->handle;
+ qopt.parent = sch->parent;
+ qopt.graft_params.band = arg - 1;
+ qopt.graft_params.child_handle = new->handle;
+
+ qdisc_offload_graft_helper(dev, sch, new, old, TC_SETUP_QDISC_ETS,
+ &qopt, extack);
+}
+
+static int ets_offload_dump(struct Qdisc *sch)
+{
+ struct tc_ets_qopt_offload qopt;
+
+ qopt.command = TC_ETS_STATS;
+ qopt.handle = sch->handle;
+ qopt.parent = sch->parent;
+ qopt.stats.bstats = &sch->bstats;
+ qopt.stats.qstats = &sch->qstats;
+
+ return qdisc_offload_dump_helper(sch, TC_SETUP_QDISC_ETS, &qopt);
+}
+
+static bool ets_class_is_strict(struct ets_sched *q, const struct ets_class *cl)
+{
+ unsigned int band = cl - q->classes;
+
+ return band < q->nstrict;
+}
+
+static int ets_class_change(struct Qdisc *sch, u32 classid, u32 parentid,
+ struct nlattr **tca, unsigned long *arg,
+ struct netlink_ext_ack *extack)
+{
+ struct ets_class *cl = ets_class_from_arg(sch, *arg);
+ struct ets_sched *q = qdisc_priv(sch);
+ struct nlattr *opt = tca[TCA_OPTIONS];
+ struct nlattr *tb[TCA_ETS_MAX + 1];
+ unsigned int quantum;
+ int err;
+
+ /* Classes can be added and removed only through Qdisc_ops.change
+ * interface.
+ */
+ if (!cl) {
+ NL_SET_ERR_MSG(extack, "Fine-grained class addition and removal is not supported");
+ return -EOPNOTSUPP;
+ }
+
+ if (!opt) {
+ NL_SET_ERR_MSG(extack, "ETS options are required for this operation");
+ return -EINVAL;
+ }
+
+ err = nla_parse_nested(tb, TCA_ETS_MAX, opt, ets_class_policy, extack);
+ if (err < 0)
+ return err;
+
+ if (!tb[TCA_ETS_QUANTA_BAND])
+ /* Nothing to configure. */
+ return 0;
+
+ if (ets_class_is_strict(q, cl)) {
+ NL_SET_ERR_MSG(extack, "Strict bands do not have a configurable quantum");
+ return -EINVAL;
+ }
+
+ err = ets_quantum_parse(sch, tb[TCA_ETS_QUANTA_BAND], &quantum,
+ extack);
+ if (err)
+ return err;
+
+ sch_tree_lock(sch);
+ cl->quantum = quantum;
+ sch_tree_unlock(sch);
+
+ ets_offload_change(sch);
+ return 0;
+}
+
+static int ets_class_graft(struct Qdisc *sch, unsigned long arg,
+ struct Qdisc *new, struct Qdisc **old,
+ struct netlink_ext_ack *extack)
+{
+ struct ets_class *cl = ets_class_from_arg(sch, arg);
+
+ if (!new) {
+ new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
+ ets_class_id(sch, cl), NULL);
+ if (!new)
+ new = &noop_qdisc;
+ else
+ qdisc_hash_add(new, true);
+ }
+
+ *old = qdisc_replace(sch, new, &cl->qdisc);
+ ets_offload_graft(sch, new, *old, arg, extack);
+ return 0;
+}
+
+static struct Qdisc *ets_class_leaf(struct Qdisc *sch, unsigned long arg)
+{
+ struct ets_class *cl = ets_class_from_arg(sch, arg);
+
+ return cl->qdisc;
+}
+
+static unsigned long ets_class_find(struct Qdisc *sch, u32 classid)
+{
+ unsigned long band = TC_H_MIN(classid);
+ struct ets_sched *q = qdisc_priv(sch);
+
+ if (band - 1 >= q->nbands)
+ return 0;
+ return band;
+}
+
+static void ets_class_qlen_notify(struct Qdisc *sch, unsigned long arg)
+{
+ struct ets_class *cl = ets_class_from_arg(sch, arg);
+ struct ets_sched *q = qdisc_priv(sch);
+
+ /* We get notified about zero-length child Qdiscs as well if they are
+ * offloaded. Those aren't on the active list though, so don't attempt
+ * to remove them.
+ */
+ if (!ets_class_is_strict(q, cl) && sch->q.qlen)
+ list_del_init(&cl->alist);
+}
+
+static int ets_class_dump(struct Qdisc *sch, unsigned long arg,
+ struct sk_buff *skb, struct tcmsg *tcm)
+{
+ struct ets_class *cl = ets_class_from_arg(sch, arg);
+ struct ets_sched *q = qdisc_priv(sch);
+ struct nlattr *nest;
+
+ tcm->tcm_parent = TC_H_ROOT;
+ tcm->tcm_handle = ets_class_id(sch, cl);
+ tcm->tcm_info = cl->qdisc->handle;
+
+ nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
+ if (!nest)
+ goto nla_put_failure;
+ if (!ets_class_is_strict(q, cl)) {
+ if (nla_put_u32(skb, TCA_ETS_QUANTA_BAND, cl->quantum))
+ goto nla_put_failure;
+ }
+ return nla_nest_end(skb, nest);
+
+nla_put_failure:
+ nla_nest_cancel(skb, nest);
+ return -EMSGSIZE;
+}
+
+static int ets_class_dump_stats(struct Qdisc *sch, unsigned long arg,
+ struct gnet_dump *d)
+{
+ struct ets_class *cl = ets_class_from_arg(sch, arg);
+ struct Qdisc *cl_q = cl->qdisc;
+
+ if (gnet_stats_copy_basic(d, NULL, &cl_q->bstats, true) < 0 ||
+ qdisc_qstats_copy(d, cl_q) < 0)
+ return -1;
+
+ return 0;
+}
+
+static void ets_qdisc_walk(struct Qdisc *sch, struct qdisc_walker *arg)
+{
+ struct ets_sched *q = qdisc_priv(sch);
+ int i;
+
+ if (arg->stop)
+ return;
+
+ for (i = 0; i < q->nbands; i++) {
+ if (!tc_qdisc_stats_dump(sch, i + 1, arg))
+ break;
+ }
+}
+
+static struct tcf_block *
+ets_qdisc_tcf_block(struct Qdisc *sch, unsigned long cl,
+ struct netlink_ext_ack *extack)
+{
+ struct ets_sched *q = qdisc_priv(sch);
+
+ if (cl) {
+ NL_SET_ERR_MSG(extack, "ETS classid must be zero");
+ return NULL;
+ }
+
+ return q->block;
+}
+
+static unsigned long ets_qdisc_bind_tcf(struct Qdisc *sch, unsigned long parent,
+ u32 classid)
+{
+ return ets_class_find(sch, classid);
+}
+
+static void ets_qdisc_unbind_tcf(struct Qdisc *sch, unsigned long arg)
+{
+}
+
+static struct ets_class *ets_classify(struct sk_buff *skb, struct Qdisc *sch,
+ int *qerr)
+{
+ struct ets_sched *q = qdisc_priv(sch);
+ u32 band = skb->priority;
+ struct tcf_result res;
+ struct tcf_proto *fl;
+ int err;
+
+ *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
+ if (TC_H_MAJ(skb->priority) != sch->handle) {
+ fl = rcu_dereference_bh(q->filter_list);
+ err = tcf_classify(skb, NULL, fl, &res, false);
+#ifdef CONFIG_NET_CLS_ACT
+ switch (err) {
+ case TC_ACT_STOLEN:
+ case TC_ACT_QUEUED:
+ case TC_ACT_TRAP:
+ *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
+ fallthrough;
+ case TC_ACT_SHOT:
+ return NULL;
+ }
+#endif
+ if (!fl || err < 0) {
+ if (TC_H_MAJ(band))
+ band = 0;
+ return &q->classes[q->prio2band[band & TC_PRIO_MAX]];
+ }
+ band = res.classid;
+ }
+ band = TC_H_MIN(band) - 1;
+ if (band >= q->nbands)
+ return &q->classes[q->prio2band[0]];
+ return &q->classes[band];
+}
+
+static int ets_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch,
+ struct sk_buff **to_free)
+{
+ unsigned int len = qdisc_pkt_len(skb);
+ struct ets_sched *q = qdisc_priv(sch);
+ struct ets_class *cl;
+ int err = 0;
+
+ cl = ets_classify(skb, sch, &err);
+ if (!cl) {
+ if (err & __NET_XMIT_BYPASS)
+ qdisc_qstats_drop(sch);
+ __qdisc_drop(skb, to_free);
+ return err;
+ }
+
+ err = qdisc_enqueue(skb, cl->qdisc, to_free);
+ if (unlikely(err != NET_XMIT_SUCCESS)) {
+ if (net_xmit_drop_count(err)) {
+ cl->qstats.drops++;
+ qdisc_qstats_drop(sch);
+ }
+ return err;
+ }
+
+ if (!cl_is_active(cl) && !ets_class_is_strict(q, cl)) {
+ list_add_tail(&cl->alist, &q->active);
+ cl->deficit = cl->quantum;
+ }
+
+ sch->qstats.backlog += len;
+ sch->q.qlen++;
+ return err;
+}
+
+static struct sk_buff *
+ets_qdisc_dequeue_skb(struct Qdisc *sch, struct sk_buff *skb)
+{
+ qdisc_bstats_update(sch, skb);
+ qdisc_qstats_backlog_dec(sch, skb);
+ sch->q.qlen--;
+ return skb;
+}
+
+static struct sk_buff *ets_qdisc_dequeue(struct Qdisc *sch)
+{
+ struct ets_sched *q = qdisc_priv(sch);
+ struct ets_class *cl;
+ struct sk_buff *skb;
+ unsigned int band;
+ unsigned int len;
+
+ while (1) {
+ for (band = 0; band < q->nstrict; band++) {
+ cl = &q->classes[band];
+ skb = qdisc_dequeue_peeked(cl->qdisc);
+ if (skb)
+ return ets_qdisc_dequeue_skb(sch, skb);
+ }
+
+ if (list_empty(&q->active))
+ goto out;
+
+ cl = list_first_entry(&q->active, struct ets_class, alist);
+ skb = cl->qdisc->ops->peek(cl->qdisc);
+ if (!skb) {
+ qdisc_warn_nonwc(__func__, cl->qdisc);
+ goto out;
+ }
+
+ len = qdisc_pkt_len(skb);
+ if (len <= cl->deficit) {
+ cl->deficit -= len;
+ skb = qdisc_dequeue_peeked(cl->qdisc);
+ if (unlikely(!skb))
+ goto out;
+ if (cl->qdisc->q.qlen == 0)
+ list_del_init(&cl->alist);
+ return ets_qdisc_dequeue_skb(sch, skb);
+ }
+
+ cl->deficit += cl->quantum;
+ list_move_tail(&cl->alist, &q->active);
+ }
+out:
+ return NULL;
+}
+
+static int ets_qdisc_priomap_parse(struct nlattr *priomap_attr,
+ unsigned int nbands, u8 *priomap,
+ struct netlink_ext_ack *extack)
+{
+ const struct nlattr *attr;
+ int prio = 0;
+ u8 band;
+ int rem;
+ int err;
+
+ err = __nla_validate_nested(priomap_attr, TCA_ETS_MAX,
+ ets_priomap_policy, NL_VALIDATE_STRICT,
+ extack);
+ if (err)
+ return err;
+
+ nla_for_each_nested(attr, priomap_attr, rem) {
+ switch (nla_type(attr)) {
+ case TCA_ETS_PRIOMAP_BAND:
+ if (prio > TC_PRIO_MAX) {
+ NL_SET_ERR_MSG_MOD(extack, "Too many priorities in ETS priomap");
+ return -EINVAL;
+ }
+ band = nla_get_u8(attr);
+ if (band >= nbands) {
+ NL_SET_ERR_MSG_MOD(extack, "Invalid band number in ETS priomap");
+ return -EINVAL;
+ }
+ priomap[prio++] = band;
+ break;
+ default:
+ WARN_ON_ONCE(1); /* Validate should have caught this. */
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
+static int ets_qdisc_quanta_parse(struct Qdisc *sch, struct nlattr *quanta_attr,
+ unsigned int nbands, unsigned int nstrict,
+ unsigned int *quanta,
+ struct netlink_ext_ack *extack)
+{
+ const struct nlattr *attr;
+ int band = nstrict;
+ int rem;
+ int err;
+
+ err = __nla_validate_nested(quanta_attr, TCA_ETS_MAX,
+ ets_quanta_policy, NL_VALIDATE_STRICT,
+ extack);
+ if (err < 0)
+ return err;
+
+ nla_for_each_nested(attr, quanta_attr, rem) {
+ switch (nla_type(attr)) {
+ case TCA_ETS_QUANTA_BAND:
+ if (band >= nbands) {
+ NL_SET_ERR_MSG_MOD(extack, "ETS quanta has more values than bands");
+ return -EINVAL;
+ }
+ err = ets_quantum_parse(sch, attr, &quanta[band++],
+ extack);
+ if (err)
+ return err;
+ break;
+ default:
+ WARN_ON_ONCE(1); /* Validate should have caught this. */
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
+static int ets_qdisc_change(struct Qdisc *sch, struct nlattr *opt,
+ struct netlink_ext_ack *extack)
+{
+ unsigned int quanta[TCQ_ETS_MAX_BANDS] = {0};
+ struct Qdisc *queues[TCQ_ETS_MAX_BANDS];
+ struct ets_sched *q = qdisc_priv(sch);
+ struct nlattr *tb[TCA_ETS_MAX + 1];
+ unsigned int oldbands = q->nbands;
+ u8 priomap[TC_PRIO_MAX + 1];
+ unsigned int nstrict = 0;
+ unsigned int nbands;
+ unsigned int i;
+ int err;
+
+ err = nla_parse_nested(tb, TCA_ETS_MAX, opt, ets_policy, extack);
+ if (err < 0)
+ return err;
+
+ if (!tb[TCA_ETS_NBANDS]) {
+ NL_SET_ERR_MSG_MOD(extack, "Number of bands is a required argument");
+ return -EINVAL;
+ }
+ nbands = nla_get_u8(tb[TCA_ETS_NBANDS]);
+ if (nbands < 1 || nbands > TCQ_ETS_MAX_BANDS) {
+ NL_SET_ERR_MSG_MOD(extack, "Invalid number of bands");
+ return -EINVAL;
+ }
+ /* Unless overridden, traffic goes to the last band. */
+ memset(priomap, nbands - 1, sizeof(priomap));
+
+ if (tb[TCA_ETS_NSTRICT]) {
+ nstrict = nla_get_u8(tb[TCA_ETS_NSTRICT]);
+ if (nstrict > nbands) {
+ NL_SET_ERR_MSG_MOD(extack, "Invalid number of strict bands");
+ return -EINVAL;
+ }
+ }
+
+ if (tb[TCA_ETS_PRIOMAP]) {
+ err = ets_qdisc_priomap_parse(tb[TCA_ETS_PRIOMAP],
+ nbands, priomap, extack);
+ if (err)
+ return err;
+ }
+
+ if (tb[TCA_ETS_QUANTA]) {
+ err = ets_qdisc_quanta_parse(sch, tb[TCA_ETS_QUANTA],
+ nbands, nstrict, quanta, extack);
+ if (err)
+ return err;
+ }
+ /* If there are more bands than strict + quanta provided, the remaining
+ * ones are ETS with quantum of MTU. Initialize the missing values here.
+ */
+ for (i = nstrict; i < nbands; i++) {
+ if (!quanta[i])
+ quanta[i] = psched_mtu(qdisc_dev(sch));
+ }
+
+ /* Before commit, make sure we can allocate all new qdiscs */
+ for (i = oldbands; i < nbands; i++) {
+ queues[i] = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
+ ets_class_id(sch, &q->classes[i]),
+ extack);
+ if (!queues[i]) {
+ while (i > oldbands)
+ qdisc_put(queues[--i]);
+ return -ENOMEM;
+ }
+ }
+
+ sch_tree_lock(sch);
+
+ for (i = nbands; i < oldbands; i++) {
+ if (i >= q->nstrict && q->classes[i].qdisc->q.qlen)
+ list_del_init(&q->classes[i].alist);
+ qdisc_purge_queue(q->classes[i].qdisc);
+ }
+
+ WRITE_ONCE(q->nbands, nbands);
+ for (i = nstrict; i < q->nstrict; i++) {
+ if (q->classes[i].qdisc->q.qlen) {
+ list_add_tail(&q->classes[i].alist, &q->active);
+ q->classes[i].deficit = quanta[i];
+ }
+ }
+ WRITE_ONCE(q->nstrict, nstrict);
+ memcpy(q->prio2band, priomap, sizeof(priomap));
+
+ for (i = 0; i < q->nbands; i++)
+ WRITE_ONCE(q->classes[i].quantum, quanta[i]);
+
+ for (i = oldbands; i < q->nbands; i++) {
+ q->classes[i].qdisc = queues[i];
+ if (q->classes[i].qdisc != &noop_qdisc)
+ qdisc_hash_add(q->classes[i].qdisc, true);
+ }
+
+ sch_tree_unlock(sch);
+
+ ets_offload_change(sch);
+ for (i = q->nbands; i < oldbands; i++) {
+ qdisc_put(q->classes[i].qdisc);
+ q->classes[i].qdisc = NULL;
+ WRITE_ONCE(q->classes[i].quantum, 0);
+ q->classes[i].deficit = 0;
+ gnet_stats_basic_sync_init(&q->classes[i].bstats);
+ memset(&q->classes[i].qstats, 0, sizeof(q->classes[i].qstats));
+ }
+ return 0;
+}
+
+static int ets_qdisc_init(struct Qdisc *sch, struct nlattr *opt,
+ struct netlink_ext_ack *extack)
+{
+ struct ets_sched *q = qdisc_priv(sch);
+ int err, i;
+
+ if (!opt)
+ return -EINVAL;
+
+ err = tcf_block_get(&q->block, &q->filter_list, sch, extack);
+ if (err)
+ return err;
+
+ INIT_LIST_HEAD(&q->active);
+ for (i = 0; i < TCQ_ETS_MAX_BANDS; i++)
+ INIT_LIST_HEAD(&q->classes[i].alist);
+
+ return ets_qdisc_change(sch, opt, extack);
+}
+
+static void ets_qdisc_reset(struct Qdisc *sch)
+{
+ struct ets_sched *q = qdisc_priv(sch);
+ int band;
+
+ for (band = q->nstrict; band < q->nbands; band++) {
+ if (q->classes[band].qdisc->q.qlen)
+ list_del_init(&q->classes[band].alist);
+ }
+ for (band = 0; band < q->nbands; band++)
+ qdisc_reset(q->classes[band].qdisc);
+}
+
+static void ets_qdisc_destroy(struct Qdisc *sch)
+{
+ struct ets_sched *q = qdisc_priv(sch);
+ int band;
+
+ ets_offload_destroy(sch);
+ tcf_block_put(q->block);
+ for (band = 0; band < q->nbands; band++)
+ qdisc_put(q->classes[band].qdisc);
+}
+
+static int ets_qdisc_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+ struct ets_sched *q = qdisc_priv(sch);
+ struct nlattr *opts;
+ struct nlattr *nest;
+ u8 nbands, nstrict;
+ int band;
+ int prio;
+ int err;
+
+ err = ets_offload_dump(sch);
+ if (err)
+ return err;
+
+ opts = nla_nest_start_noflag(skb, TCA_OPTIONS);
+ if (!opts)
+ goto nla_err;
+
+ nbands = READ_ONCE(q->nbands);
+ if (nla_put_u8(skb, TCA_ETS_NBANDS, nbands))
+ goto nla_err;
+
+ nstrict = READ_ONCE(q->nstrict);
+ if (nstrict && nla_put_u8(skb, TCA_ETS_NSTRICT, nstrict))
+ goto nla_err;
+
+ if (nbands > nstrict) {
+ nest = nla_nest_start(skb, TCA_ETS_QUANTA);
+ if (!nest)
+ goto nla_err;
+
+ for (band = nstrict; band < nbands; band++) {
+ if (nla_put_u32(skb, TCA_ETS_QUANTA_BAND,
+ READ_ONCE(q->classes[band].quantum)))
+ goto nla_err;
+ }
+
+ nla_nest_end(skb, nest);
+ }
+
+ nest = nla_nest_start(skb, TCA_ETS_PRIOMAP);
+ if (!nest)
+ goto nla_err;
+
+ for (prio = 0; prio <= TC_PRIO_MAX; prio++) {
+ if (nla_put_u8(skb, TCA_ETS_PRIOMAP_BAND,
+ READ_ONCE(q->prio2band[prio])))
+ goto nla_err;
+ }
+
+ nla_nest_end(skb, nest);
+
+ return nla_nest_end(skb, opts);
+
+nla_err:
+ nla_nest_cancel(skb, opts);
+ return -EMSGSIZE;
+}
+
+static const struct Qdisc_class_ops ets_class_ops = {
+ .change = ets_class_change,
+ .graft = ets_class_graft,
+ .leaf = ets_class_leaf,
+ .find = ets_class_find,
+ .qlen_notify = ets_class_qlen_notify,
+ .dump = ets_class_dump,
+ .dump_stats = ets_class_dump_stats,
+ .walk = ets_qdisc_walk,
+ .tcf_block = ets_qdisc_tcf_block,
+ .bind_tcf = ets_qdisc_bind_tcf,
+ .unbind_tcf = ets_qdisc_unbind_tcf,
+};
+
+static struct Qdisc_ops ets_qdisc_ops __read_mostly = {
+ .cl_ops = &ets_class_ops,
+ .id = "ets",
+ .priv_size = sizeof(struct ets_sched),
+ .enqueue = ets_qdisc_enqueue,
+ .dequeue = ets_qdisc_dequeue,
+ .peek = qdisc_peek_dequeued,
+ .change = ets_qdisc_change,
+ .init = ets_qdisc_init,
+ .reset = ets_qdisc_reset,
+ .destroy = ets_qdisc_destroy,
+ .dump = ets_qdisc_dump,
+ .owner = THIS_MODULE,
+};
+MODULE_ALIAS_NET_SCH("ets");
+
+static int __init ets_init(void)
+{
+ return register_qdisc(&ets_qdisc_ops);
+}
+
+static void __exit ets_exit(void)
+{
+ unregister_qdisc(&ets_qdisc_ops);
+}
+
+module_init(ets_init);
+module_exit(ets_exit);
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Enhanced Transmission Selection(ETS) scheduler");
diff --git a/net/sched/sch_fifo.c b/net/sched/sch_fifo.c
index 3809c9bf8896..e6bfd39ff339 100644
--- a/net/sched/sch_fifo.c
+++ b/net/sched/sch_fifo.c
@@ -1,11 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* net/sched/sch_fifo.c The simplest FIFO queue.
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
*/
@@ -16,13 +12,15 @@
#include <linux/errno.h>
#include <linux/skbuff.h>
#include <net/pkt_sched.h>
+#include <net/pkt_cls.h>
/* 1 band FIFO pseudo-"scheduler" */
static int bfifo_enqueue(struct sk_buff *skb, struct Qdisc *sch,
struct sk_buff **to_free)
{
- if (likely(sch->qstats.backlog + qdisc_pkt_len(skb) <= sch->limit))
+ if (likely(sch->qstats.backlog + qdisc_pkt_len(skb) <=
+ READ_ONCE(sch->limit)))
return qdisc_enqueue_tail(skb, sch);
return qdisc_drop(skb, sch, to_free);
@@ -31,7 +29,7 @@ static int bfifo_enqueue(struct sk_buff *skb, struct Qdisc *sch,
static int pfifo_enqueue(struct sk_buff *skb, struct Qdisc *sch,
struct sk_buff **to_free)
{
- if (likely(sch->q.qlen < sch->limit))
+ if (likely(sch->q.qlen < READ_ONCE(sch->limit)))
return qdisc_enqueue_tail(skb, sch);
return qdisc_drop(skb, sch, to_free);
@@ -42,7 +40,10 @@ static int pfifo_tail_enqueue(struct sk_buff *skb, struct Qdisc *sch,
{
unsigned int prev_backlog;
- if (likely(sch->q.qlen < sch->limit))
+ if (unlikely(READ_ONCE(sch->limit) == 0))
+ return qdisc_drop(skb, sch, to_free);
+
+ if (likely(sch->q.qlen < READ_ONCE(sch->limit)))
return qdisc_enqueue_tail(skb, sch);
prev_backlog = sch->qstats.backlog;
@@ -55,8 +56,49 @@ static int pfifo_tail_enqueue(struct sk_buff *skb, struct Qdisc *sch,
return NET_XMIT_CN;
}
-static int fifo_init(struct Qdisc *sch, struct nlattr *opt,
- struct netlink_ext_ack *extack)
+static void fifo_offload_init(struct Qdisc *sch)
+{
+ struct net_device *dev = qdisc_dev(sch);
+ struct tc_fifo_qopt_offload qopt;
+
+ if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
+ return;
+
+ qopt.command = TC_FIFO_REPLACE;
+ qopt.handle = sch->handle;
+ qopt.parent = sch->parent;
+ dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_FIFO, &qopt);
+}
+
+static void fifo_offload_destroy(struct Qdisc *sch)
+{
+ struct net_device *dev = qdisc_dev(sch);
+ struct tc_fifo_qopt_offload qopt;
+
+ if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
+ return;
+
+ qopt.command = TC_FIFO_DESTROY;
+ qopt.handle = sch->handle;
+ qopt.parent = sch->parent;
+ dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_FIFO, &qopt);
+}
+
+static int fifo_offload_dump(struct Qdisc *sch)
+{
+ struct tc_fifo_qopt_offload qopt;
+
+ qopt.command = TC_FIFO_STATS;
+ qopt.handle = sch->handle;
+ qopt.parent = sch->parent;
+ qopt.stats.bstats = &sch->bstats;
+ qopt.stats.qstats = &sch->qstats;
+
+ return qdisc_offload_dump_helper(sch, TC_SETUP_QDISC_FIFO, &qopt);
+}
+
+static int __fifo_init(struct Qdisc *sch, struct nlattr *opt,
+ struct netlink_ext_ack *extack)
{
bool bypass;
bool is_bfifo = sch->ops == &bfifo_qdisc_ops;
@@ -67,14 +109,14 @@ static int fifo_init(struct Qdisc *sch, struct nlattr *opt,
if (is_bfifo)
limit *= psched_mtu(qdisc_dev(sch));
- sch->limit = limit;
+ WRITE_ONCE(sch->limit, limit);
} else {
struct tc_fifo_qopt *ctl = nla_data(opt);
if (nla_len(opt) < sizeof(*ctl))
return -EINVAL;
- sch->limit = ctl->limit;
+ WRITE_ONCE(sch->limit, ctl->limit);
}
if (is_bfifo)
@@ -86,12 +128,37 @@ static int fifo_init(struct Qdisc *sch, struct nlattr *opt,
sch->flags |= TCQ_F_CAN_BYPASS;
else
sch->flags &= ~TCQ_F_CAN_BYPASS;
+
return 0;
}
-static int fifo_dump(struct Qdisc *sch, struct sk_buff *skb)
+static int fifo_init(struct Qdisc *sch, struct nlattr *opt,
+ struct netlink_ext_ack *extack)
{
- struct tc_fifo_qopt opt = { .limit = sch->limit };
+ int err;
+
+ err = __fifo_init(sch, opt, extack);
+ if (err)
+ return err;
+
+ fifo_offload_init(sch);
+ return 0;
+}
+
+static int fifo_hd_init(struct Qdisc *sch, struct nlattr *opt,
+ struct netlink_ext_ack *extack)
+{
+ return __fifo_init(sch, opt, extack);
+}
+
+static void fifo_destroy(struct Qdisc *sch)
+{
+ fifo_offload_destroy(sch);
+}
+
+static int __fifo_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+ struct tc_fifo_qopt opt = { .limit = READ_ONCE(sch->limit) };
if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt))
goto nla_put_failure;
@@ -101,6 +168,22 @@ nla_put_failure:
return -1;
}
+static int fifo_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+ int err;
+
+ err = fifo_offload_dump(sch);
+ if (err)
+ return err;
+
+ return __fifo_dump(sch, skb);
+}
+
+static int fifo_hd_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+ return __fifo_dump(sch, skb);
+}
+
struct Qdisc_ops pfifo_qdisc_ops __read_mostly = {
.id = "pfifo",
.priv_size = 0,
@@ -108,6 +191,7 @@ struct Qdisc_ops pfifo_qdisc_ops __read_mostly = {
.dequeue = qdisc_dequeue_head,
.peek = qdisc_peek_head,
.init = fifo_init,
+ .destroy = fifo_destroy,
.reset = qdisc_reset_queue,
.change = fifo_init,
.dump = fifo_dump,
@@ -122,6 +206,7 @@ struct Qdisc_ops bfifo_qdisc_ops __read_mostly = {
.dequeue = qdisc_dequeue_head,
.peek = qdisc_peek_head,
.init = fifo_init,
+ .destroy = fifo_destroy,
.reset = qdisc_reset_queue,
.change = fifo_init,
.dump = fifo_dump,
@@ -135,10 +220,10 @@ struct Qdisc_ops pfifo_head_drop_qdisc_ops __read_mostly = {
.enqueue = pfifo_tail_enqueue,
.dequeue = qdisc_dequeue_head,
.peek = qdisc_peek_head,
- .init = fifo_init,
+ .init = fifo_hd_init,
.reset = qdisc_reset_queue,
- .change = fifo_init,
- .dump = fifo_dump,
+ .change = fifo_hd_init,
+ .dump = fifo_hd_dump,
.owner = THIS_MODULE,
};
@@ -152,6 +237,9 @@ int fifo_set_limit(struct Qdisc *q, unsigned int limit)
if (strncmp(q->ops->id + 1, "fifo", 4) != 0)
return 0;
+ if (!q->ops->change)
+ return 0;
+
nla = kmalloc(nla_attr_size(sizeof(struct tc_fifo_qopt)), GFP_KERNEL);
if (nla) {
nla->nla_type = RTM_NEWQDISC;
@@ -185,3 +273,4 @@ struct Qdisc *fifo_create_dflt(struct Qdisc *sch, struct Qdisc_ops *ops,
return q ? : ERR_PTR(err);
}
EXPORT_SYMBOL(fifo_create_dflt);
+MODULE_DESCRIPTION("Single queue packet and byte based First In First Out(P/BFIFO) scheduler");
diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c
index 1a662f2bb7bb..6e5f2f4f2415 100644
--- a/net/sched/sch_fq.c
+++ b/net/sched/sch_fq.c
@@ -1,12 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* net/sched/sch_fq.c Fair Queue Packet Scheduler (per flow pacing)
*
- * Copyright (C) 2013-2015 Eric Dumazet <edumazet@google.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
+ * Copyright (C) 2013-2023 Eric Dumazet <edumazet@google.com>
*
* Meant to be mostly used for locally generated traffic :
* Fast classification depends on skb->sk being set before reaching us.
@@ -54,21 +50,45 @@
#include <net/tcp_states.h>
#include <net/tcp.h>
+struct fq_skb_cb {
+ u64 time_to_send;
+ u8 band;
+};
+
+static inline struct fq_skb_cb *fq_skb_cb(struct sk_buff *skb)
+{
+ qdisc_cb_private_validate(skb, sizeof(struct fq_skb_cb));
+ return (struct fq_skb_cb *)qdisc_skb_cb(skb)->data;
+}
+
/*
- * Per flow structure, dynamically allocated
+ * Per flow structure, dynamically allocated.
+ * If packets have monotically increasing time_to_send, they are placed in O(1)
+ * in linear list (head,tail), otherwise are placed in a rbtree (t_root).
*/
struct fq_flow {
+/* First cache line : used in fq_gc(), fq_enqueue(), fq_dequeue() */
+ struct rb_root t_root;
struct sk_buff *head; /* list of skbs for this flow : first skb */
union {
struct sk_buff *tail; /* last skb in the list */
- unsigned long age; /* jiffies when flow was emptied, for gc */
+ unsigned long age; /* (jiffies | 1UL) when flow was emptied, for gc */
+ };
+ union {
+ struct rb_node fq_node; /* anchor in fq_root[] trees */
+ /* Following field is only used for q->internal,
+ * because q->internal is not hashed in fq_root[]
+ */
+ u64 stat_fastpath_packets;
};
- struct rb_node fq_node; /* anchor in fq_root[] trees */
struct sock *sk;
+ u32 socket_hash; /* sk_hash */
int qlen; /* number of packets in flow queue */
+
+/* Second cache line */
int credit;
- u32 socket_hash; /* sk_hash */
- struct fq_flow *next; /* next pointer in RR lists, or &detached */
+ int band;
+ struct fq_flow *next; /* next pointer in RR lists */
struct rb_node rate_node; /* anchor in q->delayed tree */
u64 time_next_packet;
@@ -79,63 +99,109 @@ struct fq_flow_head {
struct fq_flow *last;
};
-struct fq_sched_data {
+struct fq_perband_flows {
struct fq_flow_head new_flows;
-
struct fq_flow_head old_flows;
+ int credit;
+ int quantum; /* based on band nr : 576KB, 192KB, 64KB */
+};
- struct rb_root delayed; /* for rate limited flows */
- u64 time_next_delayed_flow;
- unsigned long unthrottle_latency_ns;
+#define FQ_PRIO2BAND_CRUMB_SIZE ((TC_PRIO_MAX + 1) >> 2)
+
+struct fq_sched_data {
+/* Read mostly cache line */
- struct fq_flow internal; /* for non classified or high prio packets */
+ u64 offload_horizon;
u32 quantum;
u32 initial_quantum;
u32 flow_refill_delay;
u32 flow_plimit; /* max packets per flow */
unsigned long flow_max_rate; /* optional max rate per flow */
u64 ce_threshold;
+ u64 horizon; /* horizon in ns */
u32 orphan_mask; /* mask for orphaned skb */
u32 low_rate_threshold;
struct rb_root *fq_root;
u8 rate_enable;
u8 fq_trees_log;
+ u8 horizon_drop;
+ u8 prio2band[FQ_PRIO2BAND_CRUMB_SIZE];
+ u32 timer_slack; /* hrtimer slack in ns */
+
+/* Read/Write fields. */
+
+ unsigned int band_nr; /* band being serviced in fq_dequeue() */
+
+ struct fq_perband_flows band_flows[FQ_BANDS];
+
+ struct fq_flow internal; /* fastpath queue. */
+ struct rb_root delayed; /* for rate limited flows */
+ u64 time_next_delayed_flow;
+ unsigned long unthrottle_latency_ns;
+ u32 band_pkt_count[FQ_BANDS];
u32 flows;
- u32 inactive_flows;
+ u32 inactive_flows; /* Flows with no packet to send. */
u32 throttled_flows;
- u64 stat_gc_flows;
- u64 stat_internal_packets;
u64 stat_throttled;
+ struct qdisc_watchdog watchdog;
+ u64 stat_gc_flows;
+
+/* Seldom used fields. */
+
+ u64 stat_band_drops[FQ_BANDS];
u64 stat_ce_mark;
+ u64 stat_horizon_drops;
+ u64 stat_horizon_caps;
u64 stat_flows_plimit;
u64 stat_pkts_too_long;
u64 stat_allocation_errors;
- struct qdisc_watchdog watchdog;
};
-/* special value to mark a detached flow (not on old/new list) */
-static struct fq_flow detached, throttled;
+/* return the i-th 2-bit value ("crumb") */
+static u8 fq_prio2band(const u8 *prio2band, unsigned int prio)
+{
+ return (READ_ONCE(prio2band[prio / 4]) >> (2 * (prio & 0x3))) & 0x3;
+}
+/*
+ * f->tail and f->age share the same location.
+ * We can use the low order bit to differentiate if this location points
+ * to a sk_buff or contains a jiffies value, if we force this value to be odd.
+ * This assumes f->tail low order bit must be 0 since alignof(struct sk_buff) >= 2
+ */
static void fq_flow_set_detached(struct fq_flow *f)
{
- f->next = &detached;
- f->age = jiffies;
+ f->age = jiffies | 1UL;
}
static bool fq_flow_is_detached(const struct fq_flow *f)
{
- return f->next == &detached;
+ return !!(f->age & 1UL);
}
+/* special value to mark a throttled flow (not on old/new list) */
+static struct fq_flow throttled;
+
static bool fq_flow_is_throttled(const struct fq_flow *f)
{
return f->next == &throttled;
}
-static void fq_flow_add_tail(struct fq_flow_head *head, struct fq_flow *flow)
+enum new_flow {
+ NEW_FLOW,
+ OLD_FLOW
+};
+
+static void fq_flow_add_tail(struct fq_sched_data *q, struct fq_flow *flow,
+ enum new_flow list_sel)
{
+ struct fq_perband_flows *pband = &q->band_flows[flow->band];
+ struct fq_flow_head *head = (list_sel == NEW_FLOW) ?
+ &pband->new_flows :
+ &pband->old_flows;
+
if (head->first)
head->last->next = flow;
else
@@ -148,7 +214,7 @@ static void fq_flow_unset_throttled(struct fq_sched_data *q, struct fq_flow *f)
{
rb_erase(&f->rate_node, &q->delayed);
q->throttled_flows--;
- fq_flow_add_tail(&q->old_flows, f);
+ fq_flow_add_tail(q, f, OLD_FLOW);
}
static void fq_flow_set_throttled(struct fq_sched_data *q, struct fq_flow *f)
@@ -193,9 +259,10 @@ static void fq_gc(struct fq_sched_data *q,
struct rb_root *root,
struct sock *sk)
{
- struct fq_flow *f, *tofree[FQ_GC_MAX];
struct rb_node **p, *parent;
- int fcnt = 0;
+ void *tofree[FQ_GC_MAX];
+ struct fq_flow *f;
+ int i, fcnt = 0;
p = &root->rb_node;
parent = NULL;
@@ -218,28 +285,81 @@ static void fq_gc(struct fq_sched_data *q,
p = &parent->rb_left;
}
+ if (!fcnt)
+ return;
+
+ for (i = fcnt; i > 0; ) {
+ f = tofree[--i];
+ rb_erase(&f->fq_node, root);
+ }
q->flows -= fcnt;
q->inactive_flows -= fcnt;
q->stat_gc_flows += fcnt;
- while (fcnt) {
- struct fq_flow *f = tofree[--fcnt];
- rb_erase(&f->fq_node, root);
- kmem_cache_free(fq_flow_cachep, f);
+ kmem_cache_free_bulk(fq_flow_cachep, fcnt, tofree);
+}
+
+/* Fast path can be used if :
+ * 1) Packet tstamp is in the past, or within the pacing offload horizon.
+ * 2) FQ qlen == 0 OR
+ * (no flow is currently eligible for transmit,
+ * AND fast path queue has less than 8 packets)
+ * 3) No SO_MAX_PACING_RATE on the socket (if any).
+ * 4) No @maxrate attribute on this qdisc,
+ *
+ * FQ can not use generic TCQ_F_CAN_BYPASS infrastructure.
+ */
+static bool fq_fastpath_check(const struct Qdisc *sch, struct sk_buff *skb,
+ u64 now)
+{
+ const struct fq_sched_data *q = qdisc_priv(sch);
+ const struct sock *sk;
+
+ if (fq_skb_cb(skb)->time_to_send > now + q->offload_horizon)
+ return false;
+
+ if (sch->q.qlen != 0) {
+ /* Even if some packets are stored in this qdisc,
+ * we can still enable fast path if all of them are
+ * scheduled in the future (ie no flows are eligible)
+ * or in the fast path queue.
+ */
+ if (q->flows != q->inactive_flows + q->throttled_flows)
+ return false;
+
+ /* Do not allow fast path queue to explode, we want Fair Queue mode
+ * under pressure.
+ */
+ if (q->internal.qlen >= 8)
+ return false;
+
+ /* Ordering invariants fall apart if some delayed flows
+ * are ready but we haven't serviced them, yet.
+ */
+ if (q->time_next_delayed_flow <= now + q->offload_horizon)
+ return false;
}
+
+ sk = skb->sk;
+ if (sk && sk_fullsock(sk) && !sk_is_tcp(sk) &&
+ sk->sk_max_pacing_rate != ~0UL)
+ return false;
+
+ if (q->flow_max_rate != ~0UL)
+ return false;
+
+ return true;
}
-static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q)
+static struct fq_flow *fq_classify(struct Qdisc *sch, struct sk_buff *skb,
+ u64 now)
{
+ struct fq_sched_data *q = qdisc_priv(sch);
struct rb_node **p, *parent;
struct sock *sk = skb->sk;
struct rb_root *root;
struct fq_flow *f;
- /* warning: no starvation prevention... */
- if (unlikely((skb->priority & TC_PRIO_MAX) == TC_PRIO_CONTROL))
- return &q->internal;
-
/* SYNACK messages are attached to a TCP_NEW_SYN_RECV request socket
* or a listener (SYNCOOKIE mode)
* 1) request sockets are not full blown,
@@ -248,8 +368,9 @@ static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q)
* 3) We do not want to rate limit them (eg SYNFLOOD attack),
* especially if the listener set SO_MAX_PACING_RATE
* 4) We pretend they are orphaned
+ * TCP can also associate TIME_WAIT sockets with RST or ACK packets.
*/
- if (!sk || sk_listener(sk)) {
+ if (!sk || sk_listener_or_tw(sk)) {
unsigned long hash = skb_get_hash(skb) & q->orphan_mask;
/* By forcing low order bit to 1, we make sure to not
@@ -257,13 +378,31 @@ static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q)
*/
sk = (struct sock *)((hash << 1) | 1UL);
skb_orphan(skb);
+ } else if (sk->sk_state == TCP_CLOSE) {
+ unsigned long hash = skb_get_hash(skb) & q->orphan_mask;
+ /*
+ * Sockets in TCP_CLOSE are non connected.
+ * Typical use case is UDP sockets, they can send packets
+ * with sendto() to many different destinations.
+ * We probably could use a generic bit advertising
+ * non connected sockets, instead of sk_state == TCP_CLOSE,
+ * if we care enough.
+ */
+ sk = (struct sock *)((hash << 1) | 1UL);
+ }
+
+ if (fq_fastpath_check(sch, skb, now)) {
+ q->internal.stat_fastpath_packets++;
+ if (skb->sk == sk && q->rate_enable &&
+ READ_ONCE(sk->sk_pacing_status) != SK_PACING_FQ)
+ smp_store_release(&sk->sk_pacing_status,
+ SK_PACING_FQ);
+ return &q->internal;
}
root = &q->fq_root[hash_ptr(sk, q->fq_trees_log)];
- if (q->flows >= (2U << q->fq_trees_log) &&
- q->inactive_flows > q->flows/2)
- fq_gc(q, root, sk);
+ fq_gc(q, root, sk);
p = &root->rb_node;
parent = NULL;
@@ -277,10 +416,13 @@ static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q)
* It not, we need to refill credit with
* initial quantum
*/
- if (unlikely(skb->sk &&
+ if (unlikely(skb->sk == sk &&
f->socket_hash != sk->sk_hash)) {
f->credit = q->initial_quantum;
f->socket_hash = sk->sk_hash;
+ if (q->rate_enable)
+ smp_store_release(&sk->sk_pacing_status,
+ SK_PACING_FQ);
if (fq_flow_is_throttled(f))
fq_flow_unset_throttled(q, f);
f->time_next_packet = 0ULL;
@@ -298,10 +440,16 @@ static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q)
q->stat_allocation_errors++;
return &q->internal;
}
+ /* f->t_root is already zeroed after kmem_cache_zalloc() */
+
fq_flow_set_detached(f);
f->sk = sk;
- if (skb->sk)
+ if (skb->sk == sk) {
f->socket_hash = sk->sk_hash;
+ if (q->rate_enable)
+ smp_store_release(&sk->sk_pacing_status,
+ SK_PACING_FQ);
+ }
f->credit = q->initial_quantum;
rb_link_node(&f->fq_node, parent, p);
@@ -312,92 +460,171 @@ static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q)
return f;
}
-
-/* remove one skb from head of flow queue */
-static struct sk_buff *fq_dequeue_head(struct Qdisc *sch, struct fq_flow *flow)
+static struct sk_buff *fq_peek(struct fq_flow *flow)
{
- struct sk_buff *skb = flow->head;
+ struct sk_buff *skb = skb_rb_first(&flow->t_root);
+ struct sk_buff *head = flow->head;
- if (skb) {
- flow->head = skb->next;
- skb_mark_not_on_list(skb);
- flow->qlen--;
- qdisc_qstats_backlog_dec(sch, skb);
- sch->q.qlen--;
+ if (!skb)
+ return head;
+
+ if (!head)
+ return skb;
+
+ if (fq_skb_cb(skb)->time_to_send < fq_skb_cb(head)->time_to_send)
+ return skb;
+ return head;
+}
+
+static void fq_erase_head(struct Qdisc *sch, struct fq_flow *flow,
+ struct sk_buff *skb)
+{
+ if (skb == flow->head) {
+ struct sk_buff *next = skb->next;
+
+ prefetch(next);
+ flow->head = next;
+ } else {
+ rb_erase(&skb->rbnode, &flow->t_root);
+ skb->dev = qdisc_dev(sch);
}
- return skb;
+}
+
+/* Remove one skb from flow queue.
+ * This skb must be the return value of prior fq_peek().
+ */
+static void fq_dequeue_skb(struct Qdisc *sch, struct fq_flow *flow,
+ struct sk_buff *skb)
+{
+ fq_erase_head(sch, flow, skb);
+ skb_mark_not_on_list(skb);
+ qdisc_qstats_backlog_dec(sch, skb);
+ sch->q.qlen--;
+ qdisc_bstats_update(sch, skb);
}
static void flow_queue_add(struct fq_flow *flow, struct sk_buff *skb)
{
- struct sk_buff *head = flow->head;
+ struct rb_node **p, *parent;
+ struct sk_buff *head, *aux;
- skb->next = NULL;
- if (!head)
- flow->head = skb;
- else
- flow->tail->next = skb;
+ head = flow->head;
+ if (!head ||
+ fq_skb_cb(skb)->time_to_send >= fq_skb_cb(flow->tail)->time_to_send) {
+ if (!head)
+ flow->head = skb;
+ else
+ flow->tail->next = skb;
+ flow->tail = skb;
+ skb->next = NULL;
+ return;
+ }
+
+ p = &flow->t_root.rb_node;
+ parent = NULL;
+
+ while (*p) {
+ parent = *p;
+ aux = rb_to_skb(parent);
+ if (fq_skb_cb(skb)->time_to_send >= fq_skb_cb(aux)->time_to_send)
+ p = &parent->rb_right;
+ else
+ p = &parent->rb_left;
+ }
+ rb_link_node(&skb->rbnode, parent, p);
+ rb_insert_color(&skb->rbnode, &flow->t_root);
+}
- flow->tail = skb;
+static bool fq_packet_beyond_horizon(const struct sk_buff *skb,
+ const struct fq_sched_data *q, u64 now)
+{
+ return unlikely((s64)skb->tstamp > (s64)(now + q->horizon));
}
+#define FQDR(reason) SKB_DROP_REASON_FQ_##reason
+
static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch,
struct sk_buff **to_free)
{
struct fq_sched_data *q = qdisc_priv(sch);
struct fq_flow *f;
+ u64 now;
+ u8 band;
- if (unlikely(sch->q.qlen >= sch->limit))
- return qdisc_drop(skb, sch, to_free);
+ band = fq_prio2band(q->prio2band, skb->priority & TC_PRIO_MAX);
+ if (unlikely(q->band_pkt_count[band] >= sch->limit)) {
+ q->stat_band_drops[band]++;
+ return qdisc_drop_reason(skb, sch, to_free,
+ FQDR(BAND_LIMIT));
+ }
- f = fq_classify(skb, q);
- if (unlikely(f->qlen >= q->flow_plimit && f != &q->internal)) {
- q->stat_flows_plimit++;
- return qdisc_drop(skb, sch, to_free);
+ now = ktime_get_ns();
+ if (!skb->tstamp) {
+ fq_skb_cb(skb)->time_to_send = now;
+ } else {
+ /* Check if packet timestamp is too far in the future. */
+ if (fq_packet_beyond_horizon(skb, q, now)) {
+ if (q->horizon_drop) {
+ q->stat_horizon_drops++;
+ return qdisc_drop_reason(skb, sch, to_free,
+ FQDR(HORIZON_LIMIT));
+ }
+ q->stat_horizon_caps++;
+ skb->tstamp = now + q->horizon;
+ }
+ fq_skb_cb(skb)->time_to_send = skb->tstamp;
}
- f->qlen++;
- qdisc_qstats_backlog_inc(sch, skb);
- if (fq_flow_is_detached(f)) {
- struct sock *sk = skb->sk;
-
- fq_flow_add_tail(&q->new_flows, f);
- if (time_after(jiffies, f->age + q->flow_refill_delay))
- f->credit = max_t(u32, f->credit, q->quantum);
- if (sk && q->rate_enable) {
- if (unlikely(smp_load_acquire(&sk->sk_pacing_status) !=
- SK_PACING_FQ))
- smp_store_release(&sk->sk_pacing_status,
- SK_PACING_FQ);
+ f = fq_classify(sch, skb, now);
+
+ if (f != &q->internal) {
+ if (unlikely(f->qlen >= q->flow_plimit)) {
+ q->stat_flows_plimit++;
+ return qdisc_drop_reason(skb, sch, to_free,
+ FQDR(FLOW_LIMIT));
+ }
+
+ if (fq_flow_is_detached(f)) {
+ fq_flow_add_tail(q, f, NEW_FLOW);
+ if (time_after(jiffies, f->age + q->flow_refill_delay))
+ f->credit = max_t(u32, f->credit, q->quantum);
}
- q->inactive_flows--;
+
+ f->band = band;
+ q->band_pkt_count[band]++;
+ fq_skb_cb(skb)->band = band;
+ if (f->qlen == 0)
+ q->inactive_flows--;
}
+ f->qlen++;
/* Note: this overwrites f->age */
flow_queue_add(f, skb);
- if (unlikely(f == &q->internal)) {
- q->stat_internal_packets++;
- }
+ qdisc_qstats_backlog_inc(sch, skb);
sch->q.qlen++;
return NET_XMIT_SUCCESS;
}
+#undef FQDR
static void fq_check_throttled(struct fq_sched_data *q, u64 now)
{
unsigned long sample;
struct rb_node *p;
- if (q->time_next_delayed_flow > now)
+ if (q->time_next_delayed_flow > now + q->offload_horizon)
return;
/* Update unthrottle latency EWMA.
* This is cheap and can help diagnosing timer/latency problems.
*/
sample = (unsigned long)(now - q->time_next_delayed_flow);
- q->unthrottle_latency_ns -= q->unthrottle_latency_ns >> 3;
- q->unthrottle_latency_ns += sample >> 3;
+ if ((long)sample > 0) {
+ q->unthrottle_latency_ns -= q->unthrottle_latency_ns >> 3;
+ q->unthrottle_latency_ns += sample >> 3;
+ }
+ now += q->offload_horizon;
q->time_next_delayed_flow = ~0ULL;
while ((p = rb_first(&q->delayed)) != NULL) {
@@ -411,78 +638,104 @@ static void fq_check_throttled(struct fq_sched_data *q, u64 now)
}
}
+static struct fq_flow_head *fq_pband_head_select(struct fq_perband_flows *pband)
+{
+ if (pband->credit <= 0)
+ return NULL;
+
+ if (pband->new_flows.first)
+ return &pband->new_flows;
+
+ return pband->old_flows.first ? &pband->old_flows : NULL;
+}
+
static struct sk_buff *fq_dequeue(struct Qdisc *sch)
{
struct fq_sched_data *q = qdisc_priv(sch);
+ struct fq_perband_flows *pband;
struct fq_flow_head *head;
struct sk_buff *skb;
struct fq_flow *f;
unsigned long rate;
+ int retry;
u32 plen;
u64 now;
if (!sch->q.qlen)
return NULL;
- skb = fq_dequeue_head(sch, &q->internal);
- if (skb)
+ skb = fq_peek(&q->internal);
+ if (unlikely(skb)) {
+ q->internal.qlen--;
+ fq_dequeue_skb(sch, &q->internal, skb);
goto out;
+ }
now = ktime_get_ns();
fq_check_throttled(q, now);
+ retry = 0;
+ pband = &q->band_flows[q->band_nr];
begin:
- head = &q->new_flows;
- if (!head->first) {
- head = &q->old_flows;
- if (!head->first) {
- if (q->time_next_delayed_flow != ~0ULL)
- qdisc_watchdog_schedule_ns(&q->watchdog,
- q->time_next_delayed_flow);
- return NULL;
+ head = fq_pband_head_select(pband);
+ if (!head) {
+ while (++retry <= FQ_BANDS) {
+ if (++q->band_nr == FQ_BANDS)
+ q->band_nr = 0;
+ pband = &q->band_flows[q->band_nr];
+ pband->credit = min(pband->credit + pband->quantum,
+ pband->quantum);
+ if (pband->credit > 0)
+ goto begin;
+ retry = 0;
}
+ if (q->time_next_delayed_flow != ~0ULL)
+ qdisc_watchdog_schedule_range_ns(&q->watchdog,
+ q->time_next_delayed_flow,
+ q->timer_slack);
+ return NULL;
}
f = head->first;
-
+ retry = 0;
if (f->credit <= 0) {
f->credit += q->quantum;
head->first = f->next;
- fq_flow_add_tail(&q->old_flows, f);
+ fq_flow_add_tail(q, f, OLD_FLOW);
goto begin;
}
- skb = f->head;
+ skb = fq_peek(f);
if (skb) {
- u64 time_next_packet = max_t(u64, ktime_to_ns(skb->tstamp),
+ u64 time_next_packet = max_t(u64, fq_skb_cb(skb)->time_to_send,
f->time_next_packet);
- if (now < time_next_packet) {
+ if (now + q->offload_horizon < time_next_packet) {
head->first = f->next;
f->time_next_packet = time_next_packet;
fq_flow_set_throttled(q, f);
goto begin;
}
- if (time_next_packet &&
- (s64)(now - time_next_packet - q->ce_threshold) > 0) {
+ prefetch(&skb->end);
+ fq_dequeue_skb(sch, f, skb);
+ if ((s64)(now - time_next_packet - q->ce_threshold) > 0) {
INET_ECN_set_ce(skb);
q->stat_ce_mark++;
}
- }
-
- skb = fq_dequeue_head(sch, f);
- if (!skb) {
+ if (--f->qlen == 0)
+ q->inactive_flows++;
+ q->band_pkt_count[fq_skb_cb(skb)->band]--;
+ } else {
head->first = f->next;
/* force a pass through old_flows to prevent starvation */
- if ((head == &q->new_flows) && q->old_flows.first) {
- fq_flow_add_tail(&q->old_flows, f);
+ if (head == &pband->new_flows) {
+ fq_flow_add_tail(q, f, OLD_FLOW);
} else {
fq_flow_set_detached(f);
- q->inactive_flows++;
}
goto begin;
}
- prefetch(&skb->end);
plen = qdisc_pkt_len(skb);
f->credit -= plen;
+ pband->credit -= plen;
if (!q->rate_enable)
goto out;
@@ -495,7 +748,7 @@ begin:
*/
if (!skb->tstamp) {
if (skb->sk)
- rate = min(skb->sk->sk_pacing_rate, rate);
+ rate = min(READ_ONCE(skb->sk->sk_pacing_rate), rate);
if (rate <= q->low_rate_threshold) {
f->credit = 0;
@@ -527,12 +780,20 @@ begin:
f->time_next_packet = now + len;
}
out:
- qdisc_bstats_update(sch, skb);
return skb;
}
static void fq_flow_purge(struct fq_flow *flow)
{
+ struct rb_node *p = rb_first(&flow->t_root);
+
+ while (p) {
+ struct sk_buff *skb = rb_to_skb(p);
+
+ p = rb_next(p);
+ rb_erase(&skb->rbnode, &flow->t_root);
+ rtnl_kfree_skbs(skb, skb);
+ }
rtnl_kfree_skbs(flow->head, flow->tail);
flow->head = NULL;
flow->qlen = 0;
@@ -565,8 +826,10 @@ static void fq_reset(struct Qdisc *sch)
kmem_cache_free(fq_flow_cachep, f);
}
}
- q->new_flows.first = NULL;
- q->old_flows.first = NULL;
+ for (idx = 0; idx < FQ_BANDS; idx++) {
+ q->band_flows[idx].new_flows.first = NULL;
+ q->band_flows[idx].old_flows.first = NULL;
+ }
q->delayed = RB_ROOT;
q->flows = 0;
q->inactive_flows = 0;
@@ -649,7 +912,7 @@ static int fq_resize(struct Qdisc *sch, u32 log)
fq_rehash(q, old_fq_root, q->fq_trees_log, array, log);
q->fq_root = array;
- q->fq_trees_log = log;
+ WRITE_ONCE(q->fq_trees_log, log);
sch_tree_unlock(sch);
@@ -658,33 +921,109 @@ static int fq_resize(struct Qdisc *sch, u32 log)
return 0;
}
+static const struct netlink_range_validation iq_range = {
+ .max = INT_MAX,
+};
+
static const struct nla_policy fq_policy[TCA_FQ_MAX + 1] = {
+ [TCA_FQ_UNSPEC] = { .strict_start_type = TCA_FQ_TIMER_SLACK },
+
[TCA_FQ_PLIMIT] = { .type = NLA_U32 },
[TCA_FQ_FLOW_PLIMIT] = { .type = NLA_U32 },
[TCA_FQ_QUANTUM] = { .type = NLA_U32 },
- [TCA_FQ_INITIAL_QUANTUM] = { .type = NLA_U32 },
+ [TCA_FQ_INITIAL_QUANTUM] = NLA_POLICY_FULL_RANGE(NLA_U32, &iq_range),
[TCA_FQ_RATE_ENABLE] = { .type = NLA_U32 },
[TCA_FQ_FLOW_DEFAULT_RATE] = { .type = NLA_U32 },
[TCA_FQ_FLOW_MAX_RATE] = { .type = NLA_U32 },
[TCA_FQ_BUCKETS_LOG] = { .type = NLA_U32 },
[TCA_FQ_FLOW_REFILL_DELAY] = { .type = NLA_U32 },
+ [TCA_FQ_ORPHAN_MASK] = { .type = NLA_U32 },
[TCA_FQ_LOW_RATE_THRESHOLD] = { .type = NLA_U32 },
[TCA_FQ_CE_THRESHOLD] = { .type = NLA_U32 },
+ [TCA_FQ_TIMER_SLACK] = { .type = NLA_U32 },
+ [TCA_FQ_HORIZON] = { .type = NLA_U32 },
+ [TCA_FQ_HORIZON_DROP] = { .type = NLA_U8 },
+ [TCA_FQ_PRIOMAP] = NLA_POLICY_EXACT_LEN(sizeof(struct tc_prio_qopt)),
+ [TCA_FQ_WEIGHTS] = NLA_POLICY_EXACT_LEN(FQ_BANDS * sizeof(s32)),
+ [TCA_FQ_OFFLOAD_HORIZON] = { .type = NLA_U32 },
};
+/* compress a u8 array with all elems <= 3 to an array of 2-bit fields */
+static void fq_prio2band_compress_crumb(const u8 *in, u8 *out)
+{
+ const int num_elems = TC_PRIO_MAX + 1;
+ u8 tmp[FQ_PRIO2BAND_CRUMB_SIZE];
+ int i;
+
+ memset(tmp, 0, sizeof(tmp));
+ for (i = 0; i < num_elems; i++)
+ tmp[i / 4] |= in[i] << (2 * (i & 0x3));
+
+ for (i = 0; i < FQ_PRIO2BAND_CRUMB_SIZE; i++)
+ WRITE_ONCE(out[i], tmp[i]);
+}
+
+static void fq_prio2band_decompress_crumb(const u8 *in, u8 *out)
+{
+ const int num_elems = TC_PRIO_MAX + 1;
+ int i;
+
+ for (i = 0; i < num_elems; i++)
+ out[i] = fq_prio2band(in, i);
+}
+
+static int fq_load_weights(struct fq_sched_data *q,
+ const struct nlattr *attr,
+ struct netlink_ext_ack *extack)
+{
+ s32 *weights = nla_data(attr);
+ int i;
+
+ for (i = 0; i < FQ_BANDS; i++) {
+ if (weights[i] < FQ_MIN_WEIGHT) {
+ NL_SET_ERR_MSG_FMT_MOD(extack, "Weight %d less that minimum allowed %d",
+ weights[i], FQ_MIN_WEIGHT);
+ return -EINVAL;
+ }
+ }
+ for (i = 0; i < FQ_BANDS; i++)
+ WRITE_ONCE(q->band_flows[i].quantum, weights[i]);
+ return 0;
+}
+
+static int fq_load_priomap(struct fq_sched_data *q,
+ const struct nlattr *attr,
+ struct netlink_ext_ack *extack)
+{
+ const struct tc_prio_qopt *map = nla_data(attr);
+ int i;
+
+ if (map->bands != FQ_BANDS) {
+ NL_SET_ERR_MSG_MOD(extack, "FQ only supports 3 bands");
+ return -EINVAL;
+ }
+ for (i = 0; i < TC_PRIO_MAX + 1; i++) {
+ if (map->priomap[i] >= FQ_BANDS) {
+ NL_SET_ERR_MSG_FMT_MOD(extack, "FQ priomap field %d maps to a too high band %d",
+ i, map->priomap[i]);
+ return -EINVAL;
+ }
+ }
+ fq_prio2band_compress_crumb(map->priomap, q->prio2band);
+ return 0;
+}
+
static int fq_change(struct Qdisc *sch, struct nlattr *opt,
struct netlink_ext_ack *extack)
{
+ unsigned int dropped_pkts = 0, dropped_bytes = 0;
struct fq_sched_data *q = qdisc_priv(sch);
struct nlattr *tb[TCA_FQ_MAX + 1];
- int err, drop_count = 0;
- unsigned drop_len = 0;
u32 fq_log;
+ int err;
- if (!opt)
- return -EINVAL;
-
- err = nla_parse_nested(tb, TCA_FQ_MAX, opt, fq_policy, NULL);
+ err = nla_parse_nested_deprecated(tb, TCA_FQ_MAX, opt, fq_policy,
+ NULL);
if (err < 0)
return err;
@@ -701,22 +1040,27 @@ static int fq_change(struct Qdisc *sch, struct nlattr *opt,
err = -EINVAL;
}
if (tb[TCA_FQ_PLIMIT])
- sch->limit = nla_get_u32(tb[TCA_FQ_PLIMIT]);
+ WRITE_ONCE(sch->limit,
+ nla_get_u32(tb[TCA_FQ_PLIMIT]));
if (tb[TCA_FQ_FLOW_PLIMIT])
- q->flow_plimit = nla_get_u32(tb[TCA_FQ_FLOW_PLIMIT]);
+ WRITE_ONCE(q->flow_plimit,
+ nla_get_u32(tb[TCA_FQ_FLOW_PLIMIT]));
if (tb[TCA_FQ_QUANTUM]) {
u32 quantum = nla_get_u32(tb[TCA_FQ_QUANTUM]);
- if (quantum > 0)
- q->quantum = quantum;
- else
+ if (quantum > 0 && quantum <= (1 << 20)) {
+ WRITE_ONCE(q->quantum, quantum);
+ } else {
+ NL_SET_ERR_MSG_MOD(extack, "invalid quantum");
err = -EINVAL;
+ }
}
if (tb[TCA_FQ_INITIAL_QUANTUM])
- q->initial_quantum = nla_get_u32(tb[TCA_FQ_INITIAL_QUANTUM]);
+ WRITE_ONCE(q->initial_quantum,
+ nla_get_u32(tb[TCA_FQ_INITIAL_QUANTUM]));
if (tb[TCA_FQ_FLOW_DEFAULT_RATE])
pr_warn_ratelimited("sch_fq: defrate %u ignored.\n",
@@ -725,17 +1069,19 @@ static int fq_change(struct Qdisc *sch, struct nlattr *opt,
if (tb[TCA_FQ_FLOW_MAX_RATE]) {
u32 rate = nla_get_u32(tb[TCA_FQ_FLOW_MAX_RATE]);
- q->flow_max_rate = (rate == ~0U) ? ~0UL : rate;
+ WRITE_ONCE(q->flow_max_rate,
+ (rate == ~0U) ? ~0UL : rate);
}
if (tb[TCA_FQ_LOW_RATE_THRESHOLD])
- q->low_rate_threshold =
- nla_get_u32(tb[TCA_FQ_LOW_RATE_THRESHOLD]);
+ WRITE_ONCE(q->low_rate_threshold,
+ nla_get_u32(tb[TCA_FQ_LOW_RATE_THRESHOLD]));
if (tb[TCA_FQ_RATE_ENABLE]) {
u32 enable = nla_get_u32(tb[TCA_FQ_RATE_ENABLE]);
if (enable <= 1)
- q->rate_enable = enable;
+ WRITE_ONCE(q->rate_enable,
+ enable);
else
err = -EINVAL;
}
@@ -743,31 +1089,67 @@ static int fq_change(struct Qdisc *sch, struct nlattr *opt,
if (tb[TCA_FQ_FLOW_REFILL_DELAY]) {
u32 usecs_delay = nla_get_u32(tb[TCA_FQ_FLOW_REFILL_DELAY]) ;
- q->flow_refill_delay = usecs_to_jiffies(usecs_delay);
+ WRITE_ONCE(q->flow_refill_delay,
+ usecs_to_jiffies(usecs_delay));
}
+ if (!err && tb[TCA_FQ_PRIOMAP])
+ err = fq_load_priomap(q, tb[TCA_FQ_PRIOMAP], extack);
+
+ if (!err && tb[TCA_FQ_WEIGHTS])
+ err = fq_load_weights(q, tb[TCA_FQ_WEIGHTS], extack);
+
if (tb[TCA_FQ_ORPHAN_MASK])
- q->orphan_mask = nla_get_u32(tb[TCA_FQ_ORPHAN_MASK]);
+ WRITE_ONCE(q->orphan_mask,
+ nla_get_u32(tb[TCA_FQ_ORPHAN_MASK]));
if (tb[TCA_FQ_CE_THRESHOLD])
- q->ce_threshold = (u64)NSEC_PER_USEC *
- nla_get_u32(tb[TCA_FQ_CE_THRESHOLD]);
+ WRITE_ONCE(q->ce_threshold,
+ (u64)NSEC_PER_USEC *
+ nla_get_u32(tb[TCA_FQ_CE_THRESHOLD]));
+
+ if (tb[TCA_FQ_TIMER_SLACK])
+ WRITE_ONCE(q->timer_slack,
+ nla_get_u32(tb[TCA_FQ_TIMER_SLACK]));
+
+ if (tb[TCA_FQ_HORIZON])
+ WRITE_ONCE(q->horizon,
+ (u64)NSEC_PER_USEC *
+ nla_get_u32(tb[TCA_FQ_HORIZON]));
+
+ if (tb[TCA_FQ_HORIZON_DROP])
+ WRITE_ONCE(q->horizon_drop,
+ nla_get_u8(tb[TCA_FQ_HORIZON_DROP]));
+
+ if (tb[TCA_FQ_OFFLOAD_HORIZON]) {
+ u64 offload_horizon = (u64)NSEC_PER_USEC *
+ nla_get_u32(tb[TCA_FQ_OFFLOAD_HORIZON]);
+ if (offload_horizon <= qdisc_dev(sch)->max_pacing_offload_horizon) {
+ WRITE_ONCE(q->offload_horizon, offload_horizon);
+ } else {
+ NL_SET_ERR_MSG_MOD(extack, "invalid offload_horizon");
+ err = -EINVAL;
+ }
+ }
if (!err) {
+
sch_tree_unlock(sch);
err = fq_resize(sch, fq_log);
sch_tree_lock(sch);
}
+
while (sch->q.qlen > sch->limit) {
- struct sk_buff *skb = fq_dequeue(sch);
+ struct sk_buff *skb = qdisc_dequeue_internal(sch, false);
if (!skb)
break;
- drop_len += qdisc_pkt_len(skb);
+
+ dropped_pkts++;
+ dropped_bytes += qdisc_pkt_len(skb);
rtnl_kfree_skbs(skb, skb);
- drop_count++;
}
- qdisc_tree_reduce_backlog(sch, drop_count, drop_len);
+ qdisc_tree_reduce_backlog(sch, dropped_pkts, dropped_bytes);
sch_tree_unlock(sch);
return err;
@@ -786,7 +1168,7 @@ static int fq_init(struct Qdisc *sch, struct nlattr *opt,
struct netlink_ext_ack *extack)
{
struct fq_sched_data *q = qdisc_priv(sch);
- int err;
+ int i, err;
sch->limit = 10000;
q->flow_plimit = 100;
@@ -796,17 +1178,28 @@ static int fq_init(struct Qdisc *sch, struct nlattr *opt,
q->flow_max_rate = ~0UL;
q->time_next_delayed_flow = ~0ULL;
q->rate_enable = 1;
- q->new_flows.first = NULL;
- q->old_flows.first = NULL;
+ for (i = 0; i < FQ_BANDS; i++) {
+ q->band_flows[i].new_flows.first = NULL;
+ q->band_flows[i].old_flows.first = NULL;
+ }
+ q->band_flows[0].quantum = 9 << 16;
+ q->band_flows[1].quantum = 3 << 16;
+ q->band_flows[2].quantum = 1 << 16;
q->delayed = RB_ROOT;
q->fq_root = NULL;
q->fq_trees_log = ilog2(1024);
q->orphan_mask = 1024 - 1;
q->low_rate_threshold = 550000 / 8;
+ q->timer_slack = 10 * NSEC_PER_USEC; /* 10 usec of hrtimer slack */
+
+ q->horizon = 10ULL * NSEC_PER_SEC; /* 10 seconds */
+ q->horizon_drop = 1; /* by default, drop packets beyond horizon */
+
/* Default ce_threshold of 4294 seconds */
q->ce_threshold = (u64)NSEC_PER_USEC * ~0U;
+ fq_prio2band_compress_crumb(sch_default_prio2band, q->prio2band);
qdisc_watchdog_init_clockid(&q->watchdog, sch, CLOCK_MONOTONIC);
if (opt)
@@ -820,31 +1213,68 @@ static int fq_init(struct Qdisc *sch, struct nlattr *opt,
static int fq_dump(struct Qdisc *sch, struct sk_buff *skb)
{
struct fq_sched_data *q = qdisc_priv(sch);
- u64 ce_threshold = q->ce_threshold;
+ struct tc_prio_qopt prio = {
+ .bands = FQ_BANDS,
+ };
struct nlattr *opts;
+ u64 offload_horizon;
+ u64 ce_threshold;
+ s32 weights[3];
+ u64 horizon;
- opts = nla_nest_start(skb, TCA_OPTIONS);
+ opts = nla_nest_start_noflag(skb, TCA_OPTIONS);
if (opts == NULL)
goto nla_put_failure;
/* TCA_FQ_FLOW_DEFAULT_RATE is not used anymore */
+ ce_threshold = READ_ONCE(q->ce_threshold);
do_div(ce_threshold, NSEC_PER_USEC);
- if (nla_put_u32(skb, TCA_FQ_PLIMIT, sch->limit) ||
- nla_put_u32(skb, TCA_FQ_FLOW_PLIMIT, q->flow_plimit) ||
- nla_put_u32(skb, TCA_FQ_QUANTUM, q->quantum) ||
- nla_put_u32(skb, TCA_FQ_INITIAL_QUANTUM, q->initial_quantum) ||
- nla_put_u32(skb, TCA_FQ_RATE_ENABLE, q->rate_enable) ||
+ horizon = READ_ONCE(q->horizon);
+ do_div(horizon, NSEC_PER_USEC);
+
+ offload_horizon = READ_ONCE(q->offload_horizon);
+ do_div(offload_horizon, NSEC_PER_USEC);
+
+ if (nla_put_u32(skb, TCA_FQ_PLIMIT,
+ READ_ONCE(sch->limit)) ||
+ nla_put_u32(skb, TCA_FQ_FLOW_PLIMIT,
+ READ_ONCE(q->flow_plimit)) ||
+ nla_put_u32(skb, TCA_FQ_QUANTUM,
+ READ_ONCE(q->quantum)) ||
+ nla_put_u32(skb, TCA_FQ_INITIAL_QUANTUM,
+ READ_ONCE(q->initial_quantum)) ||
+ nla_put_u32(skb, TCA_FQ_RATE_ENABLE,
+ READ_ONCE(q->rate_enable)) ||
nla_put_u32(skb, TCA_FQ_FLOW_MAX_RATE,
- min_t(unsigned long, q->flow_max_rate, ~0U)) ||
+ min_t(unsigned long,
+ READ_ONCE(q->flow_max_rate), ~0U)) ||
nla_put_u32(skb, TCA_FQ_FLOW_REFILL_DELAY,
- jiffies_to_usecs(q->flow_refill_delay)) ||
- nla_put_u32(skb, TCA_FQ_ORPHAN_MASK, q->orphan_mask) ||
+ jiffies_to_usecs(READ_ONCE(q->flow_refill_delay))) ||
+ nla_put_u32(skb, TCA_FQ_ORPHAN_MASK,
+ READ_ONCE(q->orphan_mask)) ||
nla_put_u32(skb, TCA_FQ_LOW_RATE_THRESHOLD,
- q->low_rate_threshold) ||
+ READ_ONCE(q->low_rate_threshold)) ||
nla_put_u32(skb, TCA_FQ_CE_THRESHOLD, (u32)ce_threshold) ||
- nla_put_u32(skb, TCA_FQ_BUCKETS_LOG, q->fq_trees_log))
+ nla_put_u32(skb, TCA_FQ_BUCKETS_LOG,
+ READ_ONCE(q->fq_trees_log)) ||
+ nla_put_u32(skb, TCA_FQ_TIMER_SLACK,
+ READ_ONCE(q->timer_slack)) ||
+ nla_put_u32(skb, TCA_FQ_HORIZON, (u32)horizon) ||
+ nla_put_u32(skb, TCA_FQ_OFFLOAD_HORIZON, (u32)offload_horizon) ||
+ nla_put_u8(skb, TCA_FQ_HORIZON_DROP,
+ READ_ONCE(q->horizon_drop)))
+ goto nla_put_failure;
+
+ fq_prio2band_decompress_crumb(q->prio2band, prio.priomap);
+ if (nla_put(skb, TCA_FQ_PRIOMAP, sizeof(prio), &prio))
+ goto nla_put_failure;
+
+ weights[0] = READ_ONCE(q->band_flows[0].quantum);
+ weights[1] = READ_ONCE(q->band_flows[1].quantum);
+ weights[2] = READ_ONCE(q->band_flows[2].quantum);
+ if (nla_put(skb, TCA_FQ_WEIGHTS, sizeof(weights), &weights))
goto nla_put_failure;
return nla_nest_end(skb, opts);
@@ -857,23 +1287,34 @@ static int fq_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
{
struct fq_sched_data *q = qdisc_priv(sch);
struct tc_fq_qd_stats st;
+ int i;
+
+ st.pad = 0;
sch_tree_lock(sch);
st.gc_flows = q->stat_gc_flows;
- st.highprio_packets = q->stat_internal_packets;
+ st.highprio_packets = 0;
+ st.fastpath_packets = q->internal.stat_fastpath_packets;
st.tcp_retrans = 0;
st.throttled = q->stat_throttled;
st.flows_plimit = q->stat_flows_plimit;
st.pkts_too_long = q->stat_pkts_too_long;
st.allocation_errors = q->stat_allocation_errors;
- st.time_next_delayed_flow = q->time_next_delayed_flow - ktime_get_ns();
+ st.time_next_delayed_flow = q->time_next_delayed_flow + q->timer_slack -
+ ktime_get_ns();
st.flows = q->flows;
st.inactive_flows = q->inactive_flows;
st.throttled_flows = q->throttled_flows;
st.unthrottle_latency_ns = min_t(unsigned long,
q->unthrottle_latency_ns, ~0U);
st.ce_mark = q->stat_ce_mark;
+ st.horizon_drops = q->stat_horizon_drops;
+ st.horizon_caps = q->stat_horizon_caps;
+ for (i = 0; i < FQ_BANDS; i++) {
+ st.band_drops[i] = q->stat_band_drops[i];
+ st.band_pkt_count[i] = q->band_pkt_count[i];
+ }
sch_tree_unlock(sch);
return gnet_stats_copy_app(d, &st, sizeof(st));
@@ -894,6 +1335,7 @@ static struct Qdisc_ops fq_qdisc_ops __read_mostly = {
.dump_stats = fq_dump_stats,
.owner = THIS_MODULE,
};
+MODULE_ALIAS_NET_SCH("fq");
static int __init fq_module_init(void)
{
@@ -901,7 +1343,7 @@ static int __init fq_module_init(void)
fq_flow_cachep = kmem_cache_create("fq_flow_cache",
sizeof(struct fq_flow),
- 0, 0, NULL);
+ 0, SLAB_HWCACHE_ALIGN, NULL);
if (!fq_flow_cachep)
return -ENOMEM;
@@ -921,3 +1363,4 @@ module_init(fq_module_init)
module_exit(fq_module_exit)
MODULE_AUTHOR("Eric Dumazet");
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Fair Queue Packet Scheduler");
diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c
index cd04d40c30b6..dc187c7f06b1 100644
--- a/net/sched/sch_fq_codel.c
+++ b/net/sched/sch_fq_codel.c
@@ -1,11 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Fair Queue CoDel discipline
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Copyright (C) 2012,2015 Eric Dumazet <edumazet@google.com>
*/
@@ -18,7 +14,6 @@
#include <linux/errno.h>
#include <linux/init.h>
#include <linux/skbuff.h>
-#include <linux/jhash.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <net/netlink.h>
@@ -49,7 +44,6 @@ struct fq_codel_flow {
struct sk_buff *tail;
struct list_head flowchain;
int deficit;
- u32 dropped; /* number of drops (or ECN marks) on this flow */
struct codel_vars cvars;
}; /* please try to keep this structure <= 64 bytes */
@@ -97,7 +91,7 @@ static unsigned int fq_codel_classify(struct sk_buff *skb, struct Qdisc *sch,
return fq_codel_hash(q, skb) + 1;
*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
- result = tcf_classify(skb, filter, &res, false);
+ result = tcf_classify(skb, NULL, filter, &res, false);
if (result >= 0) {
#ifdef CONFIG_NET_CLS_ACT
switch (result) {
@@ -105,7 +99,7 @@ static unsigned int fq_codel_classify(struct sk_buff *skb, struct Qdisc *sch,
case TC_ACT_QUEUED:
case TC_ACT_TRAP:
*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
- /* fall through */
+ fallthrough;
case TC_ACT_SHOT:
return 0;
}
@@ -174,10 +168,12 @@ static unsigned int fq_codel_drop(struct Qdisc *sch, unsigned int max_packets,
skb = dequeue_head(flow);
len += qdisc_pkt_len(skb);
mem += get_codel_cb(skb)->mem_usage;
+ tcf_set_drop_reason(skb, SKB_DROP_REASON_QDISC_OVERLIMIT);
__qdisc_drop(skb, to_free);
} while (++i < max_packets && len < threshold);
- flow->dropped += i;
+ /* Tell codel to increase its signal strength also */
+ flow->cvars.count += i;
q->backlogs[idx] -= len;
q->memory_usage -= mem;
sch->qstats.drops += i;
@@ -192,7 +188,7 @@ static int fq_codel_enqueue(struct sk_buff *skb, struct Qdisc *sch,
struct fq_codel_sched_data *q = qdisc_priv(sch);
unsigned int idx, prev_backlog, prev_qlen;
struct fq_codel_flow *flow;
- int uninitialized_var(ret);
+ int ret;
unsigned int pkt_len;
bool memory_limited;
@@ -215,7 +211,6 @@ static int fq_codel_enqueue(struct sk_buff *skb, struct Qdisc *sch,
list_add_tail(&flow->flowchain, &q->new_flows);
q->new_flow_count++;
flow->deficit = q->quantum;
- flow->dropped = 0;
}
get_codel_cb(skb)->mem_usage = skb->truesize;
q->memory_usage += get_codel_cb(skb)->mem_usage;
@@ -280,7 +275,7 @@ static void drop_func(struct sk_buff *skb, void *ctx)
{
struct Qdisc *sch = ctx;
- kfree_skb(skb);
+ qdisc_dequeue_drop(sch, skb, SKB_DROP_REASON_QDISC_CONGESTED);
qdisc_qstats_drop(sch);
}
@@ -290,7 +285,6 @@ static struct sk_buff *fq_codel_dequeue(struct Qdisc *sch)
struct sk_buff *skb;
struct fq_codel_flow *flow;
struct list_head *head;
- u32 prev_drop_count, prev_ecn_mark;
begin:
head = &q->new_flows;
@@ -307,16 +301,10 @@ begin:
goto begin;
}
- prev_drop_count = q->cstats.drop_count;
- prev_ecn_mark = q->cstats.ecn_mark;
-
skb = codel_dequeue(sch, &sch->qstats.backlog, &q->cparams,
&flow->cvars, &q->cstats, qdisc_pkt_len,
codel_get_enqueue_time, drop_func, dequeue_func);
- flow->dropped += q->cstats.drop_count - prev_drop_count;
- flow->dropped += q->cstats.ecn_mark - prev_ecn_mark;
-
if (!skb) {
/* force a pass through old_flows to prevent starvation */
if ((head == &q->new_flows) && !list_empty(&q->old_flows))
@@ -327,10 +315,8 @@ begin:
}
qdisc_bstats_update(sch, skb);
flow->deficit -= qdisc_pkt_len(skb);
- /* We cant call qdisc_tree_reduce_backlog() if our qlen is 0,
- * or HTB crashes. Defer it for next round.
- */
- if (q->cstats.drop_count && sch->q.qlen) {
+
+ if (q->cstats.drop_count) {
qdisc_tree_reduce_backlog(sch, q->cstats.drop_count,
q->cstats.drop_len);
q->cstats.drop_count = 0;
@@ -360,8 +346,6 @@ static void fq_codel_reset(struct Qdisc *sch)
codel_vars_init(&flow->cvars);
}
memset(q->backlogs, 0, q->flows_cnt * sizeof(u32));
- sch->q.qlen = 0;
- sch->qstats.backlog = 0;
q->memory_usage = 0;
}
@@ -375,20 +359,21 @@ static const struct nla_policy fq_codel_policy[TCA_FQ_CODEL_MAX + 1] = {
[TCA_FQ_CODEL_CE_THRESHOLD] = { .type = NLA_U32 },
[TCA_FQ_CODEL_DROP_BATCH_SIZE] = { .type = NLA_U32 },
[TCA_FQ_CODEL_MEMORY_LIMIT] = { .type = NLA_U32 },
+ [TCA_FQ_CODEL_CE_THRESHOLD_SELECTOR] = { .type = NLA_U8 },
+ [TCA_FQ_CODEL_CE_THRESHOLD_MASK] = { .type = NLA_U8 },
};
static int fq_codel_change(struct Qdisc *sch, struct nlattr *opt,
struct netlink_ext_ack *extack)
{
+ unsigned int dropped_pkts = 0, dropped_bytes = 0;
struct fq_codel_sched_data *q = qdisc_priv(sch);
struct nlattr *tb[TCA_FQ_CODEL_MAX + 1];
+ u32 quantum = 0;
int err;
- if (!opt)
- return -EINVAL;
-
- err = nla_parse_nested(tb, TCA_FQ_CODEL_MAX, opt, fq_codel_policy,
- NULL);
+ err = nla_parse_nested_deprecated(tb, TCA_FQ_CODEL_MAX, opt,
+ fq_codel_policy, NULL);
if (err < 0)
return err;
if (tb[TCA_FQ_CODEL_FLOWS]) {
@@ -399,52 +384,74 @@ static int fq_codel_change(struct Qdisc *sch, struct nlattr *opt,
q->flows_cnt > 65536)
return -EINVAL;
}
+ if (tb[TCA_FQ_CODEL_QUANTUM]) {
+ quantum = max(256U, nla_get_u32(tb[TCA_FQ_CODEL_QUANTUM]));
+ if (quantum > FQ_CODEL_QUANTUM_MAX) {
+ NL_SET_ERR_MSG(extack, "Invalid quantum");
+ return -EINVAL;
+ }
+ }
sch_tree_lock(sch);
if (tb[TCA_FQ_CODEL_TARGET]) {
u64 target = nla_get_u32(tb[TCA_FQ_CODEL_TARGET]);
- q->cparams.target = (target * NSEC_PER_USEC) >> CODEL_SHIFT;
+ WRITE_ONCE(q->cparams.target,
+ (target * NSEC_PER_USEC) >> CODEL_SHIFT);
}
if (tb[TCA_FQ_CODEL_CE_THRESHOLD]) {
u64 val = nla_get_u32(tb[TCA_FQ_CODEL_CE_THRESHOLD]);
- q->cparams.ce_threshold = (val * NSEC_PER_USEC) >> CODEL_SHIFT;
+ WRITE_ONCE(q->cparams.ce_threshold,
+ (val * NSEC_PER_USEC) >> CODEL_SHIFT);
}
+ if (tb[TCA_FQ_CODEL_CE_THRESHOLD_SELECTOR])
+ WRITE_ONCE(q->cparams.ce_threshold_selector,
+ nla_get_u8(tb[TCA_FQ_CODEL_CE_THRESHOLD_SELECTOR]));
+ if (tb[TCA_FQ_CODEL_CE_THRESHOLD_MASK])
+ WRITE_ONCE(q->cparams.ce_threshold_mask,
+ nla_get_u8(tb[TCA_FQ_CODEL_CE_THRESHOLD_MASK]));
+
if (tb[TCA_FQ_CODEL_INTERVAL]) {
u64 interval = nla_get_u32(tb[TCA_FQ_CODEL_INTERVAL]);
- q->cparams.interval = (interval * NSEC_PER_USEC) >> CODEL_SHIFT;
+ WRITE_ONCE(q->cparams.interval,
+ (interval * NSEC_PER_USEC) >> CODEL_SHIFT);
}
if (tb[TCA_FQ_CODEL_LIMIT])
- sch->limit = nla_get_u32(tb[TCA_FQ_CODEL_LIMIT]);
+ WRITE_ONCE(sch->limit,
+ nla_get_u32(tb[TCA_FQ_CODEL_LIMIT]));
if (tb[TCA_FQ_CODEL_ECN])
- q->cparams.ecn = !!nla_get_u32(tb[TCA_FQ_CODEL_ECN]);
+ WRITE_ONCE(q->cparams.ecn,
+ !!nla_get_u32(tb[TCA_FQ_CODEL_ECN]));
- if (tb[TCA_FQ_CODEL_QUANTUM])
- q->quantum = max(256U, nla_get_u32(tb[TCA_FQ_CODEL_QUANTUM]));
+ if (quantum)
+ WRITE_ONCE(q->quantum, quantum);
if (tb[TCA_FQ_CODEL_DROP_BATCH_SIZE])
- q->drop_batch_size = min(1U, nla_get_u32(tb[TCA_FQ_CODEL_DROP_BATCH_SIZE]));
+ WRITE_ONCE(q->drop_batch_size,
+ max(1U, nla_get_u32(tb[TCA_FQ_CODEL_DROP_BATCH_SIZE])));
if (tb[TCA_FQ_CODEL_MEMORY_LIMIT])
- q->memory_limit = min(1U << 31, nla_get_u32(tb[TCA_FQ_CODEL_MEMORY_LIMIT]));
+ WRITE_ONCE(q->memory_limit,
+ min(1U << 31, nla_get_u32(tb[TCA_FQ_CODEL_MEMORY_LIMIT])));
while (sch->q.qlen > sch->limit ||
q->memory_usage > q->memory_limit) {
- struct sk_buff *skb = fq_codel_dequeue(sch);
+ struct sk_buff *skb = qdisc_dequeue_internal(sch, false);
+
+ if (!skb)
+ break;
- q->cstats.drop_len += qdisc_pkt_len(skb);
+ dropped_pkts++;
+ dropped_bytes += qdisc_pkt_len(skb);
rtnl_kfree_skbs(skb, skb);
- q->cstats.drop_count++;
}
- qdisc_tree_reduce_backlog(sch, q->cstats.drop_count, q->cstats.drop_len);
- q->cstats.drop_count = 0;
- q->cstats.drop_len = 0;
+ qdisc_tree_reduce_backlog(sch, dropped_pkts, dropped_bytes);
sch_tree_unlock(sch);
return 0;
@@ -512,6 +519,9 @@ static int fq_codel_init(struct Qdisc *sch, struct nlattr *opt,
sch->flags |= TCQ_F_CAN_BYPASS;
else
sch->flags &= ~TCQ_F_CAN_BYPASS;
+
+ sch->flags |= TCQ_F_DEQUEUE_DROPS;
+
return 0;
alloc_failure:
@@ -525,34 +535,43 @@ init_failure:
static int fq_codel_dump(struct Qdisc *sch, struct sk_buff *skb)
{
struct fq_codel_sched_data *q = qdisc_priv(sch);
+ codel_time_t ce_threshold;
struct nlattr *opts;
- opts = nla_nest_start(skb, TCA_OPTIONS);
+ opts = nla_nest_start_noflag(skb, TCA_OPTIONS);
if (opts == NULL)
goto nla_put_failure;
if (nla_put_u32(skb, TCA_FQ_CODEL_TARGET,
- codel_time_to_us(q->cparams.target)) ||
+ codel_time_to_us(READ_ONCE(q->cparams.target))) ||
nla_put_u32(skb, TCA_FQ_CODEL_LIMIT,
- sch->limit) ||
+ READ_ONCE(sch->limit)) ||
nla_put_u32(skb, TCA_FQ_CODEL_INTERVAL,
- codel_time_to_us(q->cparams.interval)) ||
+ codel_time_to_us(READ_ONCE(q->cparams.interval))) ||
nla_put_u32(skb, TCA_FQ_CODEL_ECN,
- q->cparams.ecn) ||
+ READ_ONCE(q->cparams.ecn)) ||
nla_put_u32(skb, TCA_FQ_CODEL_QUANTUM,
- q->quantum) ||
+ READ_ONCE(q->quantum)) ||
nla_put_u32(skb, TCA_FQ_CODEL_DROP_BATCH_SIZE,
- q->drop_batch_size) ||
+ READ_ONCE(q->drop_batch_size)) ||
nla_put_u32(skb, TCA_FQ_CODEL_MEMORY_LIMIT,
- q->memory_limit) ||
+ READ_ONCE(q->memory_limit)) ||
nla_put_u32(skb, TCA_FQ_CODEL_FLOWS,
- q->flows_cnt))
+ READ_ONCE(q->flows_cnt)))
goto nla_put_failure;
- if (q->cparams.ce_threshold != CODEL_DISABLED_THRESHOLD &&
- nla_put_u32(skb, TCA_FQ_CODEL_CE_THRESHOLD,
- codel_time_to_us(q->cparams.ce_threshold)))
- goto nla_put_failure;
+ ce_threshold = READ_ONCE(q->cparams.ce_threshold);
+ if (ce_threshold != CODEL_DISABLED_THRESHOLD) {
+ if (nla_put_u32(skb, TCA_FQ_CODEL_CE_THRESHOLD,
+ codel_time_to_us(ce_threshold)))
+ goto nla_put_failure;
+ if (nla_put_u8(skb, TCA_FQ_CODEL_CE_THRESHOLD_SELECTOR,
+ READ_ONCE(q->cparams.ce_threshold_selector)))
+ goto nla_put_failure;
+ if (nla_put_u8(skb, TCA_FQ_CODEL_CE_THRESHOLD_MASK,
+ READ_ONCE(q->cparams.ce_threshold_mask)))
+ goto nla_put_failure;
+ }
return nla_nest_end(skb, opts);
@@ -600,8 +619,6 @@ static unsigned long fq_codel_find(struct Qdisc *sch, u32 classid)
static unsigned long fq_codel_bind(struct Qdisc *sch, unsigned long parent,
u32 classid)
{
- /* we cannot bypass queue discipline anymore */
- sch->flags &= ~TCQ_F_CAN_BYPASS;
return 0;
}
@@ -664,7 +681,7 @@ static int fq_codel_dump_class_stats(struct Qdisc *sch, unsigned long cl,
sch_tree_unlock(sch);
}
qs.backlog = q->backlogs[idx];
- qs.drops = flow->dropped;
+ qs.drops = 0;
}
if (gnet_stats_copy_queue(d, NULL, &qs, qs.qlen) < 0)
return -1;
@@ -682,16 +699,12 @@ static void fq_codel_walk(struct Qdisc *sch, struct qdisc_walker *arg)
return;
for (i = 0; i < q->flows_cnt; i++) {
- if (list_empty(&q->flows[i].flowchain) ||
- arg->count < arg->skip) {
+ if (list_empty(&q->flows[i].flowchain)) {
arg->count++;
continue;
}
- if (arg->fn(sch, i + 1, arg) < 0) {
- arg->stop = 1;
+ if (!tc_qdisc_stats_dump(sch, i + 1, arg))
break;
- }
- arg->count++;
}
}
@@ -721,6 +734,7 @@ static struct Qdisc_ops fq_codel_qdisc_ops __read_mostly = {
.dump_stats = fq_codel_dump_stats,
.owner = THIS_MODULE,
};
+MODULE_ALIAS_NET_SCH("fq_codel");
static int __init fq_codel_module_init(void)
{
@@ -736,3 +750,4 @@ module_init(fq_codel_module_init)
module_exit(fq_codel_module_exit)
MODULE_AUTHOR("Eric Dumazet");
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Fair Queue CoDel discipline");
diff --git a/net/sched/sch_fq_pie.c b/net/sched/sch_fq_pie.c
new file mode 100644
index 000000000000..7b96bc3ff891
--- /dev/null
+++ b/net/sched/sch_fq_pie.c
@@ -0,0 +1,595 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Flow Queue PIE discipline
+ *
+ * Copyright (C) 2019 Mohit P. Tahiliani <tahiliani@nitk.edu.in>
+ * Copyright (C) 2019 Sachin D. Patil <sdp.sachin@gmail.com>
+ * Copyright (C) 2019 V. Saicharan <vsaicharan1998@gmail.com>
+ * Copyright (C) 2019 Mohit Bhasi <mohitbhasi1998@gmail.com>
+ * Copyright (C) 2019 Leslie Monis <lesliemonis@gmail.com>
+ * Copyright (C) 2019 Gautam Ramakrishnan <gautamramk@gmail.com>
+ */
+
+#include <linux/jhash.h>
+#include <linux/module.h>
+#include <linux/sizes.h>
+#include <linux/vmalloc.h>
+#include <net/pkt_cls.h>
+#include <net/pie.h>
+
+/* Flow Queue PIE
+ *
+ * Principles:
+ * - Packets are classified on flows.
+ * - This is a Stochastic model (as we use a hash, several flows might
+ * be hashed to the same slot)
+ * - Each flow has a PIE managed queue.
+ * - Flows are linked onto two (Round Robin) lists,
+ * so that new flows have priority on old ones.
+ * - For a given flow, packets are not reordered.
+ * - Drops during enqueue only.
+ * - ECN capability is off by default.
+ * - ECN threshold (if ECN is enabled) is at 10% by default.
+ * - Uses timestamps to calculate queue delay by default.
+ */
+
+/**
+ * struct fq_pie_flow - contains data for each flow
+ * @vars: pie vars associated with the flow
+ * @deficit: number of remaining byte credits
+ * @backlog: size of data in the flow
+ * @qlen: number of packets in the flow
+ * @flowchain: flowchain for the flow
+ * @head: first packet in the flow
+ * @tail: last packet in the flow
+ */
+struct fq_pie_flow {
+ struct pie_vars vars;
+ s32 deficit;
+ u32 backlog;
+ u32 qlen;
+ struct list_head flowchain;
+ struct sk_buff *head;
+ struct sk_buff *tail;
+};
+
+struct fq_pie_sched_data {
+ struct tcf_proto __rcu *filter_list; /* optional external classifier */
+ struct tcf_block *block;
+ struct fq_pie_flow *flows;
+ struct Qdisc *sch;
+ struct list_head old_flows;
+ struct list_head new_flows;
+ struct pie_params p_params;
+ u32 ecn_prob;
+ u32 flows_cnt;
+ u32 flows_cursor;
+ u32 quantum;
+ u32 memory_limit;
+ u32 new_flow_count;
+ u32 memory_usage;
+ u32 overmemory;
+ struct pie_stats stats;
+ struct timer_list adapt_timer;
+};
+
+static unsigned int fq_pie_hash(const struct fq_pie_sched_data *q,
+ struct sk_buff *skb)
+{
+ return reciprocal_scale(skb_get_hash(skb), q->flows_cnt);
+}
+
+static unsigned int fq_pie_classify(struct sk_buff *skb, struct Qdisc *sch,
+ int *qerr)
+{
+ struct fq_pie_sched_data *q = qdisc_priv(sch);
+ struct tcf_proto *filter;
+ struct tcf_result res;
+ int result;
+
+ if (TC_H_MAJ(skb->priority) == sch->handle &&
+ TC_H_MIN(skb->priority) > 0 &&
+ TC_H_MIN(skb->priority) <= q->flows_cnt)
+ return TC_H_MIN(skb->priority);
+
+ filter = rcu_dereference_bh(q->filter_list);
+ if (!filter)
+ return fq_pie_hash(q, skb) + 1;
+
+ *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
+ result = tcf_classify(skb, NULL, filter, &res, false);
+ if (result >= 0) {
+#ifdef CONFIG_NET_CLS_ACT
+ switch (result) {
+ case TC_ACT_STOLEN:
+ case TC_ACT_QUEUED:
+ case TC_ACT_TRAP:
+ *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
+ fallthrough;
+ case TC_ACT_SHOT:
+ return 0;
+ }
+#endif
+ if (TC_H_MIN(res.classid) <= q->flows_cnt)
+ return TC_H_MIN(res.classid);
+ }
+ return 0;
+}
+
+/* add skb to flow queue (tail add) */
+static inline void flow_queue_add(struct fq_pie_flow *flow,
+ struct sk_buff *skb)
+{
+ if (!flow->head)
+ flow->head = skb;
+ else
+ flow->tail->next = skb;
+ flow->tail = skb;
+ skb->next = NULL;
+}
+
+static int fq_pie_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch,
+ struct sk_buff **to_free)
+{
+ enum skb_drop_reason reason = SKB_DROP_REASON_QDISC_OVERLIMIT;
+ struct fq_pie_sched_data *q = qdisc_priv(sch);
+ struct fq_pie_flow *sel_flow;
+ int ret;
+ u8 memory_limited = false;
+ u8 enqueue = false;
+ u32 pkt_len;
+ u32 idx;
+
+ /* Classifies packet into corresponding flow */
+ idx = fq_pie_classify(skb, sch, &ret);
+ if (idx == 0) {
+ if (ret & __NET_XMIT_BYPASS)
+ qdisc_qstats_drop(sch);
+ __qdisc_drop(skb, to_free);
+ return ret;
+ }
+ idx--;
+
+ sel_flow = &q->flows[idx];
+ /* Checks whether adding a new packet would exceed memory limit */
+ get_pie_cb(skb)->mem_usage = skb->truesize;
+ memory_limited = q->memory_usage > q->memory_limit + skb->truesize;
+
+ /* Checks if the qdisc is full */
+ if (unlikely(qdisc_qlen(sch) >= sch->limit)) {
+ q->stats.overlimit++;
+ goto out;
+ } else if (unlikely(memory_limited)) {
+ q->overmemory++;
+ }
+
+ reason = SKB_DROP_REASON_QDISC_CONGESTED;
+
+ if (!pie_drop_early(sch, &q->p_params, &sel_flow->vars,
+ sel_flow->backlog, skb->len)) {
+ enqueue = true;
+ } else if (q->p_params.ecn &&
+ sel_flow->vars.prob <= (MAX_PROB / 100) * q->ecn_prob &&
+ INET_ECN_set_ce(skb)) {
+ /* If packet is ecn capable, mark it if drop probability
+ * is lower than the parameter ecn_prob, else drop it.
+ */
+ q->stats.ecn_mark++;
+ enqueue = true;
+ }
+ if (enqueue) {
+ /* Set enqueue time only when dq_rate_estimator is disabled. */
+ if (!q->p_params.dq_rate_estimator)
+ pie_set_enqueue_time(skb);
+
+ pkt_len = qdisc_pkt_len(skb);
+ q->stats.packets_in++;
+ q->memory_usage += skb->truesize;
+ sch->qstats.backlog += pkt_len;
+ sch->q.qlen++;
+ flow_queue_add(sel_flow, skb);
+ if (list_empty(&sel_flow->flowchain)) {
+ list_add_tail(&sel_flow->flowchain, &q->new_flows);
+ q->new_flow_count++;
+ sel_flow->deficit = q->quantum;
+ sel_flow->qlen = 0;
+ sel_flow->backlog = 0;
+ }
+ sel_flow->qlen++;
+ sel_flow->backlog += pkt_len;
+ return NET_XMIT_SUCCESS;
+ }
+out:
+ q->stats.dropped++;
+ sel_flow->vars.accu_prob = 0;
+ qdisc_drop_reason(skb, sch, to_free, reason);
+ return NET_XMIT_CN;
+}
+
+static const struct netlink_range_validation fq_pie_q_range = {
+ .min = 1,
+ .max = 1 << 20,
+};
+
+static const struct nla_policy fq_pie_policy[TCA_FQ_PIE_MAX + 1] = {
+ [TCA_FQ_PIE_LIMIT] = {.type = NLA_U32},
+ [TCA_FQ_PIE_FLOWS] = {.type = NLA_U32},
+ [TCA_FQ_PIE_TARGET] = {.type = NLA_U32},
+ [TCA_FQ_PIE_TUPDATE] = {.type = NLA_U32},
+ [TCA_FQ_PIE_ALPHA] = {.type = NLA_U32},
+ [TCA_FQ_PIE_BETA] = {.type = NLA_U32},
+ [TCA_FQ_PIE_QUANTUM] =
+ NLA_POLICY_FULL_RANGE(NLA_U32, &fq_pie_q_range),
+ [TCA_FQ_PIE_MEMORY_LIMIT] = {.type = NLA_U32},
+ [TCA_FQ_PIE_ECN_PROB] = {.type = NLA_U32},
+ [TCA_FQ_PIE_ECN] = {.type = NLA_U32},
+ [TCA_FQ_PIE_BYTEMODE] = {.type = NLA_U32},
+ [TCA_FQ_PIE_DQ_RATE_ESTIMATOR] = {.type = NLA_U32},
+};
+
+static inline struct sk_buff *dequeue_head(struct fq_pie_flow *flow)
+{
+ struct sk_buff *skb = flow->head;
+
+ flow->head = skb->next;
+ skb->next = NULL;
+ return skb;
+}
+
+static struct sk_buff *fq_pie_qdisc_dequeue(struct Qdisc *sch)
+{
+ struct fq_pie_sched_data *q = qdisc_priv(sch);
+ struct sk_buff *skb = NULL;
+ struct fq_pie_flow *flow;
+ struct list_head *head;
+ u32 pkt_len;
+
+begin:
+ head = &q->new_flows;
+ if (list_empty(head)) {
+ head = &q->old_flows;
+ if (list_empty(head))
+ return NULL;
+ }
+
+ flow = list_first_entry(head, struct fq_pie_flow, flowchain);
+ /* Flow has exhausted all its credits */
+ if (flow->deficit <= 0) {
+ flow->deficit += q->quantum;
+ list_move_tail(&flow->flowchain, &q->old_flows);
+ goto begin;
+ }
+
+ if (flow->head) {
+ skb = dequeue_head(flow);
+ pkt_len = qdisc_pkt_len(skb);
+ sch->qstats.backlog -= pkt_len;
+ sch->q.qlen--;
+ qdisc_bstats_update(sch, skb);
+ }
+
+ if (!skb) {
+ /* force a pass through old_flows to prevent starvation */
+ if (head == &q->new_flows && !list_empty(&q->old_flows))
+ list_move_tail(&flow->flowchain, &q->old_flows);
+ else
+ list_del_init(&flow->flowchain);
+ goto begin;
+ }
+
+ flow->qlen--;
+ flow->deficit -= pkt_len;
+ flow->backlog -= pkt_len;
+ q->memory_usage -= get_pie_cb(skb)->mem_usage;
+ pie_process_dequeue(skb, &q->p_params, &flow->vars, flow->backlog);
+ return skb;
+}
+
+static int fq_pie_change(struct Qdisc *sch, struct nlattr *opt,
+ struct netlink_ext_ack *extack)
+{
+ unsigned int dropped_pkts = 0, dropped_bytes = 0;
+ struct fq_pie_sched_data *q = qdisc_priv(sch);
+ struct nlattr *tb[TCA_FQ_PIE_MAX + 1];
+ int err;
+
+ err = nla_parse_nested(tb, TCA_FQ_PIE_MAX, opt, fq_pie_policy, extack);
+ if (err < 0)
+ return err;
+
+ sch_tree_lock(sch);
+ if (tb[TCA_FQ_PIE_LIMIT]) {
+ u32 limit = nla_get_u32(tb[TCA_FQ_PIE_LIMIT]);
+
+ WRITE_ONCE(q->p_params.limit, limit);
+ WRITE_ONCE(sch->limit, limit);
+ }
+ if (tb[TCA_FQ_PIE_FLOWS]) {
+ if (q->flows) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "Number of flows cannot be changed");
+ goto flow_error;
+ }
+ q->flows_cnt = nla_get_u32(tb[TCA_FQ_PIE_FLOWS]);
+ if (!q->flows_cnt || q->flows_cnt > 65536) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "Number of flows must range in [1..65536]");
+ goto flow_error;
+ }
+ }
+
+ /* convert from microseconds to pschedtime */
+ if (tb[TCA_FQ_PIE_TARGET]) {
+ /* target is in us */
+ u32 target = nla_get_u32(tb[TCA_FQ_PIE_TARGET]);
+
+ /* convert to pschedtime */
+ WRITE_ONCE(q->p_params.target,
+ PSCHED_NS2TICKS((u64)target * NSEC_PER_USEC));
+ }
+
+ /* tupdate is in jiffies */
+ if (tb[TCA_FQ_PIE_TUPDATE])
+ WRITE_ONCE(q->p_params.tupdate,
+ usecs_to_jiffies(nla_get_u32(tb[TCA_FQ_PIE_TUPDATE])));
+
+ if (tb[TCA_FQ_PIE_ALPHA])
+ WRITE_ONCE(q->p_params.alpha,
+ nla_get_u32(tb[TCA_FQ_PIE_ALPHA]));
+
+ if (tb[TCA_FQ_PIE_BETA])
+ WRITE_ONCE(q->p_params.beta,
+ nla_get_u32(tb[TCA_FQ_PIE_BETA]));
+
+ if (tb[TCA_FQ_PIE_QUANTUM])
+ WRITE_ONCE(q->quantum, nla_get_u32(tb[TCA_FQ_PIE_QUANTUM]));
+
+ if (tb[TCA_FQ_PIE_MEMORY_LIMIT])
+ WRITE_ONCE(q->memory_limit,
+ nla_get_u32(tb[TCA_FQ_PIE_MEMORY_LIMIT]));
+
+ if (tb[TCA_FQ_PIE_ECN_PROB])
+ WRITE_ONCE(q->ecn_prob,
+ nla_get_u32(tb[TCA_FQ_PIE_ECN_PROB]));
+
+ if (tb[TCA_FQ_PIE_ECN])
+ WRITE_ONCE(q->p_params.ecn,
+ nla_get_u32(tb[TCA_FQ_PIE_ECN]));
+
+ if (tb[TCA_FQ_PIE_BYTEMODE])
+ WRITE_ONCE(q->p_params.bytemode,
+ nla_get_u32(tb[TCA_FQ_PIE_BYTEMODE]));
+
+ if (tb[TCA_FQ_PIE_DQ_RATE_ESTIMATOR])
+ WRITE_ONCE(q->p_params.dq_rate_estimator,
+ nla_get_u32(tb[TCA_FQ_PIE_DQ_RATE_ESTIMATOR]));
+
+ /* Drop excess packets if new limit is lower */
+ while (sch->q.qlen > sch->limit) {
+ struct sk_buff *skb = qdisc_dequeue_internal(sch, false);
+
+ if (!skb)
+ break;
+
+ dropped_pkts++;
+ dropped_bytes += qdisc_pkt_len(skb);
+ rtnl_kfree_skbs(skb, skb);
+ }
+ qdisc_tree_reduce_backlog(sch, dropped_pkts, dropped_bytes);
+
+ sch_tree_unlock(sch);
+ return 0;
+
+flow_error:
+ sch_tree_unlock(sch);
+ return -EINVAL;
+}
+
+static void fq_pie_timer(struct timer_list *t)
+{
+ struct fq_pie_sched_data *q = timer_container_of(q, t, adapt_timer);
+ unsigned long next, tupdate;
+ struct Qdisc *sch = q->sch;
+ spinlock_t *root_lock; /* to lock qdisc for probability calculations */
+ int max_cnt, i;
+
+ rcu_read_lock();
+ root_lock = qdisc_lock(qdisc_root_sleeping(sch));
+ spin_lock(root_lock);
+
+ /* Limit this expensive loop to 2048 flows per round. */
+ max_cnt = min_t(int, q->flows_cnt - q->flows_cursor, 2048);
+ for (i = 0; i < max_cnt; i++) {
+ pie_calculate_probability(&q->p_params,
+ &q->flows[q->flows_cursor].vars,
+ q->flows[q->flows_cursor].backlog);
+ q->flows_cursor++;
+ }
+
+ tupdate = q->p_params.tupdate;
+ next = 0;
+ if (q->flows_cursor >= q->flows_cnt) {
+ q->flows_cursor = 0;
+ next = tupdate;
+ }
+ if (tupdate)
+ mod_timer(&q->adapt_timer, jiffies + next);
+ spin_unlock(root_lock);
+ rcu_read_unlock();
+}
+
+static int fq_pie_init(struct Qdisc *sch, struct nlattr *opt,
+ struct netlink_ext_ack *extack)
+{
+ struct fq_pie_sched_data *q = qdisc_priv(sch);
+ int err;
+ u32 idx;
+
+ pie_params_init(&q->p_params);
+ sch->limit = 10 * 1024;
+ q->p_params.limit = sch->limit;
+ q->quantum = psched_mtu(qdisc_dev(sch));
+ q->sch = sch;
+ q->ecn_prob = 10;
+ q->flows_cnt = 1024;
+ q->memory_limit = SZ_32M;
+
+ INIT_LIST_HEAD(&q->new_flows);
+ INIT_LIST_HEAD(&q->old_flows);
+ timer_setup(&q->adapt_timer, fq_pie_timer, 0);
+
+ if (opt) {
+ err = fq_pie_change(sch, opt, extack);
+
+ if (err)
+ return err;
+ }
+
+ err = tcf_block_get(&q->block, &q->filter_list, sch, extack);
+ if (err)
+ goto init_failure;
+
+ q->flows = kvcalloc(q->flows_cnt, sizeof(struct fq_pie_flow),
+ GFP_KERNEL);
+ if (!q->flows) {
+ err = -ENOMEM;
+ goto init_failure;
+ }
+ for (idx = 0; idx < q->flows_cnt; idx++) {
+ struct fq_pie_flow *flow = q->flows + idx;
+
+ INIT_LIST_HEAD(&flow->flowchain);
+ pie_vars_init(&flow->vars);
+ }
+
+ mod_timer(&q->adapt_timer, jiffies + HZ / 2);
+
+ return 0;
+
+init_failure:
+ q->flows_cnt = 0;
+
+ return err;
+}
+
+static int fq_pie_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+ struct fq_pie_sched_data *q = qdisc_priv(sch);
+ struct nlattr *opts;
+
+ opts = nla_nest_start(skb, TCA_OPTIONS);
+ if (!opts)
+ return -EMSGSIZE;
+
+ /* convert target from pschedtime to us */
+ if (nla_put_u32(skb, TCA_FQ_PIE_LIMIT, READ_ONCE(sch->limit)) ||
+ nla_put_u32(skb, TCA_FQ_PIE_FLOWS, READ_ONCE(q->flows_cnt)) ||
+ nla_put_u32(skb, TCA_FQ_PIE_TARGET,
+ ((u32)PSCHED_TICKS2NS(READ_ONCE(q->p_params.target))) /
+ NSEC_PER_USEC) ||
+ nla_put_u32(skb, TCA_FQ_PIE_TUPDATE,
+ jiffies_to_usecs(READ_ONCE(q->p_params.tupdate))) ||
+ nla_put_u32(skb, TCA_FQ_PIE_ALPHA, READ_ONCE(q->p_params.alpha)) ||
+ nla_put_u32(skb, TCA_FQ_PIE_BETA, READ_ONCE(q->p_params.beta)) ||
+ nla_put_u32(skb, TCA_FQ_PIE_QUANTUM, READ_ONCE(q->quantum)) ||
+ nla_put_u32(skb, TCA_FQ_PIE_MEMORY_LIMIT,
+ READ_ONCE(q->memory_limit)) ||
+ nla_put_u32(skb, TCA_FQ_PIE_ECN_PROB, READ_ONCE(q->ecn_prob)) ||
+ nla_put_u32(skb, TCA_FQ_PIE_ECN, READ_ONCE(q->p_params.ecn)) ||
+ nla_put_u32(skb, TCA_FQ_PIE_BYTEMODE, READ_ONCE(q->p_params.bytemode)) ||
+ nla_put_u32(skb, TCA_FQ_PIE_DQ_RATE_ESTIMATOR,
+ READ_ONCE(q->p_params.dq_rate_estimator)))
+ goto nla_put_failure;
+
+ return nla_nest_end(skb, opts);
+
+nla_put_failure:
+ nla_nest_cancel(skb, opts);
+ return -EMSGSIZE;
+}
+
+static int fq_pie_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
+{
+ struct fq_pie_sched_data *q = qdisc_priv(sch);
+ struct tc_fq_pie_xstats st = {
+ .packets_in = q->stats.packets_in,
+ .overlimit = q->stats.overlimit,
+ .overmemory = q->overmemory,
+ .dropped = q->stats.dropped,
+ .ecn_mark = q->stats.ecn_mark,
+ .new_flow_count = q->new_flow_count,
+ .memory_usage = q->memory_usage,
+ };
+ struct list_head *pos;
+
+ sch_tree_lock(sch);
+ list_for_each(pos, &q->new_flows)
+ st.new_flows_len++;
+
+ list_for_each(pos, &q->old_flows)
+ st.old_flows_len++;
+ sch_tree_unlock(sch);
+
+ return gnet_stats_copy_app(d, &st, sizeof(st));
+}
+
+static void fq_pie_reset(struct Qdisc *sch)
+{
+ struct fq_pie_sched_data *q = qdisc_priv(sch);
+ u32 idx;
+
+ INIT_LIST_HEAD(&q->new_flows);
+ INIT_LIST_HEAD(&q->old_flows);
+ for (idx = 0; idx < q->flows_cnt; idx++) {
+ struct fq_pie_flow *flow = q->flows + idx;
+
+ /* Removes all packets from flow */
+ rtnl_kfree_skbs(flow->head, flow->tail);
+ flow->head = NULL;
+
+ INIT_LIST_HEAD(&flow->flowchain);
+ pie_vars_init(&flow->vars);
+ }
+}
+
+static void fq_pie_destroy(struct Qdisc *sch)
+{
+ struct fq_pie_sched_data *q = qdisc_priv(sch);
+
+ tcf_block_put(q->block);
+ q->p_params.tupdate = 0;
+ timer_delete_sync(&q->adapt_timer);
+ kvfree(q->flows);
+}
+
+static struct Qdisc_ops fq_pie_qdisc_ops __read_mostly = {
+ .id = "fq_pie",
+ .priv_size = sizeof(struct fq_pie_sched_data),
+ .enqueue = fq_pie_qdisc_enqueue,
+ .dequeue = fq_pie_qdisc_dequeue,
+ .peek = qdisc_peek_dequeued,
+ .init = fq_pie_init,
+ .destroy = fq_pie_destroy,
+ .reset = fq_pie_reset,
+ .change = fq_pie_change,
+ .dump = fq_pie_dump,
+ .dump_stats = fq_pie_dump_stats,
+ .owner = THIS_MODULE,
+};
+MODULE_ALIAS_NET_SCH("fq_pie");
+
+static int __init fq_pie_module_init(void)
+{
+ return register_qdisc(&fq_pie_qdisc_ops);
+}
+
+static void __exit fq_pie_module_exit(void)
+{
+ unregister_qdisc(&fq_pie_qdisc_ops);
+}
+
+module_init(fq_pie_module_init);
+module_exit(fq_pie_module_exit);
+
+MODULE_DESCRIPTION("Flow Queue Proportional Integral controller Enhanced (FQ-PIE)");
+MODULE_AUTHOR("Mohit P. Tahiliani");
+MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_frag.c b/net/sched/sch_frag.c
new file mode 100644
index 000000000000..d1d87dce7f3f
--- /dev/null
+++ b/net/sched/sch_frag.c
@@ -0,0 +1,160 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+#include <linux/if_vlan.h>
+#include <net/netlink.h>
+#include <net/sch_generic.h>
+#include <net/pkt_sched.h>
+#include <net/dst.h>
+#include <net/ip.h>
+#include <net/ip6_fib.h>
+
+struct sch_frag_data {
+ unsigned long dst;
+ struct qdisc_skb_cb cb;
+ __be16 inner_protocol;
+ u16 vlan_tci;
+ __be16 vlan_proto;
+ unsigned int l2_len;
+ u8 l2_data[VLAN_ETH_HLEN];
+ int (*xmit)(struct sk_buff *skb);
+ local_lock_t bh_lock;
+};
+
+static DEFINE_PER_CPU(struct sch_frag_data, sch_frag_data_storage) = {
+ .bh_lock = INIT_LOCAL_LOCK(bh_lock),
+};
+
+static int sch_frag_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
+{
+ struct sch_frag_data *data = this_cpu_ptr(&sch_frag_data_storage);
+
+ lockdep_assert_held(&data->bh_lock);
+ if (skb_cow_head(skb, data->l2_len) < 0) {
+ kfree_skb(skb);
+ return -ENOMEM;
+ }
+
+ __skb_dst_copy(skb, data->dst);
+ *qdisc_skb_cb(skb) = data->cb;
+ skb->inner_protocol = data->inner_protocol;
+ if (data->vlan_tci & VLAN_CFI_MASK)
+ __vlan_hwaccel_put_tag(skb, data->vlan_proto,
+ data->vlan_tci & ~VLAN_CFI_MASK);
+ else
+ __vlan_hwaccel_clear_tag(skb);
+
+ /* Reconstruct the MAC header. */
+ skb_push(skb, data->l2_len);
+ memcpy(skb->data, &data->l2_data, data->l2_len);
+ skb_postpush_rcsum(skb, skb->data, data->l2_len);
+ skb_reset_mac_header(skb);
+
+ return data->xmit(skb);
+}
+
+static void sch_frag_prepare_frag(struct sk_buff *skb,
+ int (*xmit)(struct sk_buff *skb))
+{
+ unsigned int hlen = skb_network_offset(skb);
+ struct sch_frag_data *data;
+
+ data = this_cpu_ptr(&sch_frag_data_storage);
+ data->dst = skb->_skb_refdst;
+ data->cb = *qdisc_skb_cb(skb);
+ data->xmit = xmit;
+ data->inner_protocol = skb->inner_protocol;
+ if (skb_vlan_tag_present(skb))
+ data->vlan_tci = skb_vlan_tag_get(skb) | VLAN_CFI_MASK;
+ else
+ data->vlan_tci = 0;
+ data->vlan_proto = skb->vlan_proto;
+ data->l2_len = hlen;
+ memcpy(&data->l2_data, skb->data, hlen);
+
+ memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
+ skb_pull(skb, hlen);
+}
+
+static unsigned int
+sch_frag_dst_get_mtu(const struct dst_entry *dst)
+{
+ return dst->dev->mtu;
+}
+
+static struct dst_ops sch_frag_dst_ops = {
+ .family = AF_UNSPEC,
+ .mtu = sch_frag_dst_get_mtu,
+};
+
+static int sch_fragment(struct net *net, struct sk_buff *skb,
+ u16 mru, int (*xmit)(struct sk_buff *skb))
+{
+ int ret = -1;
+
+ if (skb_network_offset(skb) > VLAN_ETH_HLEN) {
+ net_warn_ratelimited("L2 header too long to fragment\n");
+ goto err;
+ }
+
+ if (skb_protocol(skb, true) == htons(ETH_P_IP)) {
+ struct rtable sch_frag_rt = { 0 };
+ unsigned long orig_dst;
+
+ local_lock_nested_bh(&sch_frag_data_storage.bh_lock);
+ sch_frag_prepare_frag(skb, xmit);
+ dst_init(&sch_frag_rt.dst, &sch_frag_dst_ops, NULL,
+ DST_OBSOLETE_NONE, DST_NOCOUNT);
+ sch_frag_rt.dst.dev = skb->dev;
+
+ orig_dst = skb->_skb_refdst;
+ skb_dst_set_noref(skb, &sch_frag_rt.dst);
+ IPCB(skb)->frag_max_size = mru;
+
+ ret = ip_do_fragment(net, skb->sk, skb, sch_frag_xmit);
+ local_unlock_nested_bh(&sch_frag_data_storage.bh_lock);
+ refdst_drop(orig_dst);
+ } else if (skb_protocol(skb, true) == htons(ETH_P_IPV6)) {
+ unsigned long orig_dst;
+ struct rt6_info sch_frag_rt;
+
+ local_lock_nested_bh(&sch_frag_data_storage.bh_lock);
+ sch_frag_prepare_frag(skb, xmit);
+ memset(&sch_frag_rt, 0, sizeof(sch_frag_rt));
+ dst_init(&sch_frag_rt.dst, &sch_frag_dst_ops, NULL,
+ DST_OBSOLETE_NONE, DST_NOCOUNT);
+ sch_frag_rt.dst.dev = skb->dev;
+
+ orig_dst = skb->_skb_refdst;
+ skb_dst_set_noref(skb, &sch_frag_rt.dst);
+ IP6CB(skb)->frag_max_size = mru;
+
+ ret = ipv6_stub->ipv6_fragment(net, skb->sk, skb,
+ sch_frag_xmit);
+ local_unlock_nested_bh(&sch_frag_data_storage.bh_lock);
+ refdst_drop(orig_dst);
+ } else {
+ net_warn_ratelimited("Fail frag %s: eth=%x, MRU=%d, MTU=%d\n",
+ netdev_name(skb->dev),
+ ntohs(skb_protocol(skb, true)), mru,
+ skb->dev->mtu);
+ goto err;
+ }
+
+ return ret;
+err:
+ kfree_skb(skb);
+ return ret;
+}
+
+int sch_frag_xmit_hook(struct sk_buff *skb, int (*xmit)(struct sk_buff *skb))
+{
+ u16 mru = tc_skb_cb(skb)->mru;
+ int err;
+
+ if (mru && skb->len > mru + skb->dev->hard_header_len)
+ err = sch_fragment(dev_net(skb->dev), skb, mru, xmit);
+ else
+ err = xmit(skb);
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(sch_frag_xmit_hook);
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 66ba2ce2320f..852e603c1755 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -1,11 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* net/sched/sch_generic.c Generic packet scheduler routines.
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
* Jamal Hadi Salim, <hadi@cyberus.ca> 990601
* - Ingress support
@@ -28,16 +24,40 @@
#include <linux/if_vlan.h>
#include <linux/skb_array.h>
#include <linux/if_macvlan.h>
+#include <linux/bpf.h>
#include <net/sch_generic.h>
#include <net/pkt_sched.h>
#include <net/dst.h>
+#include <net/hotdata.h>
#include <trace/events/qdisc.h>
+#include <trace/events/net.h>
#include <net/xfrm.h>
/* Qdisc to use by default */
const struct Qdisc_ops *default_qdisc_ops = &pfifo_fast_ops;
EXPORT_SYMBOL(default_qdisc_ops);
+static void qdisc_maybe_clear_missed(struct Qdisc *q,
+ const struct netdev_queue *txq)
+{
+ clear_bit(__QDISC_STATE_MISSED, &q->state);
+
+ /* Make sure the below netif_xmit_frozen_or_stopped()
+ * checking happens after clearing STATE_MISSED.
+ */
+ smp_mb__after_atomic();
+
+ /* Checking netif_xmit_frozen_or_stopped() again to
+ * make sure STATE_MISSED is set if the STATE_MISSED
+ * set by netif_tx_wake_queue()'s rescheduling of
+ * net_tx_action() is cleared by the above clear_bit().
+ */
+ if (!netif_xmit_frozen_or_stopped(txq))
+ set_bit(__QDISC_STATE_MISSED, &q->state);
+ else
+ set_bit(__QDISC_STATE_DRAINING, &q->state);
+}
+
/* Main transmission queue. */
/* Modifications to data participating in scheduling must be protected with
@@ -49,6 +69,8 @@ EXPORT_SYMBOL(default_qdisc_ops);
* - updates to tree and tree walking are only done under the rtnl mutex.
*/
+#define SKB_XOFF_MAGIC ((struct sk_buff *)1UL)
+
static inline struct sk_buff *__skb_dequeue_bad_txq(struct Qdisc *q)
{
const struct netdev_queue *txq = q->dev_queue;
@@ -74,7 +96,8 @@ static inline struct sk_buff *__skb_dequeue_bad_txq(struct Qdisc *q)
q->q.qlen--;
}
} else {
- skb = NULL;
+ skb = SKB_XOFF_MAGIC;
+ qdisc_maybe_clear_missed(q, txq);
}
}
@@ -118,60 +141,49 @@ static inline void qdisc_enqueue_skb_bad_txq(struct Qdisc *q,
spin_unlock(lock);
}
-static inline int __dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q)
+static inline void dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q)
{
- while (skb) {
- struct sk_buff *next = skb->next;
-
- __skb_queue_tail(&q->gso_skb, skb);
- q->qstats.requeues++;
- qdisc_qstats_backlog_inc(q, skb);
- q->q.qlen++; /* it's still part of the queue */
+ spinlock_t *lock = NULL;
- skb = next;
+ if (q->flags & TCQ_F_NOLOCK) {
+ lock = qdisc_lock(q);
+ spin_lock(lock);
}
- __netif_schedule(q);
-
- return 0;
-}
-
-static inline int dev_requeue_skb_locked(struct sk_buff *skb, struct Qdisc *q)
-{
- spinlock_t *lock = qdisc_lock(q);
- spin_lock(lock);
while (skb) {
struct sk_buff *next = skb->next;
__skb_queue_tail(&q->gso_skb, skb);
- qdisc_qstats_cpu_requeues_inc(q);
- qdisc_qstats_cpu_backlog_inc(q, skb);
- qdisc_qstats_cpu_qlen_inc(q);
+ /* it's still part of the queue */
+ if (qdisc_is_percpu_stats(q)) {
+ qdisc_qstats_cpu_requeues_inc(q);
+ qdisc_qstats_cpu_backlog_inc(q, skb);
+ qdisc_qstats_cpu_qlen_inc(q);
+ } else {
+ q->qstats.requeues++;
+ qdisc_qstats_backlog_inc(q, skb);
+ q->q.qlen++;
+ }
skb = next;
}
- spin_unlock(lock);
-
- __netif_schedule(q);
- return 0;
-}
-
-static inline int dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q)
-{
- if (q->flags & TCQ_F_NOLOCK)
- return dev_requeue_skb_locked(skb, q);
- else
- return __dev_requeue_skb(skb, q);
+ if (lock) {
+ spin_unlock(lock);
+ set_bit(__QDISC_STATE_MISSED, &q->state);
+ } else {
+ __netif_schedule(q);
+ }
}
static void try_bulk_dequeue_skb(struct Qdisc *q,
struct sk_buff *skb,
const struct netdev_queue *txq,
- int *packets)
+ int *packets, int budget)
{
int bytelimit = qdisc_avail_bulklimit(txq) - skb->len;
+ int cnt = 0;
while (bytelimit > 0) {
struct sk_buff *nskb = q->dequeue(q);
@@ -182,8 +194,10 @@ static void try_bulk_dequeue_skb(struct Qdisc *q,
bytelimit -= nskb->len; /* covers GSO len */
skb->next = nskb;
skb = nskb;
- (*packets)++; /* GSO counts as one pkt */
+ if (++cnt >= budget)
+ break;
}
+ (*packets) += cnt;
skb_mark_not_on_list(skb);
}
@@ -217,7 +231,7 @@ static void try_bulk_dequeue_skb_slow(struct Qdisc *q,
* A requeued skb (via q->gso_skb) can also be a SKB list.
*/
static struct sk_buff *dequeue_skb(struct Qdisc *q, bool *validate,
- int *packets)
+ int *packets, int budget)
{
const struct netdev_queue *txq = q->dev_queue;
struct sk_buff *skb = NULL;
@@ -259,6 +273,7 @@ static struct sk_buff *dequeue_skb(struct Qdisc *q, bool *validate,
}
} else {
skb = NULL;
+ qdisc_maybe_clear_missed(q, txq);
}
if (lock)
spin_unlock(lock);
@@ -268,17 +283,22 @@ validate:
*validate = true;
if ((q->flags & TCQ_F_ONETXQUEUE) &&
- netif_xmit_frozen_or_stopped(txq))
+ netif_xmit_frozen_or_stopped(txq)) {
+ qdisc_maybe_clear_missed(q, txq);
return skb;
+ }
skb = qdisc_dequeue_skb_bad_txq(q);
- if (unlikely(skb))
+ if (unlikely(skb)) {
+ if (skb == SKB_XOFF_MAGIC)
+ return NULL;
goto bulk;
+ }
skb = q->dequeue(q);
if (skb) {
bulk:
if (qdisc_may_bulk(q))
- try_bulk_dequeue_skb(q, skb, txq, packets);
+ try_bulk_dequeue_skb(q, skb, txq, packets, budget);
else
try_bulk_dequeue_skb_slow(q, skb, packets);
}
@@ -289,8 +309,8 @@ trace:
/*
* Transmit possibly several skbs, and handle the return status as
- * required. Owning running seqcount bit guarantees that
- * only one CPU can execute this function.
+ * required. Owning qdisc running bit guarantees that only one CPU
+ * can execute this function.
*
* Returns to the caller:
* false - hardware queue frozen backoff
@@ -325,6 +345,8 @@ bool sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
HARD_TX_LOCK(dev, txq, smp_processor_id());
if (!netif_xmit_frozen_or_stopped(txq))
skb = dev_hard_start_xmit(skb, dev, txq, &ret);
+ else
+ qdisc_maybe_clear_missed(q, txq);
HARD_TX_UNLOCK(dev, txq);
} else {
@@ -368,7 +390,7 @@ bool sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
* >0 - queue is not empty.
*
*/
-static inline bool qdisc_restart(struct Qdisc *q, int *packets)
+static inline bool qdisc_restart(struct Qdisc *q, int *packets, int budget)
{
spinlock_t *root_lock = NULL;
struct netdev_queue *txq;
@@ -377,7 +399,7 @@ static inline bool qdisc_restart(struct Qdisc *q, int *packets)
bool validate;
/* Dequeue packet */
- skb = dequeue_skb(q, &validate, packets);
+ skb = dequeue_skb(q, &validate, packets, budget);
if (unlikely(!skb))
return false;
@@ -392,18 +414,17 @@ static inline bool qdisc_restart(struct Qdisc *q, int *packets)
void __qdisc_run(struct Qdisc *q)
{
- int quota = dev_tx_weight;
+ int quota = READ_ONCE(net_hotdata.dev_tx_weight);
int packets;
- while (qdisc_restart(q, &packets)) {
- /*
- * Ordered by possible occurrence: Postpone processing if
- * 1. we've exceeded packet quota
- * 2. another process needs the CPU;
- */
+ while (qdisc_restart(q, &packets, quota)) {
quota -= packets;
- if (quota <= 0 || need_resched()) {
- __netif_schedule(q);
+ if (quota <= 0) {
+ if (q->flags & TCQ_F_NOLOCK)
+ set_bit(__QDISC_STATE_MISSED, &q->state);
+ else
+ __netif_schedule(q);
+
break;
}
}
@@ -411,16 +432,12 @@ void __qdisc_run(struct Qdisc *q)
unsigned long dev_trans_start(struct net_device *dev)
{
- unsigned long val, res;
+ unsigned long res = READ_ONCE(netdev_get_tx_queue(dev, 0)->trans_start);
+ unsigned long val;
unsigned int i;
- if (is_vlan_dev(dev))
- dev = vlan_dev_real_dev(dev);
- else if (netif_is_macvlan(dev))
- dev = macvlan_dev_real_dev(dev);
- res = netdev_get_tx_queue(dev, 0)->trans_start;
for (i = 1; i < dev->num_tx_queues; i++) {
- val = netdev_get_tx_queue(dev, i)->trans_start;
+ val = READ_ONCE(netdev_get_tx_queue(dev, i)->trans_start);
if (val && time_after(val, res))
res = val;
}
@@ -429,70 +446,133 @@ unsigned long dev_trans_start(struct net_device *dev)
}
EXPORT_SYMBOL(dev_trans_start);
+static void netif_freeze_queues(struct net_device *dev)
+{
+ unsigned int i;
+ int cpu;
+
+ cpu = smp_processor_id();
+ for (i = 0; i < dev->num_tx_queues; i++) {
+ struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
+
+ /* We are the only thread of execution doing a
+ * freeze, but we have to grab the _xmit_lock in
+ * order to synchronize with threads which are in
+ * the ->hard_start_xmit() handler and already
+ * checked the frozen bit.
+ */
+ __netif_tx_lock(txq, cpu);
+ set_bit(__QUEUE_STATE_FROZEN, &txq->state);
+ __netif_tx_unlock(txq);
+ }
+}
+
+void netif_tx_lock(struct net_device *dev)
+{
+ spin_lock(&dev->tx_global_lock);
+ netif_freeze_queues(dev);
+}
+EXPORT_SYMBOL(netif_tx_lock);
+
+static void netif_unfreeze_queues(struct net_device *dev)
+{
+ unsigned int i;
+
+ for (i = 0; i < dev->num_tx_queues; i++) {
+ struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
+
+ /* No need to grab the _xmit_lock here. If the
+ * queue is not stopped for another reason, we
+ * force a schedule.
+ */
+ clear_bit(__QUEUE_STATE_FROZEN, &txq->state);
+ netif_schedule_queue(txq);
+ }
+}
+
+void netif_tx_unlock(struct net_device *dev)
+{
+ netif_unfreeze_queues(dev);
+ spin_unlock(&dev->tx_global_lock);
+}
+EXPORT_SYMBOL(netif_tx_unlock);
+
static void dev_watchdog(struct timer_list *t)
{
- struct net_device *dev = from_timer(dev, t, watchdog_timer);
+ struct net_device *dev = timer_container_of(dev, t, watchdog_timer);
+ bool release = true;
- netif_tx_lock(dev);
+ spin_lock(&dev->tx_global_lock);
if (!qdisc_tx_is_noop(dev)) {
if (netif_device_present(dev) &&
netif_running(dev) &&
netif_carrier_ok(dev)) {
- int some_queue_timedout = 0;
+ unsigned int timedout_ms = 0;
unsigned int i;
unsigned long trans_start;
+ unsigned long oldest_start = jiffies;
for (i = 0; i < dev->num_tx_queues; i++) {
struct netdev_queue *txq;
txq = netdev_get_tx_queue(dev, i);
- trans_start = txq->trans_start;
- if (netif_xmit_stopped(txq) &&
- time_after(jiffies, (trans_start +
- dev->watchdog_timeo))) {
- some_queue_timedout = 1;
- txq->trans_timeout++;
+ if (!netif_xmit_stopped(txq))
+ continue;
+
+ /* Paired with WRITE_ONCE() + smp_mb...() in
+ * netdev_tx_sent_queue() and netif_tx_stop_queue().
+ */
+ smp_mb();
+ trans_start = READ_ONCE(txq->trans_start);
+
+ if (time_after(jiffies, trans_start + dev->watchdog_timeo)) {
+ timedout_ms = jiffies_to_msecs(jiffies - trans_start);
+ atomic_long_inc(&txq->trans_timeout);
break;
}
+ if (time_after(oldest_start, trans_start))
+ oldest_start = trans_start;
}
- if (some_queue_timedout) {
- WARN_ONCE(1, KERN_INFO "NETDEV WATCHDOG: %s (%s): transmit queue %u timed out\n",
- dev->name, netdev_drivername(dev), i);
- dev->netdev_ops->ndo_tx_timeout(dev);
+ if (unlikely(timedout_ms)) {
+ trace_net_dev_xmit_timeout(dev, i);
+ netdev_crit(dev, "NETDEV WATCHDOG: CPU: %d: transmit queue %u timed out %u ms\n",
+ raw_smp_processor_id(),
+ i, timedout_ms);
+ netif_freeze_queues(dev);
+ dev->netdev_ops->ndo_tx_timeout(dev, i);
+ netif_unfreeze_queues(dev);
}
if (!mod_timer(&dev->watchdog_timer,
- round_jiffies(jiffies +
+ round_jiffies(oldest_start +
dev->watchdog_timeo)))
- dev_hold(dev);
+ release = false;
}
}
- netif_tx_unlock(dev);
+ spin_unlock(&dev->tx_global_lock);
- dev_put(dev);
+ if (release)
+ netdev_put(dev, &dev->watchdog_dev_tracker);
}
-void __netdev_watchdog_up(struct net_device *dev)
+void netdev_watchdog_up(struct net_device *dev)
{
- if (dev->netdev_ops->ndo_tx_timeout) {
- if (dev->watchdog_timeo <= 0)
- dev->watchdog_timeo = 5*HZ;
- if (!mod_timer(&dev->watchdog_timer,
- round_jiffies(jiffies + dev->watchdog_timeo)))
- dev_hold(dev);
- }
-}
-
-static void dev_watchdog_up(struct net_device *dev)
-{
- __netdev_watchdog_up(dev);
+ if (!dev->netdev_ops->ndo_tx_timeout)
+ return;
+ if (dev->watchdog_timeo <= 0)
+ dev->watchdog_timeo = 5*HZ;
+ if (!mod_timer(&dev->watchdog_timer,
+ round_jiffies(jiffies + dev->watchdog_timeo)))
+ netdev_hold(dev, &dev->watchdog_dev_tracker,
+ GFP_ATOMIC);
}
+EXPORT_SYMBOL_GPL(netdev_watchdog_up);
-static void dev_watchdog_down(struct net_device *dev)
+static void netdev_watchdog_down(struct net_device *dev)
{
netif_tx_lock_bh(dev);
- if (del_timer(&dev->watchdog_timer))
- dev_put(dev);
+ if (timer_delete(&dev->watchdog_timer))
+ netdev_put(dev, &dev->watchdog_dev_tracker);
netif_tx_unlock_bh(dev);
}
@@ -500,7 +580,7 @@ static void dev_watchdog_down(struct net_device *dev)
* netif_carrier_on - set carrier
* @dev: network device
*
- * Device has detected that carrier.
+ * Device has detected acquisition of carrier.
*/
void netif_carrier_on(struct net_device *dev)
{
@@ -510,7 +590,7 @@ void netif_carrier_on(struct net_device *dev)
atomic_inc(&dev->carrier_up_count);
linkwatch_fire_event(dev);
if (netif_running(dev))
- __netdev_watchdog_up(dev);
+ netdev_watchdog_up(dev);
}
}
EXPORT_SYMBOL(netif_carrier_on);
@@ -532,6 +612,24 @@ void netif_carrier_off(struct net_device *dev)
}
EXPORT_SYMBOL(netif_carrier_off);
+/**
+ * netif_carrier_event - report carrier state event
+ * @dev: network device
+ *
+ * Device has detected a carrier event but the carrier state wasn't changed.
+ * Use in drivers when querying carrier state asynchronously, to avoid missing
+ * events (link flaps) if link recovers before it's queried.
+ */
+void netif_carrier_event(struct net_device *dev)
+{
+ if (dev->reg_state == NETREG_UNINITIALIZED)
+ return;
+ atomic_inc(&dev->carrier_up_count);
+ atomic_inc(&dev->carrier_down_count);
+ linkwatch_fire_event(dev);
+}
+EXPORT_SYMBOL_GPL(netif_carrier_event);
+
/* "NOOP" scheduler: the best scheduler, recommended for all interfaces
under all circumstances. It is difficult to invent anything faster or
cheaper.
@@ -540,6 +638,7 @@ EXPORT_SYMBOL(netif_carrier_off);
static int noop_enqueue(struct sk_buff *skb, struct Qdisc *qdisc,
struct sk_buff **to_free)
{
+ dev_core_stats_tx_dropped_inc(skb->dev);
__qdisc_drop(skb, to_free);
return NET_XMIT_CN;
}
@@ -559,8 +658,8 @@ struct Qdisc_ops noop_qdisc_ops __read_mostly = {
};
static struct netdev_queue noop_netdev_queue = {
- .qdisc = &noop_qdisc,
- .qdisc_sleeping = &noop_qdisc,
+ RCU_POINTER_INITIALIZER(qdisc, &noop_qdisc),
+ RCU_POINTER_INITIALIZER(qdisc_sleeping, &noop_qdisc),
};
struct Qdisc noop_qdisc = {
@@ -570,8 +669,6 @@ struct Qdisc noop_qdisc = {
.ops = &noop_qdisc_ops,
.q.lock = __SPIN_LOCK_UNLOCKED(noop_qdisc.q.lock),
.dev_queue = &noop_netdev_queue,
- .running = SEQCNT_ZERO(noop_qdisc.running),
- .busylock = __SPIN_LOCK_UNLOCKED(noop_qdisc.busylock),
.gso_skb = {
.next = (struct sk_buff *)&noop_qdisc.gso_skb,
.prev = (struct sk_buff *)&noop_qdisc.gso_skb,
@@ -607,9 +704,10 @@ struct Qdisc_ops noqueue_qdisc_ops __read_mostly = {
.owner = THIS_MODULE,
};
-static const u8 prio2band[TC_PRIO_MAX + 1] = {
- 1, 2, 2, 2, 1, 2, 0, 0 , 1, 1, 1, 1, 1, 1, 1, 1
+const u8 sch_default_prio2band[TC_PRIO_MAX + 1] = {
+ 1, 2, 2, 2, 1, 2, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1
};
+EXPORT_SYMBOL(sch_default_prio2band);
/* 3-band FIFO queue: old style, but should be a bit faster than
generic prio+fifo combination.
@@ -634,7 +732,7 @@ static inline struct skb_array *band2list(struct pfifo_fast_priv *priv,
static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc *qdisc,
struct sk_buff **to_free)
{
- int band = prio2band[skb->priority & TC_PRIO_MAX];
+ int band = sch_default_prio2band[skb->priority & TC_PRIO_MAX];
struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
struct skb_array *q = band2list(priv, band);
unsigned int pkt_len = qdisc_pkt_len(skb);
@@ -642,14 +740,16 @@ static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc *qdisc,
err = skb_array_produce(q, skb);
- if (unlikely(err))
- return qdisc_drop_cpu(skb, qdisc, to_free);
+ if (unlikely(err)) {
+ tcf_set_drop_reason(skb, SKB_DROP_REASON_QDISC_OVERLIMIT);
- qdisc_qstats_cpu_qlen_inc(qdisc);
- /* Note: skb can not be used after skb_array_produce(),
- * so we better not use qdisc_qstats_cpu_backlog_inc()
- */
- this_cpu_add(qdisc->cpu_qstats->backlog, pkt_len);
+ if (qdisc_is_percpu_stats(qdisc))
+ return qdisc_drop_cpu(skb, qdisc, to_free);
+ else
+ return qdisc_drop(skb, qdisc, to_free);
+ }
+
+ qdisc_update_stats_at_enqueue(qdisc, pkt_len);
return NET_XMIT_SUCCESS;
}
@@ -657,8 +757,10 @@ static struct sk_buff *pfifo_fast_dequeue(struct Qdisc *qdisc)
{
struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
struct sk_buff *skb = NULL;
+ bool need_retry = true;
int band;
+retry:
for (band = 0; band < PFIFO_FAST_BANDS && !skb; band++) {
struct skb_array *q = band2list(priv, band);
@@ -668,9 +770,25 @@ static struct sk_buff *pfifo_fast_dequeue(struct Qdisc *qdisc)
skb = __skb_array_consume(q);
}
if (likely(skb)) {
- qdisc_qstats_cpu_backlog_dec(qdisc, skb);
- qdisc_bstats_cpu_update(qdisc, skb);
- qdisc_qstats_cpu_qlen_dec(qdisc);
+ qdisc_update_stats_at_dequeue(qdisc, skb);
+ } else if (need_retry &&
+ READ_ONCE(qdisc->state) & QDISC_STATE_NON_EMPTY) {
+ /* Delay clearing the STATE_MISSED here to reduce
+ * the overhead of the second spin_trylock() in
+ * qdisc_run_begin() and __netif_schedule() calling
+ * in qdisc_run_end().
+ */
+ clear_bit(__QDISC_STATE_MISSED, &qdisc->state);
+ clear_bit(__QDISC_STATE_DRAINING, &qdisc->state);
+
+ /* Make sure dequeuing happens after clearing
+ * STATE_MISSED.
+ */
+ smp_mb__after_atomic();
+
+ need_retry = false;
+
+ goto retry;
}
return skb;
@@ -710,11 +828,14 @@ static void pfifo_fast_reset(struct Qdisc *qdisc)
kfree_skb(skb);
}
- for_each_possible_cpu(i) {
- struct gnet_stats_queue *q = per_cpu_ptr(qdisc->cpu_qstats, i);
+ if (qdisc_is_percpu_stats(qdisc)) {
+ for_each_possible_cpu(i) {
+ struct gnet_stats_queue *q;
- q->backlog = 0;
- q->qlen = 0;
+ q = per_cpu_ptr(qdisc->cpu_qstats, i);
+ q->backlog = 0;
+ q->qlen = 0;
+ }
}
}
@@ -722,7 +843,7 @@ static int pfifo_fast_dump(struct Qdisc *qdisc, struct sk_buff *skb)
{
struct tc_prio_qopt opt = { .bands = PFIFO_FAST_BANDS };
- memcpy(&opt.priomap, prio2band, TC_PRIO_MAX + 1);
+ memcpy(&opt.priomap, sch_default_prio2band, TC_PRIO_MAX + 1);
if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt))
goto nla_put_failure;
return skb->len;
@@ -789,8 +910,8 @@ static int pfifo_fast_change_tx_queue_len(struct Qdisc *sch,
bands[prio] = q;
}
- return skb_array_resize_multiple(bands, PFIFO_FAST_BANDS, new_len,
- GFP_KERNEL);
+ return skb_array_resize_multiple_bh(bands, PFIFO_FAST_BANDS, new_len,
+ GFP_KERNEL);
}
struct Qdisc_ops pfifo_fast_ops __read_mostly = {
@@ -810,15 +931,13 @@ struct Qdisc_ops pfifo_fast_ops __read_mostly = {
EXPORT_SYMBOL(pfifo_fast_ops);
static struct lock_class_key qdisc_tx_busylock;
-static struct lock_class_key qdisc_running_key;
struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
const struct Qdisc_ops *ops,
struct netlink_ext_ack *extack)
{
- void *p;
struct Qdisc *sch;
- unsigned int size = QDISC_ALIGN(sizeof(*sch)) + ops->priv_size;
+ unsigned int size = sizeof(*sch) + ops->priv_size;
int err = -ENOBUFS;
struct net_device *dev;
@@ -829,30 +948,20 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
}
dev = dev_queue->dev;
- p = kzalloc_node(size, GFP_KERNEL,
- netdev_queue_numa_node_read(dev_queue));
+ sch = kzalloc_node(size, GFP_KERNEL, netdev_queue_numa_node_read(dev_queue));
- if (!p)
+ if (!sch)
goto errout;
- sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p);
- /* if we got non aligned memory, ask more and do alignment ourself */
- if (sch != p) {
- kfree(p);
- p = kzalloc_node(size + QDISC_ALIGNTO - 1, GFP_KERNEL,
- netdev_queue_numa_node_read(dev_queue));
- if (!p)
- goto errout;
- sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p);
- sch->padded = (char *) sch - (char *) p;
- }
__skb_queue_head_init(&sch->gso_skb);
__skb_queue_head_init(&sch->skb_bad_txq);
- qdisc_skb_head_init(&sch->q);
+ gnet_stats_basic_sync_init(&sch->bstats);
+ lockdep_register_key(&sch->root_lock_key);
spin_lock_init(&sch->q.lock);
+ lockdep_set_class(&sch->q.lock, &sch->root_lock_key);
if (ops->static_flags & TCQ_F_CPUSTATS) {
sch->cpu_bstats =
- netdev_alloc_pcpu_stats(struct gnet_stats_basic_cpu);
+ netdev_alloc_pcpu_stats(struct gnet_stats_basic_sync);
if (!sch->cpu_bstats)
goto errout1;
@@ -863,30 +972,23 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
}
}
- spin_lock_init(&sch->busylock);
- lockdep_set_class(&sch->busylock,
- dev->qdisc_tx_busylock ?: &qdisc_tx_busylock);
-
/* seqlock has the same scope of busylock, for NOLOCK qdisc */
spin_lock_init(&sch->seqlock);
- lockdep_set_class(&sch->busylock,
+ lockdep_set_class(&sch->seqlock,
dev->qdisc_tx_busylock ?: &qdisc_tx_busylock);
- seqcount_init(&sch->running);
- lockdep_set_class(&sch->running,
- dev->qdisc_running_key ?: &qdisc_running_key);
-
sch->ops = ops;
sch->flags = ops->static_flags;
sch->enqueue = ops->enqueue;
sch->dequeue = ops->dequeue;
sch->dev_queue = dev_queue;
- dev_hold(dev);
+ netdev_hold(dev, &sch->dev_tracker, GFP_KERNEL);
refcount_set(&sch->refcnt, 1);
return sch;
errout1:
- kfree(p);
+ lockdep_unregister_key(&sch->root_lock_key);
+ kfree(sch);
errout:
return ERR_PTR(err);
}
@@ -898,20 +1000,22 @@ struct Qdisc *qdisc_create_dflt(struct netdev_queue *dev_queue,
{
struct Qdisc *sch;
- if (!try_module_get(ops->owner)) {
+ if (!bpf_try_module_get(ops, ops->owner)) {
NL_SET_ERR_MSG(extack, "Failed to increase module reference counter");
return NULL;
}
sch = qdisc_alloc(dev_queue, ops, extack);
if (IS_ERR(sch)) {
- module_put(ops->owner);
+ bpf_module_put(ops, ops->owner);
return NULL;
}
sch->parent = parentid;
- if (!ops->init || ops->init(sch, NULL, extack) == 0)
+ if (!ops->init || ops->init(sch, NULL, extack) == 0) {
+ trace_qdisc_create(ops, dev_queue->dev, parentid);
return sch;
+ }
qdisc_put(sch);
return NULL;
@@ -923,20 +1027,14 @@ EXPORT_SYMBOL(qdisc_create_dflt);
void qdisc_reset(struct Qdisc *qdisc)
{
const struct Qdisc_ops *ops = qdisc->ops;
- struct sk_buff *skb, *tmp;
+
+ trace_qdisc_reset(qdisc);
if (ops->reset)
ops->reset(qdisc);
- skb_queue_walk_safe(&qdisc->gso_skb, skb, tmp) {
- __skb_unlink(skb, &qdisc->gso_skb);
- kfree_skb_list(skb);
- }
-
- skb_queue_walk_safe(&qdisc->skb_bad_txq, skb, tmp) {
- __skb_unlink(skb, &qdisc->skb_bad_txq);
- kfree_skb_list(skb);
- }
+ __skb_queue_purge(&qdisc->gso_skb);
+ __skb_queue_purge(&qdisc->skb_bad_txq);
qdisc->q.qlen = 0;
qdisc->qstats.backlog = 0;
@@ -950,7 +1048,7 @@ void qdisc_free(struct Qdisc *qdisc)
free_percpu(qdisc->cpu_qstats);
}
- kfree((char *) qdisc - qdisc->padded);
+ kfree(qdisc);
}
static void qdisc_free_cb(struct rcu_head *head)
@@ -960,10 +1058,10 @@ static void qdisc_free_cb(struct rcu_head *head)
qdisc_free(q);
}
-static void qdisc_destroy(struct Qdisc *qdisc)
+static void __qdisc_destroy(struct Qdisc *qdisc)
{
const struct Qdisc_ops *ops = qdisc->ops;
- struct sk_buff *skb, *tmp;
+ struct net_device *dev = qdisc_dev(qdisc);
#ifdef CONFIG_NET_SCHED
qdisc_hash_del(qdisc);
@@ -971,34 +1069,40 @@ static void qdisc_destroy(struct Qdisc *qdisc)
qdisc_put_stab(rtnl_dereference(qdisc->stab));
#endif
gen_kill_estimator(&qdisc->rate_est);
- if (ops->reset)
- ops->reset(qdisc);
+
+ qdisc_reset(qdisc);
+
+
if (ops->destroy)
ops->destroy(qdisc);
- module_put(ops->owner);
- dev_put(qdisc_dev(qdisc));
+ lockdep_unregister_key(&qdisc->root_lock_key);
+ bpf_module_put(ops, ops->owner);
+ netdev_put(dev, &qdisc->dev_tracker);
- skb_queue_walk_safe(&qdisc->gso_skb, skb, tmp) {
- __skb_unlink(skb, &qdisc->gso_skb);
- kfree_skb_list(skb);
- }
-
- skb_queue_walk_safe(&qdisc->skb_bad_txq, skb, tmp) {
- __skb_unlink(skb, &qdisc->skb_bad_txq);
- kfree_skb_list(skb);
- }
+ trace_qdisc_destroy(qdisc);
call_rcu(&qdisc->rcu, qdisc_free_cb);
}
+void qdisc_destroy(struct Qdisc *qdisc)
+{
+ if (qdisc->flags & TCQ_F_BUILTIN)
+ return;
+
+ __qdisc_destroy(qdisc);
+}
+
void qdisc_put(struct Qdisc *qdisc)
{
+ if (!qdisc)
+ return;
+
if (qdisc->flags & TCQ_F_BUILTIN ||
!refcount_dec_and_test(&qdisc->refcnt))
return;
- qdisc_destroy(qdisc);
+ __qdisc_destroy(qdisc);
}
EXPORT_SYMBOL(qdisc_put);
@@ -1013,7 +1117,7 @@ void qdisc_put_unlocked(struct Qdisc *qdisc)
!refcount_dec_and_rtnl_lock(&qdisc->refcnt))
return;
- qdisc_destroy(qdisc);
+ __qdisc_destroy(qdisc);
rtnl_unlock();
}
EXPORT_SYMBOL(qdisc_put_unlocked);
@@ -1022,7 +1126,7 @@ EXPORT_SYMBOL(qdisc_put_unlocked);
struct Qdisc *dev_graft_qdisc(struct netdev_queue *dev_queue,
struct Qdisc *qdisc)
{
- struct Qdisc *oqdisc = dev_queue->qdisc_sleeping;
+ struct Qdisc *oqdisc = rtnl_dereference(dev_queue->qdisc_sleeping);
spinlock_t *root_lock;
root_lock = qdisc_lock(oqdisc);
@@ -1031,7 +1135,7 @@ struct Qdisc *dev_graft_qdisc(struct netdev_queue *dev_queue,
/* ... and graft new one */
if (qdisc == NULL)
qdisc = &noop_qdisc;
- dev_queue->qdisc_sleeping = qdisc;
+ rcu_assign_pointer(dev_queue->qdisc_sleeping, qdisc);
rcu_assign_pointer(dev_queue->qdisc, &noop_qdisc);
spin_unlock_bh(root_lock);
@@ -1040,6 +1144,21 @@ struct Qdisc *dev_graft_qdisc(struct netdev_queue *dev_queue,
}
EXPORT_SYMBOL(dev_graft_qdisc);
+static void shutdown_scheduler_queue(struct net_device *dev,
+ struct netdev_queue *dev_queue,
+ void *_qdisc_default)
+{
+ struct Qdisc *qdisc = rtnl_dereference(dev_queue->qdisc_sleeping);
+ struct Qdisc *qdisc_default = _qdisc_default;
+
+ if (qdisc) {
+ rcu_assign_pointer(dev_queue->qdisc, qdisc_default);
+ rcu_assign_pointer(dev_queue->qdisc_sleeping, qdisc_default);
+
+ qdisc_put(qdisc);
+ }
+}
+
static void attach_one_default_qdisc(struct net_device *dev,
struct netdev_queue *dev_queue,
void *_unused)
@@ -1049,15 +1168,16 @@ static void attach_one_default_qdisc(struct net_device *dev,
if (dev->priv_flags & IFF_NO_QUEUE)
ops = &noqueue_qdisc_ops;
+ else if(dev->type == ARPHRD_CAN)
+ ops = &pfifo_fast_ops;
qdisc = qdisc_create_dflt(dev_queue, ops, TC_H_ROOT, NULL);
- if (!qdisc) {
- netdev_info(dev, "activation failed\n");
+ if (!qdisc)
return;
- }
+
if (!netif_is_multiqueue(dev))
qdisc->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT;
- dev_queue->qdisc_sleeping = qdisc;
+ rcu_assign_pointer(dev_queue->qdisc_sleeping, qdisc);
}
static void attach_default_qdiscs(struct net_device *dev)
@@ -1070,18 +1190,34 @@ static void attach_default_qdiscs(struct net_device *dev)
if (!netif_is_multiqueue(dev) ||
dev->priv_flags & IFF_NO_QUEUE) {
netdev_for_each_tx_queue(dev, attach_one_default_qdisc, NULL);
- dev->qdisc = txq->qdisc_sleeping;
- qdisc_refcount_inc(dev->qdisc);
+ qdisc = rtnl_dereference(txq->qdisc_sleeping);
+ rcu_assign_pointer(dev->qdisc, qdisc);
+ qdisc_refcount_inc(qdisc);
} else {
qdisc = qdisc_create_dflt(txq, &mq_qdisc_ops, TC_H_ROOT, NULL);
if (qdisc) {
- dev->qdisc = qdisc;
+ rcu_assign_pointer(dev->qdisc, qdisc);
qdisc->ops->attach(qdisc);
}
}
+ qdisc = rtnl_dereference(dev->qdisc);
+
+ /* Detect default qdisc setup/init failed and fallback to "noqueue" */
+ if (qdisc == &noop_qdisc) {
+ netdev_warn(dev, "default qdisc (%s) fail, fallback to %s\n",
+ default_qdisc_ops->id, noqueue_qdisc_ops.id);
+ netdev_for_each_tx_queue(dev, shutdown_scheduler_queue, &noop_qdisc);
+ dev->priv_flags |= IFF_NO_QUEUE;
+ netdev_for_each_tx_queue(dev, attach_one_default_qdisc, NULL);
+ qdisc = rtnl_dereference(txq->qdisc_sleeping);
+ rcu_assign_pointer(dev->qdisc, qdisc);
+ qdisc_refcount_inc(qdisc);
+ dev->priv_flags ^= IFF_NO_QUEUE;
+ }
+
#ifdef CONFIG_NET_SCHED
- if (dev->qdisc != &noop_qdisc)
- qdisc_hash_add(dev->qdisc, false);
+ if (qdisc != &noop_qdisc)
+ qdisc_hash_add(qdisc, false);
#endif
}
@@ -1089,7 +1225,7 @@ static void transition_one_qdisc(struct net_device *dev,
struct netdev_queue *dev_queue,
void *_need_watchdog)
{
- struct Qdisc *new_qdisc = dev_queue->qdisc_sleeping;
+ struct Qdisc *new_qdisc = rtnl_dereference(dev_queue->qdisc_sleeping);
int *need_watchdog_p = _need_watchdog;
if (!(new_qdisc->flags & TCQ_F_BUILTIN))
@@ -1097,7 +1233,7 @@ static void transition_one_qdisc(struct net_device *dev,
rcu_assign_pointer(dev_queue->qdisc, new_qdisc);
if (need_watchdog_p) {
- dev_queue->trans_start = 0;
+ WRITE_ONCE(dev_queue->trans_start, 0);
*need_watchdog_p = 1;
}
}
@@ -1111,7 +1247,7 @@ void dev_activate(struct net_device *dev)
* and noqueue_qdisc for virtual interfaces
*/
- if (dev->qdisc == &noop_qdisc)
+ if (rtnl_dereference(dev->qdisc) == &noop_qdisc)
attach_default_qdiscs(dev);
if (!netif_carrier_ok(dev))
@@ -1125,35 +1261,59 @@ void dev_activate(struct net_device *dev)
if (need_watchdog) {
netif_trans_update(dev);
- dev_watchdog_up(dev);
+ netdev_watchdog_up(dev);
}
}
EXPORT_SYMBOL(dev_activate);
+static void qdisc_deactivate(struct Qdisc *qdisc)
+{
+ if (qdisc->flags & TCQ_F_BUILTIN)
+ return;
+
+ set_bit(__QDISC_STATE_DEACTIVATED, &qdisc->state);
+}
+
static void dev_deactivate_queue(struct net_device *dev,
struct netdev_queue *dev_queue,
- void *_qdisc_default)
+ void *_sync_needed)
{
- struct Qdisc *qdisc_default = _qdisc_default;
+ bool *sync_needed = _sync_needed;
struct Qdisc *qdisc;
qdisc = rtnl_dereference(dev_queue->qdisc);
if (qdisc) {
- bool nolock = qdisc->flags & TCQ_F_NOLOCK;
+ if (qdisc->enqueue)
+ *sync_needed = true;
+ qdisc_deactivate(qdisc);
+ rcu_assign_pointer(dev_queue->qdisc, &noop_qdisc);
+ }
+}
+
+static void dev_reset_queue(struct net_device *dev,
+ struct netdev_queue *dev_queue,
+ void *_unused)
+{
+ struct Qdisc *qdisc;
+ bool nolock;
- if (nolock)
- spin_lock_bh(&qdisc->seqlock);
- spin_lock_bh(qdisc_lock(qdisc));
+ qdisc = rtnl_dereference(dev_queue->qdisc_sleeping);
+ if (!qdisc)
+ return;
- if (!(qdisc->flags & TCQ_F_BUILTIN))
- set_bit(__QDISC_STATE_DEACTIVATED, &qdisc->state);
+ nolock = qdisc->flags & TCQ_F_NOLOCK;
- rcu_assign_pointer(dev_queue->qdisc, qdisc_default);
- qdisc_reset(qdisc);
+ if (nolock)
+ spin_lock_bh(&qdisc->seqlock);
+ spin_lock_bh(qdisc_lock(qdisc));
- spin_unlock_bh(qdisc_lock(qdisc));
- if (nolock)
- spin_unlock_bh(&qdisc->seqlock);
+ qdisc_reset(qdisc);
+
+ spin_unlock_bh(qdisc_lock(qdisc));
+ if (nolock) {
+ clear_bit(__QDISC_STATE_MISSED, &qdisc->state);
+ clear_bit(__QDISC_STATE_DRAINING, &qdisc->state);
+ spin_unlock_bh(&qdisc->seqlock);
}
}
@@ -1168,7 +1328,7 @@ static bool some_qdisc_is_busy(struct net_device *dev)
int val;
dev_queue = netdev_get_tx_queue(dev, i);
- q = dev_queue->qdisc_sleeping;
+ q = rtnl_dereference(dev_queue->qdisc_sleeping);
root_lock = qdisc_lock(q);
spin_lock_bh(root_lock);
@@ -1184,16 +1344,6 @@ static bool some_qdisc_is_busy(struct net_device *dev)
return false;
}
-static void dev_qdisc_reset(struct net_device *dev,
- struct netdev_queue *dev_queue,
- void *none)
-{
- struct Qdisc *qdisc = dev_queue->qdisc_sleeping;
-
- if (qdisc)
- qdisc_reset(qdisc);
-}
-
/**
* dev_deactivate_many - deactivate transmissions on several devices
* @head: list of devices to deactivate
@@ -1203,34 +1353,39 @@ static void dev_qdisc_reset(struct net_device *dev,
*/
void dev_deactivate_many(struct list_head *head)
{
+ bool sync_needed = false;
struct net_device *dev;
list_for_each_entry(dev, head, close_list) {
netdev_for_each_tx_queue(dev, dev_deactivate_queue,
- &noop_qdisc);
+ &sync_needed);
if (dev_ingress_queue(dev))
dev_deactivate_queue(dev, dev_ingress_queue(dev),
- &noop_qdisc);
+ &sync_needed);
- dev_watchdog_down(dev);
+ netdev_watchdog_down(dev);
}
- /* Wait for outstanding qdisc-less dev_queue_xmit calls.
- * This is avoided if all devices are in dismantle phase :
- * Caller will call synchronize_net() for us
- */
- synchronize_net();
+ /* Wait for outstanding qdisc enqueuing calls. */
+ if (sync_needed)
+ synchronize_net();
- /* Wait for outstanding qdisc_run calls. */
list_for_each_entry(dev, head, close_list) {
- while (some_qdisc_is_busy(dev))
- yield();
- /* The new qdisc is assigned at this point so we can safely
- * unwind stale skb lists and qdisc statistics
- */
- netdev_for_each_tx_queue(dev, dev_qdisc_reset, NULL);
+ netdev_for_each_tx_queue(dev, dev_reset_queue, NULL);
+
if (dev_ingress_queue(dev))
- dev_qdisc_reset(dev, dev_ingress_queue(dev), NULL);
+ dev_reset_queue(dev, dev_ingress_queue(dev), NULL);
+ }
+
+ /* Wait for outstanding qdisc_run calls. */
+ list_for_each_entry(dev, head, close_list) {
+ while (some_qdisc_is_busy(dev)) {
+ /* wait_event() would avoid this sleep-loop but would
+ * require expensive checks in the fast paths of packet
+ * processing which isn't worth it.
+ */
+ schedule_timeout_uninterruptible(1);
+ }
}
}
@@ -1247,7 +1402,7 @@ EXPORT_SYMBOL(dev_deactivate);
static int qdisc_change_tx_queue_len(struct net_device *dev,
struct netdev_queue *dev_queue)
{
- struct Qdisc *qdisc = dev_queue->qdisc_sleeping;
+ struct Qdisc *qdisc = rtnl_dereference(dev_queue->qdisc_sleeping);
const struct Qdisc_ops *ops = qdisc->ops;
if (ops->change_tx_queue_len)
@@ -1255,6 +1410,39 @@ static int qdisc_change_tx_queue_len(struct net_device *dev,
return 0;
}
+void dev_qdisc_change_real_num_tx(struct net_device *dev,
+ unsigned int new_real_tx)
+{
+ struct Qdisc *qdisc = rtnl_dereference(dev->qdisc);
+
+ if (qdisc->ops->change_real_num_tx)
+ qdisc->ops->change_real_num_tx(qdisc, new_real_tx);
+}
+
+void mq_change_real_num_tx(struct Qdisc *sch, unsigned int new_real_tx)
+{
+#ifdef CONFIG_NET_SCHED
+ struct net_device *dev = qdisc_dev(sch);
+ struct Qdisc *qdisc;
+ unsigned int i;
+
+ for (i = new_real_tx; i < dev->real_num_tx_queues; i++) {
+ qdisc = rtnl_dereference(netdev_get_tx_queue(dev, i)->qdisc_sleeping);
+ /* Only update the default qdiscs we created,
+ * qdiscs with handles are always hashed.
+ */
+ if (qdisc != &noop_qdisc && !qdisc->handle)
+ qdisc_hash_del(qdisc);
+ }
+ for (i = dev->real_num_tx_queues; i < new_real_tx; i++) {
+ qdisc = rtnl_dereference(netdev_get_tx_queue(dev, i)->qdisc_sleeping);
+ if (qdisc != &noop_qdisc && !qdisc->handle)
+ qdisc_hash_add(qdisc, false);
+ }
+#endif
+}
+EXPORT_SYMBOL(mq_change_real_num_tx);
+
int dev_qdisc_change_tx_queue_len(struct net_device *dev)
{
bool up = dev->flags & IFF_UP;
@@ -1284,12 +1472,12 @@ static void dev_init_scheduler_queue(struct net_device *dev,
struct Qdisc *qdisc = _qdisc;
rcu_assign_pointer(dev_queue->qdisc, qdisc);
- dev_queue->qdisc_sleeping = qdisc;
+ rcu_assign_pointer(dev_queue->qdisc_sleeping, qdisc);
}
void dev_init_scheduler(struct net_device *dev)
{
- dev->qdisc = &noop_qdisc;
+ rcu_assign_pointer(dev->qdisc, &noop_qdisc);
netdev_for_each_tx_queue(dev, dev_init_scheduler_queue, &noop_qdisc);
if (dev_ingress_queue(dev))
dev_init_scheduler_queue(dev, dev_ingress_queue(dev), &noop_qdisc);
@@ -1297,105 +1485,126 @@ void dev_init_scheduler(struct net_device *dev)
timer_setup(&dev->watchdog_timer, dev_watchdog, 0);
}
-static void shutdown_scheduler_queue(struct net_device *dev,
- struct netdev_queue *dev_queue,
- void *_qdisc_default)
-{
- struct Qdisc *qdisc = dev_queue->qdisc_sleeping;
- struct Qdisc *qdisc_default = _qdisc_default;
-
- if (qdisc) {
- rcu_assign_pointer(dev_queue->qdisc, qdisc_default);
- dev_queue->qdisc_sleeping = qdisc_default;
-
- qdisc_put(qdisc);
- }
-}
-
void dev_shutdown(struct net_device *dev)
{
netdev_for_each_tx_queue(dev, shutdown_scheduler_queue, &noop_qdisc);
if (dev_ingress_queue(dev))
shutdown_scheduler_queue(dev, dev_ingress_queue(dev), &noop_qdisc);
- qdisc_put(dev->qdisc);
- dev->qdisc = &noop_qdisc;
+ qdisc_put(rtnl_dereference(dev->qdisc));
+ rcu_assign_pointer(dev->qdisc, &noop_qdisc);
WARN_ON(timer_pending(&dev->watchdog_timer));
}
+/**
+ * psched_ratecfg_precompute__() - Pre-compute values for reciprocal division
+ * @rate: Rate to compute reciprocal division values of
+ * @mult: Multiplier for reciprocal division
+ * @shift: Shift for reciprocal division
+ *
+ * The multiplier and shift for reciprocal division by rate are stored
+ * in mult and shift.
+ *
+ * The deal here is to replace a divide by a reciprocal one
+ * in fast path (a reciprocal divide is a multiply and a shift)
+ *
+ * Normal formula would be :
+ * time_in_ns = (NSEC_PER_SEC * len) / rate_bps
+ *
+ * We compute mult/shift to use instead :
+ * time_in_ns = (len * mult) >> shift;
+ *
+ * We try to get the highest possible mult value for accuracy,
+ * but have to make sure no overflows will ever happen.
+ *
+ * reciprocal_value() is not used here it doesn't handle 64-bit values.
+ */
+static void psched_ratecfg_precompute__(u64 rate, u32 *mult, u8 *shift)
+{
+ u64 factor = NSEC_PER_SEC;
+
+ *mult = 1;
+ *shift = 0;
+
+ if (rate <= 0)
+ return;
+
+ for (;;) {
+ *mult = div64_u64(factor, rate);
+ if (*mult & (1U << 31) || factor & (1ULL << 63))
+ break;
+ factor <<= 1;
+ (*shift)++;
+ }
+}
+
void psched_ratecfg_precompute(struct psched_ratecfg *r,
const struct tc_ratespec *conf,
u64 rate64)
{
memset(r, 0, sizeof(*r));
r->overhead = conf->overhead;
+ r->mpu = conf->mpu;
r->rate_bytes_ps = max_t(u64, conf->rate, rate64);
r->linklayer = (conf->linklayer & TC_LINKLAYER_MASK);
- r->mult = 1;
- /*
- * The deal here is to replace a divide by a reciprocal one
- * in fast path (a reciprocal divide is a multiply and a shift)
- *
- * Normal formula would be :
- * time_in_ns = (NSEC_PER_SEC * len) / rate_bps
- *
- * We compute mult/shift to use instead :
- * time_in_ns = (len * mult) >> shift;
- *
- * We try to get the highest possible mult value for accuracy,
- * but have to make sure no overflows will ever happen.
- */
- if (r->rate_bytes_ps > 0) {
- u64 factor = NSEC_PER_SEC;
-
- for (;;) {
- r->mult = div64_u64(factor, r->rate_bytes_ps);
- if (r->mult & (1U << 31) || factor & (1ULL << 63))
- break;
- factor <<= 1;
- r->shift++;
- }
- }
+ psched_ratecfg_precompute__(r->rate_bytes_ps, &r->mult, &r->shift);
}
EXPORT_SYMBOL(psched_ratecfg_precompute);
-static void mini_qdisc_rcu_func(struct rcu_head *head)
+void psched_ppscfg_precompute(struct psched_pktrate *r, u64 pktrate64)
{
+ r->rate_pkts_ps = pktrate64;
+ psched_ratecfg_precompute__(r->rate_pkts_ps, &r->mult, &r->shift);
}
+EXPORT_SYMBOL(psched_ppscfg_precompute);
void mini_qdisc_pair_swap(struct mini_Qdisc_pair *miniqp,
struct tcf_proto *tp_head)
{
- struct mini_Qdisc *miniq_old = rtnl_dereference(*miniqp->p_miniq);
+ /* Protected with chain0->filter_chain_lock.
+ * Can't access chain directly because tp_head can be NULL.
+ */
+ struct mini_Qdisc *miniq_old =
+ rcu_dereference_protected(*miniqp->p_miniq, 1);
struct mini_Qdisc *miniq;
if (!tp_head) {
RCU_INIT_POINTER(*miniqp->p_miniq, NULL);
- /* Wait for flying RCU callback before it is freed. */
- rcu_barrier();
- return;
- }
+ } else {
+ miniq = miniq_old != &miniqp->miniq1 ?
+ &miniqp->miniq1 : &miniqp->miniq2;
- miniq = !miniq_old || miniq_old == &miniqp->miniq2 ?
- &miniqp->miniq1 : &miniqp->miniq2;
+ /* We need to make sure that readers won't see the miniq
+ * we are about to modify. So ensure that at least one RCU
+ * grace period has elapsed since the miniq was made
+ * inactive.
+ */
+ if (IS_ENABLED(CONFIG_PREEMPT_RT))
+ cond_synchronize_rcu(miniq->rcu_state);
+ else if (!poll_state_synchronize_rcu(miniq->rcu_state))
+ synchronize_rcu_expedited();
- /* We need to make sure that readers won't see the miniq
- * we are about to modify. So wait until previous call_rcu callback
- * is done.
- */
- rcu_barrier();
- miniq->filter_list = tp_head;
- rcu_assign_pointer(*miniqp->p_miniq, miniq);
+ miniq->filter_list = tp_head;
+ rcu_assign_pointer(*miniqp->p_miniq, miniq);
+ }
if (miniq_old)
- /* This is counterpart of the rcu barriers above. We need to
+ /* This is counterpart of the rcu sync above. We need to
* block potential new user of miniq_old until all readers
* are not seeing it.
*/
- call_rcu(&miniq_old->rcu, mini_qdisc_rcu_func);
+ miniq_old->rcu_state = start_poll_synchronize_rcu();
}
EXPORT_SYMBOL(mini_qdisc_pair_swap);
+void mini_qdisc_pair_block_init(struct mini_Qdisc_pair *miniqp,
+ struct tcf_block *block)
+{
+ miniqp->miniq1.block = block;
+ miniqp->miniq2.block = block;
+}
+EXPORT_SYMBOL(mini_qdisc_pair_block_init);
+
void mini_qdisc_pair_init(struct mini_Qdisc_pair *miniqp, struct Qdisc *qdisc,
struct mini_Qdisc __rcu **p_miniq)
{
@@ -1403,6 +1612,8 @@ void mini_qdisc_pair_init(struct mini_Qdisc_pair *miniqp, struct Qdisc *qdisc,
miniqp->miniq1.cpu_qstats = qdisc->cpu_qstats;
miniqp->miniq2.cpu_bstats = qdisc->cpu_bstats;
miniqp->miniq2.cpu_qstats = qdisc->cpu_qstats;
+ miniqp->miniq1.rcu_state = get_state_synchronize_rcu();
+ miniqp->miniq2.rcu_state = miniqp->miniq1.rcu_state;
miniqp->p_miniq = p_miniq;
}
EXPORT_SYMBOL(mini_qdisc_pair_init);
diff --git a/net/sched/sch_gred.c b/net/sched/sch_gred.c
index 234afbf9115b..532fde548b88 100644
--- a/net/sched/sch_gred.c
+++ b/net/sched/sch_gred.c
@@ -1,17 +1,12 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* net/sched/sch_gred.c Generic Random Early Detection queue.
*
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Authors: J Hadi Salim (hadi@cyberus.ca) 1998-2002
*
* 991129: - Bug fix with grio mode
* - a better sing. AvgQ mode with Grio(WRED)
- * - A finer grained VQ dequeue based on sugestion
+ * - A finer grained VQ dequeue based on suggestion
* from Ren Liu
* - More error checks
*
@@ -61,6 +56,7 @@ struct gred_sched {
u32 DPs;
u32 def;
struct red_vars wred_set;
+ struct tc_gred_qopt_offload *opt;
};
static inline int gred_wred_mode(struct gred_sched *table)
@@ -255,10 +251,10 @@ static int gred_enqueue(struct sk_buff *skb, struct Qdisc *sch,
q->stats.pdrop++;
drop:
- return qdisc_drop(skb, sch, to_free);
+ return qdisc_drop_reason(skb, sch, to_free, SKB_DROP_REASON_QDISC_OVERLIMIT);
congestion_drop:
- qdisc_drop(skb, sch, to_free);
+ qdisc_drop_reason(skb, sch, to_free, SKB_DROP_REASON_QDISC_CONGESTED);
return NET_XMIT_CN;
}
@@ -316,48 +312,50 @@ static void gred_offload(struct Qdisc *sch, enum tc_gred_command command)
{
struct gred_sched *table = qdisc_priv(sch);
struct net_device *dev = qdisc_dev(sch);
- struct tc_gred_qopt_offload opt = {
- .command = command,
- .handle = sch->handle,
- .parent = sch->parent,
- };
+ struct tc_gred_qopt_offload *opt = table->opt;
if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
return;
+ memset(opt, 0, sizeof(*opt));
+ opt->command = command;
+ opt->handle = sch->handle;
+ opt->parent = sch->parent;
+
if (command == TC_GRED_REPLACE) {
unsigned int i;
- opt.set.grio_on = gred_rio_mode(table);
- opt.set.wred_on = gred_wred_mode(table);
- opt.set.dp_cnt = table->DPs;
- opt.set.dp_def = table->def;
+ opt->set.grio_on = gred_rio_mode(table);
+ opt->set.wred_on = gred_wred_mode(table);
+ opt->set.dp_cnt = table->DPs;
+ opt->set.dp_def = table->def;
for (i = 0; i < table->DPs; i++) {
struct gred_sched_data *q = table->tab[i];
if (!q)
continue;
- opt.set.tab[i].present = true;
- opt.set.tab[i].limit = q->limit;
- opt.set.tab[i].prio = q->prio;
- opt.set.tab[i].min = q->parms.qth_min >> q->parms.Wlog;
- opt.set.tab[i].max = q->parms.qth_max >> q->parms.Wlog;
- opt.set.tab[i].is_ecn = gred_use_ecn(q);
- opt.set.tab[i].is_harddrop = gred_use_harddrop(q);
- opt.set.tab[i].probability = q->parms.max_P;
- opt.set.tab[i].backlog = &q->backlog;
+ opt->set.tab[i].present = true;
+ opt->set.tab[i].limit = q->limit;
+ opt->set.tab[i].prio = q->prio;
+ opt->set.tab[i].min = q->parms.qth_min >> q->parms.Wlog;
+ opt->set.tab[i].max = q->parms.qth_max >> q->parms.Wlog;
+ opt->set.tab[i].is_ecn = gred_use_ecn(q);
+ opt->set.tab[i].is_harddrop = gred_use_harddrop(q);
+ opt->set.tab[i].probability = q->parms.max_P;
+ opt->set.tab[i].backlog = &q->backlog;
}
- opt.set.qstats = &sch->qstats;
+ opt->set.qstats = &sch->qstats;
}
- dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_GRED, &opt);
+ dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_GRED, opt);
}
static int gred_offload_dump_stats(struct Qdisc *sch)
{
struct gred_sched *table = qdisc_priv(sch);
struct tc_gred_qopt_offload *hw_stats;
+ u64 bytes = 0, packets = 0;
unsigned int i;
int ret;
@@ -369,30 +367,34 @@ static int gred_offload_dump_stats(struct Qdisc *sch)
hw_stats->handle = sch->handle;
hw_stats->parent = sch->parent;
- for (i = 0; i < MAX_DPs; i++)
+ for (i = 0; i < MAX_DPs; i++) {
+ gnet_stats_basic_sync_init(&hw_stats->stats.bstats[i]);
if (table->tab[i])
hw_stats->stats.xstats[i] = &table->tab[i]->stats;
+ }
ret = qdisc_offload_dump_helper(sch, TC_SETUP_QDISC_GRED, hw_stats);
/* Even if driver returns failure adjust the stats - in case offload
* ended but driver still wants to adjust the values.
*/
+ sch_tree_lock(sch);
for (i = 0; i < MAX_DPs; i++) {
if (!table->tab[i])
continue;
- table->tab[i]->packetsin += hw_stats->stats.bstats[i].packets;
- table->tab[i]->bytesin += hw_stats->stats.bstats[i].bytes;
+ table->tab[i]->packetsin += u64_stats_read(&hw_stats->stats.bstats[i].packets);
+ table->tab[i]->bytesin += u64_stats_read(&hw_stats->stats.bstats[i].bytes);
table->tab[i]->backlog += hw_stats->stats.qstats[i].backlog;
- _bstats_update(&sch->bstats,
- hw_stats->stats.bstats[i].bytes,
- hw_stats->stats.bstats[i].packets);
+ bytes += u64_stats_read(&hw_stats->stats.bstats[i].bytes);
+ packets += u64_stats_read(&hw_stats->stats.bstats[i].packets);
sch->qstats.qlen += hw_stats->stats.qstats[i].qlen;
sch->qstats.backlog += hw_stats->stats.qstats[i].backlog;
sch->qstats.drops += hw_stats->stats.qstats[i].drops;
sch->qstats.requeues += hw_stats->stats.qstats[i].requeues;
sch->qstats.overlimits += hw_stats->stats.qstats[i].overlimits;
}
+ _bstats_update(&sch->bstats, bytes, packets);
+ sch_tree_unlock(sch);
kfree(hw_stats);
return ret;
@@ -485,7 +487,7 @@ static inline int gred_change_vq(struct Qdisc *sch, int dp,
struct gred_sched *table = qdisc_priv(sch);
struct gred_sched_data *q = table->tab[dp];
- if (!red_check_params(ctl->qth_min, ctl->qth_max, ctl->Wlog)) {
+ if (!red_check_params(ctl->qth_min, ctl->qth_max, ctl->Wlog, ctl->Scell_log, stab)) {
NL_SET_ERR_MSG_MOD(extack, "invalid RED parameters");
return -EINVAL;
}
@@ -538,7 +540,8 @@ static void gred_vq_apply(struct gred_sched *table, const struct nlattr *entry)
struct nlattr *tb[TCA_GRED_VQ_MAX + 1];
u32 dp;
- nla_parse_nested(tb, TCA_GRED_VQ_MAX, entry, gred_vq_policy, NULL);
+ nla_parse_nested_deprecated(tb, TCA_GRED_VQ_MAX, entry,
+ gred_vq_policy, NULL);
dp = nla_get_u32(tb[TCA_GRED_VQ_DP]);
@@ -568,8 +571,8 @@ static int gred_vq_validate(struct gred_sched *table, u32 cdp,
int err;
u32 dp;
- err = nla_parse_nested(tb, TCA_GRED_VQ_MAX, entry, gred_vq_policy,
- extack);
+ err = nla_parse_nested_deprecated(tb, TCA_GRED_VQ_MAX, entry,
+ gred_vq_policy, extack);
if (err < 0)
return err;
@@ -610,8 +613,8 @@ static int gred_vqs_validate(struct gred_sched *table, u32 cdp,
const struct nlattr *attr;
int rem, err;
- err = nla_validate_nested(vqs, TCA_GRED_VQ_ENTRY_MAX,
- gred_vqe_policy, extack);
+ err = nla_validate_nested_deprecated(vqs, TCA_GRED_VQ_ENTRY_MAX,
+ gred_vqe_policy, extack);
if (err < 0)
return err;
@@ -647,10 +650,8 @@ static int gred_change(struct Qdisc *sch, struct nlattr *opt,
u32 max_P;
struct gred_sched_data *prealloc;
- if (opt == NULL)
- return -EINVAL;
-
- err = nla_parse_nested(tb, TCA_GRED_MAX, opt, gred_policy, extack);
+ err = nla_parse_nested_deprecated(tb, TCA_GRED_MAX, opt, gred_policy,
+ extack);
if (err < 0)
return err;
@@ -667,7 +668,7 @@ static int gred_change(struct Qdisc *sch, struct nlattr *opt,
return -EINVAL;
}
- max_P = tb[TCA_GRED_MAX_P] ? nla_get_u32(tb[TCA_GRED_MAX_P]) : 0;
+ max_P = nla_get_u32_default(tb[TCA_GRED_MAX_P], 0);
ctl = nla_data(tb[TCA_GRED_PARMS]);
stab = nla_data(tb[TCA_GRED_STAB]);
@@ -731,13 +732,15 @@ err_unlock_free:
static int gred_init(struct Qdisc *sch, struct nlattr *opt,
struct netlink_ext_ack *extack)
{
+ struct gred_sched *table = qdisc_priv(sch);
struct nlattr *tb[TCA_GRED_MAX + 1];
int err;
if (!opt)
return -EINVAL;
- err = nla_parse_nested(tb, TCA_GRED_MAX, opt, gred_policy, extack);
+ err = nla_parse_nested_deprecated(tb, TCA_GRED_MAX, opt, gred_policy,
+ extack);
if (err < 0)
return err;
@@ -753,6 +756,12 @@ static int gred_init(struct Qdisc *sch, struct nlattr *opt,
sch->limit = qdisc_dev(sch)->tx_queue_len
* psched_mtu(qdisc_dev(sch));
+ if (qdisc_dev(sch)->netdev_ops->ndo_setup_tc) {
+ table->opt = kzalloc(sizeof(*table->opt), GFP_KERNEL);
+ if (!table->opt)
+ return -ENOMEM;
+ }
+
return gred_change_table_def(sch, tb[TCA_GRED_DPS], extack);
}
@@ -772,7 +781,7 @@ static int gred_dump(struct Qdisc *sch, struct sk_buff *skb)
if (gred_offload_dump_stats(sch))
goto nla_put_failure;
- opts = nla_nest_start(skb, TCA_OPTIONS);
+ opts = nla_nest_start_noflag(skb, TCA_OPTIONS);
if (opts == NULL)
goto nla_put_failure;
if (nla_put(skb, TCA_GRED_DPS, sizeof(sopt), &sopt))
@@ -790,7 +799,7 @@ static int gred_dump(struct Qdisc *sch, struct sk_buff *skb)
goto nla_put_failure;
/* Old style all-in-one dump of VQs */
- parms = nla_nest_start(skb, TCA_GRED_PARMS);
+ parms = nla_nest_start_noflag(skb, TCA_GRED_PARMS);
if (parms == NULL)
goto nla_put_failure;
@@ -819,7 +828,6 @@ static int gred_dump(struct Qdisc *sch, struct sk_buff *skb)
opt.Wlog = q->parms.Wlog;
opt.Plog = q->parms.Plog;
opt.Scell_log = q->parms.Scell_log;
- opt.other = q->stats.other;
opt.early = q->stats.prob_drop;
opt.forced = q->stats.forced_drop;
opt.pdrop = q->stats.pdrop;
@@ -841,7 +849,7 @@ append_opt:
nla_nest_end(skb, parms);
/* Dump the VQs again, in more structured way */
- vqs = nla_nest_start(skb, TCA_GRED_VQ_LIST);
+ vqs = nla_nest_start_noflag(skb, TCA_GRED_VQ_LIST);
if (!vqs)
goto nla_put_failure;
@@ -852,7 +860,7 @@ append_opt:
if (!q)
continue;
- vq = nla_nest_start(skb, TCA_GRED_VQ_ENTRY);
+ vq = nla_nest_start_noflag(skb, TCA_GRED_VQ_ENTRY);
if (!vq)
goto nla_put_failure;
@@ -885,8 +893,6 @@ append_opt:
goto nla_put_failure;
if (nla_put_u32(skb, TCA_GRED_VQ_STAT_PDROP, q->stats.pdrop))
goto nla_put_failure;
- if (nla_put_u32(skb, TCA_GRED_VQ_STAT_OTHER, q->stats.other))
- goto nla_put_failure;
nla_nest_end(skb, vq);
}
@@ -904,11 +910,12 @@ static void gred_destroy(struct Qdisc *sch)
struct gred_sched *table = qdisc_priv(sch);
int i;
- for (i = 0; i < table->DPs; i++) {
- if (table->tab[i])
- gred_destroy_vq(table->tab[i]);
- }
- gred_offload(sch, TC_GRED_DESTROY);
+ for (i = 0; i < table->DPs; i++)
+ gred_destroy_vq(table->tab[i]);
+
+ if (table->opt)
+ gred_offload(sch, TC_GRED_DESTROY);
+ kfree(table->opt);
}
static struct Qdisc_ops gred_qdisc_ops __read_mostly = {
@@ -924,6 +931,7 @@ static struct Qdisc_ops gred_qdisc_ops __read_mostly = {
.dump = gred_dump,
.owner = THIS_MODULE,
};
+MODULE_ALIAS_NET_SCH("gred");
static int __init gred_module_init(void)
{
@@ -939,3 +947,4 @@ module_init(gred_module_init)
module_exit(gred_module_exit)
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Generic Random Early Detection qdisc");
diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c
index 24cc220a3218..d8fd35da32a7 100644
--- a/net/sched/sch_hfsc.c
+++ b/net/sched/sch_hfsc.c
@@ -111,12 +111,11 @@ enum hfsc_class_flags {
struct hfsc_class {
struct Qdisc_class_common cl_common;
- struct gnet_stats_basic_packed bstats;
+ struct gnet_stats_basic_sync bstats;
struct gnet_stats_queue qstats;
struct net_rate_estimator __rcu *rate_est;
struct tcf_proto __rcu *filter_list; /* filter list */
struct tcf_block *block;
- unsigned int filter_cnt; /* filter count */
unsigned int level; /* class level in hierarchy */
struct hfsc_sched *sched; /* scheduler data */
@@ -176,6 +175,11 @@ struct hfsc_sched {
#define HT_INFINITY 0xffffffffffffffffULL /* infinite time value */
+static bool cl_in_el_or_vttree(struct hfsc_class *cl)
+{
+ return ((cl->cl_flags & HFSC_FSC) && cl->cl_nactive) ||
+ ((cl->cl_flags & HFSC_RSC) && !RB_EMPTY_NODE(&cl->el_node));
+}
/*
* eligible tree holds backlogged classes being sorted by their eligible times.
@@ -204,7 +208,10 @@ eltree_insert(struct hfsc_class *cl)
static inline void
eltree_remove(struct hfsc_class *cl)
{
- rb_erase(&cl->el_node, &cl->sched->eligible);
+ if (!RB_EMPTY_NODE(&cl->el_node)) {
+ rb_erase(&cl->el_node, &cl->sched->eligible);
+ RB_CLEAR_NODE(&cl->el_node);
+ }
}
static inline void
@@ -828,32 +835,6 @@ update_vf(struct hfsc_class *cl, unsigned int len, u64 cur_time)
}
}
-static unsigned int
-qdisc_peek_len(struct Qdisc *sch)
-{
- struct sk_buff *skb;
- unsigned int len;
-
- skb = sch->ops->peek(sch);
- if (unlikely(skb == NULL)) {
- qdisc_warn_nonwc("qdisc_peek_len", sch);
- return 0;
- }
- len = qdisc_pkt_len(skb);
-
- return len;
-}
-
-static void
-hfsc_purge_queue(struct Qdisc *sch, struct hfsc_class *cl)
-{
- unsigned int len = cl->qdisc->q.qlen;
- unsigned int backlog = cl->qdisc->qstats.backlog;
-
- qdisc_reset(cl->qdisc);
- qdisc_tree_reduce_backlog(cl->qdisc, len, backlog);
-}
-
static void
hfsc_adjust_levels(struct hfsc_class *cl)
{
@@ -913,6 +894,14 @@ hfsc_change_usc(struct hfsc_class *cl, struct tc_service_curve *usc,
cl->cl_flags |= HFSC_USC;
}
+static void
+hfsc_upgrade_rt(struct hfsc_class *cl)
+{
+ cl->cl_fsc = cl->cl_rsc;
+ rtsc_init(&cl->cl_virtual, &cl->cl_fsc, cl->cl_vt, cl->cl_total);
+ cl->cl_flags |= HFSC_FSC;
+}
+
static const struct nla_policy hfsc_policy[TCA_HFSC_MAX + 1] = {
[TCA_HFSC_RSC] = { .len = sizeof(struct tc_service_curve) },
[TCA_HFSC_FSC] = { .len = sizeof(struct tc_service_curve) },
@@ -936,7 +925,8 @@ hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
if (opt == NULL)
return -EINVAL;
- err = nla_parse_nested(tb, TCA_HFSC_MAX, opt, hfsc_policy, NULL);
+ err = nla_parse_nested_deprecated(tb, TCA_HFSC_MAX, opt, hfsc_policy,
+ NULL);
if (err < 0)
return err;
@@ -960,6 +950,7 @@ hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
if (cl != NULL) {
int old_flags;
+ int len = 0;
if (parentid) {
if (cl->cl_parent &&
@@ -974,7 +965,7 @@ hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
err = gen_replace_estimator(&cl->bstats, NULL,
&cl->rate_est,
NULL,
- qdisc_root_sleeping_running(sch),
+ true,
tca[TCA_RATE]);
if (err)
return err;
@@ -990,9 +981,13 @@ hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
if (usc != NULL)
hfsc_change_usc(cl, usc, cur_time);
+ if (cl->qdisc->q.qlen != 0)
+ len = qdisc_peek_len(cl->qdisc);
+ /* Check queue length again since some qdisc implementations
+ * (e.g., netem/codel) might empty the queue during the peek
+ * operation.
+ */
if (cl->qdisc->q.qlen != 0) {
- int len = qdisc_peek_len(cl->qdisc);
-
if (cl->cl_flags & HFSC_RSC) {
if (old_flags & HFSC_RSC)
update_ed(cl, len);
@@ -1034,6 +1029,8 @@ hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
if (cl == NULL)
return -ENOBUFS;
+ RB_CLEAR_NODE(&cl->el_node);
+
err = tcf_block_get(&cl->block, &cl->filter_list, sch, extack);
if (err) {
kfree(cl);
@@ -1042,9 +1039,7 @@ hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
if (tca[TCA_RATE]) {
err = gen_new_estimator(&cl->bstats, NULL, &cl->rate_est,
- NULL,
- qdisc_root_sleeping_running(sch),
- tca[TCA_RATE]);
+ NULL, true, tca[TCA_RATE]);
if (err) {
tcf_block_put(cl->block);
kfree(cl);
@@ -1073,10 +1068,16 @@ hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
cl->cf_tree = RB_ROOT;
sch_tree_lock(sch);
+ /* Check if the inner class is a misconfigured 'rt' */
+ if (!(parent->cl_flags & HFSC_FSC) && parent != &q->root) {
+ NL_SET_ERR_MSG(extack,
+ "Forced curve change on parent 'rt' to 'sc'");
+ hfsc_upgrade_rt(parent);
+ }
qdisc_class_hash_insert(&q->clhash, &cl->cl_common);
list_add_tail(&cl->siblings, &parent->children);
if (parent->level == 0)
- hfsc_purge_queue(sch, parent);
+ qdisc_purge_queue(parent->qdisc);
hfsc_adjust_levels(parent);
sch_tree_unlock(sch);
@@ -1099,20 +1100,24 @@ hfsc_destroy_class(struct Qdisc *sch, struct hfsc_class *cl)
}
static int
-hfsc_delete_class(struct Qdisc *sch, unsigned long arg)
+hfsc_delete_class(struct Qdisc *sch, unsigned long arg,
+ struct netlink_ext_ack *extack)
{
struct hfsc_sched *q = qdisc_priv(sch);
struct hfsc_class *cl = (struct hfsc_class *)arg;
- if (cl->level > 0 || cl->filter_cnt > 0 || cl == &q->root)
+ if (cl->level > 0 || qdisc_class_in_use(&cl->cl_common) ||
+ cl == &q->root) {
+ NL_SET_ERR_MSG(extack, "HFSC class in use");
return -EBUSY;
+ }
sch_tree_lock(sch);
list_del(&cl->siblings);
hfsc_adjust_levels(cl->cl_parent);
- hfsc_purge_queue(sch, cl);
+ qdisc_purge_queue(cl->qdisc);
qdisc_class_hash_remove(&q->clhash, &cl->cl_common);
sch_tree_unlock(sch);
@@ -1138,14 +1143,14 @@ hfsc_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
head = &q->root;
tcf = rcu_dereference_bh(q->root.filter_list);
- while (tcf && (result = tcf_classify(skb, tcf, &res, false)) >= 0) {
+ while (tcf && (result = tcf_classify(skb, NULL, tcf, &res, false)) >= 0) {
#ifdef CONFIG_NET_CLS_ACT
switch (result) {
case TC_ACT_QUEUED:
case TC_ACT_STOLEN:
case TC_ACT_TRAP:
*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
- /* fall through */
+ fallthrough;
case TC_ACT_SHOT:
return NULL;
}
@@ -1168,7 +1173,8 @@ hfsc_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
}
/* classification failed, try default class */
- cl = hfsc_find_class(TC_H_MAKE(TC_H_MAJ(sch->handle), q->defcls), sch);
+ cl = hfsc_find_class(TC_H_MAKE(TC_H_MAJ(sch->handle),
+ READ_ONCE(q->defcls)), sch);
if (cl == NULL || cl->level > 0)
return NULL;
@@ -1213,7 +1219,8 @@ hfsc_qlen_notify(struct Qdisc *sch, unsigned long arg)
/* vttree is now handled in update_vf() so that update_vf(cl, 0, 0)
* needs to be called explicitly to remove a class from vttree.
*/
- update_vf(cl, 0, 0);
+ if (cl->cl_nactive)
+ update_vf(cl, 0, 0);
if (cl->cl_flags & HFSC_RSC)
eltree_remove(cl);
}
@@ -1233,7 +1240,7 @@ hfsc_bind_tcf(struct Qdisc *sch, unsigned long parent, u32 classid)
if (cl != NULL) {
if (p != NULL && p->level <= cl->level)
return 0;
- cl->filter_cnt++;
+ qdisc_class_get(&cl->cl_common);
}
return (unsigned long)cl;
@@ -1244,7 +1251,7 @@ hfsc_unbind_tcf(struct Qdisc *sch, unsigned long arg)
{
struct hfsc_class *cl = (struct hfsc_class *)arg;
- cl->filter_cnt--;
+ qdisc_class_put(&cl->cl_common);
}
static struct tcf_block *hfsc_tcf_block(struct Qdisc *sch, unsigned long arg,
@@ -1310,7 +1317,7 @@ hfsc_dump_class(struct Qdisc *sch, unsigned long arg, struct sk_buff *skb,
if (cl->level == 0)
tcm->tcm_info = cl->qdisc->handle;
- nest = nla_nest_start(skb, TCA_OPTIONS);
+ nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
if (nest == NULL)
goto nla_put_failure;
if (hfsc_dump_curves(skb, cl) < 0)
@@ -1328,16 +1335,17 @@ hfsc_dump_class_stats(struct Qdisc *sch, unsigned long arg,
{
struct hfsc_class *cl = (struct hfsc_class *)arg;
struct tc_hfsc_stats xstats;
+ __u32 qlen;
- cl->qstats.backlog = cl->qdisc->qstats.backlog;
+ qdisc_qstats_qlen_backlog(cl->qdisc, &qlen, &cl->qstats.backlog);
xstats.level = cl->level;
xstats.period = cl->cl_vtperiod;
xstats.work = cl->cl_total;
xstats.rtwork = cl->cl_cumul;
- if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), d, NULL, &cl->bstats) < 0 ||
+ if (gnet_stats_copy_basic(d, NULL, &cl->bstats, true) < 0 ||
gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
- gnet_stats_copy_queue(d, NULL, &cl->qstats, cl->qdisc->q.qlen) < 0)
+ gnet_stats_copy_queue(d, NULL, &cl->qstats, qlen) < 0)
return -1;
return gnet_stats_copy_app(d, &xstats, sizeof(xstats));
@@ -1358,15 +1366,8 @@ hfsc_walk(struct Qdisc *sch, struct qdisc_walker *arg)
for (i = 0; i < q->clhash.hashsize; i++) {
hlist_for_each_entry(cl, &q->clhash.hash[i],
cl_common.hnode) {
- if (arg->count < arg->skip) {
- arg->count++;
- continue;
- }
- if (arg->fn(sch, (unsigned long)cl, arg) < 0) {
- arg->stop = 1;
+ if (!tc_qdisc_stats_dump(sch, (unsigned long)cl, arg))
return;
- }
- arg->count++;
}
}
}
@@ -1413,6 +1414,7 @@ hfsc_init_qdisc(struct Qdisc *sch, struct nlattr *opt,
if (err)
return err;
+ gnet_stats_basic_sync_init(&q->root.bstats);
q->root.cl_common.classid = sch->handle;
q->root.sched = q;
q->root.qdisc = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
@@ -1438,13 +1440,11 @@ hfsc_change_qdisc(struct Qdisc *sch, struct nlattr *opt,
struct hfsc_sched *q = qdisc_priv(sch);
struct tc_hfsc_qopt *qopt;
- if (opt == NULL || nla_len(opt) < sizeof(*qopt))
+ if (nla_len(opt) < sizeof(*qopt))
return -EINVAL;
qopt = nla_data(opt);
- sch_tree_lock(sch);
- q->defcls = qopt->defcls;
- sch_tree_unlock(sch);
+ WRITE_ONCE(q->defcls, qopt->defcls);
return 0;
}
@@ -1492,8 +1492,6 @@ hfsc_reset_qdisc(struct Qdisc *sch)
}
q->eligible = RB_ROOT;
qdisc_watchdog_cancel(&q->watchdog);
- sch->qstats.backlog = 0;
- sch->q.qlen = 0;
}
static void
@@ -1526,7 +1524,7 @@ hfsc_dump_qdisc(struct Qdisc *sch, struct sk_buff *skb)
unsigned char *b = skb_tail_pointer(skb);
struct tc_hfsc_qopt qopt;
- qopt.defcls = q->defcls;
+ qopt.defcls = READ_ONCE(q->defcls);
if (nla_put(skb, TCA_OPTIONS, sizeof(qopt), &qopt))
goto nla_put_failure;
return skb->len;
@@ -1541,7 +1539,7 @@ hfsc_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free)
{
unsigned int len = qdisc_pkt_len(skb);
struct hfsc_class *cl;
- int uninitialized_var(err);
+ int err;
bool first;
cl = hfsc_classify(skb, sch, &err);
@@ -1562,7 +1560,10 @@ hfsc_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free)
return err;
}
- if (first) {
+ sch->qstats.backlog += len;
+ sch->q.qlen++;
+
+ if (first && !cl_in_el_or_vttree(cl)) {
if (cl->cl_flags & HFSC_RSC)
init_ed(cl, len);
if (cl->cl_flags & HFSC_FSC)
@@ -1577,9 +1578,6 @@ hfsc_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free)
}
- sch->qstats.backlog += len;
- sch->q.qlen++;
-
return NET_XMIT_SUCCESS;
}
@@ -1634,10 +1632,16 @@ hfsc_dequeue(struct Qdisc *sch)
if (cl->qdisc->q.qlen != 0) {
/* update ed */
next_len = qdisc_peek_len(cl->qdisc);
- if (realtime)
- update_ed(cl, next_len);
- else
- update_d(cl, next_len);
+ /* Check queue length again since some qdisc implementations
+ * (e.g., netem/codel) might empty the queue during the peek
+ * operation.
+ */
+ if (cl->qdisc->q.qlen != 0) {
+ if (realtime)
+ update_ed(cl, next_len);
+ else
+ update_d(cl, next_len);
+ }
} else {
/* the class becomes passive */
eltree_remove(cl);
@@ -1680,6 +1684,7 @@ static struct Qdisc_ops hfsc_qdisc_ops __read_mostly = {
.priv_size = sizeof(struct hfsc_sched),
.owner = THIS_MODULE
};
+MODULE_ALIAS_NET_SCH("hfsc");
static int __init
hfsc_init(void)
@@ -1694,5 +1699,6 @@ hfsc_cleanup(void)
}
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Hierarchical Fair Service Curve scheduler");
module_init(hfsc_init);
module_exit(hfsc_cleanup);
diff --git a/net/sched/sch_hhf.c b/net/sched/sch_hhf.c
index 9d6a47697406..2d4855e28a28 100644
--- a/net/sched/sch_hhf.c
+++ b/net/sched/sch_hhf.c
@@ -1,14 +1,15 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* net/sched/sch_hhf.c Heavy-Hitter Filter (HHF)
*
* Copyright (C) 2013 Terry Lam <vtlam@google.com>
* Copyright (C) 2013 Nandita Dukkipati <nanditad@google.com>
*/
-#include <linux/jhash.h>
#include <linux/jiffies.h>
#include <linux/module.h>
#include <linux/skbuff.h>
#include <linux/vmalloc.h>
+#include <linux/siphash.h>
#include <net/pkt_sched.h>
#include <net/sock.h>
@@ -125,7 +126,7 @@ struct wdrr_bucket {
struct hhf_sched_data {
struct wdrr_bucket buckets[WDRR_BUCKET_CNT];
- u32 perturbation; /* hash perturbation */
+ siphash_key_t perturbation; /* hash perturbation */
u32 quantum; /* psched_mtu(qdisc_dev(sch)); */
u32 drop_overlimit; /* number of times max qdisc packet
* limit was hit
@@ -263,7 +264,7 @@ static enum wdrr_bucket_idx hhf_classify(struct sk_buff *skb, struct Qdisc *sch)
}
/* Get hashed flow-id of the skb. */
- hash = skb_get_hash_perturb(skb, q->perturbation);
+ hash = skb_get_hash_perturb(skb, &q->perturbation);
/* Check if this packet belongs to an already established HH flow. */
flow_pos = hash & HHF_BIT_MASK;
@@ -507,18 +508,16 @@ static const struct nla_policy hhf_policy[TCA_HHF_MAX + 1] = {
static int hhf_change(struct Qdisc *sch, struct nlattr *opt,
struct netlink_ext_ack *extack)
{
+ unsigned int dropped_pkts = 0, dropped_bytes = 0;
struct hhf_sched_data *q = qdisc_priv(sch);
struct nlattr *tb[TCA_HHF_MAX + 1];
- unsigned int qlen, prev_backlog;
int err;
u64 non_hh_quantum;
u32 new_quantum = q->quantum;
u32 new_hhf_non_hh_weight = q->hhf_non_hh_weight;
- if (!opt)
- return -EINVAL;
-
- err = nla_parse_nested(tb, TCA_HHF_MAX, opt, hhf_policy, NULL);
+ err = nla_parse_nested_deprecated(tb, TCA_HHF_MAX, opt, hhf_policy,
+ NULL);
if (err < 0)
return err;
@@ -529,44 +528,50 @@ static int hhf_change(struct Qdisc *sch, struct nlattr *opt,
new_hhf_non_hh_weight = nla_get_u32(tb[TCA_HHF_NON_HH_WEIGHT]);
non_hh_quantum = (u64)new_quantum * new_hhf_non_hh_weight;
- if (non_hh_quantum > INT_MAX)
+ if (non_hh_quantum == 0 || non_hh_quantum > INT_MAX)
return -EINVAL;
sch_tree_lock(sch);
if (tb[TCA_HHF_BACKLOG_LIMIT])
- sch->limit = nla_get_u32(tb[TCA_HHF_BACKLOG_LIMIT]);
+ WRITE_ONCE(sch->limit, nla_get_u32(tb[TCA_HHF_BACKLOG_LIMIT]));
- q->quantum = new_quantum;
- q->hhf_non_hh_weight = new_hhf_non_hh_weight;
+ WRITE_ONCE(q->quantum, new_quantum);
+ WRITE_ONCE(q->hhf_non_hh_weight, new_hhf_non_hh_weight);
if (tb[TCA_HHF_HH_FLOWS_LIMIT])
- q->hh_flows_limit = nla_get_u32(tb[TCA_HHF_HH_FLOWS_LIMIT]);
+ WRITE_ONCE(q->hh_flows_limit,
+ nla_get_u32(tb[TCA_HHF_HH_FLOWS_LIMIT]));
if (tb[TCA_HHF_RESET_TIMEOUT]) {
u32 us = nla_get_u32(tb[TCA_HHF_RESET_TIMEOUT]);
- q->hhf_reset_timeout = usecs_to_jiffies(us);
+ WRITE_ONCE(q->hhf_reset_timeout,
+ usecs_to_jiffies(us));
}
if (tb[TCA_HHF_ADMIT_BYTES])
- q->hhf_admit_bytes = nla_get_u32(tb[TCA_HHF_ADMIT_BYTES]);
+ WRITE_ONCE(q->hhf_admit_bytes,
+ nla_get_u32(tb[TCA_HHF_ADMIT_BYTES]));
if (tb[TCA_HHF_EVICT_TIMEOUT]) {
u32 us = nla_get_u32(tb[TCA_HHF_EVICT_TIMEOUT]);
- q->hhf_evict_timeout = usecs_to_jiffies(us);
+ WRITE_ONCE(q->hhf_evict_timeout,
+ usecs_to_jiffies(us));
}
- qlen = sch->q.qlen;
- prev_backlog = sch->qstats.backlog;
while (sch->q.qlen > sch->limit) {
- struct sk_buff *skb = hhf_dequeue(sch);
+ struct sk_buff *skb = qdisc_dequeue_internal(sch, false);
+
+ if (!skb)
+ break;
+ dropped_pkts++;
+ dropped_bytes += qdisc_pkt_len(skb);
rtnl_kfree_skbs(skb, skb);
}
- qdisc_tree_reduce_backlog(sch, qlen - sch->q.qlen,
- prev_backlog - sch->qstats.backlog);
+ qdisc_tree_reduce_backlog(sch, dropped_pkts, dropped_bytes);
sch_tree_unlock(sch);
return 0;
@@ -580,7 +585,7 @@ static int hhf_init(struct Qdisc *sch, struct nlattr *opt,
sch->limit = 1000;
q->quantum = psched_mtu(qdisc_dev(sch));
- q->perturbation = prandom_u32();
+ get_random_bytes(&q->perturbation, sizeof(q->perturbation));
INIT_LIST_HEAD(&q->new_buckets);
INIT_LIST_HEAD(&q->old_buckets);
@@ -654,19 +659,22 @@ static int hhf_dump(struct Qdisc *sch, struct sk_buff *skb)
struct hhf_sched_data *q = qdisc_priv(sch);
struct nlattr *opts;
- opts = nla_nest_start(skb, TCA_OPTIONS);
+ opts = nla_nest_start_noflag(skb, TCA_OPTIONS);
if (opts == NULL)
goto nla_put_failure;
- if (nla_put_u32(skb, TCA_HHF_BACKLOG_LIMIT, sch->limit) ||
- nla_put_u32(skb, TCA_HHF_QUANTUM, q->quantum) ||
- nla_put_u32(skb, TCA_HHF_HH_FLOWS_LIMIT, q->hh_flows_limit) ||
+ if (nla_put_u32(skb, TCA_HHF_BACKLOG_LIMIT, READ_ONCE(sch->limit)) ||
+ nla_put_u32(skb, TCA_HHF_QUANTUM, READ_ONCE(q->quantum)) ||
+ nla_put_u32(skb, TCA_HHF_HH_FLOWS_LIMIT,
+ READ_ONCE(q->hh_flows_limit)) ||
nla_put_u32(skb, TCA_HHF_RESET_TIMEOUT,
- jiffies_to_usecs(q->hhf_reset_timeout)) ||
- nla_put_u32(skb, TCA_HHF_ADMIT_BYTES, q->hhf_admit_bytes) ||
+ jiffies_to_usecs(READ_ONCE(q->hhf_reset_timeout))) ||
+ nla_put_u32(skb, TCA_HHF_ADMIT_BYTES,
+ READ_ONCE(q->hhf_admit_bytes)) ||
nla_put_u32(skb, TCA_HHF_EVICT_TIMEOUT,
- jiffies_to_usecs(q->hhf_evict_timeout)) ||
- nla_put_u32(skb, TCA_HHF_NON_HH_WEIGHT, q->hhf_non_hh_weight))
+ jiffies_to_usecs(READ_ONCE(q->hhf_evict_timeout))) ||
+ nla_put_u32(skb, TCA_HHF_NON_HH_WEIGHT,
+ READ_ONCE(q->hhf_non_hh_weight)))
goto nla_put_failure;
return nla_nest_end(skb, opts);
@@ -703,6 +711,7 @@ static struct Qdisc_ops hhf_qdisc_ops __read_mostly = {
.dump_stats = hhf_dump_stats,
.owner = THIS_MODULE,
};
+MODULE_ALIAS_NET_SCH("hhf");
static int __init hhf_module_init(void)
{
@@ -719,3 +728,4 @@ module_exit(hhf_module_exit)
MODULE_AUTHOR("Terry Lam");
MODULE_AUTHOR("Nandita Dukkipati");
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Heavy-Hitter Filter (HHF)");
diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c
index 30f9da7e1076..b5e40c51655a 100644
--- a/net/sched/sch_htb.c
+++ b/net/sched/sch_htb.c
@@ -1,11 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* net/sched/sch_htb.c Hierarchical token bucket, feed tree version
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Authors: Martin Devera, <devik@cdi.cz>
*
* Credits (in time order) for older HTB versions:
@@ -56,7 +52,7 @@
*/
static int htb_hysteresis __read_mostly = 0; /* whether to use mode hysteresis for speedup */
-#define HTB_VER 0x30011 /* major must be matched with number suplied by TC as version */
+#define HTB_VER 0x30011 /* major must be matched with number supplied by TC as version */
#if HTB_VER >> 16 != TC_HTB_PROTOVER
#error "Mismatched sch_htb.c and pkt_sch.h"
@@ -106,7 +102,6 @@ struct htb_class {
struct tcf_proto __rcu *filter_list; /* class attached filters */
struct tcf_block *block;
- int filter_cnt;
int level; /* our level (see above) */
unsigned int children;
@@ -117,7 +112,8 @@ struct htb_class {
/*
* Written often fields
*/
- struct gnet_stats_basic_packed bstats;
+ struct gnet_stats_basic_sync bstats;
+ struct gnet_stats_basic_sync bstats_bias;
struct tc_htb_xstats xstats; /* our special stats */
/* token bucket parameters */
@@ -128,6 +124,7 @@ struct htb_class {
struct htb_class_leaf {
int deficit[TC_HTB_MAXDEPTH];
struct Qdisc *q;
+ struct netdev_queue *offload_queue;
} leaf;
struct htb_class_inner {
struct htb_prio clprio[TC_HTB_NUMPRIO];
@@ -165,7 +162,8 @@ struct htb_sched {
/* non shaped skbs; let them go directly thru */
struct qdisc_skb_head direct_queue;
- long direct_pkts;
+ u32 direct_pkts;
+ u32 overlimits;
struct qdisc_watchdog watchdog;
@@ -177,6 +175,11 @@ struct htb_sched {
int row_mask[TC_HTB_MAXDEPTH];
struct htb_level hlevel[TC_HTB_MAXDEPTH];
+
+ struct Qdisc **direct_qdiscs;
+ unsigned int num_direct_qdiscs;
+
+ bool offload;
};
/* find class in global hash table using given handle */
@@ -195,8 +198,14 @@ static unsigned long htb_search(struct Qdisc *sch, u32 handle)
{
return (unsigned long)htb_find(handle, sch);
}
+
+#define HTB_DIRECT ((struct htb_class *)-1L)
+
/**
* htb_classify - classify a packet into class
+ * @skb: the socket buffer
+ * @sch: the active queue discipline
+ * @qerr: pointer for returned status code
*
* It returns NULL if the packet should be dropped or -1 if the packet
* should be passed directly thru. In all other cases leaf class is returned.
@@ -207,8 +216,6 @@ static unsigned long htb_search(struct Qdisc *sch, u32 handle)
* have no valid leaf we try to use MAJOR:default leaf. It still unsuccessful
* then finish and return direct queue.
*/
-#define HTB_DIRECT ((struct htb_class *)-1L)
-
static struct htb_class *htb_classify(struct sk_buff *skb, struct Qdisc *sch,
int *qerr)
{
@@ -235,14 +242,14 @@ static struct htb_class *htb_classify(struct sk_buff *skb, struct Qdisc *sch,
}
*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
- while (tcf && (result = tcf_classify(skb, tcf, &res, false)) >= 0) {
+ while (tcf && (result = tcf_classify(skb, NULL, tcf, &res, false)) >= 0) {
#ifdef CONFIG_NET_CLS_ACT
switch (result) {
case TC_ACT_QUEUED:
case TC_ACT_STOLEN:
case TC_ACT_TRAP:
*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
- /* fall through */
+ fallthrough;
case TC_ACT_SHOT:
return NULL;
}
@@ -270,6 +277,9 @@ static struct htb_class *htb_classify(struct sk_buff *skb, struct Qdisc *sch,
/**
* htb_add_to_id_tree - adds class to the round robin list
+ * @root: the root of the tree
+ * @cl: the class to add
+ * @prio: the give prio in class
*
* Routine adds class to the list (actually tree) sorted by classid.
* Make sure that class is not already on such list for given prio.
@@ -295,6 +305,9 @@ static void htb_add_to_id_tree(struct rb_root *root,
/**
* htb_add_to_wait_tree - adds class to the event queue with delay
+ * @q: the priority event queue
+ * @cl: the class to add
+ * @delay: delay in microseconds
*
* The class is added to priority event queue to indicate that class will
* change its mode in cl->pq_key microseconds. Make sure that class is not
@@ -328,17 +341,22 @@ static void htb_add_to_wait_tree(struct htb_sched *q,
/**
* htb_next_rb_node - finds next node in binary tree
+ * @n: the current node in binary tree
*
* When we are past last key we return NULL.
* Average complexity is 2 steps per call.
*/
static inline void htb_next_rb_node(struct rb_node **n)
{
- *n = rb_next(*n);
+ if (*n)
+ *n = rb_next(*n);
}
/**
* htb_add_class_to_row - add class to its row
+ * @q: the priority event queue
+ * @cl: the class to add
+ * @mask: the given priorities in class in bitmap
*
* The class is added to row at priorities marked in mask.
* It does nothing if mask == 0.
@@ -368,6 +386,9 @@ static void htb_safe_rb_erase(struct rb_node *rb, struct rb_root *root)
/**
* htb_remove_class_from_row - removes class from its row
+ * @q: the priority event queue
+ * @cl: the class to add
+ * @mask: the given priorities in class in bitmap
*
* The class is removed from row at priorities marked in mask.
* It does nothing if mask == 0.
@@ -395,6 +416,8 @@ static inline void htb_remove_class_from_row(struct htb_sched *q,
/**
* htb_activate_prios - creates active classe's feed chain
+ * @q: the priority event queue
+ * @cl: the class to activate
*
* The class is connected to ancestors and/or appropriate rows
* for priorities it is participating on. cl->cmode must be new
@@ -408,7 +431,10 @@ static void htb_activate_prios(struct htb_sched *q, struct htb_class *cl)
while (cl->cmode == HTB_MAY_BORROW && p && mask) {
m = mask;
while (m) {
- int prio = ffz(~m);
+ unsigned int prio = ffz(~m);
+
+ if (WARN_ON_ONCE(prio >= ARRAY_SIZE(p->inner.clprio)))
+ break;
m &= ~(1 << prio);
if (p->inner.clprio[prio].feed.rb_node)
@@ -430,6 +456,8 @@ static void htb_activate_prios(struct htb_sched *q, struct htb_class *cl)
/**
* htb_deactivate_prios - remove class from feed chain
+ * @q: the priority event queue
+ * @cl: the class to deactivate
*
* cl->cmode must represent old mode (before deactivation). It does
* nothing if cl->prio_activity == 0. Class is removed from all feed
@@ -490,6 +518,8 @@ static inline s64 htb_hiwater(const struct htb_class *cl)
/**
* htb_class_mode - computes and returns current class mode
+ * @cl: the target class
+ * @diff: diff time in microseconds
*
* It computes cl's mode at time cl->t_c+diff and returns it. If mode
* is not HTB_CAN_SEND then cl->pq_key is updated to time difference
@@ -518,9 +548,12 @@ htb_class_mode(struct htb_class *cl, s64 *diff)
/**
* htb_change_class_mode - changes classe's mode
+ * @q: the priority event queue
+ * @cl: the target class
+ * @diff: diff time in microseconds
*
* This should be the only way how to change classe's mode under normal
- * cirsumstances. Routine will update feed lists linkage, change mode
+ * circumstances. Routine will update feed lists linkage, change mode
* and add class to the wait event queue if appropriate. New mode should
* be different from old one and cl->pq_key has to be valid if changing
* to mode other than HTB_CAN_SEND (see htb_add_to_wait_tree).
@@ -533,8 +566,10 @@ htb_change_class_mode(struct htb_sched *q, struct htb_class *cl, s64 *diff)
if (new_mode == cl->cmode)
return;
- if (new_mode == HTB_CANT_SEND)
+ if (new_mode == HTB_CANT_SEND) {
cl->overlimits++;
+ q->overlimits++;
+ }
if (cl->prio_activity) { /* not necessary: speed optimization */
if (cl->cmode != HTB_CANT_SEND)
@@ -548,6 +583,8 @@ htb_change_class_mode(struct htb_sched *q, struct htb_class *cl, s64 *diff)
/**
* htb_activate - inserts leaf cl into appropriate active feeds
+ * @q: the priority event queue
+ * @cl: the target class
*
* Routine learns (new) priority of leaf and activates feed chain
* for the prio. It can be called on already active leaf safely.
@@ -555,7 +592,7 @@ htb_change_class_mode(struct htb_sched *q, struct htb_class *cl, s64 *diff)
*/
static inline void htb_activate(struct htb_sched *q, struct htb_class *cl)
{
- WARN_ON(cl->level || !cl->leaf.q || !cl->leaf.q->q.qlen);
+ WARN_ON(cl->level || !cl->leaf.q);
if (!cl->prio_activity) {
cl->prio_activity = 1 << cl->prio;
@@ -565,14 +602,16 @@ static inline void htb_activate(struct htb_sched *q, struct htb_class *cl)
/**
* htb_deactivate - remove leaf cl from active feeds
+ * @q: the priority event queue
+ * @cl: the target class
*
* Make sure that leaf is active. In the other words it can't be called
* with non-active leaf. It also removes class from the drop list.
*/
static inline void htb_deactivate(struct htb_sched *q, struct htb_class *cl)
{
- WARN_ON(!cl->prio_activity);
-
+ if (!cl->prio_activity)
+ return;
htb_deactivate_prios(q, cl);
cl->prio_activity = 0;
}
@@ -580,7 +619,7 @@ static inline void htb_deactivate(struct htb_sched *q, struct htb_class *cl)
static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch,
struct sk_buff **to_free)
{
- int uninitialized_var(ret);
+ int ret;
unsigned int len = qdisc_pkt_len(skb);
struct htb_sched *q = qdisc_priv(sch);
struct htb_class *cl = htb_classify(skb, sch, &ret);
@@ -644,6 +683,10 @@ static inline void htb_accnt_ctokens(struct htb_class *cl, int bytes, s64 diff)
/**
* htb_charge_class - charges amount "bytes" to leaf and ancestors
+ * @q: the priority event queue
+ * @cl: the class to start iterate
+ * @level: the minimum level to account
+ * @skb: the socket buffer
*
* Routine assumes that packet "bytes" long was dequeued from leaf cl
* borrowing from "level". It accounts bytes to ceil leaky bucket for
@@ -693,6 +736,9 @@ static void htb_charge_class(struct htb_sched *q, struct htb_class *cl,
/**
* htb_do_events - make mode changes to classes at the level
+ * @q: the priority event queue
+ * @level: which wait_pq in 'q->hlevel'
+ * @start: start jiffies
*
* Scans event queue for pending events and applies them. Returns time of
* next pending event (0 for no event in pq, q->now for too many events).
@@ -761,6 +807,8 @@ static struct rb_node *htb_id_find_next_upper(int prio, struct rb_node *n,
/**
* htb_lookup_leaf - returns next leaf class in DRR order
+ * @hprio: the current one
+ * @prio: which prio in class
*
* Find leaf where current feed pointers points to.
*/
@@ -773,7 +821,9 @@ static struct htb_class *htb_lookup_leaf(struct htb_prio *hprio, const int prio)
u32 *pid;
} stk[TC_HTB_MAXDEPTH], *sp = stk;
- BUG_ON(!hprio->row.rb_node);
+ if (unlikely(!hprio->row.rb_node))
+ return NULL;
+
sp->root = hprio->row.rb_node;
sp->pptr = &hprio->ptr;
sp->pid = &hprio->last_ptr_id;
@@ -937,7 +987,6 @@ ok:
goto ok;
}
}
- qdisc_qstats_overlimit(sch);
if (likely(next_event > q->now))
qdisc_watchdog_schedule_ns(&q->watchdog, next_event);
else
@@ -959,7 +1008,7 @@ static void htb_reset(struct Qdisc *sch)
if (cl->level)
memset(&cl->inner, 0, sizeof(cl->inner));
else {
- if (cl->leaf.q)
+ if (cl->leaf.q && !q->offload)
qdisc_reset(cl->leaf.q);
}
cl->prio_activity = 0;
@@ -968,8 +1017,6 @@ static void htb_reset(struct Qdisc *sch)
}
qdisc_watchdog_cancel(&q->watchdog);
__qdisc_reset_queue(&q->direct_queue);
- sch->q.qlen = 0;
- sch->qstats.backlog = 0;
memset(q->hlevel, 0, sizeof(q->hlevel));
memset(q->row_mask, 0, sizeof(q->row_mask));
}
@@ -982,6 +1029,7 @@ static const struct nla_policy htb_policy[TCA_HTB_MAX + 1] = {
[TCA_HTB_DIRECT_QLEN] = { .type = NLA_U32 },
[TCA_HTB_RATE64] = { .type = NLA_U64 },
[TCA_HTB_CEIL64] = { .type = NLA_U64 },
+ [TCA_HTB_OFFLOAD] = { .type = NLA_FLAG },
};
static void htb_work_func(struct work_struct *work)
@@ -994,12 +1042,21 @@ static void htb_work_func(struct work_struct *work)
rcu_read_unlock();
}
+static int htb_offload(struct net_device *dev, struct tc_htb_qopt_offload *opt)
+{
+ return dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_HTB, opt);
+}
+
static int htb_init(struct Qdisc *sch, struct nlattr *opt,
struct netlink_ext_ack *extack)
{
+ struct net_device *dev = qdisc_dev(sch);
+ struct tc_htb_qopt_offload offload_opt;
struct htb_sched *q = qdisc_priv(sch);
struct nlattr *tb[TCA_HTB_MAX + 1];
struct tc_htb_glob *gopt;
+ unsigned int ntx;
+ bool offload;
int err;
qdisc_watchdog_init(&q->watchdog, sch);
@@ -1012,7 +1069,8 @@ static int htb_init(struct Qdisc *sch, struct nlattr *opt,
if (err)
return err;
- err = nla_parse_nested(tb, TCA_HTB_MAX, opt, htb_policy, NULL);
+ err = nla_parse_nested_deprecated(tb, TCA_HTB_MAX, opt, htb_policy,
+ NULL);
if (err < 0)
return err;
@@ -1023,12 +1081,31 @@ static int htb_init(struct Qdisc *sch, struct nlattr *opt,
if (gopt->version != HTB_VER >> 16)
return -EINVAL;
+ offload = nla_get_flag(tb[TCA_HTB_OFFLOAD]);
+
+ if (offload) {
+ if (sch->parent != TC_H_ROOT) {
+ NL_SET_ERR_MSG(extack, "HTB must be the root qdisc to use offload");
+ return -EOPNOTSUPP;
+ }
+
+ if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc) {
+ NL_SET_ERR_MSG(extack, "hw-tc-offload ethtool feature flag must be on");
+ return -EOPNOTSUPP;
+ }
+
+ q->num_direct_qdiscs = dev->real_num_tx_queues;
+ q->direct_qdiscs = kcalloc(q->num_direct_qdiscs,
+ sizeof(*q->direct_qdiscs),
+ GFP_KERNEL);
+ if (!q->direct_qdiscs)
+ return -ENOMEM;
+ }
+
err = qdisc_class_hash_init(&q->clhash);
if (err < 0)
return err;
- qdisc_skb_head_init(&q->direct_queue);
-
if (tb[TCA_HTB_DIRECT_QLEN])
q->direct_qlen = nla_get_u32(tb[TCA_HTB_DIRECT_QLEN]);
else
@@ -1038,15 +1115,105 @@ static int htb_init(struct Qdisc *sch, struct nlattr *opt,
q->rate2quantum = 1;
q->defcls = gopt->defcls;
+ if (!offload)
+ return 0;
+
+ for (ntx = 0; ntx < q->num_direct_qdiscs; ntx++) {
+ struct netdev_queue *dev_queue = netdev_get_tx_queue(dev, ntx);
+ struct Qdisc *qdisc;
+
+ qdisc = qdisc_create_dflt(dev_queue, &pfifo_qdisc_ops,
+ TC_H_MAKE(sch->handle, 0), extack);
+ if (!qdisc) {
+ return -ENOMEM;
+ }
+
+ q->direct_qdiscs[ntx] = qdisc;
+ qdisc->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT;
+ }
+
+ sch->flags |= TCQ_F_MQROOT;
+
+ offload_opt = (struct tc_htb_qopt_offload) {
+ .command = TC_HTB_CREATE,
+ .parent_classid = TC_H_MAJ(sch->handle) >> 16,
+ .classid = TC_H_MIN(q->defcls),
+ .extack = extack,
+ };
+ err = htb_offload(dev, &offload_opt);
+ if (err)
+ return err;
+
+ /* Defer this assignment, so that htb_destroy skips offload-related
+ * parts (especially calling ndo_setup_tc) on errors.
+ */
+ q->offload = true;
+
return 0;
}
+static void htb_attach_offload(struct Qdisc *sch)
+{
+ struct net_device *dev = qdisc_dev(sch);
+ struct htb_sched *q = qdisc_priv(sch);
+ unsigned int ntx;
+
+ for (ntx = 0; ntx < q->num_direct_qdiscs; ntx++) {
+ struct Qdisc *old, *qdisc = q->direct_qdiscs[ntx];
+
+ old = dev_graft_qdisc(qdisc->dev_queue, qdisc);
+ qdisc_put(old);
+ qdisc_hash_add(qdisc, false);
+ }
+ for (ntx = q->num_direct_qdiscs; ntx < dev->num_tx_queues; ntx++) {
+ struct netdev_queue *dev_queue = netdev_get_tx_queue(dev, ntx);
+ struct Qdisc *old = dev_graft_qdisc(dev_queue, NULL);
+
+ qdisc_put(old);
+ }
+
+ kfree(q->direct_qdiscs);
+ q->direct_qdiscs = NULL;
+}
+
+static void htb_attach_software(struct Qdisc *sch)
+{
+ struct net_device *dev = qdisc_dev(sch);
+ unsigned int ntx;
+
+ /* Resemble qdisc_graft behavior. */
+ for (ntx = 0; ntx < dev->num_tx_queues; ntx++) {
+ struct netdev_queue *dev_queue = netdev_get_tx_queue(dev, ntx);
+ struct Qdisc *old = dev_graft_qdisc(dev_queue, sch);
+
+ qdisc_refcount_inc(sch);
+
+ qdisc_put(old);
+ }
+}
+
+static void htb_attach(struct Qdisc *sch)
+{
+ struct htb_sched *q = qdisc_priv(sch);
+
+ if (q->offload)
+ htb_attach_offload(sch);
+ else
+ htb_attach_software(sch);
+}
+
static int htb_dump(struct Qdisc *sch, struct sk_buff *skb)
{
struct htb_sched *q = qdisc_priv(sch);
struct nlattr *nest;
struct tc_htb_glob gopt;
+ if (q->offload)
+ sch->flags |= TCQ_F_OFFLOADED;
+ else
+ sch->flags &= ~TCQ_F_OFFLOADED;
+
+ sch->qstats.overlimits = q->overlimits;
/* Its safe to not acquire qdisc lock. As we hold RTNL,
* no change can happen on the qdisc parameters.
*/
@@ -1057,12 +1224,14 @@ static int htb_dump(struct Qdisc *sch, struct sk_buff *skb)
gopt.defcls = q->defcls;
gopt.debug = 0;
- nest = nla_nest_start(skb, TCA_OPTIONS);
+ nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
if (nest == NULL)
goto nla_put_failure;
if (nla_put(skb, TCA_HTB_INIT, sizeof(gopt), &gopt) ||
nla_put_u32(skb, TCA_HTB_DIRECT_QLEN, q->direct_qlen))
goto nla_put_failure;
+ if (q->offload && nla_put_flag(skb, TCA_HTB_OFFLOAD))
+ goto nla_put_failure;
return nla_nest_end(skb, nest);
@@ -1075,6 +1244,7 @@ static int htb_dump_class(struct Qdisc *sch, unsigned long arg,
struct sk_buff *skb, struct tcmsg *tcm)
{
struct htb_class *cl = (struct htb_class *)arg;
+ struct htb_sched *q = qdisc_priv(sch);
struct nlattr *nest;
struct tc_htb_opt opt;
@@ -1086,7 +1256,7 @@ static int htb_dump_class(struct Qdisc *sch, unsigned long arg,
if (!cl->level && cl->leaf.q)
tcm->tcm_info = cl->leaf.q->handle;
- nest = nla_nest_start(skb, TCA_OPTIONS);
+ nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
if (nest == NULL)
goto nla_put_failure;
@@ -1101,6 +1271,8 @@ static int htb_dump_class(struct Qdisc *sch, unsigned long arg,
opt.level = cl->level;
if (nla_put(skb, TCA_HTB_PARMS, sizeof(opt), &opt))
goto nla_put_failure;
+ if (q->offload && nla_put_flag(skb, TCA_HTB_OFFLOAD))
+ goto nla_put_failure;
if ((cl->rate.rate_bytes_ps >= (1ULL << 32)) &&
nla_put_u64_64bit(skb, TCA_HTB_RATE64, cl->rate.rate_bytes_ps,
TCA_HTB_PAD))
@@ -1117,27 +1289,70 @@ nla_put_failure:
return -1;
}
+static void htb_offload_aggregate_stats(struct htb_sched *q,
+ struct htb_class *cl)
+{
+ u64 bytes = 0, packets = 0;
+ struct htb_class *c;
+ unsigned int i;
+
+ gnet_stats_basic_sync_init(&cl->bstats);
+
+ for (i = 0; i < q->clhash.hashsize; i++) {
+ hlist_for_each_entry(c, &q->clhash.hash[i], common.hnode) {
+ struct htb_class *p = c;
+
+ while (p && p->level < cl->level)
+ p = p->parent;
+
+ if (p != cl)
+ continue;
+
+ bytes += u64_stats_read(&c->bstats_bias.bytes);
+ packets += u64_stats_read(&c->bstats_bias.packets);
+ if (c->level == 0) {
+ bytes += u64_stats_read(&c->leaf.q->bstats.bytes);
+ packets += u64_stats_read(&c->leaf.q->bstats.packets);
+ }
+ }
+ }
+ _bstats_update(&cl->bstats, bytes, packets);
+}
+
static int
htb_dump_class_stats(struct Qdisc *sch, unsigned long arg, struct gnet_dump *d)
{
struct htb_class *cl = (struct htb_class *)arg;
+ struct htb_sched *q = qdisc_priv(sch);
struct gnet_stats_queue qs = {
.drops = cl->drops,
.overlimits = cl->overlimits,
};
__u32 qlen = 0;
- if (!cl->level && cl->leaf.q) {
- qlen = cl->leaf.q->q.qlen;
- qs.backlog = cl->leaf.q->qstats.backlog;
- }
+ if (!cl->level && cl->leaf.q)
+ qdisc_qstats_qlen_backlog(cl->leaf.q, &qlen, &qs.backlog);
+
cl->xstats.tokens = clamp_t(s64, PSCHED_NS2TICKS(cl->tokens),
INT_MIN, INT_MAX);
cl->xstats.ctokens = clamp_t(s64, PSCHED_NS2TICKS(cl->ctokens),
INT_MIN, INT_MAX);
- if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
- d, NULL, &cl->bstats) < 0 ||
+ if (q->offload) {
+ if (!cl->level) {
+ if (cl->leaf.q)
+ cl->bstats = cl->leaf.q->bstats;
+ else
+ gnet_stats_basic_sync_init(&cl->bstats);
+ _bstats_update(&cl->bstats,
+ u64_stats_read(&cl->bstats_bias.bytes),
+ u64_stats_read(&cl->bstats_bias.packets));
+ } else {
+ htb_offload_aggregate_stats(q, cl);
+ }
+ }
+
+ if (gnet_stats_copy_basic(d, NULL, &cl->bstats, true) < 0 ||
gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
gnet_stats_copy_queue(d, NULL, &qs, qlen) < 0)
return -1;
@@ -1145,19 +1360,121 @@ htb_dump_class_stats(struct Qdisc *sch, unsigned long arg, struct gnet_dump *d)
return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats));
}
+static struct netdev_queue *
+htb_select_queue(struct Qdisc *sch, struct tcmsg *tcm)
+{
+ struct net_device *dev = qdisc_dev(sch);
+ struct tc_htb_qopt_offload offload_opt;
+ struct htb_sched *q = qdisc_priv(sch);
+ int err;
+
+ if (!q->offload)
+ return sch->dev_queue;
+
+ offload_opt = (struct tc_htb_qopt_offload) {
+ .command = TC_HTB_LEAF_QUERY_QUEUE,
+ .classid = TC_H_MIN(tcm->tcm_parent),
+ };
+ err = htb_offload(dev, &offload_opt);
+ if (err || offload_opt.qid >= dev->num_tx_queues)
+ return NULL;
+ return netdev_get_tx_queue(dev, offload_opt.qid);
+}
+
+static struct Qdisc *
+htb_graft_helper(struct netdev_queue *dev_queue, struct Qdisc *new_q)
+{
+ struct net_device *dev = dev_queue->dev;
+ struct Qdisc *old_q;
+
+ if (dev->flags & IFF_UP)
+ dev_deactivate(dev);
+ old_q = dev_graft_qdisc(dev_queue, new_q);
+ if (new_q)
+ new_q->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT;
+ if (dev->flags & IFF_UP)
+ dev_activate(dev);
+
+ return old_q;
+}
+
+static struct netdev_queue *htb_offload_get_queue(struct htb_class *cl)
+{
+ struct netdev_queue *queue;
+
+ queue = cl->leaf.offload_queue;
+ if (!(cl->leaf.q->flags & TCQ_F_BUILTIN))
+ WARN_ON(cl->leaf.q->dev_queue != queue);
+
+ return queue;
+}
+
+static void htb_offload_move_qdisc(struct Qdisc *sch, struct htb_class *cl_old,
+ struct htb_class *cl_new, bool destroying)
+{
+ struct netdev_queue *queue_old, *queue_new;
+ struct net_device *dev = qdisc_dev(sch);
+
+ queue_old = htb_offload_get_queue(cl_old);
+ queue_new = htb_offload_get_queue(cl_new);
+
+ if (!destroying) {
+ struct Qdisc *qdisc;
+
+ if (dev->flags & IFF_UP)
+ dev_deactivate(dev);
+ qdisc = dev_graft_qdisc(queue_old, NULL);
+ WARN_ON(qdisc != cl_old->leaf.q);
+ }
+
+ if (!(cl_old->leaf.q->flags & TCQ_F_BUILTIN))
+ cl_old->leaf.q->dev_queue = queue_new;
+ cl_old->leaf.offload_queue = queue_new;
+
+ if (!destroying) {
+ struct Qdisc *qdisc;
+
+ qdisc = dev_graft_qdisc(queue_new, cl_old->leaf.q);
+ if (dev->flags & IFF_UP)
+ dev_activate(dev);
+ WARN_ON(!(qdisc->flags & TCQ_F_BUILTIN));
+ }
+}
+
static int htb_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
struct Qdisc **old, struct netlink_ext_ack *extack)
{
+ struct netdev_queue *dev_queue = sch->dev_queue;
struct htb_class *cl = (struct htb_class *)arg;
+ struct htb_sched *q = qdisc_priv(sch);
+ struct Qdisc *old_q;
if (cl->level)
return -EINVAL;
- if (new == NULL &&
- (new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
- cl->common.classid, extack)) == NULL)
- return -ENOBUFS;
+
+ if (q->offload)
+ dev_queue = htb_offload_get_queue(cl);
+
+ if (!new) {
+ new = qdisc_create_dflt(dev_queue, &pfifo_qdisc_ops,
+ cl->common.classid, extack);
+ if (!new)
+ return -ENOBUFS;
+ }
+
+ if (q->offload) {
+ /* One ref for cl->leaf.q, the other for dev_queue->qdisc. */
+ qdisc_refcount_inc(new);
+ old_q = htb_graft_helper(dev_queue, new);
+ }
*old = qdisc_replace(sch, new, &cl->leaf.q);
+
+ if (q->offload) {
+ WARN_ON(old_q != *old);
+ qdisc_put(old_q);
+ }
+
return 0;
}
@@ -1185,9 +1502,10 @@ static inline int htb_parent_last_child(struct htb_class *cl)
return 1;
}
-static void htb_parent_to_leaf(struct htb_sched *q, struct htb_class *cl,
+static void htb_parent_to_leaf(struct Qdisc *sch, struct htb_class *cl,
struct Qdisc *new_q)
{
+ struct htb_sched *q = qdisc_priv(sch);
struct htb_class *parent = cl->parent;
WARN_ON(cl->level || !cl->leaf.q || cl->prio_activity);
@@ -1203,6 +1521,84 @@ static void htb_parent_to_leaf(struct htb_sched *q, struct htb_class *cl,
parent->ctokens = parent->cbuffer;
parent->t_c = ktime_get_ns();
parent->cmode = HTB_CAN_SEND;
+ if (q->offload)
+ parent->leaf.offload_queue = cl->leaf.offload_queue;
+}
+
+static void htb_parent_to_leaf_offload(struct Qdisc *sch,
+ struct netdev_queue *dev_queue,
+ struct Qdisc *new_q)
+{
+ struct Qdisc *old_q;
+
+ /* One ref for cl->leaf.q, the other for dev_queue->qdisc. */
+ if (new_q)
+ qdisc_refcount_inc(new_q);
+ old_q = htb_graft_helper(dev_queue, new_q);
+ WARN_ON(!(old_q->flags & TCQ_F_BUILTIN));
+}
+
+static int htb_destroy_class_offload(struct Qdisc *sch, struct htb_class *cl,
+ bool last_child, bool destroying,
+ struct netlink_ext_ack *extack)
+{
+ struct tc_htb_qopt_offload offload_opt;
+ struct netdev_queue *dev_queue;
+ struct Qdisc *q = cl->leaf.q;
+ struct Qdisc *old;
+ int err;
+
+ if (cl->level)
+ return -EINVAL;
+
+ WARN_ON(!q);
+ dev_queue = htb_offload_get_queue(cl);
+ /* When destroying, caller qdisc_graft grafts the new qdisc and invokes
+ * qdisc_put for the qdisc being destroyed. htb_destroy_class_offload
+ * does not need to graft or qdisc_put the qdisc being destroyed.
+ */
+ if (!destroying) {
+ old = htb_graft_helper(dev_queue, NULL);
+ /* Last qdisc grafted should be the same as cl->leaf.q when
+ * calling htb_delete.
+ */
+ WARN_ON(old != q);
+ }
+
+ if (cl->parent) {
+ _bstats_update(&cl->parent->bstats_bias,
+ u64_stats_read(&q->bstats.bytes),
+ u64_stats_read(&q->bstats.packets));
+ }
+
+ offload_opt = (struct tc_htb_qopt_offload) {
+ .command = !last_child ? TC_HTB_LEAF_DEL :
+ destroying ? TC_HTB_LEAF_DEL_LAST_FORCE :
+ TC_HTB_LEAF_DEL_LAST,
+ .classid = cl->common.classid,
+ .extack = extack,
+ };
+ err = htb_offload(qdisc_dev(sch), &offload_opt);
+
+ if (!destroying) {
+ if (!err)
+ qdisc_put(old);
+ else
+ htb_graft_helper(dev_queue, old);
+ }
+
+ if (last_child)
+ return err;
+
+ if (!err && offload_opt.classid != TC_H_MIN(cl->common.classid)) {
+ u32 classid = TC_H_MAJ(sch->handle) |
+ TC_H_MIN(offload_opt.classid);
+ struct htb_class *moved_cl = htb_find(classid, sch);
+
+ htb_offload_move_qdisc(sch, moved_cl, cl, destroying);
+ }
+
+ return err;
}
static void htb_destroy_class(struct Qdisc *sch, struct htb_class *cl)
@@ -1218,8 +1614,11 @@ static void htb_destroy_class(struct Qdisc *sch, struct htb_class *cl)
static void htb_destroy(struct Qdisc *sch)
{
+ struct net_device *dev = qdisc_dev(sch);
+ struct tc_htb_qopt_offload offload_opt;
struct htb_sched *q = qdisc_priv(sch);
struct hlist_node *next;
+ bool nonempty, changed;
struct htb_class *cl;
unsigned int i;
@@ -1238,60 +1637,118 @@ static void htb_destroy(struct Qdisc *sch)
cl->block = NULL;
}
}
- for (i = 0; i < q->clhash.hashsize; i++) {
- hlist_for_each_entry_safe(cl, next, &q->clhash.hash[i],
- common.hnode)
- htb_destroy_class(sch, cl);
- }
+
+ do {
+ nonempty = false;
+ changed = false;
+ for (i = 0; i < q->clhash.hashsize; i++) {
+ hlist_for_each_entry_safe(cl, next, &q->clhash.hash[i],
+ common.hnode) {
+ bool last_child;
+
+ if (!q->offload) {
+ htb_destroy_class(sch, cl);
+ continue;
+ }
+
+ nonempty = true;
+
+ if (cl->level)
+ continue;
+
+ changed = true;
+
+ last_child = htb_parent_last_child(cl);
+ htb_destroy_class_offload(sch, cl, last_child,
+ true, NULL);
+ qdisc_class_hash_remove(&q->clhash,
+ &cl->common);
+ if (cl->parent)
+ cl->parent->children--;
+ if (last_child)
+ htb_parent_to_leaf(sch, cl, NULL);
+ htb_destroy_class(sch, cl);
+ }
+ }
+ } while (changed);
+ WARN_ON(nonempty);
+
qdisc_class_hash_destroy(&q->clhash);
__qdisc_reset_queue(&q->direct_queue);
+
+ if (q->offload) {
+ offload_opt = (struct tc_htb_qopt_offload) {
+ .command = TC_HTB_DESTROY,
+ };
+ htb_offload(dev, &offload_opt);
+ }
+
+ if (!q->direct_qdiscs)
+ return;
+ for (i = 0; i < q->num_direct_qdiscs && q->direct_qdiscs[i]; i++)
+ qdisc_put(q->direct_qdiscs[i]);
+ kfree(q->direct_qdiscs);
}
-static int htb_delete(struct Qdisc *sch, unsigned long arg)
+static int htb_delete(struct Qdisc *sch, unsigned long arg,
+ struct netlink_ext_ack *extack)
{
struct htb_sched *q = qdisc_priv(sch);
struct htb_class *cl = (struct htb_class *)arg;
struct Qdisc *new_q = NULL;
int last_child = 0;
+ int err;
/* TODO: why don't allow to delete subtree ? references ? does
* tc subsys guarantee us that in htb_destroy it holds no class
* refs so that we can remove children safely there ?
*/
- if (cl->children || cl->filter_cnt)
+ if (cl->children || qdisc_class_in_use(&cl->common)) {
+ NL_SET_ERR_MSG(extack, "HTB class in use");
return -EBUSY;
+ }
- if (!cl->level && htb_parent_last_child(cl)) {
- new_q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
- cl->parent->common.classid,
- NULL);
+ if (!cl->level && htb_parent_last_child(cl))
last_child = 1;
+
+ if (q->offload) {
+ err = htb_destroy_class_offload(sch, cl, last_child, false,
+ extack);
+ if (err)
+ return err;
}
- sch_tree_lock(sch);
+ if (last_child) {
+ struct netdev_queue *dev_queue = sch->dev_queue;
- if (!cl->level) {
- unsigned int qlen = cl->leaf.q->q.qlen;
- unsigned int backlog = cl->leaf.q->qstats.backlog;
+ if (q->offload)
+ dev_queue = htb_offload_get_queue(cl);
- qdisc_reset(cl->leaf.q);
- qdisc_tree_reduce_backlog(cl->leaf.q, qlen, backlog);
+ new_q = qdisc_create_dflt(dev_queue, &pfifo_qdisc_ops,
+ cl->parent->common.classid,
+ NULL);
+ if (q->offload)
+ htb_parent_to_leaf_offload(sch, dev_queue, new_q);
}
+ sch_tree_lock(sch);
+
+ if (!cl->level)
+ qdisc_purge_queue(cl->leaf.q);
+
/* delete from hash and active; remainder in destroy_class */
qdisc_class_hash_remove(&q->clhash, &cl->common);
if (cl->parent)
cl->parent->children--;
- if (cl->prio_activity)
- htb_deactivate(q, cl);
+ htb_deactivate(q, cl);
if (cl->cmode != HTB_CAN_SEND)
htb_safe_rb_erase(&cl->pq_node,
&q->hlevel[cl->level].wait_pq);
if (last_child)
- htb_parent_to_leaf(q, cl, new_q);
+ htb_parent_to_leaf(sch, cl, new_q);
sch_tree_unlock(sch);
@@ -1306,8 +1763,11 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,
int err = -EINVAL;
struct htb_sched *q = qdisc_priv(sch);
struct htb_class *cl = (struct htb_class *)*arg, *parent;
+ struct tc_htb_qopt_offload offload_opt;
struct nlattr *opt = tca[TCA_OPTIONS];
struct nlattr *tb[TCA_HTB_MAX + 1];
+ struct Qdisc *parent_qdisc = NULL;
+ struct netdev_queue *dev_queue;
struct tc_htb_opt *hopt;
u64 rate64, ceil64;
int warn = 0;
@@ -1316,7 +1776,8 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,
if (!opt)
goto failure;
- err = nla_parse_nested(tb, TCA_HTB_MAX, opt, htb_policy, NULL);
+ err = nla_parse_nested_deprecated(tb, TCA_HTB_MAX, opt, htb_policy,
+ extack);
if (err < 0)
goto failure;
@@ -1330,6 +1791,18 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,
if (!hopt->rate.rate || !hopt->ceil.rate)
goto failure;
+ if (q->offload) {
+ /* Options not supported by the offload. */
+ if (hopt->rate.overhead || hopt->ceil.overhead) {
+ NL_SET_ERR_MSG(extack, "HTB offload doesn't support the overhead parameter");
+ goto failure;
+ }
+ if (hopt->rate.mpu || hopt->ceil.mpu) {
+ NL_SET_ERR_MSG(extack, "HTB offload doesn't support the mpu parameter");
+ goto failure;
+ }
+ }
+
/* Keeping backward compatible with rate_table based iproute2 tc */
if (hopt->rate.linklayer == TC_LINKLAYER_UNAWARE)
qdisc_put_rtab(qdisc_get_rtab(&hopt->rate, tb[TCA_HTB_RTAB],
@@ -1339,8 +1812,12 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,
qdisc_put_rtab(qdisc_get_rtab(&hopt->ceil, tb[TCA_HTB_CTAB],
NULL));
+ rate64 = nla_get_u64_default(tb[TCA_HTB_RATE64], 0);
+ ceil64 = nla_get_u64_default(tb[TCA_HTB_CEIL64], 0);
+
if (!cl) { /* new class */
- struct Qdisc *new_q;
+ struct net_device *dev = qdisc_dev(sch);
+ struct Qdisc *new_q, *old_q;
int prio;
struct {
struct nlattr nla;
@@ -1364,7 +1841,7 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,
/* check maximal depth */
if (parent && parent->parent && parent->parent->level < 2) {
- pr_err("htb: tree is too deep\n");
+ NL_SET_ERR_MSG_MOD(extack, "tree is too deep");
goto failure;
}
err = -ENOBUFS;
@@ -1372,6 +1849,9 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,
if (!cl)
goto failure;
+ gnet_stats_basic_sync_init(&cl->bstats);
+ gnet_stats_basic_sync_init(&cl->bstats_bias);
+
err = tcf_block_get(&cl->block, &cl->filter_list, sch, extack);
if (err) {
kfree(cl);
@@ -1381,13 +1861,10 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,
err = gen_new_estimator(&cl->bstats, NULL,
&cl->rate_est,
NULL,
- qdisc_root_sleeping_running(sch),
+ true,
tca[TCA_RATE] ? : &est.nla);
- if (err) {
- tcf_block_put(cl->block);
- kfree(cl);
- goto failure;
- }
+ if (err)
+ goto err_block_put;
}
cl->children = 0;
@@ -1396,23 +1873,83 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,
for (prio = 0; prio < TC_HTB_NUMPRIO; prio++)
RB_CLEAR_NODE(&cl->node[prio]);
+ cl->common.classid = classid;
+
+ /* Make sure nothing interrupts us in between of two
+ * ndo_setup_tc calls.
+ */
+ ASSERT_RTNL();
+
/* create leaf qdisc early because it uses kmalloc(GFP_KERNEL)
* so that can't be used inside of sch_tree_lock
* -- thanks to Karlis Peisenieks
*/
- new_q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
+ if (!q->offload) {
+ dev_queue = sch->dev_queue;
+ } else if (!(parent && !parent->level)) {
+ /* Assign a dev_queue to this classid. */
+ offload_opt = (struct tc_htb_qopt_offload) {
+ .command = TC_HTB_LEAF_ALLOC_QUEUE,
+ .classid = cl->common.classid,
+ .parent_classid = parent ?
+ TC_H_MIN(parent->common.classid) :
+ TC_HTB_CLASSID_ROOT,
+ .rate = max_t(u64, hopt->rate.rate, rate64),
+ .ceil = max_t(u64, hopt->ceil.rate, ceil64),
+ .prio = hopt->prio,
+ .quantum = hopt->quantum,
+ .extack = extack,
+ };
+ err = htb_offload(dev, &offload_opt);
+ if (err) {
+ NL_SET_ERR_MSG_WEAK(extack,
+ "Failed to offload TC_HTB_LEAF_ALLOC_QUEUE");
+ goto err_kill_estimator;
+ }
+ dev_queue = netdev_get_tx_queue(dev, offload_opt.qid);
+ } else { /* First child. */
+ dev_queue = htb_offload_get_queue(parent);
+ old_q = htb_graft_helper(dev_queue, NULL);
+ WARN_ON(old_q != parent->leaf.q);
+ offload_opt = (struct tc_htb_qopt_offload) {
+ .command = TC_HTB_LEAF_TO_INNER,
+ .classid = cl->common.classid,
+ .parent_classid =
+ TC_H_MIN(parent->common.classid),
+ .rate = max_t(u64, hopt->rate.rate, rate64),
+ .ceil = max_t(u64, hopt->ceil.rate, ceil64),
+ .prio = hopt->prio,
+ .quantum = hopt->quantum,
+ .extack = extack,
+ };
+ err = htb_offload(dev, &offload_opt);
+ if (err) {
+ NL_SET_ERR_MSG_WEAK(extack,
+ "Failed to offload TC_HTB_LEAF_TO_INNER");
+ htb_graft_helper(dev_queue, old_q);
+ goto err_kill_estimator;
+ }
+ _bstats_update(&parent->bstats_bias,
+ u64_stats_read(&old_q->bstats.bytes),
+ u64_stats_read(&old_q->bstats.packets));
+ qdisc_put(old_q);
+ }
+ new_q = qdisc_create_dflt(dev_queue, &pfifo_qdisc_ops,
classid, NULL);
+ if (q->offload) {
+ /* One ref for cl->leaf.q, the other for dev_queue->qdisc. */
+ if (new_q)
+ qdisc_refcount_inc(new_q);
+ old_q = htb_graft_helper(dev_queue, new_q);
+ /* No qdisc_put needed. */
+ WARN_ON(!(old_q->flags & TCQ_F_BUILTIN));
+ }
sch_tree_lock(sch);
if (parent && !parent->level) {
- unsigned int qlen = parent->leaf.q->q.qlen;
- unsigned int backlog = parent->leaf.q->qstats.backlog;
-
/* turn parent into inner node */
- qdisc_reset(parent->leaf.q);
- qdisc_tree_reduce_backlog(parent->leaf.q, qlen, backlog);
- qdisc_put(parent->leaf.q);
- if (parent->prio_activity)
- htb_deactivate(q, parent);
+ qdisc_purge_queue(parent->leaf.q);
+ parent_qdisc = parent->leaf.q;
+ htb_deactivate(q, parent);
/* remove from evt list because of level change */
if (parent->cmode != HTB_CAN_SEND) {
@@ -1423,10 +1960,12 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,
: TC_HTB_MAXDEPTH) - 1;
memset(&parent->inner, 0, sizeof(parent->inner));
}
+
/* leaf (we) needs elementary qdisc */
cl->leaf.q = new_q ? new_q : &noop_qdisc;
+ if (q->offload)
+ cl->leaf.offload_queue = dev_queue;
- cl->common.classid = classid;
cl->parent = parent;
/* set class to be in HTB_CAN_SEND state */
@@ -1447,17 +1986,37 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,
err = gen_replace_estimator(&cl->bstats, NULL,
&cl->rate_est,
NULL,
- qdisc_root_sleeping_running(sch),
+ true,
tca[TCA_RATE]);
if (err)
return err;
}
- sch_tree_lock(sch);
- }
- rate64 = tb[TCA_HTB_RATE64] ? nla_get_u64(tb[TCA_HTB_RATE64]) : 0;
+ if (q->offload) {
+ struct net_device *dev = qdisc_dev(sch);
+
+ offload_opt = (struct tc_htb_qopt_offload) {
+ .command = TC_HTB_NODE_MODIFY,
+ .classid = cl->common.classid,
+ .rate = max_t(u64, hopt->rate.rate, rate64),
+ .ceil = max_t(u64, hopt->ceil.rate, ceil64),
+ .prio = hopt->prio,
+ .quantum = hopt->quantum,
+ .extack = extack,
+ };
+ err = htb_offload(dev, &offload_opt);
+ if (err)
+ /* Estimator was replaced, and rollback may fail
+ * as well, so we don't try to recover it, and
+ * the estimator won't work property with the
+ * offload anyway, because bstats are updated
+ * only when the stats are queried.
+ */
+ return err;
+ }
- ceil64 = tb[TCA_HTB_CEIL64] ? nla_get_u64(tb[TCA_HTB_CEIL64]) : 0;
+ sch_tree_lock(sch);
+ }
psched_ratecfg_precompute(&cl->rate, &hopt->rate, rate64);
psched_ratecfg_precompute(&cl->ceil, &hopt->ceil, ceil64);
@@ -1489,16 +2048,23 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,
cl->cbuffer = PSCHED_TICKS2NS(hopt->cbuffer);
sch_tree_unlock(sch);
+ qdisc_put(parent_qdisc);
if (warn)
- pr_warn("HTB: quantum of class %X is %s. Consider r2q change.\n",
- cl->common.classid, (warn == -1 ? "small" : "big"));
+ NL_SET_ERR_MSG_FMT_MOD(extack,
+ "quantum of class %X is %s. Consider r2q change.",
+ cl->common.classid, (warn == -1 ? "small" : "big"));
qdisc_class_hash_grow(sch, &q->clhash);
*arg = (unsigned long)cl;
return 0;
+err_kill_estimator:
+ gen_kill_estimator(&cl->rate_est);
+err_block_put:
+ tcf_block_put(cl->block);
+ kfree(cl);
failure:
return err;
}
@@ -1527,7 +2093,7 @@ static unsigned long htb_bind_filter(struct Qdisc *sch, unsigned long parent,
* be broken by class during destroy IIUC.
*/
if (cl)
- cl->filter_cnt++;
+ qdisc_class_get(&cl->common);
return (unsigned long)cl;
}
@@ -1535,8 +2101,7 @@ static void htb_unbind_filter(struct Qdisc *sch, unsigned long arg)
{
struct htb_class *cl = (struct htb_class *)arg;
- if (cl)
- cl->filter_cnt--;
+ qdisc_class_put(&cl->common);
}
static void htb_walk(struct Qdisc *sch, struct qdisc_walker *arg)
@@ -1550,20 +2115,14 @@ static void htb_walk(struct Qdisc *sch, struct qdisc_walker *arg)
for (i = 0; i < q->clhash.hashsize; i++) {
hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) {
- if (arg->count < arg->skip) {
- arg->count++;
- continue;
- }
- if (arg->fn(sch, (unsigned long)cl, arg) < 0) {
- arg->stop = 1;
+ if (!tc_qdisc_stats_dump(sch, (unsigned long)cl, arg))
return;
- }
- arg->count++;
}
}
}
static const struct Qdisc_class_ops htb_class_ops = {
+ .select_queue = htb_select_queue,
.graft = htb_graft,
.leaf = htb_leaf,
.qlen_notify = htb_qlen_notify,
@@ -1586,11 +2145,13 @@ static struct Qdisc_ops htb_qdisc_ops __read_mostly = {
.dequeue = htb_dequeue,
.peek = qdisc_peek_dequeued,
.init = htb_init,
+ .attach = htb_attach,
.reset = htb_reset,
.destroy = htb_destroy,
.dump = htb_dump,
.owner = THIS_MODULE,
};
+MODULE_ALIAS_NET_SCH("htb");
static int __init htb_module_init(void)
{
@@ -1604,3 +2165,4 @@ static void __exit htb_module_exit(void)
module_init(htb_module_init)
module_exit(htb_module_exit)
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Hierarchical Token Bucket scheduler");
diff --git a/net/sched/sch_ingress.c b/net/sched/sch_ingress.c
index ce3f55259d0d..cc6051d4f2ef 100644
--- a/net/sched/sch_ingress.c
+++ b/net/sched/sch_ingress.c
@@ -1,10 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/* net/sched/sch_ingress.c - Ingress and clsact qdisc
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Authors: Jamal Hadi Salim 1999
*/
@@ -17,6 +13,7 @@
#include <net/netlink.h>
#include <net/pkt_sched.h>
#include <net/pkt_cls.h>
+#include <net/tcx.h>
struct ingress_sched_data {
struct tcf_block *block;
@@ -82,23 +79,55 @@ static int ingress_init(struct Qdisc *sch, struct nlattr *opt,
{
struct ingress_sched_data *q = qdisc_priv(sch);
struct net_device *dev = qdisc_dev(sch);
+ struct bpf_mprog_entry *entry;
+ bool created;
+ int err;
+
+ if (sch->parent != TC_H_INGRESS)
+ return -EOPNOTSUPP;
net_inc_ingress_queue();
- mini_qdisc_pair_init(&q->miniqp, sch, &dev->miniq_ingress);
+ entry = tcx_entry_fetch_or_create(dev, true, &created);
+ if (!entry)
+ return -ENOMEM;
+ tcx_miniq_inc(entry);
+ mini_qdisc_pair_init(&q->miniqp, sch, &tcx_entry(entry)->miniq);
+ if (created)
+ tcx_entry_update(dev, entry, true);
- q->block_info.binder_type = TCF_BLOCK_BINDER_TYPE_CLSACT_INGRESS;
+ q->block_info.binder_type = FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS;
q->block_info.chain_head_change = clsact_chain_head_change;
q->block_info.chain_head_change_priv = &q->miniqp;
- return tcf_block_get_ext(&q->block, sch, &q->block_info, extack);
+ err = tcf_block_get_ext(&q->block, sch, &q->block_info, extack);
+ if (err)
+ return err;
+
+ mini_qdisc_pair_block_init(&q->miniqp, q->block);
+
+ return 0;
}
static void ingress_destroy(struct Qdisc *sch)
{
struct ingress_sched_data *q = qdisc_priv(sch);
+ struct net_device *dev = qdisc_dev(sch);
+ struct bpf_mprog_entry *entry = rtnl_dereference(dev->tcx_ingress);
+
+ if (sch->parent != TC_H_INGRESS)
+ return;
tcf_block_put_ext(q->block, sch, &q->block_info);
+
+ if (entry) {
+ tcx_miniq_dec(entry);
+ if (!tcx_entry_is_active(entry)) {
+ tcx_entry_update(dev, NULL, true);
+ tcx_entry_free(entry);
+ }
+ }
+
net_dec_ingress_queue();
}
@@ -106,7 +135,7 @@ static int ingress_dump(struct Qdisc *sch, struct sk_buff *skb)
{
struct nlattr *nest;
- nest = nla_nest_start(skb, TCA_OPTIONS);
+ nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
if (nest == NULL)
goto nla_put_failure;
@@ -118,6 +147,7 @@ nla_put_failure:
}
static const struct Qdisc_class_ops ingress_class_ops = {
+ .flags = QDISC_CLASS_OPS_DOIT_UNLOCKED,
.leaf = ingress_leaf,
.find = ingress_find,
.walk = ingress_walk,
@@ -130,7 +160,7 @@ static struct Qdisc_ops ingress_qdisc_ops __read_mostly = {
.cl_ops = &ingress_class_ops,
.id = "ingress",
.priv_size = sizeof(struct ingress_sched_data),
- .static_flags = TCQ_F_CPUSTATS,
+ .static_flags = TCQ_F_INGRESS | TCQ_F_CPUSTATS,
.init = ingress_init,
.destroy = ingress_destroy,
.dump = ingress_dump,
@@ -138,6 +168,7 @@ static struct Qdisc_ops ingress_qdisc_ops __read_mostly = {
.ingress_block_get = ingress_ingress_block_get,
.owner = THIS_MODULE,
};
+MODULE_ALIAS_NET_SCH("ingress");
struct clsact_sched_data {
struct tcf_block *ingress_block;
@@ -213,14 +244,25 @@ static int clsact_init(struct Qdisc *sch, struct nlattr *opt,
{
struct clsact_sched_data *q = qdisc_priv(sch);
struct net_device *dev = qdisc_dev(sch);
+ struct bpf_mprog_entry *entry;
+ bool created;
int err;
+ if (sch->parent != TC_H_CLSACT)
+ return -EOPNOTSUPP;
+
net_inc_ingress_queue();
net_inc_egress_queue();
- mini_qdisc_pair_init(&q->miniqp_ingress, sch, &dev->miniq_ingress);
+ entry = tcx_entry_fetch_or_create(dev, true, &created);
+ if (!entry)
+ return -ENOMEM;
+ tcx_miniq_inc(entry);
+ mini_qdisc_pair_init(&q->miniqp_ingress, sch, &tcx_entry(entry)->miniq);
+ if (created)
+ tcx_entry_update(dev, entry, true);
- q->ingress_block_info.binder_type = TCF_BLOCK_BINDER_TYPE_CLSACT_INGRESS;
+ q->ingress_block_info.binder_type = FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS;
q->ingress_block_info.chain_head_change = clsact_chain_head_change;
q->ingress_block_info.chain_head_change_priv = &q->miniqp_ingress;
@@ -229,9 +271,17 @@ static int clsact_init(struct Qdisc *sch, struct nlattr *opt,
if (err)
return err;
- mini_qdisc_pair_init(&q->miniqp_egress, sch, &dev->miniq_egress);
+ mini_qdisc_pair_block_init(&q->miniqp_ingress, q->ingress_block);
+
+ entry = tcx_entry_fetch_or_create(dev, false, &created);
+ if (!entry)
+ return -ENOMEM;
+ tcx_miniq_inc(entry);
+ mini_qdisc_pair_init(&q->miniqp_egress, sch, &tcx_entry(entry)->miniq);
+ if (created)
+ tcx_entry_update(dev, entry, false);
- q->egress_block_info.binder_type = TCF_BLOCK_BINDER_TYPE_CLSACT_EGRESS;
+ q->egress_block_info.binder_type = FLOW_BLOCK_BINDER_TYPE_CLSACT_EGRESS;
q->egress_block_info.chain_head_change = clsact_chain_head_change;
q->egress_block_info.chain_head_change_priv = &q->miniqp_egress;
@@ -241,15 +291,38 @@ static int clsact_init(struct Qdisc *sch, struct nlattr *opt,
static void clsact_destroy(struct Qdisc *sch)
{
struct clsact_sched_data *q = qdisc_priv(sch);
+ struct net_device *dev = qdisc_dev(sch);
+ struct bpf_mprog_entry *ingress_entry = rtnl_dereference(dev->tcx_ingress);
+ struct bpf_mprog_entry *egress_entry = rtnl_dereference(dev->tcx_egress);
+
+ if (sch->parent != TC_H_CLSACT)
+ return;
- tcf_block_put_ext(q->egress_block, sch, &q->egress_block_info);
tcf_block_put_ext(q->ingress_block, sch, &q->ingress_block_info);
+ tcf_block_put_ext(q->egress_block, sch, &q->egress_block_info);
+
+ if (ingress_entry) {
+ tcx_miniq_dec(ingress_entry);
+ if (!tcx_entry_is_active(ingress_entry)) {
+ tcx_entry_update(dev, NULL, true);
+ tcx_entry_free(ingress_entry);
+ }
+ }
+
+ if (egress_entry) {
+ tcx_miniq_dec(egress_entry);
+ if (!tcx_entry_is_active(egress_entry)) {
+ tcx_entry_update(dev, NULL, false);
+ tcx_entry_free(egress_entry);
+ }
+ }
net_dec_ingress_queue();
net_dec_egress_queue();
}
static const struct Qdisc_class_ops clsact_class_ops = {
+ .flags = QDISC_CLASS_OPS_DOIT_UNLOCKED,
.leaf = ingress_leaf,
.find = clsact_find,
.walk = ingress_walk,
@@ -262,7 +335,7 @@ static struct Qdisc_ops clsact_qdisc_ops __read_mostly = {
.cl_ops = &clsact_class_ops,
.id = "clsact",
.priv_size = sizeof(struct clsact_sched_data),
- .static_flags = TCQ_F_CPUSTATS,
+ .static_flags = TCQ_F_INGRESS | TCQ_F_CPUSTATS,
.init = clsact_init,
.destroy = clsact_destroy,
.dump = ingress_dump,
@@ -272,6 +345,7 @@ static struct Qdisc_ops clsact_qdisc_ops __read_mostly = {
.egress_block_get = clsact_egress_block_get,
.owner = THIS_MODULE,
};
+MODULE_ALIAS_NET_SCH("clsact");
static int __init ingress_module_init(void)
{
@@ -296,5 +370,5 @@ static void __exit ingress_module_exit(void)
module_init(ingress_module_init);
module_exit(ingress_module_exit);
-MODULE_ALIAS("sch_clsact");
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Ingress and clsact based ingress and egress qdiscs");
diff --git a/net/sched/sch_mq.c b/net/sched/sch_mq.c
index 203659bc3906..c860119a8f09 100644
--- a/net/sched/sch_mq.c
+++ b/net/sched/sch_mq.c
@@ -1,11 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* net/sched/sch_mq.c Classful multiqueue dummy scheduler
*
* Copyright (c) 2009 Patrick McHardy <kaber@trash.net>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * version 2 as published by the Free Software Foundation.
*/
#include <linux/types.h>
@@ -133,10 +130,9 @@ static int mq_dump(struct Qdisc *sch, struct sk_buff *skb)
struct net_device *dev = qdisc_dev(sch);
struct Qdisc *qdisc;
unsigned int ntx;
- __u32 qlen = 0;
sch->q.qlen = 0;
- memset(&sch->bstats, 0, sizeof(sch->bstats));
+ gnet_stats_basic_sync_init(&sch->bstats);
memset(&sch->qstats, 0, sizeof(sch->qstats));
/* MQ supports lockless qdiscs. However, statistics accounting needs
@@ -145,27 +141,14 @@ static int mq_dump(struct Qdisc *sch, struct sk_buff *skb)
* qdisc totals are added at end.
*/
for (ntx = 0; ntx < dev->num_tx_queues; ntx++) {
- qdisc = netdev_get_tx_queue(dev, ntx)->qdisc_sleeping;
+ qdisc = rtnl_dereference(netdev_get_tx_queue(dev, ntx)->qdisc_sleeping);
spin_lock_bh(qdisc_lock(qdisc));
- if (qdisc_is_percpu_stats(qdisc)) {
- qlen = qdisc_qlen_sum(qdisc);
- __gnet_stats_copy_basic(NULL, &sch->bstats,
- qdisc->cpu_bstats,
- &qdisc->bstats);
- __gnet_stats_copy_queue(&sch->qstats,
- qdisc->cpu_qstats,
- &qdisc->qstats, qlen);
- } else {
- sch->q.qlen += qdisc->q.qlen;
- sch->bstats.bytes += qdisc->bstats.bytes;
- sch->bstats.packets += qdisc->bstats.packets;
- sch->qstats.qlen += qdisc->qstats.qlen;
- sch->qstats.backlog += qdisc->qstats.backlog;
- sch->qstats.drops += qdisc->qstats.drops;
- sch->qstats.requeues += qdisc->qstats.requeues;
- sch->qstats.overlimits += qdisc->qstats.overlimits;
- }
+ gnet_stats_add_basic(&sch->bstats, qdisc->cpu_bstats,
+ &qdisc->bstats, false);
+ gnet_stats_add_queue(&sch->qstats, qdisc->cpu_qstats,
+ &qdisc->qstats);
+ sch->q.qlen += qdisc_qlen(qdisc);
spin_unlock_bh(qdisc_lock(qdisc));
}
@@ -219,7 +202,7 @@ static struct Qdisc *mq_leaf(struct Qdisc *sch, unsigned long cl)
{
struct netdev_queue *dev_queue = mq_queue_get(sch, cl);
- return dev_queue->qdisc_sleeping;
+ return rtnl_dereference(dev_queue->qdisc_sleeping);
}
static unsigned long mq_find(struct Qdisc *sch, u32 classid)
@@ -238,7 +221,7 @@ static int mq_dump_class(struct Qdisc *sch, unsigned long cl,
tcm->tcm_parent = TC_H_ROOT;
tcm->tcm_handle |= TC_H_MIN(cl);
- tcm->tcm_info = dev_queue->qdisc_sleeping->handle;
+ tcm->tcm_info = rtnl_dereference(dev_queue->qdisc_sleeping)->handle;
return 0;
}
@@ -247,9 +230,9 @@ static int mq_dump_class_stats(struct Qdisc *sch, unsigned long cl,
{
struct netdev_queue *dev_queue = mq_queue_get(sch, cl);
- sch = dev_queue->qdisc_sleeping;
- if (gnet_stats_copy_basic(&sch->running, d, NULL, &sch->bstats) < 0 ||
- gnet_stats_copy_queue(d, NULL, &sch->qstats, sch->q.qlen) < 0)
+ sch = rtnl_dereference(dev_queue->qdisc_sleeping);
+ if (gnet_stats_copy_basic(d, sch->cpu_bstats, &sch->bstats, true) < 0 ||
+ qdisc_qstats_copy(d, sch) < 0)
return -1;
return 0;
}
@@ -264,11 +247,8 @@ static void mq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
arg->count = arg->skip;
for (ntx = arg->skip; ntx < dev->num_tx_queues; ntx++) {
- if (arg->fn(sch, ntx + 1, arg) < 0) {
- arg->stop = 1;
+ if (!tc_qdisc_stats_dump(sch, ntx + 1, arg))
break;
- }
- arg->count++;
}
}
@@ -289,6 +269,7 @@ struct Qdisc_ops mq_qdisc_ops __read_mostly = {
.init = mq_init,
.destroy = mq_destroy,
.attach = mq_attach,
+ .change_real_num_tx = mq_change_real_num_tx,
.dump = mq_dump,
.owner = THIS_MODULE,
};
diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c
index d364e63c396d..f3e5ef9a9592 100644
--- a/net/sched/sch_mqprio.c
+++ b/net/sched/sch_mqprio.c
@@ -1,13 +1,11 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* net/sched/sch_mqprio.c
*
* Copyright (c) 2010 John Fastabend <john.r.fastabend@intel.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * version 2 as published by the Free Software Foundation.
*/
+#include <linux/ethtool_netlink.h>
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/kernel.h>
@@ -20,6 +18,8 @@
#include <net/sch_generic.h>
#include <net/pkt_cls.h>
+#include "sch_mqprio_lib.h"
+
struct mqprio_sched {
struct Qdisc **qdiscs;
u16 mode;
@@ -28,8 +28,70 @@ struct mqprio_sched {
u32 flags;
u64 min_rate[TC_QOPT_MAX_QUEUE];
u64 max_rate[TC_QOPT_MAX_QUEUE];
+ u32 fp[TC_QOPT_MAX_QUEUE];
};
+static int mqprio_enable_offload(struct Qdisc *sch,
+ const struct tc_mqprio_qopt *qopt,
+ struct netlink_ext_ack *extack)
+{
+ struct mqprio_sched *priv = qdisc_priv(sch);
+ struct net_device *dev = qdisc_dev(sch);
+ struct tc_mqprio_qopt_offload mqprio = {
+ .qopt = *qopt,
+ .extack = extack,
+ };
+ int err, i;
+
+ switch (priv->mode) {
+ case TC_MQPRIO_MODE_DCB:
+ if (priv->shaper != TC_MQPRIO_SHAPER_DCB)
+ return -EINVAL;
+ break;
+ case TC_MQPRIO_MODE_CHANNEL:
+ mqprio.flags = priv->flags;
+ if (priv->flags & TC_MQPRIO_F_MODE)
+ mqprio.mode = priv->mode;
+ if (priv->flags & TC_MQPRIO_F_SHAPER)
+ mqprio.shaper = priv->shaper;
+ if (priv->flags & TC_MQPRIO_F_MIN_RATE)
+ for (i = 0; i < mqprio.qopt.num_tc; i++)
+ mqprio.min_rate[i] = priv->min_rate[i];
+ if (priv->flags & TC_MQPRIO_F_MAX_RATE)
+ for (i = 0; i < mqprio.qopt.num_tc; i++)
+ mqprio.max_rate[i] = priv->max_rate[i];
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ mqprio_fp_to_offload(priv->fp, &mqprio);
+
+ err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_MQPRIO,
+ &mqprio);
+ if (err)
+ return err;
+
+ priv->hw_offload = mqprio.qopt.hw;
+
+ return 0;
+}
+
+static void mqprio_disable_offload(struct Qdisc *sch)
+{
+ struct tc_mqprio_qopt_offload mqprio = { { 0 } };
+ struct mqprio_sched *priv = qdisc_priv(sch);
+ struct net_device *dev = qdisc_dev(sch);
+
+ switch (priv->mode) {
+ case TC_MQPRIO_MODE_DCB:
+ case TC_MQPRIO_MODE_CHANNEL:
+ dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_MQPRIO,
+ &mqprio);
+ break;
+ }
+}
+
static void mqprio_destroy(struct Qdisc *sch)
{
struct net_device *dev = qdisc_dev(sch);
@@ -44,37 +106,17 @@ static void mqprio_destroy(struct Qdisc *sch)
kfree(priv->qdiscs);
}
- if (priv->hw_offload && dev->netdev_ops->ndo_setup_tc) {
- struct tc_mqprio_qopt_offload mqprio = { { 0 } };
-
- switch (priv->mode) {
- case TC_MQPRIO_MODE_DCB:
- case TC_MQPRIO_MODE_CHANNEL:
- dev->netdev_ops->ndo_setup_tc(dev,
- TC_SETUP_QDISC_MQPRIO,
- &mqprio);
- break;
- default:
- return;
- }
- } else {
+ if (priv->hw_offload && dev->netdev_ops->ndo_setup_tc)
+ mqprio_disable_offload(sch);
+ else
netdev_set_num_tc(dev, 0);
- }
}
-static int mqprio_parse_opt(struct net_device *dev, struct tc_mqprio_qopt *qopt)
+static int mqprio_parse_opt(struct net_device *dev, struct tc_mqprio_qopt *qopt,
+ const struct tc_mqprio_caps *caps,
+ struct netlink_ext_ack *extack)
{
- int i, j;
-
- /* Verify num_tc is not out of max range */
- if (qopt->num_tc > TC_MAX_QUEUE)
- return -EINVAL;
-
- /* Verify priority mapping uses valid tcs */
- for (i = 0; i < TC_BITMASK + 1; i++) {
- if (qopt->prio_tc_map[i] >= qopt->num_tc)
- return -EINVAL;
- }
+ int err;
/* Limit qopt->hw to maximum supported offload value. Drivers have
* the option of overriding this later if they don't support the a
@@ -83,52 +125,220 @@ static int mqprio_parse_opt(struct net_device *dev, struct tc_mqprio_qopt *qopt)
if (qopt->hw > TC_MQPRIO_HW_OFFLOAD_MAX)
qopt->hw = TC_MQPRIO_HW_OFFLOAD_MAX;
- /* If hardware offload is requested we will leave it to the device
- * to either populate the queue counts itself or to validate the
- * provided queue counts. If ndo_setup_tc is not present then
- * hardware doesn't support offload and we should return an error.
+ /* If hardware offload is requested, we will leave 3 options to the
+ * device driver:
+ * - populate the queue counts itself (and ignore what was requested)
+ * - validate the provided queue counts by itself (and apply them)
+ * - request queue count validation here (and apply them)
*/
- if (qopt->hw)
- return dev->netdev_ops->ndo_setup_tc ? 0 : -EINVAL;
-
- for (i = 0; i < qopt->num_tc; i++) {
- unsigned int last = qopt->offset[i] + qopt->count[i];
-
- /* Verify the queue count is in tx range being equal to the
- * real_num_tx_queues indicates the last queue is in use.
- */
- if (qopt->offset[i] >= dev->real_num_tx_queues ||
- !qopt->count[i] ||
- last > dev->real_num_tx_queues)
- return -EINVAL;
-
- /* Verify that the offset and counts do not overlap */
- for (j = i + 1; j < qopt->num_tc; j++) {
- if (last > qopt->offset[j])
- return -EINVAL;
- }
+ err = mqprio_validate_qopt(dev, qopt,
+ !qopt->hw || caps->validate_queue_counts,
+ false, extack);
+ if (err)
+ return err;
+
+ /* If ndo_setup_tc is not present then hardware doesn't support offload
+ * and we should return an error.
+ */
+ if (qopt->hw && !dev->netdev_ops->ndo_setup_tc) {
+ NL_SET_ERR_MSG(extack,
+ "Device does not support hardware offload");
+ return -EINVAL;
}
return 0;
}
+static const struct
+nla_policy mqprio_tc_entry_policy[TCA_MQPRIO_TC_ENTRY_MAX + 1] = {
+ [TCA_MQPRIO_TC_ENTRY_INDEX] = NLA_POLICY_MAX(NLA_U32,
+ TC_QOPT_MAX_QUEUE - 1),
+ [TCA_MQPRIO_TC_ENTRY_FP] = NLA_POLICY_RANGE(NLA_U32,
+ TC_FP_EXPRESS,
+ TC_FP_PREEMPTIBLE),
+};
+
static const struct nla_policy mqprio_policy[TCA_MQPRIO_MAX + 1] = {
[TCA_MQPRIO_MODE] = { .len = sizeof(u16) },
[TCA_MQPRIO_SHAPER] = { .len = sizeof(u16) },
[TCA_MQPRIO_MIN_RATE64] = { .type = NLA_NESTED },
[TCA_MQPRIO_MAX_RATE64] = { .type = NLA_NESTED },
+ [TCA_MQPRIO_TC_ENTRY] = { .type = NLA_NESTED },
};
-static int parse_attr(struct nlattr *tb[], int maxtype, struct nlattr *nla,
- const struct nla_policy *policy, int len)
+static int mqprio_parse_tc_entry(u32 fp[TC_QOPT_MAX_QUEUE],
+ struct nlattr *opt,
+ unsigned long *seen_tcs,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[TCA_MQPRIO_TC_ENTRY_MAX + 1];
+ int err, tc;
+
+ err = nla_parse_nested(tb, TCA_MQPRIO_TC_ENTRY_MAX, opt,
+ mqprio_tc_entry_policy, extack);
+ if (err < 0)
+ return err;
+
+ if (NL_REQ_ATTR_CHECK(extack, opt, tb, TCA_MQPRIO_TC_ENTRY_INDEX)) {
+ NL_SET_ERR_MSG(extack, "TC entry index missing");
+ return -EINVAL;
+ }
+
+ tc = nla_get_u32(tb[TCA_MQPRIO_TC_ENTRY_INDEX]);
+ if (*seen_tcs & BIT(tc)) {
+ NL_SET_ERR_MSG_ATTR(extack, tb[TCA_MQPRIO_TC_ENTRY_INDEX],
+ "Duplicate tc entry");
+ return -EINVAL;
+ }
+
+ *seen_tcs |= BIT(tc);
+
+ if (tb[TCA_MQPRIO_TC_ENTRY_FP])
+ fp[tc] = nla_get_u32(tb[TCA_MQPRIO_TC_ENTRY_FP]);
+
+ return 0;
+}
+
+static int mqprio_parse_tc_entries(struct Qdisc *sch, struct nlattr *nlattr_opt,
+ int nlattr_opt_len,
+ struct netlink_ext_ack *extack)
{
- int nested_len = nla_len(nla) - NLA_ALIGN(len);
+ struct mqprio_sched *priv = qdisc_priv(sch);
+ struct net_device *dev = qdisc_dev(sch);
+ bool have_preemption = false;
+ unsigned long seen_tcs = 0;
+ u32 fp[TC_QOPT_MAX_QUEUE];
+ struct nlattr *n;
+ int tc, rem;
+ int err = 0;
+
+ for (tc = 0; tc < TC_QOPT_MAX_QUEUE; tc++)
+ fp[tc] = priv->fp[tc];
+
+ nla_for_each_attr_type(n, TCA_MQPRIO_TC_ENTRY, nlattr_opt,
+ nlattr_opt_len, rem) {
+ err = mqprio_parse_tc_entry(fp, n, &seen_tcs, extack);
+ if (err)
+ goto out;
+ }
+
+ for (tc = 0; tc < TC_QOPT_MAX_QUEUE; tc++) {
+ priv->fp[tc] = fp[tc];
+ if (fp[tc] == TC_FP_PREEMPTIBLE)
+ have_preemption = true;
+ }
+
+ if (have_preemption && !ethtool_dev_mm_supported(dev)) {
+ NL_SET_ERR_MSG(extack, "Device does not support preemption");
+ return -EOPNOTSUPP;
+ }
+out:
+ return err;
+}
+
+/* Parse the other netlink attributes that represent the payload of
+ * TCA_OPTIONS, which are appended right after struct tc_mqprio_qopt.
+ */
+static int mqprio_parse_nlattr(struct Qdisc *sch, struct tc_mqprio_qopt *qopt,
+ struct nlattr *opt,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *nlattr_opt = nla_data(opt) + NLA_ALIGN(sizeof(*qopt));
+ int nlattr_opt_len = nla_len(opt) - NLA_ALIGN(sizeof(*qopt));
+ struct mqprio_sched *priv = qdisc_priv(sch);
+ struct nlattr *tb[TCA_MQPRIO_MAX + 1] = {};
+ struct nlattr *attr;
+ int i, rem, err;
+
+ if (nlattr_opt_len >= nla_attr_size(0)) {
+ err = nla_parse_deprecated(tb, TCA_MQPRIO_MAX, nlattr_opt,
+ nlattr_opt_len, mqprio_policy,
+ NULL);
+ if (err < 0)
+ return err;
+ }
+
+ if (!qopt->hw) {
+ NL_SET_ERR_MSG(extack,
+ "mqprio TCA_OPTIONS can only contain netlink attributes in hardware mode");
+ return -EINVAL;
+ }
- if (nested_len >= nla_attr_size(0))
- return nla_parse(tb, maxtype, nla_data(nla) + NLA_ALIGN(len),
- nested_len, policy, NULL);
+ if (tb[TCA_MQPRIO_MODE]) {
+ priv->flags |= TC_MQPRIO_F_MODE;
+ priv->mode = nla_get_u16(tb[TCA_MQPRIO_MODE]);
+ }
+
+ if (tb[TCA_MQPRIO_SHAPER]) {
+ priv->flags |= TC_MQPRIO_F_SHAPER;
+ priv->shaper = nla_get_u16(tb[TCA_MQPRIO_SHAPER]);
+ }
+
+ if (tb[TCA_MQPRIO_MIN_RATE64]) {
+ if (priv->shaper != TC_MQPRIO_SHAPER_BW_RATE) {
+ NL_SET_ERR_MSG_ATTR(extack, tb[TCA_MQPRIO_MIN_RATE64],
+ "min_rate accepted only when shaper is in bw_rlimit mode");
+ return -EINVAL;
+ }
+ i = 0;
+ nla_for_each_nested(attr, tb[TCA_MQPRIO_MIN_RATE64],
+ rem) {
+ if (nla_type(attr) != TCA_MQPRIO_MIN_RATE64) {
+ NL_SET_ERR_MSG_ATTR(extack, attr,
+ "Attribute type expected to be TCA_MQPRIO_MIN_RATE64");
+ return -EINVAL;
+ }
+
+ if (nla_len(attr) != sizeof(u64)) {
+ NL_SET_ERR_MSG_ATTR(extack, attr,
+ "Attribute TCA_MQPRIO_MIN_RATE64 expected to have 8 bytes length");
+ return -EINVAL;
+ }
+
+ if (i >= qopt->num_tc)
+ break;
+ priv->min_rate[i] = nla_get_u64(attr);
+ i++;
+ }
+ priv->flags |= TC_MQPRIO_F_MIN_RATE;
+ }
+
+ if (tb[TCA_MQPRIO_MAX_RATE64]) {
+ if (priv->shaper != TC_MQPRIO_SHAPER_BW_RATE) {
+ NL_SET_ERR_MSG_ATTR(extack, tb[TCA_MQPRIO_MAX_RATE64],
+ "max_rate accepted only when shaper is in bw_rlimit mode");
+ return -EINVAL;
+ }
+ i = 0;
+ nla_for_each_nested(attr, tb[TCA_MQPRIO_MAX_RATE64],
+ rem) {
+ if (nla_type(attr) != TCA_MQPRIO_MAX_RATE64) {
+ NL_SET_ERR_MSG_ATTR(extack, attr,
+ "Attribute type expected to be TCA_MQPRIO_MAX_RATE64");
+ return -EINVAL;
+ }
+
+ if (nla_len(attr) != sizeof(u64)) {
+ NL_SET_ERR_MSG_ATTR(extack, attr,
+ "Attribute TCA_MQPRIO_MAX_RATE64 expected to have 8 bytes length");
+ return -EINVAL;
+ }
+
+ if (i >= qopt->num_tc)
+ break;
+ priv->max_rate[i] = nla_get_u64(attr);
+ i++;
+ }
+ priv->flags |= TC_MQPRIO_F_MAX_RATE;
+ }
+
+ if (tb[TCA_MQPRIO_TC_ENTRY]) {
+ err = mqprio_parse_tc_entries(sch, nlattr_opt, nlattr_opt_len,
+ extack);
+ if (err)
+ return err;
+ }
- memset(tb, 0, sizeof(struct nlattr *) * (maxtype + 1));
return 0;
}
@@ -141,10 +351,8 @@ static int mqprio_init(struct Qdisc *sch, struct nlattr *opt,
struct Qdisc *qdisc;
int i, err = -EOPNOTSUPP;
struct tc_mqprio_qopt *qopt = NULL;
- struct nlattr *tb[TCA_MQPRIO_MAX + 1];
- struct nlattr *attr;
- int rem;
- int len;
+ struct tc_mqprio_caps caps;
+ int len, tc;
BUILD_BUG_ON(TC_MAX_QUEUE != TC_QOPT_MAX_QUEUE);
BUILD_BUG_ON(TC_BITMASK != TC_QOPT_BITMASK);
@@ -162,61 +370,21 @@ static int mqprio_init(struct Qdisc *sch, struct nlattr *opt,
if (!opt || nla_len(opt) < sizeof(*qopt))
return -EINVAL;
+ for (tc = 0; tc < TC_QOPT_MAX_QUEUE; tc++)
+ priv->fp[tc] = TC_FP_EXPRESS;
+
+ qdisc_offload_query_caps(dev, TC_SETUP_QDISC_MQPRIO,
+ &caps, sizeof(caps));
+
qopt = nla_data(opt);
- if (mqprio_parse_opt(dev, qopt))
+ if (mqprio_parse_opt(dev, qopt, &caps, extack))
return -EINVAL;
len = nla_len(opt) - NLA_ALIGN(sizeof(*qopt));
if (len > 0) {
- err = parse_attr(tb, TCA_MQPRIO_MAX, opt, mqprio_policy,
- sizeof(*qopt));
- if (err < 0)
+ err = mqprio_parse_nlattr(sch, qopt, opt, extack);
+ if (err)
return err;
-
- if (!qopt->hw)
- return -EINVAL;
-
- if (tb[TCA_MQPRIO_MODE]) {
- priv->flags |= TC_MQPRIO_F_MODE;
- priv->mode = *(u16 *)nla_data(tb[TCA_MQPRIO_MODE]);
- }
-
- if (tb[TCA_MQPRIO_SHAPER]) {
- priv->flags |= TC_MQPRIO_F_SHAPER;
- priv->shaper = *(u16 *)nla_data(tb[TCA_MQPRIO_SHAPER]);
- }
-
- if (tb[TCA_MQPRIO_MIN_RATE64]) {
- if (priv->shaper != TC_MQPRIO_SHAPER_BW_RATE)
- return -EINVAL;
- i = 0;
- nla_for_each_nested(attr, tb[TCA_MQPRIO_MIN_RATE64],
- rem) {
- if (nla_type(attr) != TCA_MQPRIO_MIN_RATE64)
- return -EINVAL;
- if (i >= qopt->num_tc)
- break;
- priv->min_rate[i] = *(u64 *)nla_data(attr);
- i++;
- }
- priv->flags |= TC_MQPRIO_F_MIN_RATE;
- }
-
- if (tb[TCA_MQPRIO_MAX_RATE64]) {
- if (priv->shaper != TC_MQPRIO_SHAPER_BW_RATE)
- return -EINVAL;
- i = 0;
- nla_for_each_nested(attr, tb[TCA_MQPRIO_MAX_RATE64],
- rem) {
- if (nla_type(attr) != TCA_MQPRIO_MAX_RATE64)
- return -EINVAL;
- if (i >= qopt->num_tc)
- break;
- priv->max_rate[i] = *(u64 *)nla_data(attr);
- i++;
- }
- priv->flags |= TC_MQPRIO_F_MAX_RATE;
- }
}
/* pre-allocate qdisc, attachment can't fail */
@@ -243,36 +411,9 @@ static int mqprio_init(struct Qdisc *sch, struct nlattr *opt,
* supplied and verified mapping
*/
if (qopt->hw) {
- struct tc_mqprio_qopt_offload mqprio = {.qopt = *qopt};
-
- switch (priv->mode) {
- case TC_MQPRIO_MODE_DCB:
- if (priv->shaper != TC_MQPRIO_SHAPER_DCB)
- return -EINVAL;
- break;
- case TC_MQPRIO_MODE_CHANNEL:
- mqprio.flags = priv->flags;
- if (priv->flags & TC_MQPRIO_F_MODE)
- mqprio.mode = priv->mode;
- if (priv->flags & TC_MQPRIO_F_SHAPER)
- mqprio.shaper = priv->shaper;
- if (priv->flags & TC_MQPRIO_F_MIN_RATE)
- for (i = 0; i < mqprio.qopt.num_tc; i++)
- mqprio.min_rate[i] = priv->min_rate[i];
- if (priv->flags & TC_MQPRIO_F_MAX_RATE)
- for (i = 0; i < mqprio.qopt.num_tc; i++)
- mqprio.max_rate[i] = priv->max_rate[i];
- break;
- default:
- return -EINVAL;
- }
- err = dev->netdev_ops->ndo_setup_tc(dev,
- TC_SETUP_QDISC_MQPRIO,
- &mqprio);
+ err = mqprio_enable_offload(sch, qopt, extack);
if (err)
return err;
-
- priv->hw_offload = mqprio.qopt.hw;
} else {
netdev_set_num_tc(dev, qopt->num_tc);
for (i = 0; i < qopt->num_tc; i++)
@@ -349,7 +490,7 @@ static int dump_rates(struct mqprio_sched *priv,
int i;
if (priv->flags & TC_MQPRIO_F_MIN_RATE) {
- nest = nla_nest_start(skb, TCA_MQPRIO_MIN_RATE64);
+ nest = nla_nest_start_noflag(skb, TCA_MQPRIO_MIN_RATE64);
if (!nest)
goto nla_put_failure;
@@ -363,7 +504,7 @@ static int dump_rates(struct mqprio_sched *priv,
}
if (priv->flags & TC_MQPRIO_F_MAX_RATE) {
- nest = nla_nest_start(skb, TCA_MQPRIO_MAX_RATE64);
+ nest = nla_nest_start_noflag(skb, TCA_MQPRIO_MAX_RATE64);
if (!nest)
goto nla_put_failure;
@@ -382,6 +523,33 @@ nla_put_failure:
return -1;
}
+static int mqprio_dump_tc_entries(struct mqprio_sched *priv,
+ struct sk_buff *skb)
+{
+ struct nlattr *n;
+ int tc;
+
+ for (tc = 0; tc < TC_QOPT_MAX_QUEUE; tc++) {
+ n = nla_nest_start(skb, TCA_MQPRIO_TC_ENTRY);
+ if (!n)
+ return -EMSGSIZE;
+
+ if (nla_put_u32(skb, TCA_MQPRIO_TC_ENTRY_INDEX, tc))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, TCA_MQPRIO_TC_ENTRY_FP, priv->fp[tc]))
+ goto nla_put_failure;
+
+ nla_nest_end(skb, n);
+ }
+
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(skb, n);
+ return -EMSGSIZE;
+}
+
static int mqprio_dump(struct Qdisc *sch, struct sk_buff *skb)
{
struct net_device *dev = qdisc_dev(sch);
@@ -389,10 +557,10 @@ static int mqprio_dump(struct Qdisc *sch, struct sk_buff *skb)
struct nlattr *nla = (struct nlattr *)skb_tail_pointer(skb);
struct tc_mqprio_qopt opt = { 0 };
struct Qdisc *qdisc;
- unsigned int ntx, tc;
+ unsigned int ntx;
sch->q.qlen = 0;
- memset(&sch->bstats, 0, sizeof(sch->bstats));
+ gnet_stats_basic_sync_init(&sch->bstats);
memset(&sch->qstats, 0, sizeof(sch->qstats));
/* MQ supports lockless qdiscs. However, statistics accounting needs
@@ -401,41 +569,22 @@ static int mqprio_dump(struct Qdisc *sch, struct sk_buff *skb)
* qdisc totals are added at end.
*/
for (ntx = 0; ntx < dev->num_tx_queues; ntx++) {
- qdisc = netdev_get_tx_queue(dev, ntx)->qdisc_sleeping;
+ qdisc = rtnl_dereference(netdev_get_tx_queue(dev, ntx)->qdisc_sleeping);
spin_lock_bh(qdisc_lock(qdisc));
- if (qdisc_is_percpu_stats(qdisc)) {
- __u32 qlen = qdisc_qlen_sum(qdisc);
-
- __gnet_stats_copy_basic(NULL, &sch->bstats,
- qdisc->cpu_bstats,
- &qdisc->bstats);
- __gnet_stats_copy_queue(&sch->qstats,
- qdisc->cpu_qstats,
- &qdisc->qstats, qlen);
- } else {
- sch->q.qlen += qdisc->q.qlen;
- sch->bstats.bytes += qdisc->bstats.bytes;
- sch->bstats.packets += qdisc->bstats.packets;
- sch->qstats.backlog += qdisc->qstats.backlog;
- sch->qstats.drops += qdisc->qstats.drops;
- sch->qstats.requeues += qdisc->qstats.requeues;
- sch->qstats.overlimits += qdisc->qstats.overlimits;
- }
+ gnet_stats_add_basic(&sch->bstats, qdisc->cpu_bstats,
+ &qdisc->bstats, false);
+ gnet_stats_add_queue(&sch->qstats, qdisc->cpu_qstats,
+ &qdisc->qstats);
+ sch->q.qlen += qdisc_qlen(qdisc);
spin_unlock_bh(qdisc_lock(qdisc));
}
- opt.num_tc = netdev_get_num_tc(dev);
- memcpy(opt.prio_tc_map, dev->prio_tc_map, sizeof(opt.prio_tc_map));
+ mqprio_qopt_reconstruct(dev, &opt);
opt.hw = priv->hw_offload;
- for (tc = 0; tc < netdev_get_num_tc(dev); tc++) {
- opt.count[tc] = dev->tc_to_txq[tc].count;
- opt.offset[tc] = dev->tc_to_txq[tc].offset;
- }
-
- if (nla_put(skb, TCA_OPTIONS, NLA_ALIGN(sizeof(opt)), &opt))
+ if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt))
goto nla_put_failure;
if ((priv->flags & TC_MQPRIO_F_MODE) &&
@@ -451,6 +600,9 @@ static int mqprio_dump(struct Qdisc *sch, struct sk_buff *skb)
(dump_rates(priv, &opt, skb) != 0))
goto nla_put_failure;
+ if (mqprio_dump_tc_entries(priv, skb))
+ goto nla_put_failure;
+
return nla_nest_end(skb, nla);
nla_put_failure:
nlmsg_trim(skb, nla);
@@ -464,7 +616,7 @@ static struct Qdisc *mqprio_leaf(struct Qdisc *sch, unsigned long cl)
if (!dev_queue)
return NULL;
- return dev_queue->qdisc_sleeping;
+ return rtnl_dereference(dev_queue->qdisc_sleeping);
}
static unsigned long mqprio_find(struct Qdisc *sch, u32 classid)
@@ -497,7 +649,7 @@ static int mqprio_dump_class(struct Qdisc *sch, unsigned long cl,
tcm->tcm_parent = (tc < 0) ? 0 :
TC_H_MAKE(TC_H_MAJ(sch->handle),
TC_H_MIN(tc + TC_H_MIN_PRIORITY));
- tcm->tcm_info = dev_queue->qdisc_sleeping->handle;
+ tcm->tcm_info = rtnl_dereference(dev_queue->qdisc_sleeping)->handle;
} else {
tcm->tcm_parent = TC_H_ROOT;
tcm->tcm_info = 0;
@@ -513,12 +665,13 @@ static int mqprio_dump_class_stats(struct Qdisc *sch, unsigned long cl,
{
if (cl >= TC_H_MIN_PRIORITY) {
int i;
- __u32 qlen = 0;
+ __u32 qlen;
struct gnet_stats_queue qstats = {0};
- struct gnet_stats_basic_packed bstats = {0};
+ struct gnet_stats_basic_sync bstats;
struct net_device *dev = qdisc_dev(sch);
struct netdev_tc_txq tc = dev->tc_to_txq[cl & TC_BITMASK];
+ gnet_stats_basic_sync_init(&bstats);
/* Drop lock here it will be reclaimed before touching
* statistics this is required because the d->lock we
* hold here is the look on dev_queue->qdisc_sleeping
@@ -530,39 +683,32 @@ static int mqprio_dump_class_stats(struct Qdisc *sch, unsigned long cl,
for (i = tc.offset; i < tc.offset + tc.count; i++) {
struct netdev_queue *q = netdev_get_tx_queue(dev, i);
struct Qdisc *qdisc = rtnl_dereference(q->qdisc);
- struct gnet_stats_basic_cpu __percpu *cpu_bstats = NULL;
- struct gnet_stats_queue __percpu *cpu_qstats = NULL;
spin_lock_bh(qdisc_lock(qdisc));
- if (qdisc_is_percpu_stats(qdisc)) {
- cpu_bstats = qdisc->cpu_bstats;
- cpu_qstats = qdisc->cpu_qstats;
- }
- qlen = qdisc_qlen_sum(qdisc);
- __gnet_stats_copy_basic(NULL, &sch->bstats,
- cpu_bstats, &qdisc->bstats);
- __gnet_stats_copy_queue(&sch->qstats,
- cpu_qstats,
- &qdisc->qstats,
- qlen);
+ gnet_stats_add_basic(&bstats, qdisc->cpu_bstats,
+ &qdisc->bstats, false);
+ gnet_stats_add_queue(&qstats, qdisc->cpu_qstats,
+ &qdisc->qstats);
+ sch->q.qlen += qdisc_qlen(qdisc);
+
spin_unlock_bh(qdisc_lock(qdisc));
}
+ qlen = qdisc_qlen(sch) + qstats.qlen;
/* Reclaim root sleeping lock before completing stats */
if (d->lock)
spin_lock_bh(d->lock);
- if (gnet_stats_copy_basic(NULL, d, NULL, &bstats) < 0 ||
+ if (gnet_stats_copy_basic(d, NULL, &bstats, false) < 0 ||
gnet_stats_copy_queue(d, NULL, &qstats, qlen) < 0)
return -1;
} else {
struct netdev_queue *dev_queue = mqprio_queue_get(sch, cl);
- sch = dev_queue->qdisc_sleeping;
- if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
- d, NULL, &sch->bstats) < 0 ||
- gnet_stats_copy_queue(d, NULL,
- &sch->qstats, sch->q.qlen) < 0)
+ sch = rtnl_dereference(dev_queue->qdisc_sleeping);
+ if (gnet_stats_copy_basic(d, sch->cpu_bstats,
+ &sch->bstats, true) < 0 ||
+ qdisc_qstats_copy(d, sch) < 0)
return -1;
}
return 0;
@@ -579,11 +725,8 @@ static void mqprio_walk(struct Qdisc *sch, struct qdisc_walker *arg)
/* Walk hierarchy with a virtual class per tc */
arg->count = arg->skip;
for (ntx = arg->skip; ntx < netdev_get_num_tc(dev); ntx++) {
- if (arg->fn(sch, ntx + TC_H_MIN_PRIORITY, arg) < 0) {
- arg->stop = 1;
+ if (!tc_qdisc_stats_dump(sch, ntx + TC_H_MIN_PRIORITY, arg))
return;
- }
- arg->count++;
}
/* Pad the values and skip over unused traffic classes */
@@ -625,9 +768,11 @@ static struct Qdisc_ops mqprio_qdisc_ops __read_mostly = {
.init = mqprio_init,
.destroy = mqprio_destroy,
.attach = mqprio_attach,
+ .change_real_num_tx = mq_change_real_num_tx,
.dump = mqprio_dump,
.owner = THIS_MODULE,
};
+MODULE_ALIAS_NET_SCH("mqprio");
static int __init mqprio_module_init(void)
{
@@ -643,3 +788,4 @@ module_init(mqprio_module_init);
module_exit(mqprio_module_exit);
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Classful multiqueue prio qdisc");
diff --git a/net/sched/sch_mqprio_lib.c b/net/sched/sch_mqprio_lib.c
new file mode 100644
index 000000000000..b3a5572c167b
--- /dev/null
+++ b/net/sched/sch_mqprio_lib.c
@@ -0,0 +1,132 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/net.h>
+#include <linux/netdevice.h>
+#include <linux/netlink.h>
+#include <linux/types.h>
+#include <net/pkt_sched.h>
+
+#include "sch_mqprio_lib.h"
+
+/* Returns true if the intervals [a, b) and [c, d) overlap. */
+static bool intervals_overlap(int a, int b, int c, int d)
+{
+ int left = max(a, c), right = min(b, d);
+
+ return left < right;
+}
+
+static int mqprio_validate_queue_counts(struct net_device *dev,
+ const struct tc_mqprio_qopt *qopt,
+ bool allow_overlapping_txqs,
+ struct netlink_ext_ack *extack)
+{
+ int i, j;
+
+ for (i = 0; i < qopt->num_tc; i++) {
+ unsigned int last = qopt->offset[i] + qopt->count[i];
+
+ if (!qopt->count[i]) {
+ NL_SET_ERR_MSG_FMT_MOD(extack, "No queues for TC %d",
+ i);
+ return -EINVAL;
+ }
+
+ /* Verify the queue count is in tx range being equal to the
+ * real_num_tx_queues indicates the last queue is in use.
+ */
+ if (qopt->offset[i] >= dev->real_num_tx_queues ||
+ last > dev->real_num_tx_queues) {
+ NL_SET_ERR_MSG_FMT_MOD(extack,
+ "Queues %d:%d for TC %d exceed the %d TX queues available",
+ qopt->count[i], qopt->offset[i],
+ i, dev->real_num_tx_queues);
+ return -EINVAL;
+ }
+
+ if (allow_overlapping_txqs)
+ continue;
+
+ /* Verify that the offset and counts do not overlap */
+ for (j = i + 1; j < qopt->num_tc; j++) {
+ if (intervals_overlap(qopt->offset[i], last,
+ qopt->offset[j],
+ qopt->offset[j] +
+ qopt->count[j])) {
+ NL_SET_ERR_MSG_FMT_MOD(extack,
+ "TC %d queues %d@%d overlap with TC %d queues %d@%d",
+ i, qopt->count[i], qopt->offset[i],
+ j, qopt->count[j], qopt->offset[j]);
+ return -EINVAL;
+ }
+ }
+ }
+
+ return 0;
+}
+
+int mqprio_validate_qopt(struct net_device *dev, struct tc_mqprio_qopt *qopt,
+ bool validate_queue_counts,
+ bool allow_overlapping_txqs,
+ struct netlink_ext_ack *extack)
+{
+ int i, err;
+
+ /* Verify num_tc is not out of max range */
+ if (qopt->num_tc > TC_MAX_QUEUE) {
+ NL_SET_ERR_MSG(extack,
+ "Number of traffic classes is outside valid range");
+ return -EINVAL;
+ }
+
+ /* Verify priority mapping uses valid tcs */
+ for (i = 0; i <= TC_BITMASK; i++) {
+ if (qopt->prio_tc_map[i] >= qopt->num_tc) {
+ NL_SET_ERR_MSG(extack,
+ "Invalid traffic class in priority to traffic class mapping");
+ return -EINVAL;
+ }
+ }
+
+ if (validate_queue_counts) {
+ err = mqprio_validate_queue_counts(dev, qopt,
+ allow_overlapping_txqs,
+ extack);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(mqprio_validate_qopt);
+
+void mqprio_qopt_reconstruct(struct net_device *dev, struct tc_mqprio_qopt *qopt)
+{
+ int tc, num_tc = netdev_get_num_tc(dev);
+
+ qopt->num_tc = num_tc;
+ memcpy(qopt->prio_tc_map, dev->prio_tc_map, sizeof(qopt->prio_tc_map));
+
+ for (tc = 0; tc < num_tc; tc++) {
+ qopt->count[tc] = dev->tc_to_txq[tc].count;
+ qopt->offset[tc] = dev->tc_to_txq[tc].offset;
+ }
+}
+EXPORT_SYMBOL_GPL(mqprio_qopt_reconstruct);
+
+void mqprio_fp_to_offload(u32 fp[TC_QOPT_MAX_QUEUE],
+ struct tc_mqprio_qopt_offload *mqprio)
+{
+ unsigned long preemptible_tcs = 0;
+ int tc;
+
+ for (tc = 0; tc < TC_QOPT_MAX_QUEUE; tc++)
+ if (fp[tc] == TC_FP_PREEMPTIBLE)
+ preemptible_tcs |= BIT(tc);
+
+ mqprio->preemptible_tcs = preemptible_tcs;
+}
+EXPORT_SYMBOL_GPL(mqprio_fp_to_offload);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Shared mqprio qdisc code currently between taprio and mqprio");
diff --git a/net/sched/sch_mqprio_lib.h b/net/sched/sch_mqprio_lib.h
new file mode 100644
index 000000000000..079f597072e3
--- /dev/null
+++ b/net/sched/sch_mqprio_lib.h
@@ -0,0 +1,20 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __SCH_MQPRIO_LIB_H
+#define __SCH_MQPRIO_LIB_H
+
+#include <linux/types.h>
+
+struct net_device;
+struct netlink_ext_ack;
+struct tc_mqprio_qopt;
+
+int mqprio_validate_qopt(struct net_device *dev, struct tc_mqprio_qopt *qopt,
+ bool validate_queue_counts,
+ bool allow_overlapping_txqs,
+ struct netlink_ext_ack *extack);
+void mqprio_qopt_reconstruct(struct net_device *dev,
+ struct tc_mqprio_qopt *qopt);
+void mqprio_fp_to_offload(u32 fp[TC_QOPT_MAX_QUEUE],
+ struct tc_mqprio_qopt_offload *mqprio);
+
+#endif
diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c
index 7410ce4d0321..06e03f5cd7ce 100644
--- a/net/sched/sch_multiq.c
+++ b/net/sched/sch_multiq.c
@@ -1,18 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (c) 2008, Intel Corporation.
*
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, see <http://www.gnu.org/licenses/>.
- *
* Author: Alexander Duyck <alexander.h.duyck@intel.com>
*/
@@ -47,14 +36,14 @@ multiq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
int err;
*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
- err = tcf_classify(skb, fl, &res, false);
+ err = tcf_classify(skb, NULL, fl, &res, false);
#ifdef CONFIG_NET_CLS_ACT
switch (err) {
case TC_ACT_STOLEN:
case TC_ACT_QUEUED:
case TC_ACT_TRAP:
*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
- /* fall through */
+ fallthrough;
case TC_ACT_SHOT:
return NULL;
}
@@ -163,7 +152,6 @@ multiq_reset(struct Qdisc *sch)
for (band = 0; band < q->bands; band++)
qdisc_reset(q->queues[band]);
- sch->q.qlen = 0;
q->curband = 0;
}
@@ -185,7 +173,8 @@ static int multiq_tune(struct Qdisc *sch, struct nlattr *opt,
{
struct multiq_sched_data *q = qdisc_priv(sch);
struct tc_multiq_qopt *qopt;
- int i;
+ struct Qdisc **removed;
+ int i, n_removed = 0;
if (!netif_is_multiqueue(qdisc_dev(sch)))
return -EOPNOTSUPP;
@@ -196,20 +185,29 @@ static int multiq_tune(struct Qdisc *sch, struct nlattr *opt,
qopt->bands = qdisc_dev(sch)->real_num_tx_queues;
+ removed = kmalloc(sizeof(*removed) * (q->max_bands - qopt->bands),
+ GFP_KERNEL);
+ if (!removed)
+ return -ENOMEM;
+
sch_tree_lock(sch);
q->bands = qopt->bands;
for (i = q->bands; i < q->max_bands; i++) {
if (q->queues[i] != &noop_qdisc) {
struct Qdisc *child = q->queues[i];
+
q->queues[i] = &noop_qdisc;
- qdisc_tree_reduce_backlog(child, child->q.qlen,
- child->qstats.backlog);
- qdisc_put(child);
+ qdisc_purge_queue(child);
+ removed[n_removed++] = child;
}
}
sch_tree_unlock(sch);
+ for (i = 0; i < n_removed; i++)
+ qdisc_put(removed[i]);
+ kfree(removed);
+
for (i = 0; i < q->bands; i++) {
if (q->queues[i] == &noop_qdisc) {
struct Qdisc *child, *old;
@@ -224,13 +222,10 @@ static int multiq_tune(struct Qdisc *sch, struct nlattr *opt,
if (child != &noop_qdisc)
qdisc_hash_add(child, true);
- if (old != &noop_qdisc) {
- qdisc_tree_reduce_backlog(old,
- old->q.qlen,
- old->qstats.backlog);
- qdisc_put(old);
- }
+ if (old != &noop_qdisc)
+ qdisc_purge_queue(old);
sch_tree_unlock(sch);
+ qdisc_put(old);
}
}
}
@@ -342,9 +337,8 @@ static int multiq_dump_class_stats(struct Qdisc *sch, unsigned long cl,
struct Qdisc *cl_q;
cl_q = q->queues[cl - 1];
- if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
- d, NULL, &cl_q->bstats) < 0 ||
- gnet_stats_copy_queue(d, NULL, &cl_q->qstats, cl_q->q.qlen) < 0)
+ if (gnet_stats_copy_basic(d, cl_q->cpu_bstats, &cl_q->bstats, true) < 0 ||
+ qdisc_qstats_copy(d, cl_q) < 0)
return -1;
return 0;
@@ -359,15 +353,8 @@ static void multiq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
return;
for (band = 0; band < q->bands; band++) {
- if (arg->count < arg->skip) {
- arg->count++;
- continue;
- }
- if (arg->fn(sch, band + 1, arg) < 0) {
- arg->stop = 1;
+ if (!tc_qdisc_stats_dump(sch, band + 1, arg))
break;
- }
- arg->count++;
}
}
@@ -408,6 +395,7 @@ static struct Qdisc_ops multiq_qdisc_ops __read_mostly = {
.dump = multiq_dump,
.owner = THIS_MODULE,
};
+MODULE_ALIAS_NET_SCH("multiq");
static int __init multiq_module_init(void)
{
@@ -423,3 +411,4 @@ module_init(multiq_module_init)
module_exit(multiq_module_exit)
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Multi queue to hardware queue mapping qdisc");
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index 75046ec72144..32a5f3304046 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -1,11 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* net/sched/sch_netem.c Network emulator
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License.
- *
* Many of the algorithms and ideas for this came from
* NIST Net which is not copyrighted.
*
@@ -21,10 +17,12 @@
#include <linux/errno.h>
#include <linux/skbuff.h>
#include <linux/vmalloc.h>
+#include <linux/prandom.h>
#include <linux/rtnetlink.h>
#include <linux/reciprocal_div.h>
#include <linux/rbtree.h>
+#include <net/gso.h>
#include <net/netlink.h>
#include <net/pkt_sched.h>
#include <net/inet_ecn.h>
@@ -70,7 +68,7 @@
struct disttable {
u32 size;
- s16 table[0];
+ s16 table[] __counted_by(size);
};
struct netem_sched_data {
@@ -81,6 +79,8 @@ struct netem_sched_data {
struct sk_buff *t_head;
struct sk_buff *t_tail;
+ u32 t_len;
+
/* optional qdisc for classful handling (NULL at netem init) */
struct Qdisc *qdisc;
@@ -108,6 +108,11 @@ struct netem_sched_data {
u32 rho;
} delay_cor, loss_cor, dup_cor, reorder_cor, corrupt_cor;
+ struct prng {
+ u64 seed;
+ struct rnd_state prng_state;
+ } prng;
+
struct disttable *delay_dist;
enum {
@@ -175,22 +180,23 @@ static inline struct netem_skb_cb *netem_skb_cb(struct sk_buff *skb)
static void init_crandom(struct crndstate *state, unsigned long rho)
{
state->rho = rho;
- state->last = prandom_u32();
+ state->last = get_random_u32();
}
/* get_crandom - correlated random number generator
* Next number depends on last value.
* rho is scaled to avoid floating point.
*/
-static u32 get_crandom(struct crndstate *state)
+static u32 get_crandom(struct crndstate *state, struct prng *p)
{
u64 value, rho;
unsigned long answer;
+ struct rnd_state *s = &p->prng_state;
if (!state || state->rho == 0) /* no correlation */
- return prandom_u32();
+ return prandom_u32_state(s);
- value = prandom_u32();
+ value = prandom_u32_state(s);
rho = (u64)state->rho + 1;
answer = (value * ((1ull<<32) - rho) + state->last * rho) >> 32;
state->last = answer;
@@ -204,7 +210,7 @@ static u32 get_crandom(struct crndstate *state)
static bool loss_4state(struct netem_sched_data *q)
{
struct clgstate *clg = &q->clg;
- u32 rnd = prandom_u32();
+ u32 rnd = prandom_u32_state(&q->prng.prng_state);
/*
* Makes a comparison between rnd and the transition
@@ -212,17 +218,17 @@ static bool loss_4state(struct netem_sched_data *q)
* next state and if the next packet has to be transmitted or lost.
* The four states correspond to:
* TX_IN_GAP_PERIOD => successfully transmitted packets within a gap period
- * LOST_IN_BURST_PERIOD => isolated losses within a gap period
- * LOST_IN_GAP_PERIOD => lost packets within a burst period
- * TX_IN_GAP_PERIOD => successfully transmitted packets within a burst period
+ * LOST_IN_GAP_PERIOD => isolated losses within a gap period
+ * LOST_IN_BURST_PERIOD => lost packets within a burst period
+ * TX_IN_BURST_PERIOD => successfully transmitted packets within a burst period
*/
switch (clg->state) {
case TX_IN_GAP_PERIOD:
if (rnd < clg->a4) {
- clg->state = LOST_IN_BURST_PERIOD;
+ clg->state = LOST_IN_GAP_PERIOD;
return true;
} else if (clg->a4 < rnd && rnd < clg->a1 + clg->a4) {
- clg->state = LOST_IN_GAP_PERIOD;
+ clg->state = LOST_IN_BURST_PERIOD;
return true;
} else if (clg->a1 + clg->a4 < rnd) {
clg->state = TX_IN_GAP_PERIOD;
@@ -231,24 +237,24 @@ static bool loss_4state(struct netem_sched_data *q)
break;
case TX_IN_BURST_PERIOD:
if (rnd < clg->a5) {
- clg->state = LOST_IN_GAP_PERIOD;
+ clg->state = LOST_IN_BURST_PERIOD;
return true;
} else {
clg->state = TX_IN_BURST_PERIOD;
}
break;
- case LOST_IN_GAP_PERIOD:
+ case LOST_IN_BURST_PERIOD:
if (rnd < clg->a3)
clg->state = TX_IN_BURST_PERIOD;
else if (clg->a3 < rnd && rnd < clg->a2 + clg->a3) {
clg->state = TX_IN_GAP_PERIOD;
} else if (clg->a2 + clg->a3 < rnd) {
- clg->state = LOST_IN_GAP_PERIOD;
+ clg->state = LOST_IN_BURST_PERIOD;
return true;
}
break;
- case LOST_IN_BURST_PERIOD:
+ case LOST_IN_GAP_PERIOD:
clg->state = TX_IN_GAP_PERIOD;
break;
}
@@ -269,18 +275,19 @@ static bool loss_4state(struct netem_sched_data *q)
static bool loss_gilb_ell(struct netem_sched_data *q)
{
struct clgstate *clg = &q->clg;
+ struct rnd_state *s = &q->prng.prng_state;
switch (clg->state) {
case GOOD_STATE:
- if (prandom_u32() < clg->a1)
+ if (prandom_u32_state(s) < clg->a1)
clg->state = BAD_STATE;
- if (prandom_u32() < clg->a4)
+ if (prandom_u32_state(s) < clg->a4)
return true;
break;
case BAD_STATE:
- if (prandom_u32() < clg->a2)
+ if (prandom_u32_state(s) < clg->a2)
clg->state = GOOD_STATE;
- if (prandom_u32() > clg->a3)
+ if (prandom_u32_state(s) > clg->a3)
return true;
}
@@ -292,7 +299,7 @@ static bool loss_event(struct netem_sched_data *q)
switch (q->loss_model) {
case CLG_RANDOM:
/* Random packet drop 0 => none, ~0 => all */
- return q->loss && q->loss >= get_crandom(&q->loss_cor);
+ return q->loss && q->loss >= get_crandom(&q->loss_cor, &q->prng);
case CLG_4_STATES:
/* 4state loss model algorithm (used also for GI model)
@@ -321,6 +328,7 @@ static bool loss_event(struct netem_sched_data *q)
*/
static s64 tabledist(s64 mu, s32 sigma,
struct crndstate *state,
+ struct prng *prng,
const struct disttable *dist)
{
s64 x;
@@ -330,11 +338,11 @@ static s64 tabledist(s64 mu, s32 sigma,
if (sigma == 0)
return mu;
- rnd = get_crandom(state);
+ rnd = get_crandom(state, prng);
/* default uniform distribution */
if (dist == NULL)
- return ((rnd % (2 * sigma)) + mu) - sigma;
+ return ((rnd % (2 * (u32)sigma)) + mu) - sigma;
t = dist->table[rnd % dist->size];
x = (sigma % NETEM_DIST_SCALE) * t;
@@ -377,6 +385,7 @@ static void tfifo_reset(struct Qdisc *sch)
rtnl_kfree_skbs(q->t_head, q->t_tail);
q->t_head = NULL;
q->t_tail = NULL;
+ q->t_len = 0;
}
static void tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch)
@@ -406,6 +415,7 @@ static void tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch)
rb_link_node(&nskb->rbnode, parent, p);
rb_insert_color(&nskb->rbnode, &q->t_root);
}
+ q->t_len++;
sch->q.qlen++;
}
@@ -419,6 +429,7 @@ static struct sk_buff *netem_segment(struct sk_buff *skb, struct Qdisc *sch,
struct sk_buff *segs;
netdev_features_t features = netif_skb_features(skb);
+ qdisc_skb_cb(skb)->pkt_segs = 1;
segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
if (IS_ERR_OR_NULL(segs)) {
@@ -441,18 +452,16 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch,
struct netem_sched_data *q = qdisc_priv(sch);
/* We don't fill cb now as skb_unshare() may invalidate it */
struct netem_skb_cb *cb;
- struct sk_buff *skb2;
+ struct sk_buff *skb2 = NULL;
struct sk_buff *segs = NULL;
- unsigned int len = 0, last_len, prev_len = qdisc_pkt_len(skb);
- int nb = 0;
+ unsigned int prev_len = qdisc_pkt_len(skb);
int count = 1;
- int rc = NET_XMIT_SUCCESS;
/* Do not fool qdisc_drop_all() */
skb->prev = NULL;
/* Random duplication */
- if (q->duplicate && q->duplicate >= get_crandom(&q->dup_cor))
+ if (q->duplicate && q->duplicate >= get_crandom(&q->dup_cor, &q->prng))
++count;
/* Drop packet? */
@@ -475,18 +484,11 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch,
skb_orphan_partial(skb);
/*
- * If we need to duplicate packet, then re-insert at top of the
- * qdisc tree, since parent queuer expects that only one
- * skb will be queued.
+ * If we need to duplicate packet, then clone it before
+ * original is modified.
*/
- if (count > 1 && (skb2 = skb_clone(skb, GFP_ATOMIC)) != NULL) {
- struct Qdisc *rootq = qdisc_root(sch);
- u32 dupsave = q->duplicate; /* prevent duplicating a dup... */
-
- q->duplicate = 0;
- rootq->enqueue(skb2, rootq, to_free);
- q->duplicate = dupsave;
- }
+ if (count > 1)
+ skb2 = skb_clone(skb, GFP_ATOMIC);
/*
* Randomized packet corruption.
@@ -494,17 +496,16 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch,
* If packet is going to be hardware checksummed, then
* do it now in software before we mangle it.
*/
- if (q->corrupt && q->corrupt >= get_crandom(&q->corrupt_cor)) {
+ if (q->corrupt && q->corrupt >= get_crandom(&q->corrupt_cor, &q->prng)) {
if (skb_is_gso(skb)) {
- segs = netem_segment(skb, sch, to_free);
- if (!segs)
- return NET_XMIT_DROP;
- } else {
- segs = skb;
- }
+ skb = netem_segment(skb, sch, to_free);
+ if (!skb)
+ goto finish_segs;
- skb = segs;
- segs = segs->next;
+ segs = skb->next;
+ skb_mark_not_on_list(skb);
+ qdisc_skb_cb(skb)->pkt_len = skb->len;
+ }
skb = skb_unshare(skb, GFP_ATOMIC);
if (unlikely(!skb)) {
@@ -514,27 +515,49 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch,
if (skb->ip_summed == CHECKSUM_PARTIAL &&
skb_checksum_help(skb)) {
qdisc_drop(skb, sch, to_free);
+ skb = NULL;
goto finish_segs;
}
- skb->data[prandom_u32() % skb_headlen(skb)] ^=
- 1<<(prandom_u32() % 8);
+ skb->data[get_random_u32_below(skb_headlen(skb))] ^=
+ 1<<get_random_u32_below(8);
+ }
+
+ if (unlikely(q->t_len >= sch->limit)) {
+ /* re-link segs, so that qdisc_drop_all() frees them all */
+ skb->next = segs;
+ qdisc_drop_all(skb, sch, to_free);
+ if (skb2)
+ __qdisc_drop(skb2, to_free);
+ return NET_XMIT_DROP;
}
- if (unlikely(sch->q.qlen >= sch->limit))
- return qdisc_drop_all(skb, sch, to_free);
+ /*
+ * If doing duplication then re-insert at top of the
+ * qdisc tree, since parent queuer expects that only one
+ * skb will be queued.
+ */
+ if (skb2) {
+ struct Qdisc *rootq = qdisc_root_bh(sch);
+ u32 dupsave = q->duplicate; /* prevent duplicating a dup... */
+
+ q->duplicate = 0;
+ rootq->enqueue(skb2, rootq, to_free);
+ q->duplicate = dupsave;
+ skb2 = NULL;
+ }
qdisc_qstats_backlog_inc(sch, skb);
cb = netem_skb_cb(skb);
if (q->gap == 0 || /* not doing reordering */
q->counter < q->gap - 1 || /* inside last reordering gap */
- q->reorder < get_crandom(&q->reorder_cor)) {
+ q->reorder < get_crandom(&q->reorder_cor, &q->prng)) {
u64 now;
s64 delay;
delay = tabledist(q->latency, q->jitter,
- &q->delay_cor, q->delay_dist);
+ &q->delay_cor, &q->prng, q->delay_dist);
now = ktime_get_ns();
@@ -592,7 +615,16 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch,
}
finish_segs:
+ if (skb2)
+ __qdisc_drop(skb2, to_free);
+
if (segs) {
+ unsigned int len, last_len;
+ int rc, nb;
+
+ len = skb ? skb->len : 0;
+ nb = skb ? 1 : 0;
+
while (segs) {
skb2 = segs->next;
skb_mark_not_on_list(segs);
@@ -608,9 +640,10 @@ finish_segs:
}
segs = skb2;
}
- sch->q.qlen += nb;
- if (nb > 1)
- qdisc_tree_reduce_backlog(sch, 1 - nb, prev_len - len);
+ /* Parent qdiscs accounted for 1 skb of size @prev_len */
+ qdisc_tree_reduce_backlog(sch, -(nb - 1), -(len - prev_len));
+ } else if (!skb) {
+ return NET_XMIT_DROP;
}
return NET_XMIT_SUCCESS;
}
@@ -625,13 +658,13 @@ static void get_slot_next(struct netem_sched_data *q, u64 now)
if (!q->slot_dist)
next_delay = q->slot_config.min_delay +
- (prandom_u32() *
+ (get_random_u32() *
(q->slot_config.max_delay -
q->slot_config.min_delay) >> 32);
else
next_delay = tabledist(q->slot_config.dist_delay,
(s32)(q->slot_config.dist_jitter),
- NULL, q->slot_dist);
+ NULL, &q->prng, q->slot_dist);
q->slot.slot_next = now + next_delay;
q->slot.packets_left = q->slot_config.max_packets;
@@ -674,8 +707,8 @@ static struct sk_buff *netem_dequeue(struct Qdisc *sch)
tfifo_dequeue:
skb = __qdisc_dequeue_head(&sch->q);
if (skb) {
- qdisc_qstats_backlog_dec(sch, skb);
deliver:
+ qdisc_qstats_backlog_dec(sch, skb);
qdisc_bstats_update(sch, skb);
return skb;
}
@@ -691,8 +724,7 @@ deliver:
if (time_to_send <= now && q->slot.slot_next <= now) {
netem_erase_head(q, skb);
- sch->q.qlen--;
- qdisc_qstats_backlog_dec(sch, skb);
+ q->t_len--;
skb->next = NULL;
skb->prev = NULL;
/* skb->dev shares skb->rbnode area,
@@ -715,21 +747,25 @@ deliver:
err = qdisc_enqueue(skb, q->qdisc, &to_free);
kfree_skb_list(to_free);
- if (err != NET_XMIT_SUCCESS &&
- net_xmit_drop_count(err)) {
- qdisc_qstats_drop(sch);
- qdisc_tree_reduce_backlog(sch, 1,
- pkt_len);
+ if (err != NET_XMIT_SUCCESS) {
+ if (net_xmit_drop_count(err))
+ qdisc_qstats_drop(sch);
+ sch->qstats.backlog -= pkt_len;
+ sch->q.qlen--;
+ qdisc_tree_reduce_backlog(sch, 1, pkt_len);
}
goto tfifo_dequeue;
}
+ sch->q.qlen--;
goto deliver;
}
if (q->qdisc) {
skb = q->qdisc->ops->dequeue(q->qdisc);
- if (skb)
+ if (skb) {
+ sch->q.qlen--;
goto deliver;
+ }
}
qdisc_watchdog_schedule_ns(&q->watchdog,
@@ -739,8 +775,10 @@ deliver:
if (q->qdisc) {
skb = q->qdisc->ops->dequeue(q->qdisc);
- if (skb)
+ if (skb) {
+ sch->q.qlen--;
goto deliver;
+ }
}
return NULL;
}
@@ -766,19 +804,17 @@ static void dist_free(struct disttable *d)
* signed 16 bit values.
*/
-static int get_dist_table(struct Qdisc *sch, struct disttable **tbl,
- const struct nlattr *attr)
+static int get_dist_table(struct disttable **tbl, const struct nlattr *attr)
{
size_t n = nla_len(attr)/sizeof(__s16);
const __s16 *data = nla_data(attr);
- spinlock_t *root_lock;
struct disttable *d;
int i;
- if (n > NETEM_DIST_MAX)
+ if (!n || n > NETEM_DIST_MAX)
return -EINVAL;
- d = kvmalloc(sizeof(struct disttable) + n * sizeof(s16), GFP_KERNEL);
+ d = kvmalloc(struct_size(d, table, n), GFP_KERNEL);
if (!d)
return -ENOMEM;
@@ -786,13 +822,7 @@ static int get_dist_table(struct Qdisc *sch, struct disttable **tbl,
for (i = 0; i < n; i++)
d->table[i] = data[i];
- root_lock = qdisc_root_sleeping_lock(sch);
-
- spin_lock_bh(root_lock);
- swap(*tbl, d);
- spin_unlock_bh(root_lock);
-
- dist_free(d);
+ *tbl = d;
return 0;
}
@@ -805,6 +835,10 @@ static void get_slot(struct netem_sched_data *q, const struct nlattr *attr)
q->slot_config.max_packets = INT_MAX;
if (q->slot_config.max_bytes == 0)
q->slot_config.max_bytes = INT_MAX;
+
+ /* capping dist_jitter to the range acceptable by tabledist() */
+ q->slot_config.dist_jitter = min_t(__s64, INT_MAX, abs(q->slot_config.dist_jitter));
+
q->slot.packets_left = q->slot_config.max_packets;
q->slot.bytes_left = q->slot_config.max_bytes;
if (q->slot_config.min_delay | q->slot_config.max_delay |
@@ -918,6 +952,7 @@ static const struct nla_policy netem_policy[TCA_NETEM_MAX + 1] = {
[TCA_NETEM_LATENCY64] = { .type = NLA_S64 },
[TCA_NETEM_JITTER64] = { .type = NLA_S64 },
[TCA_NETEM_SLOT] = { .len = sizeof(struct tc_netem_slot) },
+ [TCA_NETEM_PRNG_SEED] = { .type = NLA_U64 },
};
static int parse_attr(struct nlattr *tb[], int maxtype, struct nlattr *nla,
@@ -931,32 +966,80 @@ static int parse_attr(struct nlattr *tb[], int maxtype, struct nlattr *nla,
}
if (nested_len >= nla_attr_size(0))
- return nla_parse(tb, maxtype, nla_data(nla) + NLA_ALIGN(len),
- nested_len, policy, NULL);
+ return nla_parse_deprecated(tb, maxtype,
+ nla_data(nla) + NLA_ALIGN(len),
+ nested_len, policy, NULL);
memset(tb, 0, sizeof(struct nlattr *) * (maxtype + 1));
return 0;
}
+static const struct Qdisc_class_ops netem_class_ops;
+
+static int check_netem_in_tree(struct Qdisc *sch, bool duplicates,
+ struct netlink_ext_ack *extack)
+{
+ struct Qdisc *root, *q;
+ unsigned int i;
+
+ root = qdisc_root_sleeping(sch);
+
+ if (sch != root && root->ops->cl_ops == &netem_class_ops) {
+ if (duplicates ||
+ ((struct netem_sched_data *)qdisc_priv(root))->duplicate)
+ goto err;
+ }
+
+ if (!qdisc_dev(root))
+ return 0;
+
+ hash_for_each(qdisc_dev(root)->qdisc_hash, i, q, hash) {
+ if (sch != q && q->ops->cl_ops == &netem_class_ops) {
+ if (duplicates ||
+ ((struct netem_sched_data *)qdisc_priv(q))->duplicate)
+ goto err;
+ }
+ }
+
+ return 0;
+
+err:
+ NL_SET_ERR_MSG(extack,
+ "netem: cannot mix duplicating netems with other netems in tree");
+ return -EINVAL;
+}
+
/* Parse netlink message to set options */
static int netem_change(struct Qdisc *sch, struct nlattr *opt,
struct netlink_ext_ack *extack)
{
struct netem_sched_data *q = qdisc_priv(sch);
struct nlattr *tb[TCA_NETEM_MAX + 1];
+ struct disttable *delay_dist = NULL;
+ struct disttable *slot_dist = NULL;
struct tc_netem_qopt *qopt;
struct clgstate old_clg;
int old_loss_model = CLG_RANDOM;
int ret;
- if (opt == NULL)
- return -EINVAL;
-
qopt = nla_data(opt);
ret = parse_attr(tb, TCA_NETEM_MAX, opt, netem_policy, sizeof(*qopt));
if (ret < 0)
return ret;
+ if (tb[TCA_NETEM_DELAY_DIST]) {
+ ret = get_dist_table(&delay_dist, tb[TCA_NETEM_DELAY_DIST]);
+ if (ret)
+ goto table_free;
+ }
+
+ if (tb[TCA_NETEM_SLOT_DIST]) {
+ ret = get_dist_table(&slot_dist, tb[TCA_NETEM_SLOT_DIST]);
+ if (ret)
+ goto table_free;
+ }
+
+ sch_tree_lock(sch);
/* backup q->clg and q->loss_model */
old_clg = q->clg;
old_loss_model = q->loss_model;
@@ -965,26 +1048,17 @@ static int netem_change(struct Qdisc *sch, struct nlattr *opt,
ret = get_loss_clg(q, tb[TCA_NETEM_LOSS]);
if (ret) {
q->loss_model = old_loss_model;
- return ret;
+ q->clg = old_clg;
+ goto unlock;
}
} else {
q->loss_model = CLG_RANDOM;
}
- if (tb[TCA_NETEM_DELAY_DIST]) {
- ret = get_dist_table(sch, &q->delay_dist,
- tb[TCA_NETEM_DELAY_DIST]);
- if (ret)
- goto get_table_failure;
- }
-
- if (tb[TCA_NETEM_SLOT_DIST]) {
- ret = get_dist_table(sch, &q->slot_dist,
- tb[TCA_NETEM_SLOT_DIST]);
- if (ret)
- goto get_table_failure;
- }
-
+ if (delay_dist)
+ swap(q->delay_dist, delay_dist);
+ if (slot_dist)
+ swap(q->slot_dist, slot_dist);
sch->limit = qopt->limit;
q->latency = PSCHED_TICKS2NS(qopt->latency);
@@ -993,6 +1067,11 @@ static int netem_change(struct Qdisc *sch, struct nlattr *opt,
q->gap = qopt->gap;
q->counter = 0;
q->loss = qopt->loss;
+
+ ret = check_netem_in_tree(sch, qopt->duplicate, extack);
+ if (ret)
+ goto unlock;
+
q->duplicate = qopt->duplicate;
/* for compatibility with earlier versions.
@@ -1029,15 +1108,21 @@ static int netem_change(struct Qdisc *sch, struct nlattr *opt,
if (tb[TCA_NETEM_SLOT])
get_slot(q, tb[TCA_NETEM_SLOT]);
- return ret;
+ /* capping jitter to the range acceptable by tabledist() */
+ q->jitter = min_t(s64, abs(q->jitter), INT_MAX);
-get_table_failure:
- /* recover clg and loss_model, in case of
- * q->clg and q->loss_model were modified
- * in get_loss_clg()
- */
- q->clg = old_clg;
- q->loss_model = old_loss_model;
+ if (tb[TCA_NETEM_PRNG_SEED])
+ q->prng.seed = nla_get_u64(tb[TCA_NETEM_PRNG_SEED]);
+ else
+ q->prng.seed = get_random_u64();
+ prandom_seed_state(&q->prng.prng_state, q->prng.seed);
+
+unlock:
+ sch_tree_unlock(sch);
+
+table_free:
+ dist_free(delay_dist);
+ dist_free(slot_dist);
return ret;
}
@@ -1075,7 +1160,7 @@ static int dump_loss_model(const struct netem_sched_data *q,
{
struct nlattr *nest;
- nest = nla_nest_start(skb, TCA_NETEM_LOSS);
+ nest = nla_nest_start_noflag(skb, TCA_NETEM_LOSS);
if (nest == NULL)
goto nla_put_failure;
@@ -1131,9 +1216,9 @@ static int netem_dump(struct Qdisc *sch, struct sk_buff *skb)
struct tc_netem_rate rate;
struct tc_netem_slot slot;
- qopt.latency = min_t(psched_tdiff_t, PSCHED_NS2TICKS(q->latency),
+ qopt.latency = min_t(psched_time_t, PSCHED_NS2TICKS(q->latency),
UINT_MAX);
- qopt.jitter = min_t(psched_tdiff_t, PSCHED_NS2TICKS(q->jitter),
+ qopt.jitter = min_t(psched_time_t, PSCHED_NS2TICKS(q->jitter),
UINT_MAX);
qopt.limit = q->limit;
qopt.loss = q->loss;
@@ -1195,6 +1280,10 @@ static int netem_dump(struct Qdisc *sch, struct sk_buff *skb)
goto nla_put_failure;
}
+ if (nla_put_u64_64bit(skb, TCA_NETEM_PRNG_SEED, q->prng.seed,
+ TCA_NETEM_PAD))
+ goto nla_put_failure;
+
return nla_nest_end(skb, nla);
nla_put_failure:
@@ -1239,12 +1328,8 @@ static unsigned long netem_find(struct Qdisc *sch, u32 classid)
static void netem_walk(struct Qdisc *sch, struct qdisc_walker *walker)
{
if (!walker->stop) {
- if (walker->count >= walker->skip)
- if (walker->fn(sch, 1, walker) < 0) {
- walker->stop = 1;
- return;
- }
- walker->count++;
+ if (!tc_qdisc_stats_dump(sch, 1, walker))
+ return;
}
}
@@ -1270,6 +1355,7 @@ static struct Qdisc_ops netem_qdisc_ops __read_mostly = {
.dump = netem_dump,
.owner = THIS_MODULE,
};
+MODULE_ALIAS_NET_SCH("netem");
static int __init netem_module_init(void)
@@ -1284,3 +1370,4 @@ static void __exit netem_module_exit(void)
module_init(netem_module_init)
module_exit(netem_module_exit)
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Network characteristics emulator qdisc");
diff --git a/net/sched/sch_pie.c b/net/sched/sch_pie.c
index d1429371592f..0a377313b6a9 100644
--- a/net/sched/sch_pie.c
+++ b/net/sched/sch_pie.c
@@ -1,15 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* Copyright (C) 2013 Cisco Systems, Inc, 2013.
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version 2
- * of the License.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
* Author: Vijay Subramanian <vijaynsu@cisco.com>
* Author: Mythili Prabhu <mysuryan@cisco.com>
*
@@ -17,9 +8,7 @@
* University of Oslo, Norway.
*
* References:
- * IETF draft submission: http://tools.ietf.org/html/draft-pan-aqm-pie-00
- * IEEE Conference on High Performance Switching and Routing 2013 :
- * "PIE: A * Lightweight Control Scheme to Address the Bufferbloat Problem"
+ * RFC 8033: https://tools.ietf.org/html/rfc8033
*/
#include <linux/module.h>
@@ -30,114 +19,73 @@
#include <linux/skbuff.h>
#include <net/pkt_sched.h>
#include <net/inet_ecn.h>
-
-#define QUEUE_THRESHOLD 10000
-#define DQCOUNT_INVALID -1
-#define MAX_PROB 0xffffffff
-#define PIE_SCALE 8
-
-/* parameters used */
-struct pie_params {
- psched_time_t target; /* user specified target delay in pschedtime */
- u32 tupdate; /* timer frequency (in jiffies) */
- u32 limit; /* number of packets that can be enqueued */
- u32 alpha; /* alpha and beta are between 0 and 32 */
- u32 beta; /* and are used for shift relative to 1 */
- bool ecn; /* true if ecn is enabled */
- bool bytemode; /* to scale drop early prob based on pkt size */
-};
-
-/* variables used */
-struct pie_vars {
- u32 prob; /* probability but scaled by u32 limit. */
- psched_time_t burst_time;
- psched_time_t qdelay;
- psched_time_t qdelay_old;
- u64 dq_count; /* measured in bytes */
- psched_time_t dq_tstamp; /* drain rate */
- u32 avg_dq_rate; /* bytes per pschedtime tick,scaled */
- u32 qlen_old; /* in bytes */
-};
-
-/* statistics gathering */
-struct pie_stats {
- u32 packets_in; /* total number of packets enqueued */
- u32 dropped; /* packets dropped due to pie_action */
- u32 overlimit; /* dropped due to lack of space in queue */
- u32 maxq; /* maximum queue size */
- u32 ecn_mark; /* packets marked with ECN */
-};
+#include <net/pie.h>
/* private data for the Qdisc */
struct pie_sched_data {
- struct pie_params params;
struct pie_vars vars;
+ struct pie_params params;
struct pie_stats stats;
struct timer_list adapt_timer;
struct Qdisc *sch;
};
-static void pie_params_init(struct pie_params *params)
-{
- params->alpha = 2;
- params->beta = 20;
- params->tupdate = usecs_to_jiffies(30 * USEC_PER_MSEC); /* 30 ms */
- params->limit = 1000; /* default of 1000 packets */
- params->target = PSCHED_NS2TICKS(20 * NSEC_PER_MSEC); /* 20 ms */
- params->ecn = false;
- params->bytemode = false;
-}
-
-static void pie_vars_init(struct pie_vars *vars)
-{
- vars->dq_count = DQCOUNT_INVALID;
- vars->avg_dq_rate = 0;
- /* default of 100 ms in pschedtime */
- vars->burst_time = PSCHED_NS2TICKS(100 * NSEC_PER_MSEC);
-}
-
-static bool drop_early(struct Qdisc *sch, u32 packet_size)
+bool pie_drop_early(struct Qdisc *sch, struct pie_params *params,
+ struct pie_vars *vars, u32 backlog, u32 packet_size)
{
- struct pie_sched_data *q = qdisc_priv(sch);
- u32 rnd;
- u32 local_prob = q->vars.prob;
+ u64 rnd;
+ u64 local_prob = vars->prob;
u32 mtu = psched_mtu(qdisc_dev(sch));
/* If there is still burst allowance left skip random early drop */
- if (q->vars.burst_time > 0)
+ if (vars->burst_time > 0)
return false;
/* If current delay is less than half of target, and
* if drop prob is low already, disable early_drop
*/
- if ((q->vars.qdelay < q->params.target / 2) &&
- (q->vars.prob < MAX_PROB / 5))
+ if ((vars->qdelay < params->target / 2) &&
+ (vars->prob < MAX_PROB / 5))
return false;
- /* If we have fewer than 2 mtu-sized packets, disable drop_early,
+ /* If we have fewer than 2 mtu-sized packets, disable pie_drop_early,
* similar to min_th in RED
*/
- if (sch->qstats.backlog < 2 * mtu)
+ if (backlog < 2 * mtu)
return false;
/* If bytemode is turned on, use packet size to compute new
* probablity. Smaller packets will have lower drop prob in this case
*/
- if (q->params.bytemode && packet_size <= mtu)
- local_prob = (local_prob / mtu) * packet_size;
+ if (params->bytemode && packet_size <= mtu)
+ local_prob = (u64)packet_size * div_u64(local_prob, mtu);
else
- local_prob = q->vars.prob;
+ local_prob = vars->prob;
+
+ if (local_prob == 0)
+ vars->accu_prob = 0;
+ else
+ vars->accu_prob += local_prob;
+
+ if (vars->accu_prob < (MAX_PROB / 100) * 85)
+ return false;
+ if (vars->accu_prob >= (MAX_PROB / 2) * 17)
+ return true;
- rnd = prandom_u32();
- if (rnd < local_prob)
+ get_random_bytes(&rnd, 8);
+ if ((rnd >> BITS_PER_BYTE) < local_prob) {
+ vars->accu_prob = 0;
return true;
+ }
return false;
}
+EXPORT_SYMBOL_GPL(pie_drop_early);
static int pie_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch,
struct sk_buff **to_free)
{
+ enum skb_drop_reason reason = SKB_DROP_REASON_QDISC_OVERLIMIT;
struct pie_sched_data *q = qdisc_priv(sch);
bool enqueue = false;
@@ -146,7 +94,10 @@ static int pie_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch,
goto out;
}
- if (!drop_early(sch, skb->len)) {
+ reason = SKB_DROP_REASON_QDISC_CONGESTED;
+
+ if (!pie_drop_early(sch, &q->params, &q->vars, sch->qstats.backlog,
+ skb->len)) {
enqueue = true;
} else if (q->params.ecn && (q->vars.prob <= MAX_PROB / 10) &&
INET_ECN_set_ce(skb)) {
@@ -159,6 +110,10 @@ static int pie_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch,
/* we can enqueue the packet */
if (enqueue) {
+ /* Set enqueue time only when dq_rate_estimator is disabled. */
+ if (!q->params.dq_rate_estimator)
+ pie_set_enqueue_time(skb);
+
q->stats.packets_in++;
if (qdisc_qlen(sch) > q->stats.maxq)
q->stats.maxq = qdisc_qlen(sch);
@@ -168,31 +123,31 @@ static int pie_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch,
out:
q->stats.dropped++;
- return qdisc_drop(skb, sch, to_free);
+ q->vars.accu_prob = 0;
+ return qdisc_drop_reason(skb, sch, to_free, reason);
}
static const struct nla_policy pie_policy[TCA_PIE_MAX + 1] = {
- [TCA_PIE_TARGET] = {.type = NLA_U32},
- [TCA_PIE_LIMIT] = {.type = NLA_U32},
- [TCA_PIE_TUPDATE] = {.type = NLA_U32},
- [TCA_PIE_ALPHA] = {.type = NLA_U32},
- [TCA_PIE_BETA] = {.type = NLA_U32},
- [TCA_PIE_ECN] = {.type = NLA_U32},
- [TCA_PIE_BYTEMODE] = {.type = NLA_U32},
+ [TCA_PIE_TARGET] = {.type = NLA_U32},
+ [TCA_PIE_LIMIT] = {.type = NLA_U32},
+ [TCA_PIE_TUPDATE] = {.type = NLA_U32},
+ [TCA_PIE_ALPHA] = {.type = NLA_U32},
+ [TCA_PIE_BETA] = {.type = NLA_U32},
+ [TCA_PIE_ECN] = {.type = NLA_U32},
+ [TCA_PIE_BYTEMODE] = {.type = NLA_U32},
+ [TCA_PIE_DQ_RATE_ESTIMATOR] = {.type = NLA_U32},
};
static int pie_change(struct Qdisc *sch, struct nlattr *opt,
struct netlink_ext_ack *extack)
{
+ unsigned int dropped_pkts = 0, dropped_bytes = 0;
struct pie_sched_data *q = qdisc_priv(sch);
struct nlattr *tb[TCA_PIE_MAX + 1];
- unsigned int qlen, dropped = 0;
int err;
- if (!opt)
- return -EINVAL;
-
- err = nla_parse_nested(tb, TCA_PIE_MAX, opt, pie_policy, NULL);
+ err = nla_parse_nested_deprecated(tb, TCA_PIE_MAX, opt, pie_policy,
+ NULL);
if (err < 0)
return err;
@@ -204,174 +159,213 @@ static int pie_change(struct Qdisc *sch, struct nlattr *opt,
u32 target = nla_get_u32(tb[TCA_PIE_TARGET]);
/* convert to pschedtime */
- q->params.target = PSCHED_NS2TICKS((u64)target * NSEC_PER_USEC);
+ WRITE_ONCE(q->params.target,
+ PSCHED_NS2TICKS((u64)target * NSEC_PER_USEC));
}
/* tupdate is in jiffies */
if (tb[TCA_PIE_TUPDATE])
- q->params.tupdate =
- usecs_to_jiffies(nla_get_u32(tb[TCA_PIE_TUPDATE]));
+ WRITE_ONCE(q->params.tupdate,
+ usecs_to_jiffies(nla_get_u32(tb[TCA_PIE_TUPDATE])));
if (tb[TCA_PIE_LIMIT]) {
u32 limit = nla_get_u32(tb[TCA_PIE_LIMIT]);
- q->params.limit = limit;
- sch->limit = limit;
+ WRITE_ONCE(q->params.limit, limit);
+ WRITE_ONCE(sch->limit, limit);
}
if (tb[TCA_PIE_ALPHA])
- q->params.alpha = nla_get_u32(tb[TCA_PIE_ALPHA]);
+ WRITE_ONCE(q->params.alpha, nla_get_u32(tb[TCA_PIE_ALPHA]));
if (tb[TCA_PIE_BETA])
- q->params.beta = nla_get_u32(tb[TCA_PIE_BETA]);
+ WRITE_ONCE(q->params.beta, nla_get_u32(tb[TCA_PIE_BETA]));
if (tb[TCA_PIE_ECN])
- q->params.ecn = nla_get_u32(tb[TCA_PIE_ECN]);
+ WRITE_ONCE(q->params.ecn, nla_get_u32(tb[TCA_PIE_ECN]));
if (tb[TCA_PIE_BYTEMODE])
- q->params.bytemode = nla_get_u32(tb[TCA_PIE_BYTEMODE]);
+ WRITE_ONCE(q->params.bytemode,
+ nla_get_u32(tb[TCA_PIE_BYTEMODE]));
+
+ if (tb[TCA_PIE_DQ_RATE_ESTIMATOR])
+ WRITE_ONCE(q->params.dq_rate_estimator,
+ nla_get_u32(tb[TCA_PIE_DQ_RATE_ESTIMATOR]));
/* Drop excess packets if new limit is lower */
- qlen = sch->q.qlen;
while (sch->q.qlen > sch->limit) {
- struct sk_buff *skb = __qdisc_dequeue_head(&sch->q);
+ struct sk_buff *skb = qdisc_dequeue_internal(sch, true);
+
+ if (!skb)
+ break;
- dropped += qdisc_pkt_len(skb);
- qdisc_qstats_backlog_dec(sch, skb);
+ dropped_pkts++;
+ dropped_bytes += qdisc_pkt_len(skb);
rtnl_qdisc_drop(skb, sch);
}
- qdisc_tree_reduce_backlog(sch, qlen - sch->q.qlen, dropped);
+ qdisc_tree_reduce_backlog(sch, dropped_pkts, dropped_bytes);
sch_tree_unlock(sch);
return 0;
}
-static void pie_process_dequeue(struct Qdisc *sch, struct sk_buff *skb)
+void pie_process_dequeue(struct sk_buff *skb, struct pie_params *params,
+ struct pie_vars *vars, u32 backlog)
{
- struct pie_sched_data *q = qdisc_priv(sch);
- int qlen = sch->qstats.backlog; /* current queue size in bytes */
+ psched_time_t now = psched_get_time();
+ u32 dtime = 0;
+
+ /* If dq_rate_estimator is disabled, calculate qdelay using the
+ * packet timestamp.
+ */
+ if (!params->dq_rate_estimator) {
+ vars->qdelay = now - pie_get_enqueue_time(skb);
+
+ if (vars->dq_tstamp != DTIME_INVALID)
+ dtime = now - vars->dq_tstamp;
+
+ vars->dq_tstamp = now;
+
+ if (backlog == 0)
+ vars->qdelay = 0;
+
+ if (dtime == 0)
+ return;
+
+ goto burst_allowance_reduction;
+ }
/* If current queue is about 10 packets or more and dq_count is unset
* we have enough packets to calculate the drain rate. Save
* current time as dq_tstamp and start measurement cycle.
*/
- if (qlen >= QUEUE_THRESHOLD && q->vars.dq_count == DQCOUNT_INVALID) {
- q->vars.dq_tstamp = psched_get_time();
- q->vars.dq_count = 0;
+ if (backlog >= QUEUE_THRESHOLD && vars->dq_count == DQCOUNT_INVALID) {
+ vars->dq_tstamp = psched_get_time();
+ vars->dq_count = 0;
}
- /* Calculate the average drain rate from this value. If queue length
- * has receded to a small value viz., <= QUEUE_THRESHOLD bytes,reset
+ /* Calculate the average drain rate from this value. If queue length
+ * has receded to a small value viz., <= QUEUE_THRESHOLD bytes, reset
* the dq_count to -1 as we don't have enough packets to calculate the
- * drain rate anymore The following if block is entered only when we
+ * drain rate anymore. The following if block is entered only when we
* have a substantial queue built up (QUEUE_THRESHOLD bytes or more)
* and we calculate the drain rate for the threshold here. dq_count is
* in bytes, time difference in psched_time, hence rate is in
* bytes/psched_time.
*/
- if (q->vars.dq_count != DQCOUNT_INVALID) {
- q->vars.dq_count += skb->len;
+ if (vars->dq_count != DQCOUNT_INVALID) {
+ vars->dq_count += skb->len;
- if (q->vars.dq_count >= QUEUE_THRESHOLD) {
- psched_time_t now = psched_get_time();
- u32 dtime = now - q->vars.dq_tstamp;
- u32 count = q->vars.dq_count << PIE_SCALE;
+ if (vars->dq_count >= QUEUE_THRESHOLD) {
+ u32 count = vars->dq_count << PIE_SCALE;
+
+ dtime = now - vars->dq_tstamp;
if (dtime == 0)
return;
count = count / dtime;
- if (q->vars.avg_dq_rate == 0)
- q->vars.avg_dq_rate = count;
+ if (vars->avg_dq_rate == 0)
+ vars->avg_dq_rate = count;
else
- q->vars.avg_dq_rate =
- (q->vars.avg_dq_rate -
- (q->vars.avg_dq_rate >> 3)) + (count >> 3);
+ vars->avg_dq_rate =
+ (vars->avg_dq_rate -
+ (vars->avg_dq_rate >> 3)) + (count >> 3);
/* If the queue has receded below the threshold, we hold
* on to the last drain rate calculated, else we reset
* dq_count to 0 to re-enter the if block when the next
* packet is dequeued
*/
- if (qlen < QUEUE_THRESHOLD) {
- q->vars.dq_count = DQCOUNT_INVALID;
+ if (backlog < QUEUE_THRESHOLD) {
+ vars->dq_count = DQCOUNT_INVALID;
} else {
- q->vars.dq_count = 0;
- q->vars.dq_tstamp = psched_get_time();
+ vars->dq_count = 0;
+ vars->dq_tstamp = psched_get_time();
}
- if (q->vars.burst_time > 0) {
- if (q->vars.burst_time > dtime)
- q->vars.burst_time -= dtime;
- else
- q->vars.burst_time = 0;
- }
+ goto burst_allowance_reduction;
}
}
+
+ return;
+
+burst_allowance_reduction:
+ if (vars->burst_time > 0) {
+ if (vars->burst_time > dtime)
+ vars->burst_time -= dtime;
+ else
+ vars->burst_time = 0;
+ }
}
+EXPORT_SYMBOL_GPL(pie_process_dequeue);
-static void calculate_probability(struct Qdisc *sch)
+void pie_calculate_probability(struct pie_params *params, struct pie_vars *vars,
+ u32 backlog)
{
- struct pie_sched_data *q = qdisc_priv(sch);
- u32 qlen = sch->qstats.backlog; /* queue size in bytes */
psched_time_t qdelay = 0; /* in pschedtime */
- psched_time_t qdelay_old = q->vars.qdelay; /* in pschedtime */
- s32 delta = 0; /* determines the change in probability */
- u32 oldprob;
- u32 alpha, beta;
+ psched_time_t qdelay_old = 0; /* in pschedtime */
+ s64 delta = 0; /* determines the change in probability */
+ u64 oldprob;
+ u64 alpha, beta;
+ u32 power;
bool update_prob = true;
- q->vars.qdelay_old = q->vars.qdelay;
+ if (params->dq_rate_estimator) {
+ qdelay_old = vars->qdelay;
+ vars->qdelay_old = vars->qdelay;
- if (q->vars.avg_dq_rate > 0)
- qdelay = (qlen << PIE_SCALE) / q->vars.avg_dq_rate;
- else
- qdelay = 0;
+ if (vars->avg_dq_rate > 0)
+ qdelay = (backlog << PIE_SCALE) / vars->avg_dq_rate;
+ else
+ qdelay = 0;
+ } else {
+ qdelay = vars->qdelay;
+ qdelay_old = vars->qdelay_old;
+ }
- /* If qdelay is zero and qlen is not, it means qlen is very small, less
- * than dequeue_rate, so we do not update probabilty in this round
+ /* If qdelay is zero and backlog is not, it means backlog is very small,
+ * so we do not update probability in this round.
*/
- if (qdelay == 0 && qlen != 0)
+ if (qdelay == 0 && backlog != 0)
update_prob = false;
/* In the algorithm, alpha and beta are between 0 and 2 with typical
* value for alpha as 0.125. In this implementation, we use values 0-32
* passed from user space to represent this. Also, alpha and beta have
* unit of HZ and need to be scaled before they can used to update
- * probability. alpha/beta are updated locally below by 1) scaling them
- * appropriately 2) scaling down by 16 to come to 0-2 range.
- * Please see paper for details.
- *
- * We scale alpha and beta differently depending on whether we are in
- * light, medium or high dropping mode.
+ * probability. alpha/beta are updated locally below by scaling down
+ * by 16 to come to 0-2 range.
*/
- if (q->vars.prob < MAX_PROB / 100) {
- alpha =
- (q->params.alpha * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 7;
- beta =
- (q->params.beta * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 7;
- } else if (q->vars.prob < MAX_PROB / 10) {
- alpha =
- (q->params.alpha * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 5;
- beta =
- (q->params.beta * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 5;
- } else {
- alpha =
- (q->params.alpha * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 4;
- beta =
- (q->params.beta * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 4;
+ alpha = ((u64)params->alpha * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 4;
+ beta = ((u64)params->beta * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 4;
+
+ /* We scale alpha and beta differently depending on how heavy the
+ * congestion is. Please see RFC 8033 for details.
+ */
+ if (vars->prob < MAX_PROB / 10) {
+ alpha >>= 1;
+ beta >>= 1;
+
+ power = 100;
+ while (vars->prob < div_u64(MAX_PROB, power) &&
+ power <= 1000000) {
+ alpha >>= 2;
+ beta >>= 2;
+ power *= 10;
+ }
}
/* alpha and beta should be between 0 and 32, in multiples of 1/16 */
- delta += alpha * ((qdelay - q->params.target));
- delta += beta * ((qdelay - qdelay_old));
+ delta += alpha * (qdelay - params->target);
+ delta += beta * (qdelay - qdelay_old);
- oldprob = q->vars.prob;
+ oldprob = vars->prob;
/* to ensure we increase probability in steps of no more than 2% */
- if (delta > (s32)(MAX_PROB / (100 / 2)) &&
- q->vars.prob >= MAX_PROB / 10)
+ if (delta > (s64)(MAX_PROB / (100 / 2)) &&
+ vars->prob >= MAX_PROB / 10)
delta = (MAX_PROB / 100) * 2;
/* Non-linear drop:
@@ -382,12 +376,12 @@ static void calculate_probability(struct Qdisc *sch)
if (qdelay > (PSCHED_NS2TICKS(250 * NSEC_PER_MSEC)))
delta += MAX_PROB / (100 / 2);
- q->vars.prob += delta;
+ vars->prob += delta;
if (delta > 0) {
/* prevent overflow */
- if (q->vars.prob < oldprob) {
- q->vars.prob = MAX_PROB;
+ if (vars->prob < oldprob) {
+ vars->prob = MAX_PROB;
/* Prevent normalization error. If probability is at
* maximum value already, we normalize it here, and
* skip the check to do a non-linear drop in the next
@@ -397,8 +391,8 @@ static void calculate_probability(struct Qdisc *sch)
}
} else {
/* prevent underflow */
- if (q->vars.prob > oldprob)
- q->vars.prob = 0;
+ if (vars->prob > oldprob)
+ vars->prob = 0;
}
/* Non-linear drop in probability: Reduce drop probability quickly if
@@ -406,37 +400,46 @@ static void calculate_probability(struct Qdisc *sch)
*/
if (qdelay == 0 && qdelay_old == 0 && update_prob)
- q->vars.prob = (q->vars.prob * 98) / 100;
+ /* Reduce drop probability to 98.4% */
+ vars->prob -= vars->prob / 64;
- q->vars.qdelay = qdelay;
- q->vars.qlen_old = qlen;
+ vars->qdelay = qdelay;
+ vars->backlog_old = backlog;
/* We restart the measurement cycle if the following conditions are met
* 1. If the delay has been low for 2 consecutive Tupdate periods
* 2. Calculated drop probability is zero
- * 3. We have atleast one estimate for the avg_dq_rate ie.,
- * is a non-zero value
+ * 3. If average dq_rate_estimator is enabled, we have at least one
+ * estimate for the avg_dq_rate ie., is a non-zero value
*/
- if ((q->vars.qdelay < q->params.target / 2) &&
- (q->vars.qdelay_old < q->params.target / 2) &&
- q->vars.prob == 0 &&
- q->vars.avg_dq_rate > 0)
- pie_vars_init(&q->vars);
+ if ((vars->qdelay < params->target / 2) &&
+ (vars->qdelay_old < params->target / 2) &&
+ vars->prob == 0 &&
+ (!params->dq_rate_estimator || vars->avg_dq_rate > 0)) {
+ pie_vars_init(vars);
+ }
+
+ if (!params->dq_rate_estimator)
+ vars->qdelay_old = qdelay;
}
+EXPORT_SYMBOL_GPL(pie_calculate_probability);
static void pie_timer(struct timer_list *t)
{
- struct pie_sched_data *q = from_timer(q, t, adapt_timer);
+ struct pie_sched_data *q = timer_container_of(q, t, adapt_timer);
struct Qdisc *sch = q->sch;
- spinlock_t *root_lock = qdisc_lock(qdisc_root_sleeping(sch));
+ spinlock_t *root_lock;
+ rcu_read_lock();
+ root_lock = qdisc_lock(qdisc_root_sleeping(sch));
spin_lock(root_lock);
- calculate_probability(sch);
+ pie_calculate_probability(&q->params, &q->vars, sch->qstats.backlog);
/* reset the timer to fire after 'tupdate'. tupdate is in jiffies. */
if (q->params.tupdate)
mod_timer(&q->adapt_timer, jiffies + q->params.tupdate);
spin_unlock(root_lock);
+ rcu_read_unlock();
}
static int pie_init(struct Qdisc *sch, struct nlattr *opt,
@@ -467,21 +470,24 @@ static int pie_dump(struct Qdisc *sch, struct sk_buff *skb)
struct pie_sched_data *q = qdisc_priv(sch);
struct nlattr *opts;
- opts = nla_nest_start(skb, TCA_OPTIONS);
+ opts = nla_nest_start_noflag(skb, TCA_OPTIONS);
if (!opts)
goto nla_put_failure;
/* convert target from pschedtime to us */
if (nla_put_u32(skb, TCA_PIE_TARGET,
- ((u32)PSCHED_TICKS2NS(q->params.target)) /
+ ((u32)PSCHED_TICKS2NS(READ_ONCE(q->params.target))) /
NSEC_PER_USEC) ||
- nla_put_u32(skb, TCA_PIE_LIMIT, sch->limit) ||
+ nla_put_u32(skb, TCA_PIE_LIMIT, READ_ONCE(sch->limit)) ||
nla_put_u32(skb, TCA_PIE_TUPDATE,
- jiffies_to_usecs(q->params.tupdate)) ||
- nla_put_u32(skb, TCA_PIE_ALPHA, q->params.alpha) ||
- nla_put_u32(skb, TCA_PIE_BETA, q->params.beta) ||
+ jiffies_to_usecs(READ_ONCE(q->params.tupdate))) ||
+ nla_put_u32(skb, TCA_PIE_ALPHA, READ_ONCE(q->params.alpha)) ||
+ nla_put_u32(skb, TCA_PIE_BETA, READ_ONCE(q->params.beta)) ||
nla_put_u32(skb, TCA_PIE_ECN, q->params.ecn) ||
- nla_put_u32(skb, TCA_PIE_BYTEMODE, q->params.bytemode))
+ nla_put_u32(skb, TCA_PIE_BYTEMODE,
+ READ_ONCE(q->params.bytemode)) ||
+ nla_put_u32(skb, TCA_PIE_DQ_RATE_ESTIMATOR,
+ READ_ONCE(q->params.dq_rate_estimator)))
goto nla_put_failure;
return nla_nest_end(skb, opts);
@@ -495,12 +501,9 @@ static int pie_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
{
struct pie_sched_data *q = qdisc_priv(sch);
struct tc_pie_xstats st = {
- .prob = q->vars.prob,
+ .prob = q->vars.prob << BITS_PER_BYTE,
.delay = ((u32)PSCHED_TICKS2NS(q->vars.qdelay)) /
NSEC_PER_USEC,
- /* unscale and return dq_rate in bytes per sec */
- .avg_dq_rate = q->vars.avg_dq_rate *
- (PSCHED_TICKS_PER_SEC) >> PIE_SCALE,
.packets_in = q->stats.packets_in,
.overlimit = q->stats.overlimit,
.maxq = q->stats.maxq,
@@ -508,17 +511,26 @@ static int pie_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
.ecn_mark = q->stats.ecn_mark,
};
+ /* avg_dq_rate is only valid if dq_rate_estimator is enabled */
+ st.dq_rate_estimating = q->params.dq_rate_estimator;
+
+ /* unscale and return dq_rate in bytes per sec */
+ if (q->params.dq_rate_estimator)
+ st.avg_dq_rate = q->vars.avg_dq_rate *
+ (PSCHED_TICKS_PER_SEC) >> PIE_SCALE;
+
return gnet_stats_copy_app(d, &st, sizeof(st));
}
static struct sk_buff *pie_qdisc_dequeue(struct Qdisc *sch)
{
+ struct pie_sched_data *q = qdisc_priv(sch);
struct sk_buff *skb = qdisc_dequeue_head(sch);
if (!skb)
return NULL;
- pie_process_dequeue(sch, skb);
+ pie_process_dequeue(skb, &q->params, &q->vars, sch->qstats.backlog);
return skb;
}
@@ -535,11 +547,11 @@ static void pie_destroy(struct Qdisc *sch)
struct pie_sched_data *q = qdisc_priv(sch);
q->params.tupdate = 0;
- del_timer_sync(&q->adapt_timer);
+ timer_delete_sync(&q->adapt_timer);
}
static struct Qdisc_ops pie_qdisc_ops __read_mostly = {
- .id = "pie",
+ .id = "pie",
.priv_size = sizeof(struct pie_sched_data),
.enqueue = pie_qdisc_enqueue,
.dequeue = pie_qdisc_dequeue,
@@ -552,6 +564,7 @@ static struct Qdisc_ops pie_qdisc_ops __read_mostly = {
.dump_stats = pie_dump_stats,
.owner = THIS_MODULE,
};
+MODULE_ALIAS_NET_SCH("pie");
static int __init pie_module_init(void)
{
diff --git a/net/sched/sch_plug.c b/net/sched/sch_plug.c
index 5619d2eb17b6..cefb65201e17 100644
--- a/net/sched/sch_plug.c
+++ b/net/sched/sch_plug.c
@@ -1,11 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* sch_plug.c Queue traffic until an explicit release command
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* There are two ways to use this qdisc:
* 1. A simple "instantaneous" plug/unplug operation, by issuing an alternating
* sequence of TCQ_PLUG_BUFFER & TCQ_PLUG_RELEASE_INDEFINITE commands.
@@ -165,9 +161,6 @@ static int plug_change(struct Qdisc *sch, struct nlattr *opt,
struct plug_sched_data *q = qdisc_priv(sch);
struct tc_plug_qopt *msg;
- if (opt == NULL)
- return -EINVAL;
-
msg = nla_data(opt);
if (nla_len(opt) < sizeof(*msg))
return -EINVAL;
@@ -214,12 +207,13 @@ static struct Qdisc_ops plug_qdisc_ops __read_mostly = {
.priv_size = sizeof(struct plug_sched_data),
.enqueue = plug_enqueue,
.dequeue = plug_dequeue,
- .peek = qdisc_peek_head,
+ .peek = qdisc_peek_dequeued,
.init = plug_init,
.change = plug_change,
.reset = qdisc_reset_queue,
.owner = THIS_MODULE,
};
+MODULE_ALIAS_NET_SCH("plug");
static int __init plug_module_init(void)
{
@@ -233,3 +227,4 @@ static void __exit plug_module_exit(void)
module_init(plug_module_init)
module_exit(plug_module_exit)
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Qdisc to plug and unplug traffic via netlink control");
diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c
index 847141cd900f..9e2b9a490db2 100644
--- a/net/sched/sch_prio.c
+++ b/net/sched/sch_prio.c
@@ -1,11 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* net/sched/sch_prio.c Simple 3-band priority "scheduler".
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
* Fixes: 19990609: J Hadi Salim <hadi@nortelnetworks.com>:
* Init -- EINVAL when opt undefined
@@ -43,14 +39,14 @@ prio_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
if (TC_H_MAJ(skb->priority) != sch->handle) {
fl = rcu_dereference_bh(q->filter_list);
- err = tcf_classify(skb, fl, &res, false);
+ err = tcf_classify(skb, NULL, fl, &res, false);
#ifdef CONFIG_NET_CLS_ACT
switch (err) {
case TC_ACT_STOLEN:
case TC_ACT_QUEUED:
case TC_ACT_TRAP:
*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
- /* fall through */
+ fallthrough;
case TC_ACT_SHOT:
return NULL;
}
@@ -139,8 +135,6 @@ prio_reset(struct Qdisc *sch)
for (prio = 0; prio < q->bands; prio++)
qdisc_reset(q->queues[prio]);
- sch->qstats.backlog = 0;
- sch->q.qlen = 0;
}
static int prio_offload(struct Qdisc *sch, struct tc_prio_qopt *qopt)
@@ -191,7 +185,7 @@ static int prio_tune(struct Qdisc *sch, struct nlattr *opt,
return -EINVAL;
qopt = nla_data(opt);
- if (qopt->bands > TCQ_PRIO_BANDS || qopt->bands < 2)
+ if (qopt->bands > TCQ_PRIO_BANDS || qopt->bands < TCQ_MIN_PRIO_BANDS)
return -EINVAL;
for (i = 0; i <= TC_PRIO_MAX; i++) {
@@ -216,12 +210,8 @@ static int prio_tune(struct Qdisc *sch, struct nlattr *opt,
q->bands = qopt->bands;
memcpy(q->prio2band, qopt->priomap, TC_PRIO_MAX+1);
- for (i = q->bands; i < oldbands; i++) {
- struct Qdisc *child = q->queues[i];
-
- qdisc_tree_reduce_backlog(child, child->q.qlen,
- child->qstats.backlog);
- }
+ for (i = q->bands; i < oldbands; i++)
+ qdisc_purge_queue(q->queues[i]);
for (i = oldbands; i < q->bands; i++) {
q->queues[i] = queues[i];
@@ -300,8 +290,14 @@ static int prio_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
struct tc_prio_qopt_offload graft_offload;
unsigned long band = arg - 1;
- if (new == NULL)
- new = &noop_qdisc;
+ if (!new) {
+ new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
+ TC_H_MAKE(sch->handle, arg), extack);
+ if (!new)
+ new = &noop_qdisc;
+ else
+ qdisc_hash_add(new, true);
+ }
*old = qdisc_replace(sch, new, &q->queues[band]);
@@ -363,9 +359,9 @@ static int prio_dump_class_stats(struct Qdisc *sch, unsigned long cl,
struct Qdisc *cl_q;
cl_q = q->queues[cl - 1];
- if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
- d, NULL, &cl_q->bstats) < 0 ||
- gnet_stats_copy_queue(d, NULL, &cl_q->qstats, cl_q->q.qlen) < 0)
+ if (gnet_stats_copy_basic(d, cl_q->cpu_bstats,
+ &cl_q->bstats, true) < 0 ||
+ qdisc_qstats_copy(d, cl_q) < 0)
return -1;
return 0;
@@ -380,15 +376,8 @@ static void prio_walk(struct Qdisc *sch, struct qdisc_walker *arg)
return;
for (prio = 0; prio < q->bands; prio++) {
- if (arg->count < arg->skip) {
- arg->count++;
- continue;
- }
- if (arg->fn(sch, prio + 1, arg) < 0) {
- arg->stop = 1;
+ if (!tc_qdisc_stats_dump(sch, prio + 1, arg))
break;
- }
- arg->count++;
}
}
@@ -429,6 +418,7 @@ static struct Qdisc_ops prio_qdisc_ops __read_mostly = {
.dump = prio_dump,
.owner = THIS_MODULE,
};
+MODULE_ALIAS_NET_SCH("prio");
static int __init prio_module_init(void)
{
@@ -444,3 +434,4 @@ module_init(prio_module_init)
module_exit(prio_module_exit)
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Simple 3-band priority qdisc");
diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c
index 29f5c4a24688..d920f57dc6d7 100644
--- a/net/sched/sch_qfq.c
+++ b/net/sched/sch_qfq.c
@@ -1,12 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* net/sched/sch_qfq.c Quick Fair Queueing Plus Scheduler.
*
* Copyright (c) 2009 Fabio Checconi, Luigi Rizzo, and Paolo Valente.
* Copyright (c) 2012 Paolo Valente.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * version 2 as published by the Free Software Foundation.
*/
#include <linux/module.h>
@@ -116,6 +113,7 @@
#define QFQ_MTU_SHIFT 16 /* to support TSO/GSO */
#define QFQ_MIN_LMAX 512 /* see qfq_slot_insert */
+#define QFQ_MAX_LMAX (1UL << QFQ_MTU_SHIFT)
#define QFQ_MAX_AGG_CLASSES 8 /* max num classes per aggregate allowed */
@@ -132,9 +130,7 @@ struct qfq_aggregate;
struct qfq_class {
struct Qdisc_class_common common;
- unsigned int filter_cnt;
-
- struct gnet_stats_basic_packed bstats;
+ struct gnet_stats_basic_sync bstats;
struct gnet_stats_queue qstats;
struct net_rate_estimator __rcu *rate_est;
struct Qdisc *qdisc;
@@ -206,6 +202,11 @@ struct qfq_sched {
*/
enum update_reason {enqueue, requeue};
+static bool cl_is_active(struct qfq_class *cl)
+{
+ return !list_empty(&cl->alist);
+}
+
static struct qfq_class *qfq_find_class(struct Qdisc *sch, u32 classid)
{
struct qfq_sched *q = qdisc_priv(sch);
@@ -217,18 +218,14 @@ static struct qfq_class *qfq_find_class(struct Qdisc *sch, u32 classid)
return container_of(clc, struct qfq_class, common);
}
-static void qfq_purge_queue(struct qfq_class *cl)
-{
- unsigned int len = cl->qdisc->q.qlen;
- unsigned int backlog = cl->qdisc->qstats.backlog;
-
- qdisc_reset(cl->qdisc);
- qdisc_tree_reduce_backlog(cl->qdisc, len, backlog);
-}
+static const struct netlink_range_validation lmax_range = {
+ .min = QFQ_MIN_LMAX,
+ .max = QFQ_MAX_LMAX,
+};
static const struct nla_policy qfq_policy[TCA_QFQ_MAX + 1] = {
- [TCA_QFQ_WEIGHT] = { .type = NLA_U32 },
- [TCA_QFQ_LMAX] = { .type = NLA_U32 },
+ [TCA_QFQ_WEIGHT] = NLA_POLICY_RANGE(NLA_U32, 1, QFQ_MAX_WEIGHT),
+ [TCA_QFQ_LMAX] = NLA_POLICY_FULL_RANGE(NLA_U32, &lmax_range),
};
/*
@@ -355,7 +352,7 @@ static void qfq_deactivate_class(struct qfq_sched *q, struct qfq_class *cl)
struct qfq_aggregate *agg = cl->agg;
- list_del(&cl->alist); /* remove from RR queue of the aggregate */
+ list_del_init(&cl->alist); /* remove from RR queue of the aggregate */
if (list_empty(&agg->active)) /* agg is now inactive */
qfq_deactivate_agg(q, agg);
}
@@ -387,8 +384,13 @@ static int qfq_change_agg(struct Qdisc *sch, struct qfq_class *cl, u32 weight,
u32 lmax)
{
struct qfq_sched *q = qdisc_priv(sch);
- struct qfq_aggregate *new_agg = qfq_find_agg(q, lmax, weight);
+ struct qfq_aggregate *new_agg;
+
+ /* 'lmax' can range from [QFQ_MIN_LMAX, pktlen + stab overhead] */
+ if (lmax > QFQ_MAX_LMAX)
+ return -EINVAL;
+ new_agg = qfq_find_agg(q, lmax, weight);
if (new_agg == NULL) { /* create new aggregate */
new_agg = kzalloc(sizeof(*new_agg), GFP_ATOMIC);
if (new_agg == NULL)
@@ -410,51 +412,52 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
bool existing = false;
struct nlattr *tb[TCA_QFQ_MAX + 1];
struct qfq_aggregate *new_agg = NULL;
- u32 weight, lmax, inv_w;
+ u32 weight, lmax, inv_w, old_weight, old_lmax;
int err;
int delta_w;
- if (tca[TCA_OPTIONS] == NULL) {
- pr_notice("qfq: no options\n");
+ if (NL_REQ_ATTR_CHECK(extack, NULL, tca, TCA_OPTIONS)) {
+ NL_SET_ERR_MSG_MOD(extack, "missing options");
return -EINVAL;
}
- err = nla_parse_nested(tb, TCA_QFQ_MAX, tca[TCA_OPTIONS], qfq_policy,
- NULL);
+ err = nla_parse_nested_deprecated(tb, TCA_QFQ_MAX, tca[TCA_OPTIONS],
+ qfq_policy, extack);
if (err < 0)
return err;
- if (tb[TCA_QFQ_WEIGHT]) {
- weight = nla_get_u32(tb[TCA_QFQ_WEIGHT]);
- if (!weight || weight > (1UL << QFQ_MAX_WSHIFT)) {
- pr_notice("qfq: invalid weight %u\n", weight);
- return -EINVAL;
- }
- } else
- weight = 1;
+ weight = nla_get_u32_default(tb[TCA_QFQ_WEIGHT], 1);
if (tb[TCA_QFQ_LMAX]) {
lmax = nla_get_u32(tb[TCA_QFQ_LMAX]);
- if (lmax < QFQ_MIN_LMAX || lmax > (1UL << QFQ_MTU_SHIFT)) {
- pr_notice("qfq: invalid max length %u\n", lmax);
+ } else {
+ /* MTU size is user controlled */
+ lmax = psched_mtu(qdisc_dev(sch));
+ if (lmax < QFQ_MIN_LMAX || lmax > QFQ_MAX_LMAX) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "MTU size out of bounds for qfq");
return -EINVAL;
}
- } else
- lmax = psched_mtu(qdisc_dev(sch));
+ }
inv_w = ONE_FP / weight;
weight = ONE_FP / inv_w;
- if (cl != NULL &&
- lmax == cl->agg->lmax &&
- weight == cl->agg->class_weight)
- return 0; /* nothing to change */
+ if (cl != NULL) {
+ sch_tree_lock(sch);
+ old_weight = cl->agg->class_weight;
+ old_lmax = cl->agg->lmax;
+ sch_tree_unlock(sch);
+ if (lmax == old_lmax && weight == old_weight)
+ return 0; /* nothing to change */
+ }
- delta_w = weight - (cl ? cl->agg->class_weight : 0);
+ delta_w = weight - (cl ? old_weight : 0);
if (q->wsum + delta_w > QFQ_MAX_WSUM) {
- pr_notice("qfq: total weight out of range (%d + %u)\n",
- delta_w, q->wsum);
+ NL_SET_ERR_MSG_FMT_MOD(extack,
+ "total weight out of range (%d + %u)",
+ delta_w, q->wsum);
return -EINVAL;
}
@@ -463,7 +466,7 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
err = gen_replace_estimator(&cl->bstats, NULL,
&cl->rate_est,
NULL,
- qdisc_root_sleeping_running(sch),
+ true,
tca[TCA_RATE]);
if (err)
return err;
@@ -477,8 +480,10 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
if (cl == NULL)
return -ENOBUFS;
+ gnet_stats_basic_sync_init(&cl->bstats);
cl->common.classid = classid;
cl->deficit = lmax;
+ INIT_LIST_HEAD(&cl->alist);
cl->qdisc = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
classid, NULL);
@@ -489,7 +494,7 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
err = gen_new_estimator(&cl->bstats, NULL,
&cl->rate_est,
NULL,
- qdisc_root_sleeping_running(sch),
+ true,
tca[TCA_RATE]);
if (err)
goto destroy_class;
@@ -497,11 +502,6 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
if (cl->qdisc != &noop_qdisc)
qdisc_hash_add(cl->qdisc, true);
- sch_tree_lock(sch);
- qdisc_class_hash_insert(&q->clhash, &cl->common);
- sch_tree_unlock(sch);
-
- qdisc_class_hash_grow(sch, &q->clhash);
set_change_agg:
sch_tree_lock(sch);
@@ -519,8 +519,11 @@ set_change_agg:
}
if (existing)
qfq_deact_rm_from_agg(q, cl);
+ else
+ qdisc_class_hash_insert(&q->clhash, &cl->common);
qfq_add_to_agg(q, new_agg, cl);
sch_tree_unlock(sch);
+ qdisc_class_hash_grow(sch, &q->clhash);
*arg = (unsigned long)cl;
return 0;
@@ -533,26 +536,27 @@ destroy_class:
static void qfq_destroy_class(struct Qdisc *sch, struct qfq_class *cl)
{
- struct qfq_sched *q = qdisc_priv(sch);
-
- qfq_rm_from_agg(q, cl);
gen_kill_estimator(&cl->rate_est);
qdisc_put(cl->qdisc);
kfree(cl);
}
-static int qfq_delete_class(struct Qdisc *sch, unsigned long arg)
+static int qfq_delete_class(struct Qdisc *sch, unsigned long arg,
+ struct netlink_ext_ack *extack)
{
struct qfq_sched *q = qdisc_priv(sch);
struct qfq_class *cl = (struct qfq_class *)arg;
- if (cl->filter_cnt > 0)
+ if (qdisc_class_in_use(&cl->common)) {
+ NL_SET_ERR_MSG_MOD(extack, "QFQ class in use");
return -EBUSY;
+ }
sch_tree_lock(sch);
- qfq_purge_queue(cl);
+ qdisc_purge_queue(cl->qdisc);
qdisc_class_hash_remove(&q->clhash, &cl->common);
+ qfq_rm_from_agg(q, cl);
sch_tree_unlock(sch);
@@ -581,8 +585,8 @@ static unsigned long qfq_bind_tcf(struct Qdisc *sch, unsigned long parent,
{
struct qfq_class *cl = qfq_find_class(sch, classid);
- if (cl != NULL)
- cl->filter_cnt++;
+ if (cl)
+ qdisc_class_get(&cl->common);
return (unsigned long)cl;
}
@@ -591,7 +595,7 @@ static void qfq_unbind_tcf(struct Qdisc *sch, unsigned long arg)
{
struct qfq_class *cl = (struct qfq_class *)arg;
- cl->filter_cnt--;
+ qdisc_class_put(&cl->common);
}
static int qfq_graft_class(struct Qdisc *sch, unsigned long arg,
@@ -623,16 +627,22 @@ static int qfq_dump_class(struct Qdisc *sch, unsigned long arg,
{
struct qfq_class *cl = (struct qfq_class *)arg;
struct nlattr *nest;
+ u32 class_weight, lmax;
tcm->tcm_parent = TC_H_ROOT;
tcm->tcm_handle = cl->common.classid;
tcm->tcm_info = cl->qdisc->handle;
- nest = nla_nest_start(skb, TCA_OPTIONS);
+ nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
if (nest == NULL)
goto nla_put_failure;
- if (nla_put_u32(skb, TCA_QFQ_WEIGHT, cl->agg->class_weight) ||
- nla_put_u32(skb, TCA_QFQ_LMAX, cl->agg->lmax))
+
+ sch_tree_lock(sch);
+ class_weight = cl->agg->class_weight;
+ lmax = cl->agg->lmax;
+ sch_tree_unlock(sch);
+ if (nla_put_u32(skb, TCA_QFQ_WEIGHT, class_weight) ||
+ nla_put_u32(skb, TCA_QFQ_LMAX, lmax))
goto nla_put_failure;
return nla_nest_end(skb, nest);
@@ -649,14 +659,14 @@ static int qfq_dump_class_stats(struct Qdisc *sch, unsigned long arg,
memset(&xstats, 0, sizeof(xstats));
+ sch_tree_lock(sch);
xstats.weight = cl->agg->class_weight;
xstats.lmax = cl->agg->lmax;
+ sch_tree_unlock(sch);
- if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
- d, NULL, &cl->bstats) < 0 ||
+ if (gnet_stats_copy_basic(d, NULL, &cl->bstats, true) < 0 ||
gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
- gnet_stats_copy_queue(d, NULL,
- &cl->qdisc->qstats, cl->qdisc->q.qlen) < 0)
+ qdisc_qstats_copy(d, cl->qdisc) < 0)
return -1;
return gnet_stats_copy_app(d, &xstats, sizeof(xstats));
@@ -673,15 +683,8 @@ static void qfq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
for (i = 0; i < q->clhash.hashsize; i++) {
hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) {
- if (arg->count < arg->skip) {
- arg->count++;
- continue;
- }
- if (arg->fn(sch, (unsigned long)cl, arg) < 0) {
- arg->stop = 1;
+ if (!tc_qdisc_stats_dump(sch, (unsigned long)cl, arg))
return;
- }
- arg->count++;
}
}
}
@@ -704,7 +707,7 @@ static struct qfq_class *qfq_classify(struct sk_buff *skb, struct Qdisc *sch,
*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
fl = rcu_dereference_bh(q->filter_list);
- result = tcf_classify(skb, fl, &res, false);
+ result = tcf_classify(skb, NULL, fl, &res, false);
if (result >= 0) {
#ifdef CONFIG_NET_CLS_ACT
switch (result) {
@@ -712,7 +715,7 @@ static struct qfq_class *qfq_classify(struct sk_buff *skb, struct Qdisc *sch,
case TC_ACT_STOLEN:
case TC_ACT_TRAP:
*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
- /* fall through */
+ fallthrough;
case TC_ACT_SHOT:
return NULL;
}
@@ -984,19 +987,24 @@ static void qfq_update_eligible(struct qfq_sched *q)
}
/* Dequeue head packet of the head class in the DRR queue of the aggregate. */
-static void agg_dequeue(struct qfq_aggregate *agg,
- struct qfq_class *cl, unsigned int len)
+static struct sk_buff *agg_dequeue(struct qfq_aggregate *agg,
+ struct qfq_class *cl, unsigned int len)
{
- qdisc_dequeue_peeked(cl->qdisc);
+ struct sk_buff *skb = qdisc_dequeue_peeked(cl->qdisc);
+
+ if (!skb)
+ return NULL;
cl->deficit -= (int) len;
if (cl->qdisc->q.qlen == 0) /* no more packets, remove from list */
- list_del(&cl->alist);
- else if (cl->deficit < qdisc_pkt_len(cl->qdisc->ops->peek(cl->qdisc))) {
+ list_del_init(&cl->alist);
+ else if (cl->deficit < qdisc_peek_len(cl->qdisc)) {
cl->deficit += agg->lmax;
list_move_tail(&cl->alist, &agg->active);
}
+
+ return skb;
}
static inline struct sk_buff *qfq_peek_skb(struct qfq_aggregate *agg,
@@ -1008,7 +1016,7 @@ static inline struct sk_buff *qfq_peek_skb(struct qfq_aggregate *agg,
*cl = list_first_entry(&agg->active, struct qfq_class, alist);
skb = (*cl)->qdisc->ops->peek((*cl)->qdisc);
if (skb == NULL)
- WARN_ONCE(1, "qfq_dequeue: non-workconserving leaf\n");
+ qdisc_warn_nonwc("qfq_dequeue", (*cl)->qdisc);
else
*len = qdisc_pkt_len(skb);
@@ -1142,11 +1150,18 @@ static struct sk_buff *qfq_dequeue(struct Qdisc *sch)
if (!skb)
return NULL;
- qdisc_qstats_backlog_dec(sch, skb);
sch->q.qlen--;
+
+ skb = agg_dequeue(in_serv_agg, cl, len);
+
+ if (!skb) {
+ sch->q.qlen++;
+ return NULL;
+ }
+
+ qdisc_qstats_backlog_dec(sch, skb);
qdisc_bstats_update(sch, skb);
- agg_dequeue(in_serv_agg, cl, len);
/* If lmax is lowered, through qfq_change_class, for a class
* owning pending packets with larger size than the new value
* of lmax, then the following condition may hold.
@@ -1215,7 +1230,6 @@ static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch,
struct qfq_class *cl;
struct qfq_aggregate *agg;
int err = 0;
- bool first;
cl = qfq_classify(skb, sch, &err);
if (cl == NULL) {
@@ -1236,8 +1250,7 @@ static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch,
}
}
- gso_segs = skb_is_gso(skb) ? skb_shinfo(skb)->gso_segs : 1;
- first = !cl->qdisc->q.qlen;
+ gso_segs = qdisc_pkt_segs(skb);
err = qdisc_enqueue(skb, cl->qdisc, to_free);
if (unlikely(err != NET_XMIT_SUCCESS)) {
pr_debug("qfq_enqueue: enqueue failed %d\n", err);
@@ -1248,14 +1261,13 @@ static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch,
return err;
}
- cl->bstats.bytes += len;
- cl->bstats.packets += gso_segs;
+ _bstats_update(&cl->bstats, len, gso_segs);
sch->qstats.backlog += len;
++sch->q.qlen;
agg = cl->agg;
- /* if the queue was not empty, then done here */
- if (!first) {
+ /* if the class is active, then done here */
+ if (cl_is_active(cl)) {
if (unlikely(skb == cl->qdisc->ops->peek(cl->qdisc)) &&
list_first_entry(&agg->active, struct qfq_class, alist)
== cl && cl->deficit < len)
@@ -1417,6 +1429,8 @@ static void qfq_qlen_notify(struct Qdisc *sch, unsigned long arg)
struct qfq_sched *q = qdisc_priv(sch);
struct qfq_class *cl = (struct qfq_class *)arg;
+ if (list_empty(&cl->alist))
+ return;
qfq_deactivate_class(q, cl);
}
@@ -1436,10 +1450,8 @@ static int qfq_init_qdisc(struct Qdisc *sch, struct nlattr *opt,
if (err < 0)
return err;
- if (qdisc_dev(sch)->tx_queue_len + 1 > QFQ_MAX_AGG_CLASSES)
- max_classes = QFQ_MAX_AGG_CLASSES;
- else
- max_classes = qdisc_dev(sch)->tx_queue_len + 1;
+ max_classes = min_t(u64, (u64)qdisc_dev(sch)->tx_queue_len + 1,
+ QFQ_MAX_AGG_CLASSES);
/* max_cl_shift = floor(log_2(max_classes)) */
max_cl_shift = __fls(max_classes);
q->max_agg_classes = 1<<max_cl_shift;
@@ -1475,8 +1487,6 @@ static void qfq_reset_qdisc(struct Qdisc *sch)
qdisc_reset(cl->qdisc);
}
}
- sch->qstats.backlog = 0;
- sch->q.qlen = 0;
}
static void qfq_destroy_qdisc(struct Qdisc *sch)
@@ -1491,6 +1501,7 @@ static void qfq_destroy_qdisc(struct Qdisc *sch)
for (i = 0; i < q->clhash.hashsize; i++) {
hlist_for_each_entry_safe(cl, next, &q->clhash.hash[i],
common.hnode) {
+ qfq_rm_from_agg(q, cl);
qfq_destroy_class(sch, cl);
}
}
@@ -1524,6 +1535,7 @@ static struct Qdisc_ops qfq_qdisc_ops __read_mostly = {
.destroy = qfq_destroy_qdisc,
.owner = THIS_MODULE,
};
+MODULE_ALIAS_NET_SCH("qfq");
static int __init qfq_init(void)
{
@@ -1538,3 +1550,4 @@ static void __exit qfq_exit(void)
module_init(qfq_init);
module_exit(qfq_exit);
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Quick Fair Queueing Plus qdisc");
diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c
index 9df9942340ea..479c42d11083 100644
--- a/net/sched/sch_red.c
+++ b/net/sched/sch_red.c
@@ -1,11 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* net/sched/sch_red.c Random Early Detection queue.
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
*
* Changes:
@@ -39,15 +35,23 @@
struct red_sched_data {
u32 limit; /* HARD maximal queue length */
+
unsigned char flags;
+ /* Non-flags in tc_red_qopt.flags. */
+ unsigned char userbits;
+
struct timer_list adapt_timer;
struct Qdisc *sch;
struct red_parms parms;
struct red_vars vars;
struct red_stats stats;
struct Qdisc *qdisc;
+ struct tcf_qevent qe_early_drop;
+ struct tcf_qevent qe_mark;
};
+#define TC_RED_SUPPORTED_FLAGS (TC_RED_HISTORIC_FLAGS | TC_RED_NODROP)
+
static inline int red_use_ecn(struct red_sched_data *q)
{
return q->flags & TC_RED_ECN;
@@ -58,11 +62,18 @@ static inline int red_use_harddrop(struct red_sched_data *q)
return q->flags & TC_RED_HARDDROP;
}
+static int red_use_nodrop(struct red_sched_data *q)
+{
+ return q->flags & TC_RED_NODROP;
+}
+
static int red_enqueue(struct sk_buff *skb, struct Qdisc *sch,
struct sk_buff **to_free)
{
+ enum skb_drop_reason reason = SKB_DROP_REASON_QDISC_CONGESTED;
struct red_sched_data *q = qdisc_priv(sch);
struct Qdisc *child = q->qdisc;
+ unsigned int len;
int ret;
q->vars.qavg = red_calc_qavg(&q->parms,
@@ -78,29 +89,50 @@ static int red_enqueue(struct sk_buff *skb, struct Qdisc *sch,
case RED_PROB_MARK:
qdisc_qstats_overlimit(sch);
- if (!red_use_ecn(q) || !INET_ECN_set_ce(skb)) {
+ if (!red_use_ecn(q)) {
+ q->stats.prob_drop++;
+ goto congestion_drop;
+ }
+
+ if (INET_ECN_set_ce(skb)) {
+ q->stats.prob_mark++;
+ skb = tcf_qevent_handle(&q->qe_mark, sch, skb, to_free, &ret);
+ if (!skb)
+ return NET_XMIT_CN | ret;
+ } else if (!red_use_nodrop(q)) {
q->stats.prob_drop++;
goto congestion_drop;
}
- q->stats.prob_mark++;
+ /* Non-ECT packet in ECN nodrop mode: queue it. */
break;
case RED_HARD_MARK:
+ reason = SKB_DROP_REASON_QDISC_OVERLIMIT;
qdisc_qstats_overlimit(sch);
- if (red_use_harddrop(q) || !red_use_ecn(q) ||
- !INET_ECN_set_ce(skb)) {
+ if (red_use_harddrop(q) || !red_use_ecn(q)) {
+ q->stats.forced_drop++;
+ goto congestion_drop;
+ }
+
+ if (INET_ECN_set_ce(skb)) {
+ q->stats.forced_mark++;
+ skb = tcf_qevent_handle(&q->qe_mark, sch, skb, to_free, &ret);
+ if (!skb)
+ return NET_XMIT_CN | ret;
+ } else if (!red_use_nodrop(q)) {
q->stats.forced_drop++;
goto congestion_drop;
}
- q->stats.forced_mark++;
+ /* Non-ECT packet in ECN nodrop mode: queue it. */
break;
}
+ len = qdisc_pkt_len(skb);
ret = qdisc_enqueue(skb, child, to_free);
if (likely(ret == NET_XMIT_SUCCESS)) {
- qdisc_qstats_backlog_inc(sch, skb);
+ sch->qstats.backlog += len;
sch->q.qlen++;
} else if (net_xmit_drop_count(ret)) {
q->stats.pdrop++;
@@ -109,7 +141,11 @@ static int red_enqueue(struct sk_buff *skb, struct Qdisc *sch,
return ret;
congestion_drop:
- qdisc_drop(skb, sch, to_free);
+ skb = tcf_qevent_handle(&q->qe_early_drop, sch, skb, to_free, &ret);
+ if (!skb)
+ return NET_XMIT_CN | ret;
+
+ qdisc_drop_reason(skb, sch, to_free, reason);
return NET_XMIT_CN;
}
@@ -144,8 +180,6 @@ static void red_reset(struct Qdisc *sch)
struct red_sched_data *q = qdisc_priv(sch);
qdisc_reset(q->qdisc);
- sch->qstats.backlog = 0;
- sch->q.qlen = 0;
red_restart(&q->vars);
}
@@ -169,6 +203,7 @@ static int red_offload(struct Qdisc *sch, bool enable)
opt.set.limit = q->limit;
opt.set.is_ecn = red_use_ecn(q);
opt.set.is_harddrop = red_use_harddrop(q);
+ opt.set.is_nodrop = red_use_nodrop(q);
opt.set.qstats = &sch->qstats;
} else {
opt.command = TC_RED_DESTROY;
@@ -181,44 +216,54 @@ static void red_destroy(struct Qdisc *sch)
{
struct red_sched_data *q = qdisc_priv(sch);
- del_timer_sync(&q->adapt_timer);
+ tcf_qevent_destroy(&q->qe_mark, sch);
+ tcf_qevent_destroy(&q->qe_early_drop, sch);
+ timer_delete_sync(&q->adapt_timer);
red_offload(sch, false);
qdisc_put(q->qdisc);
}
static const struct nla_policy red_policy[TCA_RED_MAX + 1] = {
+ [TCA_RED_UNSPEC] = { .strict_start_type = TCA_RED_FLAGS },
[TCA_RED_PARMS] = { .len = sizeof(struct tc_red_qopt) },
[TCA_RED_STAB] = { .len = RED_STAB_SIZE },
[TCA_RED_MAX_P] = { .type = NLA_U32 },
+ [TCA_RED_FLAGS] = NLA_POLICY_BITFIELD32(TC_RED_SUPPORTED_FLAGS),
+ [TCA_RED_EARLY_DROP_BLOCK] = { .type = NLA_U32 },
+ [TCA_RED_MARK_BLOCK] = { .type = NLA_U32 },
};
-static int red_change(struct Qdisc *sch, struct nlattr *opt,
- struct netlink_ext_ack *extack)
+static int __red_change(struct Qdisc *sch, struct nlattr **tb,
+ struct netlink_ext_ack *extack)
{
struct Qdisc *old_child = NULL, *child = NULL;
struct red_sched_data *q = qdisc_priv(sch);
- struct nlattr *tb[TCA_RED_MAX + 1];
+ struct nla_bitfield32 flags_bf;
struct tc_red_qopt *ctl;
+ unsigned char userbits;
+ unsigned char flags;
int err;
u32 max_P;
-
- if (opt == NULL)
- return -EINVAL;
-
- err = nla_parse_nested(tb, TCA_RED_MAX, opt, red_policy, NULL);
- if (err < 0)
- return err;
+ u8 *stab;
if (tb[TCA_RED_PARMS] == NULL ||
tb[TCA_RED_STAB] == NULL)
return -EINVAL;
- max_P = tb[TCA_RED_MAX_P] ? nla_get_u32(tb[TCA_RED_MAX_P]) : 0;
+ max_P = nla_get_u32_default(tb[TCA_RED_MAX_P], 0);
ctl = nla_data(tb[TCA_RED_PARMS]);
- if (!red_check_params(ctl->qth_min, ctl->qth_max, ctl->Wlog))
+ stab = nla_data(tb[TCA_RED_STAB]);
+ if (!red_check_params(ctl->qth_min, ctl->qth_max, ctl->Wlog,
+ ctl->Scell_log, stab))
return -EINVAL;
+ err = red_get_flags(ctl->flags, TC_RED_HISTORIC_FLAGS,
+ tb[TCA_RED_FLAGS], TC_RED_SUPPORTED_FLAGS,
+ &flags_bf, &userbits, extack);
+ if (err)
+ return err;
+
if (ctl->limit > 0) {
child = fifo_create_dflt(sch, &bfifo_qdisc_ops, ctl->limit,
extack);
@@ -230,11 +275,17 @@ static int red_change(struct Qdisc *sch, struct nlattr *opt,
}
sch_tree_lock(sch);
- q->flags = ctl->flags;
+
+ flags = (q->flags & ~flags_bf.selector) | flags_bf.value;
+ err = red_validate_flags(flags, extack);
+ if (err)
+ goto unlock_out;
+
+ q->flags = flags;
+ q->userbits = userbits;
q->limit = ctl->limit;
if (child) {
- qdisc_tree_reduce_backlog(q->qdisc, q->qdisc->q.qlen,
- q->qdisc->qstats.backlog);
+ qdisc_purge_queue(q->qdisc);
old_child = q->qdisc;
q->qdisc = child;
}
@@ -242,11 +293,11 @@ static int red_change(struct Qdisc *sch, struct nlattr *opt,
red_set_parms(&q->parms,
ctl->qth_min, ctl->qth_max, ctl->Wlog,
ctl->Plog, ctl->Scell_log,
- nla_data(tb[TCA_RED_STAB]),
+ stab,
max_P);
red_set_vars(&q->vars);
- del_timer(&q->adapt_timer);
+ timer_delete(&q->adapt_timer);
if (ctl->flags & TC_RED_ADAPTATIVE)
mod_timer(&q->adapt_timer, jiffies + HZ/2);
@@ -260,29 +311,86 @@ static int red_change(struct Qdisc *sch, struct nlattr *opt,
if (old_child)
qdisc_put(old_child);
return 0;
+
+unlock_out:
+ sch_tree_unlock(sch);
+ if (child)
+ qdisc_put(child);
+ return err;
}
static inline void red_adaptative_timer(struct timer_list *t)
{
- struct red_sched_data *q = from_timer(q, t, adapt_timer);
+ struct red_sched_data *q = timer_container_of(q, t, adapt_timer);
struct Qdisc *sch = q->sch;
- spinlock_t *root_lock = qdisc_lock(qdisc_root_sleeping(sch));
+ spinlock_t *root_lock;
+ rcu_read_lock();
+ root_lock = qdisc_lock(qdisc_root_sleeping(sch));
spin_lock(root_lock);
red_adaptative_algo(&q->parms, &q->vars);
mod_timer(&q->adapt_timer, jiffies + HZ/2);
spin_unlock(root_lock);
+ rcu_read_unlock();
}
static int red_init(struct Qdisc *sch, struct nlattr *opt,
struct netlink_ext_ack *extack)
{
struct red_sched_data *q = qdisc_priv(sch);
+ struct nlattr *tb[TCA_RED_MAX + 1];
+ int err;
q->qdisc = &noop_qdisc;
q->sch = sch;
timer_setup(&q->adapt_timer, red_adaptative_timer, 0);
- return red_change(sch, opt, extack);
+
+ if (!opt)
+ return -EINVAL;
+
+ err = nla_parse_nested_deprecated(tb, TCA_RED_MAX, opt, red_policy,
+ extack);
+ if (err < 0)
+ return err;
+
+ err = __red_change(sch, tb, extack);
+ if (err)
+ return err;
+
+ err = tcf_qevent_init(&q->qe_early_drop, sch,
+ FLOW_BLOCK_BINDER_TYPE_RED_EARLY_DROP,
+ tb[TCA_RED_EARLY_DROP_BLOCK], extack);
+ if (err)
+ return err;
+
+ return tcf_qevent_init(&q->qe_mark, sch,
+ FLOW_BLOCK_BINDER_TYPE_RED_MARK,
+ tb[TCA_RED_MARK_BLOCK], extack);
+}
+
+static int red_change(struct Qdisc *sch, struct nlattr *opt,
+ struct netlink_ext_ack *extack)
+{
+ struct red_sched_data *q = qdisc_priv(sch);
+ struct nlattr *tb[TCA_RED_MAX + 1];
+ int err;
+
+ err = nla_parse_nested_deprecated(tb, TCA_RED_MAX, opt, red_policy,
+ extack);
+ if (err < 0)
+ return err;
+
+ err = tcf_qevent_validate_change(&q->qe_early_drop,
+ tb[TCA_RED_EARLY_DROP_BLOCK], extack);
+ if (err)
+ return err;
+
+ err = tcf_qevent_validate_change(&q->qe_mark,
+ tb[TCA_RED_MARK_BLOCK], extack);
+ if (err)
+ return err;
+
+ return __red_change(sch, tb, extack);
}
static int red_dump_offload_stats(struct Qdisc *sch)
@@ -306,7 +414,8 @@ static int red_dump(struct Qdisc *sch, struct sk_buff *skb)
struct nlattr *opts = NULL;
struct tc_red_qopt opt = {
.limit = q->limit,
- .flags = q->flags,
+ .flags = (q->flags & TC_RED_HISTORIC_FLAGS) |
+ q->userbits,
.qth_min = q->parms.qth_min >> q->parms.Wlog,
.qth_max = q->parms.qth_max >> q->parms.Wlog,
.Wlog = q->parms.Wlog,
@@ -319,11 +428,15 @@ static int red_dump(struct Qdisc *sch, struct sk_buff *skb)
if (err)
goto nla_put_failure;
- opts = nla_nest_start(skb, TCA_OPTIONS);
+ opts = nla_nest_start_noflag(skb, TCA_OPTIONS);
if (opts == NULL)
goto nla_put_failure;
if (nla_put(skb, TCA_RED_PARMS, sizeof(opt), &opt) ||
- nla_put_u32(skb, TCA_RED_MAX_P, q->parms.max_P))
+ nla_put_u32(skb, TCA_RED_MAX_P, q->parms.max_P) ||
+ nla_put_bitfield32(skb, TCA_RED_FLAGS,
+ q->flags, TC_RED_SUPPORTED_FLAGS) ||
+ tcf_qevent_dump(skb, TCA_RED_MARK_BLOCK, &q->qe_mark) ||
+ tcf_qevent_dump(skb, TCA_RED_EARLY_DROP_BLOCK, &q->qe_early_drop))
goto nla_put_failure;
return nla_nest_end(skb, opts);
@@ -352,7 +465,6 @@ static int red_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
}
st.early = q->stats.prob_drop + q->stats.forced_drop;
st.pdrop = q->stats.pdrop;
- st.other = q->stats.other;
st.marked = q->stats.prob_mark + q->stats.forced_mark;
return gnet_stats_copy_app(d, &st, sizeof(st));
@@ -411,12 +523,7 @@ static unsigned long red_find(struct Qdisc *sch, u32 classid)
static void red_walk(struct Qdisc *sch, struct qdisc_walker *walker)
{
if (!walker->stop) {
- if (walker->count >= walker->skip)
- if (walker->fn(sch, 1, walker) < 0) {
- walker->stop = 1;
- return;
- }
- walker->count++;
+ tc_qdisc_stats_dump(sch, 1, walker);
}
}
@@ -443,6 +550,7 @@ static struct Qdisc_ops red_qdisc_ops __read_mostly = {
.dump_stats = red_dump_stats,
.owner = THIS_MODULE,
};
+MODULE_ALIAS_NET_SCH("red");
static int __init red_module_init(void)
{
@@ -458,3 +566,4 @@ module_init(red_module_init)
module_exit(red_module_exit)
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Random Early Detection qdisc");
diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c
index bab506b01a32..d2835f1168e1 100644
--- a/net/sched/sch_sfb.c
+++ b/net/sched/sch_sfb.c
@@ -1,19 +1,15 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* net/sched/sch_sfb.c Stochastic Fair Blue
*
* Copyright (c) 2008-2011 Juliusz Chroboczek <jch@pps.jussieu.fr>
* Copyright (c) 2011 Eric Dumazet <eric.dumazet@gmail.com>
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * version 2 as published by the Free Software Foundation.
- *
* W. Feng, D. Kandlur, D. Saha, K. Shin. Blue:
* A New Class of Active Queue Management Algorithms.
* U. Michigan CSE-TR-387-99, April 1999.
*
* http://www.thefengs.com/wuchang/blue/CSE-TR-387-99.pdf
- *
*/
#include <linux/module.h>
@@ -22,7 +18,7 @@
#include <linux/errno.h>
#include <linux/skbuff.h>
#include <linux/random.h>
-#include <linux/jhash.h>
+#include <linux/siphash.h>
#include <net/ip.h>
#include <net/pkt_sched.h>
#include <net/pkt_cls.h>
@@ -49,7 +45,7 @@ struct sfb_bucket {
* (Section 4.4 of SFB reference : moving hash functions)
*/
struct sfb_bins {
- u32 perturbation; /* jhash perturbation */
+ siphash_key_t perturbation; /* siphash key */
struct sfb_bucket bins[SFB_LEVELS][SFB_NUMBUCKETS];
};
@@ -139,15 +135,15 @@ static void increment_one_qlen(u32 sfbhash, u32 slot, struct sfb_sched_data *q)
}
}
-static void increment_qlen(const struct sk_buff *skb, struct sfb_sched_data *q)
+static void increment_qlen(const struct sfb_skb_cb *cb, struct sfb_sched_data *q)
{
u32 sfbhash;
- sfbhash = sfb_hash(skb, 0);
+ sfbhash = cb->hashes[0];
if (sfbhash)
increment_one_qlen(sfbhash, 0, q);
- sfbhash = sfb_hash(skb, 1);
+ sfbhash = cb->hashes[1];
if (sfbhash)
increment_one_qlen(sfbhash, 1, q);
}
@@ -221,7 +217,8 @@ static u32 sfb_compute_qlen(u32 *prob_r, u32 *avgpm_r, const struct sfb_sched_da
static void sfb_init_perturbation(u32 slot, struct sfb_sched_data *q)
{
- q->bins[slot].perturbation = prandom_u32();
+ get_random_bytes(&q->bins[slot].perturbation,
+ sizeof(q->bins[slot].perturbation));
}
static void sfb_swap_slot(struct sfb_sched_data *q)
@@ -260,7 +257,7 @@ static bool sfb_classify(struct sk_buff *skb, struct tcf_proto *fl,
struct tcf_result res;
int result;
- result = tcf_classify(skb, fl, &res, false);
+ result = tcf_classify(skb, NULL, fl, &res, false);
if (result >= 0) {
#ifdef CONFIG_NET_CLS_ACT
switch (result) {
@@ -268,7 +265,7 @@ static bool sfb_classify(struct sk_buff *skb, struct tcf_proto *fl,
case TC_ACT_QUEUED:
case TC_ACT_TRAP:
*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
- /* fall through */
+ fallthrough;
case TC_ACT_SHOT:
return false;
}
@@ -283,9 +280,12 @@ static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch,
struct sk_buff **to_free)
{
+ enum skb_drop_reason reason = SKB_DROP_REASON_QDISC_OVERLIMIT;
struct sfb_sched_data *q = qdisc_priv(sch);
+ unsigned int len = qdisc_pkt_len(skb);
struct Qdisc *child = q->qdisc;
struct tcf_proto *fl;
+ struct sfb_skb_cb cb;
int i;
u32 p_min = ~0;
u32 minqlen = ~0;
@@ -318,9 +318,9 @@ static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch,
/* If using external classifiers, get result and record it. */
if (!sfb_classify(skb, fl, &ret, &salt))
goto other_drop;
- sfbhash = jhash_1word(salt, q->bins[slot].perturbation);
+ sfbhash = siphash_1u32(salt, &q->bins[slot].perturbation);
} else {
- sfbhash = skb_get_hash_perturb(skb, q->bins[slot].perturbation);
+ sfbhash = skb_get_hash_perturb(skb, &q->bins[slot].perturbation);
}
@@ -356,7 +356,7 @@ static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch,
/* Inelastic flow */
if (q->double_buffering) {
sfbhash = skb_get_hash_perturb(skb,
- q->bins[slot].perturbation);
+ &q->bins[slot].perturbation);
if (!sfbhash)
sfbhash = 1;
sfb_skb_cb(skb)->hashes[slot] = sfbhash;
@@ -380,7 +380,8 @@ static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch,
goto enqueue;
}
- r = prandom_u32() & SFB_MAX_PROB;
+ r = get_random_u16() & SFB_MAX_PROB;
+ reason = SKB_DROP_REASON_QDISC_CONGESTED;
if (unlikely(r < p_min)) {
if (unlikely(p_min > SFB_MAX_PROB / 2)) {
@@ -402,11 +403,12 @@ static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch,
}
enqueue:
+ memcpy(&cb, sfb_skb_cb(skb), sizeof(cb));
ret = qdisc_enqueue(skb, child, to_free);
if (likely(ret == NET_XMIT_SUCCESS)) {
- qdisc_qstats_backlog_inc(sch, skb);
+ sch->qstats.backlog += len;
sch->q.qlen++;
- increment_qlen(skb, q);
+ increment_qlen(&cb, q);
} else if (net_xmit_drop_count(ret)) {
q->stats.childdrop++;
qdisc_qstats_drop(sch);
@@ -414,7 +416,7 @@ enqueue:
return ret;
drop:
- qdisc_drop(skb, sch, to_free);
+ qdisc_drop_reason(skb, sch, to_free, reason);
return NET_XMIT_CN;
other_drop:
if (ret & __NET_XMIT_BYPASS)
@@ -455,9 +457,8 @@ static void sfb_reset(struct Qdisc *sch)
{
struct sfb_sched_data *q = qdisc_priv(sch);
- qdisc_reset(q->qdisc);
- sch->qstats.backlog = 0;
- sch->q.qlen = 0;
+ if (likely(q->qdisc))
+ qdisc_reset(q->qdisc);
q->slot = 0;
q->double_buffering = false;
sfb_zero_all_buckets(q);
@@ -492,14 +493,15 @@ static int sfb_change(struct Qdisc *sch, struct nlattr *opt,
struct netlink_ext_ack *extack)
{
struct sfb_sched_data *q = qdisc_priv(sch);
- struct Qdisc *child;
+ struct Qdisc *child, *old;
struct nlattr *tb[TCA_SFB_MAX + 1];
const struct tc_sfb_qopt *ctl = &sfb_default_ops;
u32 limit;
int err;
if (opt) {
- err = nla_parse_nested(tb, TCA_SFB_MAX, opt, sfb_policy, NULL);
+ err = nla_parse_nested_deprecated(tb, TCA_SFB_MAX, opt,
+ sfb_policy, NULL);
if (err < 0)
return -EINVAL;
@@ -521,9 +523,8 @@ static int sfb_change(struct Qdisc *sch, struct nlattr *opt,
qdisc_hash_add(child, true);
sch_tree_lock(sch);
- qdisc_tree_reduce_backlog(q->qdisc, q->qdisc->q.qlen,
- q->qdisc->qstats.backlog);
- qdisc_put(q->qdisc);
+ qdisc_purge_queue(q->qdisc);
+ old = q->qdisc;
q->qdisc = child;
q->rehash_interval = msecs_to_jiffies(ctl->rehash_interval);
@@ -546,6 +547,7 @@ static int sfb_change(struct Qdisc *sch, struct nlattr *opt,
sfb_init_perturbation(1, q);
sch_tree_unlock(sch);
+ qdisc_put(old);
return 0;
}
@@ -581,7 +583,7 @@ static int sfb_dump(struct Qdisc *sch, struct sk_buff *skb)
};
sch->qstats.backlog = q->qdisc->qstats.backlog;
- opts = nla_nest_start(skb, TCA_OPTIONS);
+ opts = nla_nest_start_noflag(skb, TCA_OPTIONS);
if (opts == NULL)
goto nla_put_failure;
if (nla_put(skb, TCA_SFB_PARMS, sizeof(opt), &opt))
@@ -651,7 +653,8 @@ static int sfb_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
return -ENOSYS;
}
-static int sfb_delete(struct Qdisc *sch, unsigned long cl)
+static int sfb_delete(struct Qdisc *sch, unsigned long cl,
+ struct netlink_ext_ack *extack)
{
return -ENOSYS;
}
@@ -659,12 +662,7 @@ static int sfb_delete(struct Qdisc *sch, unsigned long cl)
static void sfb_walk(struct Qdisc *sch, struct qdisc_walker *walker)
{
if (!walker->stop) {
- if (walker->count >= walker->skip)
- if (walker->fn(sch, 1, walker) < 0) {
- walker->stop = 1;
- return;
- }
- walker->count++;
+ tc_qdisc_stats_dump(sch, 1, walker);
}
}
@@ -713,6 +711,7 @@ static struct Qdisc_ops sfb_qdisc_ops __read_mostly = {
.dump_stats = sfb_dump_stats,
.owner = THIS_MODULE,
};
+MODULE_ALIAS_NET_SCH("sfb");
static int __init sfb_module_init(void)
{
diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c
index 2f2678197760..96eb2f122973 100644
--- a/net/sched/sch_sfq.c
+++ b/net/sched/sch_sfq.c
@@ -1,11 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* net/sched/sch_sfq.c Stochastic Fairness Queueing discipline.
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
*/
@@ -18,7 +14,7 @@
#include <linux/errno.h>
#include <linux/init.h>
#include <linux/skbuff.h>
-#include <linux/jhash.h>
+#include <linux/siphash.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <net/netlink.h>
@@ -81,12 +77,6 @@
#define SFQ_EMPTY_SLOT 0xffff
#define SFQ_DEFAULT_HASH_DIVISOR 1024
-/* We use 16 bits to store allot, and want to handle packets up to 64K
- * Scale allot by 8 (1<<3) so that no overflow occurs.
- */
-#define SFQ_ALLOT_SHIFT 3
-#define SFQ_ALLOT_SIZE(X) DIV_ROUND_UP(X, 1 << SFQ_ALLOT_SHIFT)
-
/* This type should contain at least SFQ_MAX_DEPTH + 1 + SFQ_MAX_FLOWS values */
typedef u16 sfq_index;
@@ -108,7 +98,7 @@ struct sfq_slot {
sfq_index next; /* next slot in sfq RR chain */
struct sfq_head dep; /* anchor in dep[] chains */
unsigned short hash; /* hash value (index in ht[]) */
- short allot; /* credit for this slot */
+ int allot; /* credit for this slot */
unsigned int backlog;
struct red_vars vars;
@@ -121,10 +111,9 @@ struct sfq_sched_data {
u8 headdrop;
u8 maxdepth; /* limit of packets per flow */
- u32 perturbation;
+ siphash_key_t perturbation;
u8 cur_depth; /* depth of longest slot */
u8 flags;
- unsigned short scaled_quantum; /* SFQ_ALLOT_SIZE(quantum) */
struct tcf_proto __rcu *filter_list;
struct tcf_block *block;
sfq_index *ht; /* Hash table ('divisor' slots) */
@@ -161,7 +150,7 @@ static inline struct sfq_head *sfq_dep_head(struct sfq_sched_data *q, sfq_index
static unsigned int sfq_hash(const struct sfq_sched_data *q,
const struct sk_buff *skb)
{
- return skb_get_hash_perturb(skb, q->perturbation) & (q->divisor - 1);
+ return skb_get_hash_perturb(skb, &q->perturbation) & (q->divisor - 1);
}
static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch,
@@ -182,7 +171,7 @@ static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch,
return sfq_hash(q, skb) + 1;
*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
- result = tcf_classify(skb, fl, &res, false);
+ result = tcf_classify(skb, NULL, fl, &res, false);
if (result >= 0) {
#ifdef CONFIG_NET_CLS_ACT
switch (result) {
@@ -190,7 +179,7 @@ static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch,
case TC_ACT_QUEUED:
case TC_ACT_TRAP:
*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
- /* fall through */
+ fallthrough;
case TC_ACT_SHOT:
return 0;
}
@@ -321,7 +310,10 @@ drop:
/* It is difficult to believe, but ALL THE SLOTS HAVE LENGTH 1. */
x = q->tail->next;
slot = &q->slots[x];
- q->tail->next = slot->next;
+ if (slot->next == x)
+ q->tail = NULL; /* no more active slots */
+ else
+ q->tail->next = slot->next;
q->ht[slot->hash] = SFQ_EMPTY_SLOT;
goto drop;
}
@@ -353,7 +345,7 @@ sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free)
unsigned int hash, dropped;
sfq_index x, qlen;
struct sfq_slot *slot;
- int uninitialized_var(ret);
+ int ret;
struct sk_buff *head;
int delta;
@@ -460,7 +452,7 @@ enqueue:
*/
q->tail = slot;
/* We could use a bigger initial quantum for new flows */
- slot->allot = q->scaled_quantum;
+ slot->allot = q->quantum;
}
if (++sch->q.qlen <= q->limit)
return NET_XMIT_SUCCESS;
@@ -497,7 +489,7 @@ next_slot:
slot = &q->slots[a];
if (slot->allot <= 0) {
q->tail = slot;
- slot->allot += q->scaled_quantum;
+ slot->allot += q->quantum;
goto next_slot;
}
skb = slot_dequeue_head(slot);
@@ -516,7 +508,7 @@ next_slot:
}
q->tail->next = next_a;
} else {
- slot->allot -= SFQ_ALLOT_SIZE(qdisc_pkt_len(skb));
+ slot->allot -= qdisc_pkt_len(skb);
}
return skb;
}
@@ -599,7 +591,7 @@ drop:
q->tail->next = x;
}
q->tail = slot;
- slot->allot = q->scaled_quantum;
+ slot->allot = q->quantum;
}
}
sch->q.qlen -= dropped;
@@ -608,21 +600,32 @@ drop:
static void sfq_perturbation(struct timer_list *t)
{
- struct sfq_sched_data *q = from_timer(q, t, perturb_timer);
+ struct sfq_sched_data *q = timer_container_of(q, t, perturb_timer);
struct Qdisc *sch = q->sch;
- spinlock_t *root_lock = qdisc_lock(qdisc_root_sleeping(sch));
+ spinlock_t *root_lock;
+ siphash_key_t nkey;
+ int period;
+ get_random_bytes(&nkey, sizeof(nkey));
+ rcu_read_lock();
+ root_lock = qdisc_lock(qdisc_root_sleeping(sch));
spin_lock(root_lock);
- q->perturbation = prandom_u32();
+ q->perturbation = nkey;
if (!q->filter_list && q->tail)
sfq_rehash(sch);
spin_unlock(root_lock);
- if (q->perturb_period)
- mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
+ /* q->perturb_period can change under us from
+ * sfq_change() and sfq_destroy().
+ */
+ period = READ_ONCE(q->perturb_period);
+ if (period)
+ mod_timer(&q->perturb_timer, jiffies + period);
+ rcu_read_unlock();
}
-static int sfq_change(struct Qdisc *sch, struct nlattr *opt)
+static int sfq_change(struct Qdisc *sch, struct nlattr *opt,
+ struct netlink_ext_ack *extack)
{
struct sfq_sched_data *q = qdisc_priv(sch);
struct tc_sfq_qopt *ctl = nla_data(opt);
@@ -631,6 +634,15 @@ static int sfq_change(struct Qdisc *sch, struct nlattr *opt)
struct red_parms *p = NULL;
struct sk_buff *to_free = NULL;
struct sk_buff *tail = NULL;
+ unsigned int maxflows;
+ unsigned int quantum;
+ unsigned int divisor;
+ int perturb_period;
+ u8 headdrop;
+ u8 maxdepth;
+ int limit;
+ u8 flags;
+
if (opt->nla_len < nla_attr_size(sizeof(*ctl)))
return -EINVAL;
@@ -639,45 +651,83 @@ static int sfq_change(struct Qdisc *sch, struct nlattr *opt)
if (ctl->divisor &&
(!is_power_of_2(ctl->divisor) || ctl->divisor > 65536))
return -EINVAL;
+
+ if ((int)ctl->quantum < 0) {
+ NL_SET_ERR_MSG_MOD(extack, "invalid quantum");
+ return -EINVAL;
+ }
+
+ if (ctl->perturb_period < 0 ||
+ ctl->perturb_period > INT_MAX / HZ) {
+ NL_SET_ERR_MSG_MOD(extack, "invalid perturb period");
+ return -EINVAL;
+ }
+ perturb_period = ctl->perturb_period * HZ;
+
if (ctl_v1 && !red_check_params(ctl_v1->qth_min, ctl_v1->qth_max,
- ctl_v1->Wlog))
+ ctl_v1->Wlog, ctl_v1->Scell_log, NULL))
return -EINVAL;
if (ctl_v1 && ctl_v1->qth_min) {
p = kmalloc(sizeof(*p), GFP_KERNEL);
if (!p)
return -ENOMEM;
}
+
sch_tree_lock(sch);
- if (ctl->quantum) {
- q->quantum = ctl->quantum;
- q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum);
- }
- q->perturb_period = ctl->perturb_period * HZ;
+
+ limit = q->limit;
+ divisor = q->divisor;
+ headdrop = q->headdrop;
+ maxdepth = q->maxdepth;
+ maxflows = q->maxflows;
+ quantum = q->quantum;
+ flags = q->flags;
+
+ /* update and validate configuration */
+ if (ctl->quantum)
+ quantum = ctl->quantum;
if (ctl->flows)
- q->maxflows = min_t(u32, ctl->flows, SFQ_MAX_FLOWS);
+ maxflows = min_t(u32, ctl->flows, SFQ_MAX_FLOWS);
if (ctl->divisor) {
- q->divisor = ctl->divisor;
- q->maxflows = min_t(u32, q->maxflows, q->divisor);
+ divisor = ctl->divisor;
+ maxflows = min_t(u32, maxflows, divisor);
}
if (ctl_v1) {
if (ctl_v1->depth)
- q->maxdepth = min_t(u32, ctl_v1->depth, SFQ_MAX_DEPTH);
+ maxdepth = min_t(u32, ctl_v1->depth, SFQ_MAX_DEPTH);
if (p) {
- swap(q->red_parms, p);
- red_set_parms(q->red_parms,
+ red_set_parms(p,
ctl_v1->qth_min, ctl_v1->qth_max,
ctl_v1->Wlog,
ctl_v1->Plog, ctl_v1->Scell_log,
NULL,
ctl_v1->max_P);
}
- q->flags = ctl_v1->flags;
- q->headdrop = ctl_v1->headdrop;
+ flags = ctl_v1->flags;
+ headdrop = ctl_v1->headdrop;
}
if (ctl->limit) {
- q->limit = min_t(u32, ctl->limit, q->maxdepth * q->maxflows);
- q->maxflows = min_t(u32, q->maxflows, q->limit);
+ limit = min_t(u32, ctl->limit, maxdepth * maxflows);
+ maxflows = min_t(u32, maxflows, limit);
}
+ if (limit == 1) {
+ sch_tree_unlock(sch);
+ kfree(p);
+ NL_SET_ERR_MSG_MOD(extack, "invalid limit");
+ return -EINVAL;
+ }
+
+ /* commit configuration */
+ q->limit = limit;
+ q->divisor = divisor;
+ q->headdrop = headdrop;
+ q->maxdepth = maxdepth;
+ q->maxflows = maxflows;
+ WRITE_ONCE(q->perturb_period, perturb_period);
+ q->quantum = quantum;
+ q->flags = flags;
+ if (p)
+ swap(q->red_parms, p);
qlen = sch->q.qlen;
while (sch->q.qlen > q->limit) {
@@ -689,10 +739,10 @@ static int sfq_change(struct Qdisc *sch, struct nlattr *opt)
rtnl_kfree_skbs(to_free, tail);
qdisc_tree_reduce_backlog(sch, qlen - sch->q.qlen, dropped);
- del_timer(&q->perturb_timer);
+ timer_delete(&q->perturb_timer);
if (q->perturb_period) {
mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
- q->perturbation = prandom_u32();
+ get_random_bytes(&q->perturbation, sizeof(q->perturbation));
}
sch_tree_unlock(sch);
kfree(p);
@@ -714,8 +764,8 @@ static void sfq_destroy(struct Qdisc *sch)
struct sfq_sched_data *q = qdisc_priv(sch);
tcf_block_put(q->block);
- q->perturb_period = 0;
- del_timer_sync(&q->perturb_timer);
+ WRITE_ONCE(q->perturb_period, 0);
+ timer_delete_sync(&q->perturb_timer);
sfq_free(q->ht);
sfq_free(q->slots);
kfree(q->red_parms);
@@ -747,12 +797,11 @@ static int sfq_init(struct Qdisc *sch, struct nlattr *opt,
q->divisor = SFQ_DEFAULT_HASH_DIVISOR;
q->maxflows = SFQ_DEFAULT_FLOWS;
q->quantum = psched_mtu(qdisc_dev(sch));
- q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum);
q->perturb_period = 0;
- q->perturbation = prandom_u32();
+ get_random_bytes(&q->perturbation, sizeof(q->perturbation));
if (opt) {
- int err = sfq_change(sch, opt);
+ int err = sfq_change(sch, opt, extack);
if (err)
return err;
}
@@ -828,8 +877,6 @@ static unsigned long sfq_find(struct Qdisc *sch, u32 classid)
static unsigned long sfq_bind(struct Qdisc *sch, unsigned long parent,
u32 classid)
{
- /* we cannot bypass queue discipline anymore */
- sch->flags &= ~TCQ_F_CAN_BYPASS;
return 0;
}
@@ -865,7 +912,7 @@ static int sfq_dump_class_stats(struct Qdisc *sch, unsigned long cl,
if (idx != SFQ_EMPTY_SLOT) {
const struct sfq_slot *slot = &q->slots[idx];
- xstats.allot = slot->allot << SFQ_ALLOT_SHIFT;
+ xstats.allot = slot->allot;
qs.qlen = slot->qlen;
qs.backlog = slot->backlog;
}
@@ -883,16 +930,12 @@ static void sfq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
return;
for (i = 0; i < q->divisor; i++) {
- if (q->ht[i] == SFQ_EMPTY_SLOT ||
- arg->count < arg->skip) {
+ if (q->ht[i] == SFQ_EMPTY_SLOT) {
arg->count++;
continue;
}
- if (arg->fn(sch, i + 1, arg) < 0) {
- arg->stop = 1;
+ if (!tc_qdisc_stats_dump(sch, i + 1, arg))
break;
- }
- arg->count++;
}
}
@@ -921,6 +964,7 @@ static struct Qdisc_ops sfq_qdisc_ops __read_mostly = {
.dump = sfq_dump,
.owner = THIS_MODULE,
};
+MODULE_ALIAS_NET_SCH("sfq");
static int __init sfq_module_init(void)
{
@@ -933,3 +977,4 @@ static void __exit sfq_module_exit(void)
module_init(sfq_module_init)
module_exit(sfq_module_exit)
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Stochastic Fairness qdisc");
diff --git a/net/sched/sch_skbprio.c b/net/sched/sch_skbprio.c
index 52c0b6d8f1d7..f485f62ab721 100644
--- a/net/sched/sch_skbprio.c
+++ b/net/sched/sch_skbprio.c
@@ -1,11 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* net/sched/sch_skbprio.c SKB Priority Queue.
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Authors: Nishanth Devarajan, <ndev2021@gmail.com>
* Cody Doucette, <doucette@bu.edu>
* original idea by Michel Machado, Cody Doucette, and Qiaobin Fu
@@ -83,7 +79,9 @@ static int skbprio_enqueue(struct sk_buff *skb, struct Qdisc *sch,
prio = min(skb->priority, max_priority);
qdisc = &q->qdiscs[prio];
- if (sch->q.qlen < sch->limit) {
+
+ /* sch->limit can change under us from skbprio_change() */
+ if (sch->q.qlen < READ_ONCE(sch->limit)) {
__skb_queue_tail(qdisc, skb);
qdisc_qstats_backlog_inc(sch, skb);
q->qstats[prio].backlog += qdisc_pkt_len(skb);
@@ -125,8 +123,6 @@ static int skbprio_enqueue(struct sk_buff *skb, struct Qdisc *sch,
/* Check to update highest and lowest priorities. */
if (skb_queue_empty(lp_qdisc)) {
if (q->lowest_prio == q->highest_prio) {
- /* The incoming packet is the only packet in queue. */
- BUG_ON(sch->q.qlen != 1);
q->lowest_prio = prio;
q->highest_prio = prio;
} else {
@@ -158,7 +154,6 @@ static struct sk_buff *skbprio_dequeue(struct Qdisc *sch)
/* Update highest priority field. */
if (skb_queue_empty(hpq)) {
if (q->lowest_prio == q->highest_prio) {
- BUG_ON(sch->q.qlen);
q->highest_prio = 0;
q->lowest_prio = SKBPRIO_MAX_PRIORITY - 1;
} else {
@@ -173,7 +168,10 @@ static int skbprio_change(struct Qdisc *sch, struct nlattr *opt,
{
struct tc_skbprio_qopt *ctl = nla_data(opt);
- sch->limit = ctl->limit;
+ if (opt->nla_len != nla_attr_size(sizeof(*ctl)))
+ return -EINVAL;
+
+ WRITE_ONCE(sch->limit, ctl->limit);
return 0;
}
@@ -201,7 +199,7 @@ static int skbprio_dump(struct Qdisc *sch, struct sk_buff *skb)
{
struct tc_skbprio_qopt opt;
- opt.limit = sch->limit;
+ opt.limit = READ_ONCE(sch->limit);
if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt))
return -1;
@@ -214,9 +212,6 @@ static void skbprio_reset(struct Qdisc *sch)
struct skbprio_sched_data *q = qdisc_priv(sch);
int prio;
- sch->qstats.backlog = 0;
- sch->q.qlen = 0;
-
for (prio = 0; prio < SKBPRIO_MAX_PRIORITY; prio++)
__skb_queue_purge(&q->qdiscs[prio]);
@@ -269,15 +264,8 @@ static void skbprio_walk(struct Qdisc *sch, struct qdisc_walker *arg)
return;
for (i = 0; i < SKBPRIO_MAX_PRIORITY; i++) {
- if (arg->count < arg->skip) {
- arg->count++;
- continue;
- }
- if (arg->fn(sch, i + 1, arg) < 0) {
- arg->stop = 1;
+ if (!tc_qdisc_stats_dump(sch, i + 1, arg))
break;
- }
- arg->count++;
}
}
@@ -303,6 +291,7 @@ static struct Qdisc_ops skbprio_qdisc_ops __read_mostly = {
.destroy = skbprio_destroy,
.owner = THIS_MODULE,
};
+MODULE_ALIAS_NET_SCH("skbprio");
static int __init skbprio_module_init(void)
{
@@ -318,3 +307,4 @@ module_init(skbprio_module_init)
module_exit(skbprio_module_exit)
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("SKB priority based scheduling qdisc");
diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c
index 206e4dbed12f..300d577b3286 100644
--- a/net/sched/sch_taprio.c
+++ b/net/sched/sch_taprio.c
@@ -6,6 +6,8 @@
*
*/
+#include <linux/ethtool.h>
+#include <linux/ethtool_netlink.h>
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/kernel.h>
@@ -13,48 +15,615 @@
#include <linux/list.h>
#include <linux/errno.h>
#include <linux/skbuff.h>
+#include <linux/math64.h>
#include <linux/module.h>
#include <linux/spinlock.h>
+#include <linux/rcupdate.h>
+#include <linux/time.h>
+#include <net/gso.h>
#include <net/netlink.h>
#include <net/pkt_sched.h>
#include <net/pkt_cls.h>
#include <net/sch_generic.h>
+#include <net/sock.h>
+#include <net/tcp.h>
+
+#define TAPRIO_STAT_NOT_SET (~0ULL)
+
+#include "sch_mqprio_lib.h"
+
+static LIST_HEAD(taprio_list);
+static struct static_key_false taprio_have_broken_mqprio;
+static struct static_key_false taprio_have_working_mqprio;
#define TAPRIO_ALL_GATES_OPEN -1
-struct sched_entry {
- struct list_head list;
+#define TXTIME_ASSIST_IS_ENABLED(flags) ((flags) & TCA_TAPRIO_ATTR_FLAG_TXTIME_ASSIST)
+#define FULL_OFFLOAD_IS_ENABLED(flags) ((flags) & TCA_TAPRIO_ATTR_FLAG_FULL_OFFLOAD)
+#define TAPRIO_SUPPORTED_FLAGS \
+ (TCA_TAPRIO_ATTR_FLAG_TXTIME_ASSIST | TCA_TAPRIO_ATTR_FLAG_FULL_OFFLOAD)
+#define TAPRIO_FLAGS_INVALID U32_MAX
+/* Minimum value for picos_per_byte to ensure non-zero duration
+ * for minimum-sized Ethernet frames (ETH_ZLEN = 60).
+ * 60 * 17 > PSEC_PER_NSEC (1000)
+ */
+#define TAPRIO_PICOS_PER_BYTE_MIN 17
- /* The instant that this entry "closes" and the next one
- * should open, the qdisc will make some effort so that no
- * packet leaves after this time.
+struct sched_entry {
+ /* Durations between this GCL entry and the GCL entry where the
+ * respective traffic class gate closes
*/
- ktime_t close_time;
- atomic_t budget;
+ u64 gate_duration[TC_MAX_QUEUE];
+ atomic_t budget[TC_MAX_QUEUE];
+ /* The qdisc makes some effort so that no packet leaves
+ * after this time
+ */
+ ktime_t gate_close_time[TC_MAX_QUEUE];
+ struct list_head list;
+ /* Used to calculate when to advance the schedule */
+ ktime_t end_time;
+ ktime_t next_txtime;
int index;
u32 gate_mask;
u32 interval;
u8 command;
};
+struct sched_gate_list {
+ /* Longest non-zero contiguous gate durations per traffic class,
+ * or 0 if a traffic class gate never opens during the schedule.
+ */
+ u64 max_open_gate_duration[TC_MAX_QUEUE];
+ u32 max_frm_len[TC_MAX_QUEUE]; /* for the fast path */
+ u32 max_sdu[TC_MAX_QUEUE]; /* for dump */
+ struct rcu_head rcu;
+ struct list_head entries;
+ size_t num_entries;
+ ktime_t cycle_end_time;
+ s64 cycle_time;
+ s64 cycle_time_extension;
+ s64 base_time;
+};
+
struct taprio_sched {
struct Qdisc **qdiscs;
struct Qdisc *root;
- s64 base_time;
+ u32 flags;
+ enum tk_offsets tk_offset;
int clockid;
- int picos_per_byte; /* Using picoseconds because for 10Gbps+
- * speeds it's sub-nanoseconds per byte
- */
- size_t num_entries;
+ bool offloaded;
+ bool detected_mqprio;
+ bool broken_mqprio;
+ atomic64_t picos_per_byte; /* Using picoseconds because for 10Gbps+
+ * speeds it's sub-nanoseconds per byte
+ */
/* Protects the update side of the RCU protected current_entry */
spinlock_t current_entry_lock;
struct sched_entry __rcu *current_entry;
- struct list_head entries;
- ktime_t (*get_time)(void);
+ struct sched_gate_list __rcu *oper_sched;
+ struct sched_gate_list __rcu *admin_sched;
struct hrtimer advance_timer;
+ struct list_head taprio_list;
+ int cur_txq[TC_MAX_QUEUE];
+ u32 max_sdu[TC_MAX_QUEUE]; /* save info from the user */
+ u32 fp[TC_QOPT_MAX_QUEUE]; /* only for dump and offloading */
+ u32 txtime_delay;
+};
+
+struct __tc_taprio_qopt_offload {
+ refcount_t users;
+ struct tc_taprio_qopt_offload offload;
};
+static void taprio_calculate_gate_durations(struct taprio_sched *q,
+ struct sched_gate_list *sched)
+{
+ struct net_device *dev = qdisc_dev(q->root);
+ int num_tc = netdev_get_num_tc(dev);
+ struct sched_entry *entry, *cur;
+ int tc;
+
+ list_for_each_entry(entry, &sched->entries, list) {
+ u32 gates_still_open = entry->gate_mask;
+
+ /* For each traffic class, calculate each open gate duration,
+ * starting at this schedule entry and ending at the schedule
+ * entry containing a gate close event for that TC.
+ */
+ cur = entry;
+
+ do {
+ if (!gates_still_open)
+ break;
+
+ for (tc = 0; tc < num_tc; tc++) {
+ if (!(gates_still_open & BIT(tc)))
+ continue;
+
+ if (cur->gate_mask & BIT(tc))
+ entry->gate_duration[tc] += cur->interval;
+ else
+ gates_still_open &= ~BIT(tc);
+ }
+
+ cur = list_next_entry_circular(cur, &sched->entries, list);
+ } while (cur != entry);
+
+ /* Keep track of the maximum gate duration for each traffic
+ * class, taking care to not confuse a traffic class which is
+ * temporarily closed with one that is always closed.
+ */
+ for (tc = 0; tc < num_tc; tc++)
+ if (entry->gate_duration[tc] &&
+ sched->max_open_gate_duration[tc] < entry->gate_duration[tc])
+ sched->max_open_gate_duration[tc] = entry->gate_duration[tc];
+ }
+}
+
+static bool taprio_entry_allows_tx(ktime_t skb_end_time,
+ struct sched_entry *entry, int tc)
+{
+ return ktime_before(skb_end_time, entry->gate_close_time[tc]);
+}
+
+static ktime_t sched_base_time(const struct sched_gate_list *sched)
+{
+ if (!sched)
+ return KTIME_MAX;
+
+ return ns_to_ktime(sched->base_time);
+}
+
+static ktime_t taprio_mono_to_any(const struct taprio_sched *q, ktime_t mono)
+{
+ /* This pairs with WRITE_ONCE() in taprio_parse_clockid() */
+ enum tk_offsets tk_offset = READ_ONCE(q->tk_offset);
+
+ switch (tk_offset) {
+ case TK_OFFS_MAX:
+ return mono;
+ default:
+ return ktime_mono_to_any(mono, tk_offset);
+ }
+}
+
+static ktime_t taprio_get_time(const struct taprio_sched *q)
+{
+ return taprio_mono_to_any(q, ktime_get());
+}
+
+static void taprio_free_sched_cb(struct rcu_head *head)
+{
+ struct sched_gate_list *sched = container_of(head, struct sched_gate_list, rcu);
+ struct sched_entry *entry, *n;
+
+ list_for_each_entry_safe(entry, n, &sched->entries, list) {
+ list_del(&entry->list);
+ kfree(entry);
+ }
+
+ kfree(sched);
+}
+
+static void switch_schedules(struct taprio_sched *q,
+ struct sched_gate_list **admin,
+ struct sched_gate_list **oper)
+{
+ rcu_assign_pointer(q->oper_sched, *admin);
+ rcu_assign_pointer(q->admin_sched, NULL);
+
+ if (*oper)
+ call_rcu(&(*oper)->rcu, taprio_free_sched_cb);
+
+ *oper = *admin;
+ *admin = NULL;
+}
+
+/* Get how much time has been already elapsed in the current cycle. */
+static s32 get_cycle_time_elapsed(struct sched_gate_list *sched, ktime_t time)
+{
+ ktime_t time_since_sched_start;
+ s32 time_elapsed;
+
+ time_since_sched_start = ktime_sub(time, sched->base_time);
+ div_s64_rem(time_since_sched_start, sched->cycle_time, &time_elapsed);
+
+ return time_elapsed;
+}
+
+static ktime_t get_interval_end_time(struct sched_gate_list *sched,
+ struct sched_gate_list *admin,
+ struct sched_entry *entry,
+ ktime_t intv_start)
+{
+ s32 cycle_elapsed = get_cycle_time_elapsed(sched, intv_start);
+ ktime_t intv_end, cycle_ext_end, cycle_end;
+
+ cycle_end = ktime_add_ns(intv_start, sched->cycle_time - cycle_elapsed);
+ intv_end = ktime_add_ns(intv_start, entry->interval);
+ cycle_ext_end = ktime_add(cycle_end, sched->cycle_time_extension);
+
+ if (ktime_before(intv_end, cycle_end))
+ return intv_end;
+ else if (admin && admin != sched &&
+ ktime_after(admin->base_time, cycle_end) &&
+ ktime_before(admin->base_time, cycle_ext_end))
+ return admin->base_time;
+ else
+ return cycle_end;
+}
+
+static int length_to_duration(struct taprio_sched *q, int len)
+{
+ return div_u64(len * atomic64_read(&q->picos_per_byte), PSEC_PER_NSEC);
+}
+
+static int duration_to_length(struct taprio_sched *q, u64 duration)
+{
+ return div_u64(duration * PSEC_PER_NSEC, atomic64_read(&q->picos_per_byte));
+}
+
+/* Sets sched->max_sdu[] and sched->max_frm_len[] to the minimum between the
+ * q->max_sdu[] requested by the user and the max_sdu dynamically determined by
+ * the maximum open gate durations at the given link speed.
+ */
+static void taprio_update_queue_max_sdu(struct taprio_sched *q,
+ struct sched_gate_list *sched,
+ struct qdisc_size_table *stab)
+{
+ struct net_device *dev = qdisc_dev(q->root);
+ int num_tc = netdev_get_num_tc(dev);
+ u32 max_sdu_from_user;
+ u32 max_sdu_dynamic;
+ u32 max_sdu;
+ int tc;
+
+ for (tc = 0; tc < num_tc; tc++) {
+ max_sdu_from_user = q->max_sdu[tc] ?: U32_MAX;
+
+ /* TC gate never closes => keep the queueMaxSDU
+ * selected by the user
+ */
+ if (sched->max_open_gate_duration[tc] == sched->cycle_time) {
+ max_sdu_dynamic = U32_MAX;
+ } else {
+ u32 max_frm_len;
+
+ max_frm_len = duration_to_length(q, sched->max_open_gate_duration[tc]);
+ /* Compensate for L1 overhead from size table,
+ * but don't let the frame size go negative
+ */
+ if (stab) {
+ max_frm_len -= stab->szopts.overhead;
+ max_frm_len = max_t(int, max_frm_len,
+ dev->hard_header_len + 1);
+ }
+ max_sdu_dynamic = max_frm_len - dev->hard_header_len;
+ if (max_sdu_dynamic > dev->max_mtu)
+ max_sdu_dynamic = U32_MAX;
+ }
+
+ max_sdu = min(max_sdu_dynamic, max_sdu_from_user);
+
+ if (max_sdu != U32_MAX) {
+ sched->max_frm_len[tc] = max_sdu + dev->hard_header_len;
+ sched->max_sdu[tc] = max_sdu;
+ } else {
+ sched->max_frm_len[tc] = U32_MAX; /* never oversized */
+ sched->max_sdu[tc] = 0;
+ }
+ }
+}
+
+/* Returns the entry corresponding to next available interval. If
+ * validate_interval is set, it only validates whether the timestamp occurs
+ * when the gate corresponding to the skb's traffic class is open.
+ */
+static struct sched_entry *find_entry_to_transmit(struct sk_buff *skb,
+ struct Qdisc *sch,
+ struct sched_gate_list *sched,
+ struct sched_gate_list *admin,
+ ktime_t time,
+ ktime_t *interval_start,
+ ktime_t *interval_end,
+ bool validate_interval)
+{
+ ktime_t curr_intv_start, curr_intv_end, cycle_end, packet_transmit_time;
+ ktime_t earliest_txtime = KTIME_MAX, txtime, cycle, transmit_end_time;
+ struct sched_entry *entry = NULL, *entry_found = NULL;
+ struct taprio_sched *q = qdisc_priv(sch);
+ struct net_device *dev = qdisc_dev(sch);
+ bool entry_available = false;
+ s32 cycle_elapsed;
+ int tc, n;
+
+ tc = netdev_get_prio_tc_map(dev, skb->priority);
+ packet_transmit_time = length_to_duration(q, qdisc_pkt_len(skb));
+
+ *interval_start = 0;
+ *interval_end = 0;
+
+ if (!sched)
+ return NULL;
+
+ cycle = sched->cycle_time;
+ cycle_elapsed = get_cycle_time_elapsed(sched, time);
+ curr_intv_end = ktime_sub_ns(time, cycle_elapsed);
+ cycle_end = ktime_add_ns(curr_intv_end, cycle);
+
+ list_for_each_entry(entry, &sched->entries, list) {
+ curr_intv_start = curr_intv_end;
+ curr_intv_end = get_interval_end_time(sched, admin, entry,
+ curr_intv_start);
+
+ if (ktime_after(curr_intv_start, cycle_end))
+ break;
+
+ if (!(entry->gate_mask & BIT(tc)) ||
+ packet_transmit_time > entry->interval)
+ continue;
+
+ txtime = entry->next_txtime;
+
+ if (ktime_before(txtime, time) || validate_interval) {
+ transmit_end_time = ktime_add_ns(time, packet_transmit_time);
+ if ((ktime_before(curr_intv_start, time) &&
+ ktime_before(transmit_end_time, curr_intv_end)) ||
+ (ktime_after(curr_intv_start, time) && !validate_interval)) {
+ entry_found = entry;
+ *interval_start = curr_intv_start;
+ *interval_end = curr_intv_end;
+ break;
+ } else if (!entry_available && !validate_interval) {
+ /* Here, we are just trying to find out the
+ * first available interval in the next cycle.
+ */
+ entry_available = true;
+ entry_found = entry;
+ *interval_start = ktime_add_ns(curr_intv_start, cycle);
+ *interval_end = ktime_add_ns(curr_intv_end, cycle);
+ }
+ } else if (ktime_before(txtime, earliest_txtime) &&
+ !entry_available) {
+ earliest_txtime = txtime;
+ entry_found = entry;
+ n = div_s64(ktime_sub(txtime, curr_intv_start), cycle);
+ *interval_start = ktime_add(curr_intv_start, n * cycle);
+ *interval_end = ktime_add(curr_intv_end, n * cycle);
+ }
+ }
+
+ return entry_found;
+}
+
+static bool is_valid_interval(struct sk_buff *skb, struct Qdisc *sch)
+{
+ struct taprio_sched *q = qdisc_priv(sch);
+ struct sched_gate_list *sched, *admin;
+ ktime_t interval_start, interval_end;
+ struct sched_entry *entry;
+
+ rcu_read_lock();
+ sched = rcu_dereference(q->oper_sched);
+ admin = rcu_dereference(q->admin_sched);
+
+ entry = find_entry_to_transmit(skb, sch, sched, admin, skb->tstamp,
+ &interval_start, &interval_end, true);
+ rcu_read_unlock();
+
+ return entry;
+}
+
+/* This returns the tstamp value set by TCP in terms of the set clock. */
+static ktime_t get_tcp_tstamp(struct taprio_sched *q, struct sk_buff *skb)
+{
+ unsigned int offset = skb_network_offset(skb);
+ const struct ipv6hdr *ipv6h;
+ const struct iphdr *iph;
+ struct ipv6hdr _ipv6h;
+
+ ipv6h = skb_header_pointer(skb, offset, sizeof(_ipv6h), &_ipv6h);
+ if (!ipv6h)
+ return 0;
+
+ if (ipv6h->version == 4) {
+ iph = (struct iphdr *)ipv6h;
+ offset += iph->ihl * 4;
+
+ /* special-case 6in4 tunnelling, as that is a common way to get
+ * v6 connectivity in the home
+ */
+ if (iph->protocol == IPPROTO_IPV6) {
+ ipv6h = skb_header_pointer(skb, offset,
+ sizeof(_ipv6h), &_ipv6h);
+
+ if (!ipv6h || ipv6h->nexthdr != IPPROTO_TCP)
+ return 0;
+ } else if (iph->protocol != IPPROTO_TCP) {
+ return 0;
+ }
+ } else if (ipv6h->version == 6 && ipv6h->nexthdr != IPPROTO_TCP) {
+ return 0;
+ }
+
+ return taprio_mono_to_any(q, skb->skb_mstamp_ns);
+}
+
+/* There are a few scenarios where we will have to modify the txtime from
+ * what is read from next_txtime in sched_entry. They are:
+ * 1. If txtime is in the past,
+ * a. The gate for the traffic class is currently open and packet can be
+ * transmitted before it closes, schedule the packet right away.
+ * b. If the gate corresponding to the traffic class is going to open later
+ * in the cycle, set the txtime of packet to the interval start.
+ * 2. If txtime is in the future, there are packets corresponding to the
+ * current traffic class waiting to be transmitted. So, the following
+ * possibilities exist:
+ * a. We can transmit the packet before the window containing the txtime
+ * closes.
+ * b. The window might close before the transmission can be completed
+ * successfully. So, schedule the packet in the next open window.
+ */
+static long get_packet_txtime(struct sk_buff *skb, struct Qdisc *sch)
+{
+ ktime_t transmit_end_time, interval_end, interval_start, tcp_tstamp;
+ struct taprio_sched *q = qdisc_priv(sch);
+ struct sched_gate_list *sched, *admin;
+ ktime_t minimum_time, now, txtime;
+ int len, packet_transmit_time;
+ struct sched_entry *entry;
+ bool sched_changed;
+
+ now = taprio_get_time(q);
+ minimum_time = ktime_add_ns(now, q->txtime_delay);
+
+ tcp_tstamp = get_tcp_tstamp(q, skb);
+ minimum_time = max_t(ktime_t, minimum_time, tcp_tstamp);
+
+ rcu_read_lock();
+ admin = rcu_dereference(q->admin_sched);
+ sched = rcu_dereference(q->oper_sched);
+ if (admin && ktime_after(minimum_time, admin->base_time))
+ switch_schedules(q, &admin, &sched);
+
+ /* Until the schedule starts, all the queues are open */
+ if (!sched || ktime_before(minimum_time, sched->base_time)) {
+ txtime = minimum_time;
+ goto done;
+ }
+
+ len = qdisc_pkt_len(skb);
+ packet_transmit_time = length_to_duration(q, len);
+
+ do {
+ sched_changed = false;
+
+ entry = find_entry_to_transmit(skb, sch, sched, admin,
+ minimum_time,
+ &interval_start, &interval_end,
+ false);
+ if (!entry) {
+ txtime = 0;
+ goto done;
+ }
+
+ txtime = entry->next_txtime;
+ txtime = max_t(ktime_t, txtime, minimum_time);
+ txtime = max_t(ktime_t, txtime, interval_start);
+
+ if (admin && admin != sched &&
+ ktime_after(txtime, admin->base_time)) {
+ sched = admin;
+ sched_changed = true;
+ continue;
+ }
+
+ transmit_end_time = ktime_add(txtime, packet_transmit_time);
+ minimum_time = transmit_end_time;
+
+ /* Update the txtime of current entry to the next time it's
+ * interval starts.
+ */
+ if (ktime_after(transmit_end_time, interval_end))
+ entry->next_txtime = ktime_add(interval_start, sched->cycle_time);
+ } while (sched_changed || ktime_after(transmit_end_time, interval_end));
+
+ entry->next_txtime = transmit_end_time;
+
+done:
+ rcu_read_unlock();
+ return txtime;
+}
+
+/* Devices with full offload are expected to honor this in hardware */
+static bool taprio_skb_exceeds_queue_max_sdu(struct Qdisc *sch,
+ struct sk_buff *skb)
+{
+ struct taprio_sched *q = qdisc_priv(sch);
+ struct net_device *dev = qdisc_dev(sch);
+ struct sched_gate_list *sched;
+ int prio = skb->priority;
+ bool exceeds = false;
+ u8 tc;
+
+ tc = netdev_get_prio_tc_map(dev, prio);
+
+ rcu_read_lock();
+ sched = rcu_dereference(q->oper_sched);
+ if (sched && skb->len > sched->max_frm_len[tc])
+ exceeds = true;
+ rcu_read_unlock();
+
+ return exceeds;
+}
+
+static int taprio_enqueue_one(struct sk_buff *skb, struct Qdisc *sch,
+ struct Qdisc *child, struct sk_buff **to_free)
+{
+ struct taprio_sched *q = qdisc_priv(sch);
+
+ /* sk_flags are only safe to use on full sockets. */
+ if (skb->sk && sk_fullsock(skb->sk) && sock_flag(skb->sk, SOCK_TXTIME)) {
+ if (!is_valid_interval(skb, sch))
+ return qdisc_drop(skb, sch, to_free);
+ } else if (TXTIME_ASSIST_IS_ENABLED(q->flags)) {
+ skb->tstamp = get_packet_txtime(skb, sch);
+ if (!skb->tstamp)
+ return qdisc_drop(skb, sch, to_free);
+ }
+
+ qdisc_qstats_backlog_inc(sch, skb);
+ sch->q.qlen++;
+
+ return qdisc_enqueue(skb, child, to_free);
+}
+
+static int taprio_enqueue_segmented(struct sk_buff *skb, struct Qdisc *sch,
+ struct Qdisc *child,
+ struct sk_buff **to_free)
+{
+ unsigned int slen = 0, numsegs = 0, len = qdisc_pkt_len(skb);
+ netdev_features_t features = netif_skb_features(skb);
+ struct sk_buff *segs, *nskb;
+ int ret;
+
+ segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
+ if (IS_ERR_OR_NULL(segs))
+ return qdisc_drop(skb, sch, to_free);
+
+ skb_list_walk_safe(segs, segs, nskb) {
+ skb_mark_not_on_list(segs);
+ qdisc_skb_cb(segs)->pkt_len = segs->len;
+ qdisc_skb_cb(segs)->pkt_segs = 1;
+ slen += segs->len;
+
+ /* FIXME: we should be segmenting to a smaller size
+ * rather than dropping these
+ */
+ if (taprio_skb_exceeds_queue_max_sdu(sch, segs))
+ ret = qdisc_drop(segs, sch, to_free);
+ else
+ ret = taprio_enqueue_one(segs, sch, child, to_free);
+
+ if (ret != NET_XMIT_SUCCESS) {
+ if (net_xmit_drop_count(ret))
+ qdisc_qstats_drop(sch);
+ } else {
+ numsegs++;
+ }
+ }
+
+ if (numsegs > 1)
+ qdisc_tree_reduce_backlog(sch, 1 - numsegs, len - slen);
+ consume_skb(skb);
+
+ return numsegs > 0 ? NET_XMIT_SUCCESS : NET_XMIT_DROP;
+}
+
+/* Will not be called in the full offload case, since the TX queues are
+ * attached to the Qdisc created using qdisc_create_dflt()
+ */
static int taprio_enqueue(struct sk_buff *skb, struct Qdisc *sch,
struct sk_buff **to_free)
{
@@ -68,178 +637,356 @@ static int taprio_enqueue(struct sk_buff *skb, struct Qdisc *sch,
if (unlikely(!child))
return qdisc_drop(skb, sch, to_free);
- qdisc_qstats_backlog_inc(sch, skb);
- sch->q.qlen++;
+ if (taprio_skb_exceeds_queue_max_sdu(sch, skb)) {
+ /* Large packets might not be transmitted when the transmission
+ * duration exceeds any configured interval. Therefore, segment
+ * the skb into smaller chunks. Drivers with full offload are
+ * expected to handle this in hardware.
+ */
+ if (skb_is_gso(skb))
+ return taprio_enqueue_segmented(skb, sch, child,
+ to_free);
- return qdisc_enqueue(skb, child, to_free);
+ return qdisc_drop(skb, sch, to_free);
+ }
+
+ return taprio_enqueue_one(skb, sch, child, to_free);
}
static struct sk_buff *taprio_peek(struct Qdisc *sch)
{
+ WARN_ONCE(1, "taprio only supports operating as root qdisc, peek() not implemented");
+ return NULL;
+}
+
+static void taprio_set_budgets(struct taprio_sched *q,
+ struct sched_gate_list *sched,
+ struct sched_entry *entry)
+{
+ struct net_device *dev = qdisc_dev(q->root);
+ int num_tc = netdev_get_num_tc(dev);
+ int tc, budget;
+
+ for (tc = 0; tc < num_tc; tc++) {
+ /* Traffic classes which never close have infinite budget */
+ if (entry->gate_duration[tc] == sched->cycle_time)
+ budget = INT_MAX;
+ else
+ budget = div64_u64((u64)entry->gate_duration[tc] * PSEC_PER_NSEC,
+ atomic64_read(&q->picos_per_byte));
+
+ atomic_set(&entry->budget[tc], budget);
+ }
+}
+
+/* When an skb is sent, it consumes from the budget of all traffic classes */
+static int taprio_update_budgets(struct sched_entry *entry, size_t len,
+ int tc_consumed, int num_tc)
+{
+ int tc, budget, new_budget = 0;
+
+ for (tc = 0; tc < num_tc; tc++) {
+ budget = atomic_read(&entry->budget[tc]);
+ /* Don't consume from infinite budget */
+ if (budget == INT_MAX) {
+ if (tc == tc_consumed)
+ new_budget = budget;
+ continue;
+ }
+
+ if (tc == tc_consumed)
+ new_budget = atomic_sub_return(len, &entry->budget[tc]);
+ else
+ atomic_sub(len, &entry->budget[tc]);
+ }
+
+ return new_budget;
+}
+
+static struct sk_buff *taprio_dequeue_from_txq(struct Qdisc *sch, int txq,
+ struct sched_entry *entry,
+ u32 gate_mask)
+{
struct taprio_sched *q = qdisc_priv(sch);
struct net_device *dev = qdisc_dev(sch);
- struct sched_entry *entry;
+ struct Qdisc *child = q->qdiscs[txq];
+ int num_tc = netdev_get_num_tc(dev);
struct sk_buff *skb;
- u32 gate_mask;
- int i;
+ ktime_t guard;
+ int prio;
+ int len;
+ u8 tc;
- rcu_read_lock();
- entry = rcu_dereference(q->current_entry);
- gate_mask = entry ? entry->gate_mask : -1;
- rcu_read_unlock();
+ if (unlikely(!child))
+ return NULL;
- if (!gate_mask)
+ if (TXTIME_ASSIST_IS_ENABLED(q->flags))
+ goto skip_peek_checks;
+
+ skb = child->ops->peek(child);
+ if (!skb)
return NULL;
- for (i = 0; i < dev->num_tx_queues; i++) {
- struct Qdisc *child = q->qdiscs[i];
- int prio;
- u8 tc;
+ prio = skb->priority;
+ tc = netdev_get_prio_tc_map(dev, prio);
- if (unlikely(!child))
- continue;
+ if (!(gate_mask & BIT(tc)))
+ return NULL;
- skb = child->ops->peek(child);
- if (!skb)
- continue;
+ len = qdisc_pkt_len(skb);
+ guard = ktime_add_ns(taprio_get_time(q), length_to_duration(q, len));
+
+ /* In the case that there's no gate entry, there's no
+ * guard band ...
+ */
+ if (gate_mask != TAPRIO_ALL_GATES_OPEN &&
+ !taprio_entry_allows_tx(guard, entry, tc))
+ return NULL;
+
+ /* ... and no budget. */
+ if (gate_mask != TAPRIO_ALL_GATES_OPEN &&
+ taprio_update_budgets(entry, len, tc, num_tc) < 0)
+ return NULL;
+
+skip_peek_checks:
+ skb = child->ops->dequeue(child);
+ if (unlikely(!skb))
+ return NULL;
+
+ qdisc_bstats_update(sch, skb);
+ qdisc_qstats_backlog_dec(sch, skb);
+ sch->q.qlen--;
+
+ return skb;
+}
+
+static void taprio_next_tc_txq(struct net_device *dev, int tc, int *txq)
+{
+ int offset = dev->tc_to_txq[tc].offset;
+ int count = dev->tc_to_txq[tc].count;
+
+ (*txq)++;
+ if (*txq == offset + count)
+ *txq = offset;
+}
+
+/* Prioritize higher traffic classes, and select among TXQs belonging to the
+ * same TC using round robin
+ */
+static struct sk_buff *taprio_dequeue_tc_priority(struct Qdisc *sch,
+ struct sched_entry *entry,
+ u32 gate_mask)
+{
+ struct taprio_sched *q = qdisc_priv(sch);
+ struct net_device *dev = qdisc_dev(sch);
+ int num_tc = netdev_get_num_tc(dev);
+ struct sk_buff *skb;
+ int tc;
- prio = skb->priority;
- tc = netdev_get_prio_tc_map(dev, prio);
+ for (tc = num_tc - 1; tc >= 0; tc--) {
+ int first_txq = q->cur_txq[tc];
if (!(gate_mask & BIT(tc)))
- return NULL;
+ continue;
+
+ do {
+ skb = taprio_dequeue_from_txq(sch, q->cur_txq[tc],
+ entry, gate_mask);
+
+ taprio_next_tc_txq(dev, tc, &q->cur_txq[tc]);
+
+ if (q->cur_txq[tc] >= dev->num_tx_queues)
+ q->cur_txq[tc] = first_txq;
- return skb;
+ if (skb)
+ return skb;
+ } while (q->cur_txq[tc] != first_txq);
}
return NULL;
}
-static inline int length_to_duration(struct taprio_sched *q, int len)
+/* Broken way of prioritizing smaller TXQ indices and ignoring the traffic
+ * class other than to determine whether the gate is open or not
+ */
+static struct sk_buff *taprio_dequeue_txq_priority(struct Qdisc *sch,
+ struct sched_entry *entry,
+ u32 gate_mask)
{
- return (len * q->picos_per_byte) / 1000;
+ struct net_device *dev = qdisc_dev(sch);
+ struct sk_buff *skb;
+ int i;
+
+ for (i = 0; i < dev->num_tx_queues; i++) {
+ skb = taprio_dequeue_from_txq(sch, i, entry, gate_mask);
+ if (skb)
+ return skb;
+ }
+
+ return NULL;
}
+/* Will not be called in the full offload case, since the TX queues are
+ * attached to the Qdisc created using qdisc_create_dflt()
+ */
static struct sk_buff *taprio_dequeue(struct Qdisc *sch)
{
struct taprio_sched *q = qdisc_priv(sch);
- struct net_device *dev = qdisc_dev(sch);
+ struct sk_buff *skb = NULL;
struct sched_entry *entry;
- struct sk_buff *skb;
u32 gate_mask;
- int i;
rcu_read_lock();
entry = rcu_dereference(q->current_entry);
/* if there's no entry, it means that the schedule didn't
* start yet, so force all gates to be open, this is in
* accordance to IEEE 802.1Qbv-2015 Section 8.6.9.4.5
- * "AdminGateSates"
+ * "AdminGateStates"
*/
gate_mask = entry ? entry->gate_mask : TAPRIO_ALL_GATES_OPEN;
- rcu_read_unlock();
-
if (!gate_mask)
- return NULL;
-
- for (i = 0; i < dev->num_tx_queues; i++) {
- struct Qdisc *child = q->qdiscs[i];
- ktime_t guard;
- int prio;
- int len;
- u8 tc;
-
- if (unlikely(!child))
- continue;
+ goto done;
+
+ if (static_branch_unlikely(&taprio_have_broken_mqprio) &&
+ !static_branch_likely(&taprio_have_working_mqprio)) {
+ /* Single NIC kind which is broken */
+ skb = taprio_dequeue_txq_priority(sch, entry, gate_mask);
+ } else if (static_branch_likely(&taprio_have_working_mqprio) &&
+ !static_branch_unlikely(&taprio_have_broken_mqprio)) {
+ /* Single NIC kind which prioritizes properly */
+ skb = taprio_dequeue_tc_priority(sch, entry, gate_mask);
+ } else {
+ /* Mixed NIC kinds present in system, need dynamic testing */
+ if (q->broken_mqprio)
+ skb = taprio_dequeue_txq_priority(sch, entry, gate_mask);
+ else
+ skb = taprio_dequeue_tc_priority(sch, entry, gate_mask);
+ }
- skb = child->ops->peek(child);
- if (!skb)
- continue;
+done:
+ rcu_read_unlock();
- prio = skb->priority;
- tc = netdev_get_prio_tc_map(dev, prio);
+ return skb;
+}
- if (!(gate_mask & BIT(tc)))
- continue;
+static bool should_restart_cycle(const struct sched_gate_list *oper,
+ const struct sched_entry *entry)
+{
+ if (list_is_last(&entry->list, &oper->entries))
+ return true;
- len = qdisc_pkt_len(skb);
- guard = ktime_add_ns(q->get_time(),
- length_to_duration(q, len));
+ if (ktime_compare(entry->end_time, oper->cycle_end_time) == 0)
+ return true;
- /* In the case that there's no gate entry, there's no
- * guard band ...
- */
- if (gate_mask != TAPRIO_ALL_GATES_OPEN &&
- ktime_after(guard, entry->close_time))
- return NULL;
+ return false;
+}
- /* ... and no budget. */
- if (gate_mask != TAPRIO_ALL_GATES_OPEN &&
- atomic_sub_return(len, &entry->budget) < 0)
- return NULL;
+static bool should_change_schedules(const struct sched_gate_list *admin,
+ const struct sched_gate_list *oper,
+ ktime_t end_time)
+{
+ ktime_t next_base_time, extension_time;
- skb = child->ops->dequeue(child);
- if (unlikely(!skb))
- return NULL;
+ if (!admin)
+ return false;
- qdisc_bstats_update(sch, skb);
- qdisc_qstats_backlog_dec(sch, skb);
- sch->q.qlen--;
+ next_base_time = sched_base_time(admin);
- return skb;
- }
+ /* This is the simple case, the end_time would fall after
+ * the next schedule base_time.
+ */
+ if (ktime_compare(next_base_time, end_time) <= 0)
+ return true;
- return NULL;
-}
+ /* This is the cycle_time_extension case, if the end_time
+ * plus the amount that can be extended would fall after the
+ * next schedule base_time, we can extend the current schedule
+ * for that amount.
+ */
+ extension_time = ktime_add_ns(end_time, oper->cycle_time_extension);
-static bool should_restart_cycle(const struct taprio_sched *q,
- const struct sched_entry *entry)
-{
- WARN_ON(!entry);
+ /* FIXME: the IEEE 802.1Q-2018 Specification isn't clear about
+ * how precisely the extension should be made. So after
+ * conformance testing, this logic may change.
+ */
+ if (ktime_compare(next_base_time, extension_time) <= 0)
+ return true;
- return list_is_last(&entry->list, &q->entries);
+ return false;
}
static enum hrtimer_restart advance_sched(struct hrtimer *timer)
{
struct taprio_sched *q = container_of(timer, struct taprio_sched,
advance_timer);
+ struct net_device *dev = qdisc_dev(q->root);
+ struct sched_gate_list *oper, *admin;
+ int num_tc = netdev_get_num_tc(dev);
struct sched_entry *entry, *next;
struct Qdisc *sch = q->root;
- ktime_t close_time;
+ ktime_t end_time;
+ int tc;
spin_lock(&q->current_entry_lock);
entry = rcu_dereference_protected(q->current_entry,
lockdep_is_held(&q->current_entry_lock));
+ oper = rcu_dereference_protected(q->oper_sched,
+ lockdep_is_held(&q->current_entry_lock));
+ admin = rcu_dereference_protected(q->admin_sched,
+ lockdep_is_held(&q->current_entry_lock));
+
+ if (!oper)
+ switch_schedules(q, &admin, &oper);
- /* This is the case that it's the first time that the schedule
- * runs, so it only happens once per schedule. The first entry
- * is pre-calculated during the schedule initialization.
+ /* This can happen in two cases: 1. this is the very first run
+ * of this function (i.e. we weren't running any schedule
+ * previously); 2. The previous schedule just ended. The first
+ * entry of all schedules are pre-calculated during the
+ * schedule initialization.
*/
- if (unlikely(!entry)) {
- next = list_first_entry(&q->entries, struct sched_entry,
+ if (unlikely(!entry || entry->end_time == oper->base_time)) {
+ next = list_first_entry(&oper->entries, struct sched_entry,
list);
- close_time = next->close_time;
+ end_time = next->end_time;
goto first_run;
}
- if (should_restart_cycle(q, entry))
- next = list_first_entry(&q->entries, struct sched_entry,
+ if (should_restart_cycle(oper, entry)) {
+ next = list_first_entry(&oper->entries, struct sched_entry,
list);
- else
+ oper->cycle_end_time = ktime_add_ns(oper->cycle_end_time,
+ oper->cycle_time);
+ } else {
next = list_next_entry(entry, list);
+ }
+
+ end_time = ktime_add_ns(entry->end_time, next->interval);
+ end_time = min_t(ktime_t, end_time, oper->cycle_end_time);
+
+ for (tc = 0; tc < num_tc; tc++) {
+ if (next->gate_duration[tc] == oper->cycle_time)
+ next->gate_close_time[tc] = KTIME_MAX;
+ else
+ next->gate_close_time[tc] = ktime_add_ns(entry->end_time,
+ next->gate_duration[tc]);
+ }
- close_time = ktime_add_ns(entry->close_time, next->interval);
+ if (should_change_schedules(admin, oper, end_time)) {
+ /* Set things so the next time this runs, the new
+ * schedule runs.
+ */
+ end_time = sched_base_time(admin);
+ switch_schedules(q, &admin, &oper);
+ }
- next->close_time = close_time;
- atomic_set(&next->budget,
- (next->interval * 1000) / q->picos_per_byte);
+ next->end_time = end_time;
+ taprio_set_budgets(q, oper, next);
first_run:
rcu_assign_pointer(q->current_entry, next);
spin_unlock(&q->current_entry_lock);
- hrtimer_set_expires(&q->advance_timer, close_time);
+ hrtimer_set_expires(&q->advance_timer, end_time);
rcu_read_lock();
__netif_schedule(sch);
@@ -255,23 +1002,42 @@ static const struct nla_policy entry_policy[TCA_TAPRIO_SCHED_ENTRY_MAX + 1] = {
[TCA_TAPRIO_SCHED_ENTRY_INTERVAL] = { .type = NLA_U32 },
};
-static const struct nla_policy entry_list_policy[TCA_TAPRIO_SCHED_MAX + 1] = {
- [TCA_TAPRIO_SCHED_ENTRY] = { .type = NLA_NESTED },
+static const struct nla_policy taprio_tc_policy[TCA_TAPRIO_TC_ENTRY_MAX + 1] = {
+ [TCA_TAPRIO_TC_ENTRY_INDEX] = NLA_POLICY_MAX(NLA_U32,
+ TC_QOPT_MAX_QUEUE - 1),
+ [TCA_TAPRIO_TC_ENTRY_MAX_SDU] = { .type = NLA_U32 },
+ [TCA_TAPRIO_TC_ENTRY_FP] = NLA_POLICY_RANGE(NLA_U32,
+ TC_FP_EXPRESS,
+ TC_FP_PREEMPTIBLE),
+};
+
+static const struct netlink_range_validation_signed taprio_cycle_time_range = {
+ .min = 0,
+ .max = INT_MAX,
};
static const struct nla_policy taprio_policy[TCA_TAPRIO_ATTR_MAX + 1] = {
[TCA_TAPRIO_ATTR_PRIOMAP] = {
.len = sizeof(struct tc_mqprio_qopt)
},
- [TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST] = { .type = NLA_NESTED },
- [TCA_TAPRIO_ATTR_SCHED_BASE_TIME] = { .type = NLA_S64 },
- [TCA_TAPRIO_ATTR_SCHED_SINGLE_ENTRY] = { .type = NLA_NESTED },
- [TCA_TAPRIO_ATTR_SCHED_CLOCKID] = { .type = NLA_S32 },
+ [TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST] = { .type = NLA_NESTED },
+ [TCA_TAPRIO_ATTR_SCHED_BASE_TIME] = { .type = NLA_S64 },
+ [TCA_TAPRIO_ATTR_SCHED_SINGLE_ENTRY] = { .type = NLA_NESTED },
+ [TCA_TAPRIO_ATTR_SCHED_CLOCKID] = { .type = NLA_S32 },
+ [TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME] =
+ NLA_POLICY_FULL_RANGE_SIGNED(NLA_S64, &taprio_cycle_time_range),
+ [TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME_EXTENSION] = { .type = NLA_S64 },
+ [TCA_TAPRIO_ATTR_FLAGS] =
+ NLA_POLICY_MASK(NLA_U32, TAPRIO_SUPPORTED_FLAGS),
+ [TCA_TAPRIO_ATTR_TXTIME_DELAY] = { .type = NLA_U32 },
+ [TCA_TAPRIO_ATTR_TC_ENTRY] = { .type = NLA_NESTED },
};
-static int fill_sched_entry(struct nlattr **tb, struct sched_entry *entry,
+static int fill_sched_entry(struct taprio_sched *q, struct nlattr **tb,
+ struct sched_entry *entry,
struct netlink_ext_ack *extack)
{
+ int min_duration = length_to_duration(q, ETH_ZLEN);
u32 interval = 0;
if (tb[TCA_TAPRIO_SCHED_ENTRY_CMD])
@@ -286,7 +1052,10 @@ static int fill_sched_entry(struct nlattr **tb, struct sched_entry *entry,
interval = nla_get_u32(
tb[TCA_TAPRIO_SCHED_ENTRY_INTERVAL]);
- if (interval == 0) {
+ /* The interval should allow at least the minimum ethernet
+ * frame to go out.
+ */
+ if (interval < min_duration) {
NL_SET_ERR_MSG(extack, "Invalid interval for schedule entry");
return -EINVAL;
}
@@ -296,14 +1065,15 @@ static int fill_sched_entry(struct nlattr **tb, struct sched_entry *entry,
return 0;
}
-static int parse_sched_entry(struct nlattr *n, struct sched_entry *entry,
- int index, struct netlink_ext_ack *extack)
+static int parse_sched_entry(struct taprio_sched *q, struct nlattr *n,
+ struct sched_entry *entry, int index,
+ struct netlink_ext_ack *extack)
{
struct nlattr *tb[TCA_TAPRIO_SCHED_ENTRY_MAX + 1] = { };
int err;
- err = nla_parse_nested(tb, TCA_TAPRIO_SCHED_ENTRY_MAX, n,
- entry_policy, NULL);
+ err = nla_parse_nested_deprecated(tb, TCA_TAPRIO_SCHED_ENTRY_MAX, n,
+ entry_policy, NULL);
if (err < 0) {
NL_SET_ERR_MSG(extack, "Could not parse nested entry");
return -EINVAL;
@@ -311,73 +1081,11 @@ static int parse_sched_entry(struct nlattr *n, struct sched_entry *entry,
entry->index = index;
- return fill_sched_entry(tb, entry, extack);
-}
-
-/* Returns the number of entries in case of success */
-static int parse_sched_single_entry(struct nlattr *n,
- struct taprio_sched *q,
- struct netlink_ext_ack *extack)
-{
- struct nlattr *tb_entry[TCA_TAPRIO_SCHED_ENTRY_MAX + 1] = { };
- struct nlattr *tb_list[TCA_TAPRIO_SCHED_MAX + 1] = { };
- struct sched_entry *entry;
- bool found = false;
- u32 index;
- int err;
-
- err = nla_parse_nested(tb_list, TCA_TAPRIO_SCHED_MAX,
- n, entry_list_policy, NULL);
- if (err < 0) {
- NL_SET_ERR_MSG(extack, "Could not parse nested entry");
- return -EINVAL;
- }
-
- if (!tb_list[TCA_TAPRIO_SCHED_ENTRY]) {
- NL_SET_ERR_MSG(extack, "Single-entry must include an entry");
- return -EINVAL;
- }
-
- err = nla_parse_nested(tb_entry, TCA_TAPRIO_SCHED_ENTRY_MAX,
- tb_list[TCA_TAPRIO_SCHED_ENTRY],
- entry_policy, NULL);
- if (err < 0) {
- NL_SET_ERR_MSG(extack, "Could not parse nested entry");
- return -EINVAL;
- }
-
- if (!tb_entry[TCA_TAPRIO_SCHED_ENTRY_INDEX]) {
- NL_SET_ERR_MSG(extack, "Entry must specify an index\n");
- return -EINVAL;
- }
-
- index = nla_get_u32(tb_entry[TCA_TAPRIO_SCHED_ENTRY_INDEX]);
- if (index >= q->num_entries) {
- NL_SET_ERR_MSG(extack, "Index for single entry exceeds number of entries in schedule");
- return -EINVAL;
- }
-
- list_for_each_entry(entry, &q->entries, list) {
- if (entry->index == index) {
- found = true;
- break;
- }
- }
-
- if (!found) {
- NL_SET_ERR_MSG(extack, "Could not find entry");
- return -ENOENT;
- }
-
- err = fill_sched_entry(tb_entry, entry, extack);
- if (err < 0)
- return err;
-
- return q->num_entries;
+ return fill_sched_entry(q, tb, entry, extack);
}
-static int parse_sched_list(struct nlattr *list,
- struct taprio_sched *q,
+static int parse_sched_list(struct taprio_sched *q, struct nlattr *list,
+ struct sched_gate_list *sched,
struct netlink_ext_ack *extack)
{
struct nlattr *n;
@@ -401,87 +1109,85 @@ static int parse_sched_list(struct nlattr *list,
return -ENOMEM;
}
- err = parse_sched_entry(n, entry, i, extack);
+ err = parse_sched_entry(q, n, entry, i, extack);
if (err < 0) {
kfree(entry);
return err;
}
- list_add_tail(&entry->list, &q->entries);
+ list_add_tail(&entry->list, &sched->entries);
i++;
}
- q->num_entries = i;
+ sched->num_entries = i;
return i;
}
-/* Returns the number of entries in case of success */
-static int parse_taprio_opt(struct nlattr **tb, struct taprio_sched *q,
- struct netlink_ext_ack *extack)
+static int parse_taprio_schedule(struct taprio_sched *q, struct nlattr **tb,
+ struct sched_gate_list *new,
+ struct netlink_ext_ack *extack)
{
int err = 0;
- int clockid;
- if (tb[TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST] &&
- tb[TCA_TAPRIO_ATTR_SCHED_SINGLE_ENTRY])
- return -EINVAL;
+ if (tb[TCA_TAPRIO_ATTR_SCHED_SINGLE_ENTRY]) {
+ NL_SET_ERR_MSG(extack, "Adding a single entry is not supported");
+ return -ENOTSUPP;
+ }
- if (tb[TCA_TAPRIO_ATTR_SCHED_SINGLE_ENTRY] && q->num_entries == 0)
- return -EINVAL;
+ if (tb[TCA_TAPRIO_ATTR_SCHED_BASE_TIME])
+ new->base_time = nla_get_s64(tb[TCA_TAPRIO_ATTR_SCHED_BASE_TIME]);
- if (q->clockid == -1 && !tb[TCA_TAPRIO_ATTR_SCHED_CLOCKID])
- return -EINVAL;
+ if (tb[TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME_EXTENSION])
+ new->cycle_time_extension = nla_get_s64(tb[TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME_EXTENSION]);
- if (tb[TCA_TAPRIO_ATTR_SCHED_BASE_TIME])
- q->base_time = nla_get_s64(
- tb[TCA_TAPRIO_ATTR_SCHED_BASE_TIME]);
+ if (tb[TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME])
+ new->cycle_time = nla_get_s64(tb[TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME]);
+
+ if (tb[TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST])
+ err = parse_sched_list(q, tb[TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST],
+ new, extack);
+ if (err < 0)
+ return err;
- if (tb[TCA_TAPRIO_ATTR_SCHED_CLOCKID]) {
- clockid = nla_get_s32(tb[TCA_TAPRIO_ATTR_SCHED_CLOCKID]);
+ if (!new->cycle_time) {
+ struct sched_entry *entry;
+ ktime_t cycle = 0;
- /* We only support static clockids and we don't allow
- * for it to be modified after the first init.
- */
- if (clockid < 0 || (q->clockid != -1 && q->clockid != clockid))
+ list_for_each_entry(entry, &new->entries, list)
+ cycle = ktime_add_ns(cycle, entry->interval);
+
+ if (cycle < 0 || cycle > INT_MAX) {
+ NL_SET_ERR_MSG(extack, "'cycle_time' is too big");
return -EINVAL;
+ }
- q->clockid = clockid;
+ new->cycle_time = cycle;
}
- if (tb[TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST])
- err = parse_sched_list(
- tb[TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST], q, extack);
- else if (tb[TCA_TAPRIO_ATTR_SCHED_SINGLE_ENTRY])
- err = parse_sched_single_entry(
- tb[TCA_TAPRIO_ATTR_SCHED_SINGLE_ENTRY], q, extack);
-
- /* parse_sched_* return the number of entries in the schedule,
- * a schedule with zero entries is an error.
- */
- if (err == 0) {
- NL_SET_ERR_MSG(extack, "The schedule should contain at least one entry");
+ if (new->cycle_time < new->num_entries * length_to_duration(q, ETH_ZLEN)) {
+ NL_SET_ERR_MSG(extack, "'cycle_time' is too small");
return -EINVAL;
}
- return err;
+ taprio_calculate_gate_durations(q, new);
+
+ return 0;
}
static int taprio_parse_mqprio_opt(struct net_device *dev,
struct tc_mqprio_qopt *qopt,
- struct netlink_ext_ack *extack)
+ struct netlink_ext_ack *extack,
+ u32 taprio_flags)
{
- int i, j;
+ bool allow_overlapping_txqs = TXTIME_ASSIST_IS_ENABLED(taprio_flags);
if (!qopt) {
- NL_SET_ERR_MSG(extack, "'mqprio' configuration is necessary");
- return -EINVAL;
- }
-
- /* Verify num_tc is not out of max range */
- if (qopt->num_tc > TC_MAX_QUEUE) {
- NL_SET_ERR_MSG(extack, "Number of traffic classes is outside valid range");
- return -EINVAL;
+ if (!dev->num_tc) {
+ NL_SET_ERR_MSG(extack, "'mqprio' configuration is necessary");
+ return -EINVAL;
+ }
+ return 0;
}
/* taprio imposes that traffic classes map 1:n to tx queues */
@@ -490,213 +1196,868 @@ static int taprio_parse_mqprio_opt(struct net_device *dev,
return -EINVAL;
}
- /* Verify priority mapping uses valid tcs */
- for (i = 0; i < TC_BITMASK + 1; i++) {
- if (qopt->prio_tc_map[i] >= qopt->num_tc) {
- NL_SET_ERR_MSG(extack, "Invalid traffic class in priority to traffic class mapping");
- return -EINVAL;
+ /* For some reason, in txtime-assist mode, we allow TXQ ranges for
+ * different TCs to overlap, and just validate the TXQ ranges.
+ */
+ return mqprio_validate_qopt(dev, qopt, true, allow_overlapping_txqs,
+ extack);
+}
+
+static int taprio_get_start_time(struct Qdisc *sch,
+ struct sched_gate_list *sched,
+ ktime_t *start)
+{
+ struct taprio_sched *q = qdisc_priv(sch);
+ ktime_t now, base, cycle;
+ s64 n;
+
+ base = sched_base_time(sched);
+ now = taprio_get_time(q);
+
+ if (ktime_after(base, now)) {
+ *start = base;
+ return 0;
+ }
+
+ cycle = sched->cycle_time;
+
+ /* The qdisc is expected to have at least one sched_entry. Moreover,
+ * any entry must have 'interval' > 0. Thus if the cycle time is zero,
+ * something went really wrong. In that case, we should warn about this
+ * inconsistent state and return error.
+ */
+ if (WARN_ON(!cycle))
+ return -EFAULT;
+
+ /* Schedule the start time for the beginning of the next
+ * cycle.
+ */
+ n = div64_s64(ktime_sub_ns(now, base), cycle);
+ *start = ktime_add_ns(base, (n + 1) * cycle);
+ return 0;
+}
+
+static void setup_first_end_time(struct taprio_sched *q,
+ struct sched_gate_list *sched, ktime_t base)
+{
+ struct net_device *dev = qdisc_dev(q->root);
+ int num_tc = netdev_get_num_tc(dev);
+ struct sched_entry *first;
+ ktime_t cycle;
+ int tc;
+
+ first = list_first_entry(&sched->entries,
+ struct sched_entry, list);
+
+ cycle = sched->cycle_time;
+
+ /* FIXME: find a better place to do this */
+ sched->cycle_end_time = ktime_add_ns(base, cycle);
+
+ first->end_time = ktime_add_ns(base, first->interval);
+ taprio_set_budgets(q, sched, first);
+
+ for (tc = 0; tc < num_tc; tc++) {
+ if (first->gate_duration[tc] == sched->cycle_time)
+ first->gate_close_time[tc] = KTIME_MAX;
+ else
+ first->gate_close_time[tc] = ktime_add_ns(base, first->gate_duration[tc]);
+ }
+
+ rcu_assign_pointer(q->current_entry, NULL);
+}
+
+static void taprio_start_sched(struct Qdisc *sch,
+ ktime_t start, struct sched_gate_list *new)
+{
+ struct taprio_sched *q = qdisc_priv(sch);
+ ktime_t expires;
+
+ if (FULL_OFFLOAD_IS_ENABLED(q->flags))
+ return;
+
+ expires = hrtimer_get_expires(&q->advance_timer);
+ if (expires == 0)
+ expires = KTIME_MAX;
+
+ /* If the new schedule starts before the next expiration, we
+ * reprogram it to the earliest one, so we change the admin
+ * schedule to the operational one at the right time.
+ */
+ start = min_t(ktime_t, start, expires);
+
+ hrtimer_start(&q->advance_timer, start, HRTIMER_MODE_ABS);
+}
+
+static void taprio_set_picos_per_byte(struct net_device *dev,
+ struct taprio_sched *q,
+ struct netlink_ext_ack *extack)
+{
+ struct ethtool_link_ksettings ecmd;
+ int speed = SPEED_10;
+ int picos_per_byte;
+ int err;
+
+ err = __ethtool_get_link_ksettings(dev, &ecmd);
+ if (err < 0)
+ goto skip;
+
+ if (ecmd.base.speed && ecmd.base.speed != SPEED_UNKNOWN)
+ speed = ecmd.base.speed;
+
+skip:
+ picos_per_byte = (USEC_PER_SEC * 8) / speed;
+ if (picos_per_byte < TAPRIO_PICOS_PER_BYTE_MIN) {
+ if (!extack)
+ pr_warn("Link speed %d is too high. Schedule may be inaccurate.\n",
+ speed);
+ NL_SET_ERR_MSG_FMT_MOD(extack,
+ "Link speed %d is too high. Schedule may be inaccurate.",
+ speed);
+ picos_per_byte = TAPRIO_PICOS_PER_BYTE_MIN;
+ }
+
+ atomic64_set(&q->picos_per_byte, picos_per_byte);
+ netdev_dbg(dev, "taprio: set %s's picos_per_byte to: %lld, linkspeed: %d\n",
+ dev->name, (long long)atomic64_read(&q->picos_per_byte),
+ ecmd.base.speed);
+}
+
+static int taprio_dev_notifier(struct notifier_block *nb, unsigned long event,
+ void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct sched_gate_list *oper, *admin;
+ struct qdisc_size_table *stab;
+ struct taprio_sched *q;
+
+ ASSERT_RTNL();
+
+ if (event != NETDEV_UP && event != NETDEV_CHANGE)
+ return NOTIFY_DONE;
+
+ list_for_each_entry(q, &taprio_list, taprio_list) {
+ if (dev != qdisc_dev(q->root))
+ continue;
+
+ taprio_set_picos_per_byte(dev, q, NULL);
+
+ stab = rtnl_dereference(q->root->stab);
+
+ rcu_read_lock();
+ oper = rcu_dereference(q->oper_sched);
+ if (oper)
+ taprio_update_queue_max_sdu(q, oper, stab);
+
+ admin = rcu_dereference(q->admin_sched);
+ if (admin)
+ taprio_update_queue_max_sdu(q, admin, stab);
+ rcu_read_unlock();
+
+ break;
+ }
+
+ return NOTIFY_DONE;
+}
+
+static void setup_txtime(struct taprio_sched *q,
+ struct sched_gate_list *sched, ktime_t base)
+{
+ struct sched_entry *entry;
+ u64 interval = 0;
+
+ list_for_each_entry(entry, &sched->entries, list) {
+ entry->next_txtime = ktime_add_ns(base, interval);
+ interval += entry->interval;
+ }
+}
+
+static struct tc_taprio_qopt_offload *taprio_offload_alloc(int num_entries)
+{
+ struct __tc_taprio_qopt_offload *__offload;
+
+ __offload = kzalloc(struct_size(__offload, offload.entries, num_entries),
+ GFP_KERNEL);
+ if (!__offload)
+ return NULL;
+
+ refcount_set(&__offload->users, 1);
+
+ return &__offload->offload;
+}
+
+struct tc_taprio_qopt_offload *taprio_offload_get(struct tc_taprio_qopt_offload
+ *offload)
+{
+ struct __tc_taprio_qopt_offload *__offload;
+
+ __offload = container_of(offload, struct __tc_taprio_qopt_offload,
+ offload);
+
+ refcount_inc(&__offload->users);
+
+ return offload;
+}
+EXPORT_SYMBOL_GPL(taprio_offload_get);
+
+void taprio_offload_free(struct tc_taprio_qopt_offload *offload)
+{
+ struct __tc_taprio_qopt_offload *__offload;
+
+ __offload = container_of(offload, struct __tc_taprio_qopt_offload,
+ offload);
+
+ if (!refcount_dec_and_test(&__offload->users))
+ return;
+
+ kfree(__offload);
+}
+EXPORT_SYMBOL_GPL(taprio_offload_free);
+
+/* The function will only serve to keep the pointers to the "oper" and "admin"
+ * schedules valid in relation to their base times, so when calling dump() the
+ * users looks at the right schedules.
+ * When using full offload, the admin configuration is promoted to oper at the
+ * base_time in the PHC time domain. But because the system time is not
+ * necessarily in sync with that, we can't just trigger a hrtimer to call
+ * switch_schedules at the right hardware time.
+ * At the moment we call this by hand right away from taprio, but in the future
+ * it will be useful to create a mechanism for drivers to notify taprio of the
+ * offload state (PENDING, ACTIVE, INACTIVE) so it can be visible in dump().
+ * This is left as TODO.
+ */
+static void taprio_offload_config_changed(struct taprio_sched *q)
+{
+ struct sched_gate_list *oper, *admin;
+
+ oper = rtnl_dereference(q->oper_sched);
+ admin = rtnl_dereference(q->admin_sched);
+
+ switch_schedules(q, &admin, &oper);
+}
+
+static u32 tc_map_to_queue_mask(struct net_device *dev, u32 tc_mask)
+{
+ u32 i, queue_mask = 0;
+
+ for (i = 0; i < dev->num_tc; i++) {
+ u32 offset, count;
+
+ if (!(tc_mask & BIT(i)))
+ continue;
+
+ offset = dev->tc_to_txq[i].offset;
+ count = dev->tc_to_txq[i].count;
+
+ queue_mask |= GENMASK(offset + count - 1, offset);
+ }
+
+ return queue_mask;
+}
+
+static void taprio_sched_to_offload(struct net_device *dev,
+ struct sched_gate_list *sched,
+ struct tc_taprio_qopt_offload *offload,
+ const struct tc_taprio_caps *caps)
+{
+ struct sched_entry *entry;
+ int i = 0;
+
+ offload->base_time = sched->base_time;
+ offload->cycle_time = sched->cycle_time;
+ offload->cycle_time_extension = sched->cycle_time_extension;
+
+ list_for_each_entry(entry, &sched->entries, list) {
+ struct tc_taprio_sched_entry *e = &offload->entries[i];
+
+ e->command = entry->command;
+ e->interval = entry->interval;
+ if (caps->gate_mask_per_txq)
+ e->gate_mask = tc_map_to_queue_mask(dev,
+ entry->gate_mask);
+ else
+ e->gate_mask = entry->gate_mask;
+
+ i++;
+ }
+
+ offload->num_entries = i;
+}
+
+static void taprio_detect_broken_mqprio(struct taprio_sched *q)
+{
+ struct net_device *dev = qdisc_dev(q->root);
+ struct tc_taprio_caps caps;
+
+ qdisc_offload_query_caps(dev, TC_SETUP_QDISC_TAPRIO,
+ &caps, sizeof(caps));
+
+ q->broken_mqprio = caps.broken_mqprio;
+ if (q->broken_mqprio)
+ static_branch_inc(&taprio_have_broken_mqprio);
+ else
+ static_branch_inc(&taprio_have_working_mqprio);
+
+ q->detected_mqprio = true;
+}
+
+static void taprio_cleanup_broken_mqprio(struct taprio_sched *q)
+{
+ if (!q->detected_mqprio)
+ return;
+
+ if (q->broken_mqprio)
+ static_branch_dec(&taprio_have_broken_mqprio);
+ else
+ static_branch_dec(&taprio_have_working_mqprio);
+}
+
+static int taprio_enable_offload(struct net_device *dev,
+ struct taprio_sched *q,
+ struct sched_gate_list *sched,
+ struct netlink_ext_ack *extack)
+{
+ const struct net_device_ops *ops = dev->netdev_ops;
+ struct tc_taprio_qopt_offload *offload;
+ struct tc_taprio_caps caps;
+ int tc, err = 0;
+
+ if (!ops->ndo_setup_tc) {
+ NL_SET_ERR_MSG(extack,
+ "Device does not support taprio offload");
+ return -EOPNOTSUPP;
+ }
+
+ qdisc_offload_query_caps(dev, TC_SETUP_QDISC_TAPRIO,
+ &caps, sizeof(caps));
+
+ if (!caps.supports_queue_max_sdu) {
+ for (tc = 0; tc < TC_MAX_QUEUE; tc++) {
+ if (q->max_sdu[tc]) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "Device does not handle queueMaxSDU");
+ return -EOPNOTSUPP;
+ }
}
}
- for (i = 0; i < qopt->num_tc; i++) {
- unsigned int last = qopt->offset[i] + qopt->count[i];
+ offload = taprio_offload_alloc(sched->num_entries);
+ if (!offload) {
+ NL_SET_ERR_MSG(extack,
+ "Not enough memory for enabling offload mode");
+ return -ENOMEM;
+ }
+ offload->cmd = TAPRIO_CMD_REPLACE;
+ offload->extack = extack;
+ mqprio_qopt_reconstruct(dev, &offload->mqprio.qopt);
+ offload->mqprio.extack = extack;
+ taprio_sched_to_offload(dev, sched, offload, &caps);
+ mqprio_fp_to_offload(q->fp, &offload->mqprio);
+
+ for (tc = 0; tc < TC_MAX_QUEUE; tc++)
+ offload->max_sdu[tc] = q->max_sdu[tc];
+
+ err = ops->ndo_setup_tc(dev, TC_SETUP_QDISC_TAPRIO, offload);
+ if (err < 0) {
+ NL_SET_ERR_MSG_WEAK(extack,
+ "Device failed to setup taprio offload");
+ goto done;
+ }
+
+ q->offloaded = true;
+
+done:
+ /* The offload structure may linger around via a reference taken by the
+ * device driver, so clear up the netlink extack pointer so that the
+ * driver isn't tempted to dereference data which stopped being valid
+ */
+ offload->extack = NULL;
+ offload->mqprio.extack = NULL;
+ taprio_offload_free(offload);
+
+ return err;
+}
+
+static int taprio_disable_offload(struct net_device *dev,
+ struct taprio_sched *q,
+ struct netlink_ext_ack *extack)
+{
+ const struct net_device_ops *ops = dev->netdev_ops;
+ struct tc_taprio_qopt_offload *offload;
+ int err;
+
+ if (!q->offloaded)
+ return 0;
+
+ offload = taprio_offload_alloc(0);
+ if (!offload) {
+ NL_SET_ERR_MSG(extack,
+ "Not enough memory to disable offload mode");
+ return -ENOMEM;
+ }
+ offload->cmd = TAPRIO_CMD_DESTROY;
+
+ err = ops->ndo_setup_tc(dev, TC_SETUP_QDISC_TAPRIO, offload);
+ if (err < 0) {
+ NL_SET_ERR_MSG(extack,
+ "Device failed to disable offload");
+ goto out;
+ }
+
+ q->offloaded = false;
+
+out:
+ taprio_offload_free(offload);
+
+ return err;
+}
+
+/* If full offload is enabled, the only possible clockid is the net device's
+ * PHC. For that reason, specifying a clockid through netlink is incorrect.
+ * For txtime-assist, it is implicitly assumed that the device's PHC is kept
+ * in sync with the specified clockid via a user space daemon such as phc2sys.
+ * For both software taprio and txtime-assist, the clockid is used for the
+ * hrtimer that advances the schedule and hence mandatory.
+ */
+static int taprio_parse_clockid(struct Qdisc *sch, struct nlattr **tb,
+ struct netlink_ext_ack *extack)
+{
+ struct taprio_sched *q = qdisc_priv(sch);
+ struct net_device *dev = qdisc_dev(sch);
+ int err = -EINVAL;
+
+ if (FULL_OFFLOAD_IS_ENABLED(q->flags)) {
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ struct kernel_ethtool_ts_info info = {
+ .cmd = ETHTOOL_GET_TS_INFO,
+ .phc_index = -1,
+ };
+
+ if (tb[TCA_TAPRIO_ATTR_SCHED_CLOCKID]) {
+ NL_SET_ERR_MSG(extack,
+ "The 'clockid' cannot be specified for full offload");
+ goto out;
+ }
+
+ if (ops && ops->get_ts_info)
+ err = ops->get_ts_info(dev, &info);
- /* Verify the queue count is in tx range being equal to the
- * real_num_tx_queues indicates the last queue is in use.
+ if (err || info.phc_index < 0) {
+ NL_SET_ERR_MSG(extack,
+ "Device does not have a PTP clock");
+ err = -ENOTSUPP;
+ goto out;
+ }
+ } else if (tb[TCA_TAPRIO_ATTR_SCHED_CLOCKID]) {
+ int clockid = nla_get_s32(tb[TCA_TAPRIO_ATTR_SCHED_CLOCKID]);
+ enum tk_offsets tk_offset;
+
+ /* We only support static clockids and we don't allow
+ * for it to be modified after the first init.
*/
- if (qopt->offset[i] >= dev->num_tx_queues ||
- !qopt->count[i] ||
- last > dev->real_num_tx_queues) {
- NL_SET_ERR_MSG(extack, "Invalid queue in traffic class to queue mapping");
- return -EINVAL;
+ if (clockid < 0 ||
+ (q->clockid != -1 && q->clockid != clockid)) {
+ NL_SET_ERR_MSG(extack,
+ "Changing the 'clockid' of a running schedule is not supported");
+ err = -ENOTSUPP;
+ goto out;
}
- /* Verify that the offset and counts do not overlap */
- for (j = i + 1; j < qopt->num_tc; j++) {
- if (last > qopt->offset[j]) {
- NL_SET_ERR_MSG(extack, "Detected overlap in the traffic class to queue mapping");
- return -EINVAL;
- }
+ switch (clockid) {
+ case CLOCK_REALTIME:
+ tk_offset = TK_OFFS_REAL;
+ break;
+ case CLOCK_MONOTONIC:
+ tk_offset = TK_OFFS_MAX;
+ break;
+ case CLOCK_BOOTTIME:
+ tk_offset = TK_OFFS_BOOT;
+ break;
+ case CLOCK_TAI:
+ tk_offset = TK_OFFS_TAI;
+ break;
+ default:
+ NL_SET_ERR_MSG(extack, "Invalid 'clockid'");
+ err = -EINVAL;
+ goto out;
}
+ /* This pairs with READ_ONCE() in taprio_mono_to_any */
+ WRITE_ONCE(q->tk_offset, tk_offset);
+
+ q->clockid = clockid;
+ } else {
+ NL_SET_ERR_MSG(extack, "Specifying a 'clockid' is mandatory");
+ goto out;
}
- return 0;
+ /* Everything went ok, return success. */
+ err = 0;
+
+out:
+ return err;
}
-static ktime_t taprio_get_start_time(struct Qdisc *sch)
+static int taprio_parse_tc_entry(struct Qdisc *sch,
+ struct nlattr *opt,
+ u32 max_sdu[TC_QOPT_MAX_QUEUE],
+ u32 fp[TC_QOPT_MAX_QUEUE],
+ unsigned long *seen_tcs,
+ struct netlink_ext_ack *extack)
{
- struct taprio_sched *q = qdisc_priv(sch);
- struct sched_entry *entry;
- ktime_t now, base, cycle;
- s64 n;
+ struct nlattr *tb[TCA_TAPRIO_TC_ENTRY_MAX + 1] = { };
+ struct net_device *dev = qdisc_dev(sch);
+ int err, tc;
+ u32 val;
+
+ err = nla_parse_nested(tb, TCA_TAPRIO_TC_ENTRY_MAX, opt,
+ taprio_tc_policy, extack);
+ if (err < 0)
+ return err;
- base = ns_to_ktime(q->base_time);
- cycle = 0;
+ if (NL_REQ_ATTR_CHECK(extack, opt, tb, TCA_TAPRIO_TC_ENTRY_INDEX)) {
+ NL_SET_ERR_MSG_MOD(extack, "TC entry index missing");
+ return -EINVAL;
+ }
- /* Calculate the cycle_time, by summing all the intervals.
- */
- list_for_each_entry(entry, &q->entries, list)
- cycle = ktime_add_ns(cycle, entry->interval);
+ tc = nla_get_u32(tb[TCA_TAPRIO_TC_ENTRY_INDEX]);
+ if (*seen_tcs & BIT(tc)) {
+ NL_SET_ERR_MSG_ATTR(extack, tb[TCA_TAPRIO_TC_ENTRY_INDEX],
+ "Duplicate tc entry");
+ return -EINVAL;
+ }
- if (!cycle)
- return base;
+ *seen_tcs |= BIT(tc);
- now = q->get_time();
+ if (tb[TCA_TAPRIO_TC_ENTRY_MAX_SDU]) {
+ val = nla_get_u32(tb[TCA_TAPRIO_TC_ENTRY_MAX_SDU]);
+ if (val > dev->max_mtu) {
+ NL_SET_ERR_MSG_MOD(extack, "TC max SDU exceeds device max MTU");
+ return -ERANGE;
+ }
- if (ktime_after(base, now))
- return base;
+ max_sdu[tc] = val;
+ }
- /* Schedule the start time for the beginning of the next
- * cycle.
- */
- n = div64_s64(ktime_sub_ns(now, base), cycle);
+ if (tb[TCA_TAPRIO_TC_ENTRY_FP])
+ fp[tc] = nla_get_u32(tb[TCA_TAPRIO_TC_ENTRY_FP]);
- return ktime_add_ns(base, (n + 1) * cycle);
+ return 0;
}
-static void taprio_start_sched(struct Qdisc *sch, ktime_t start)
+static int taprio_parse_tc_entries(struct Qdisc *sch,
+ struct nlattr *opt,
+ struct netlink_ext_ack *extack)
{
struct taprio_sched *q = qdisc_priv(sch);
- struct sched_entry *first;
- unsigned long flags;
+ struct net_device *dev = qdisc_dev(sch);
+ u32 max_sdu[TC_QOPT_MAX_QUEUE];
+ bool have_preemption = false;
+ unsigned long seen_tcs = 0;
+ u32 fp[TC_QOPT_MAX_QUEUE];
+ struct nlattr *n;
+ int tc, rem;
+ int err = 0;
- spin_lock_irqsave(&q->current_entry_lock, flags);
+ for (tc = 0; tc < TC_QOPT_MAX_QUEUE; tc++) {
+ max_sdu[tc] = q->max_sdu[tc];
+ fp[tc] = q->fp[tc];
+ }
- first = list_first_entry(&q->entries, struct sched_entry,
- list);
+ nla_for_each_nested_type(n, TCA_TAPRIO_ATTR_TC_ENTRY, opt, rem) {
+ err = taprio_parse_tc_entry(sch, n, max_sdu, fp, &seen_tcs,
+ extack);
+ if (err)
+ return err;
+ }
- first->close_time = ktime_add_ns(start, first->interval);
- atomic_set(&first->budget,
- (first->interval * 1000) / q->picos_per_byte);
- rcu_assign_pointer(q->current_entry, NULL);
+ for (tc = 0; tc < TC_QOPT_MAX_QUEUE; tc++) {
+ q->max_sdu[tc] = max_sdu[tc];
+ q->fp[tc] = fp[tc];
+ if (fp[tc] != TC_FP_EXPRESS)
+ have_preemption = true;
+ }
- spin_unlock_irqrestore(&q->current_entry_lock, flags);
+ if (have_preemption) {
+ if (!FULL_OFFLOAD_IS_ENABLED(q->flags)) {
+ NL_SET_ERR_MSG(extack,
+ "Preemption only supported with full offload");
+ return -EOPNOTSUPP;
+ }
- hrtimer_start(&q->advance_timer, start, HRTIMER_MODE_ABS);
+ if (!ethtool_dev_mm_supported(dev)) {
+ NL_SET_ERR_MSG(extack,
+ "Device does not support preemption");
+ return -EOPNOTSUPP;
+ }
+ }
+
+ return err;
+}
+
+static int taprio_mqprio_cmp(const struct net_device *dev,
+ const struct tc_mqprio_qopt *mqprio)
+{
+ int i;
+
+ if (!mqprio || mqprio->num_tc != dev->num_tc)
+ return -1;
+
+ for (i = 0; i < mqprio->num_tc; i++)
+ if (dev->tc_to_txq[i].count != mqprio->count[i] ||
+ dev->tc_to_txq[i].offset != mqprio->offset[i])
+ return -1;
+
+ for (i = 0; i <= TC_BITMASK; i++)
+ if (dev->prio_tc_map[i] != mqprio->prio_tc_map[i])
+ return -1;
+
+ return 0;
}
static int taprio_change(struct Qdisc *sch, struct nlattr *opt,
struct netlink_ext_ack *extack)
{
+ struct qdisc_size_table *stab = rtnl_dereference(sch->stab);
struct nlattr *tb[TCA_TAPRIO_ATTR_MAX + 1] = { };
+ struct sched_gate_list *oper, *admin, *new_admin;
struct taprio_sched *q = qdisc_priv(sch);
struct net_device *dev = qdisc_dev(sch);
struct tc_mqprio_qopt *mqprio = NULL;
- struct ethtool_link_ksettings ecmd;
- int i, err, size;
- s64 link_speed;
+ unsigned long flags;
+ u32 taprio_flags;
ktime_t start;
+ int i, err;
- err = nla_parse_nested(tb, TCA_TAPRIO_ATTR_MAX, opt,
- taprio_policy, extack);
+ err = nla_parse_nested_deprecated(tb, TCA_TAPRIO_ATTR_MAX, opt,
+ taprio_policy, extack);
if (err < 0)
return err;
- err = -EINVAL;
if (tb[TCA_TAPRIO_ATTR_PRIOMAP])
mqprio = nla_data(tb[TCA_TAPRIO_ATTR_PRIOMAP]);
- err = taprio_parse_mqprio_opt(dev, mqprio, extack);
+ /* The semantics of the 'flags' argument in relation to 'change()'
+ * requests, are interpreted following two rules (which are applied in
+ * this order): (1) an omitted 'flags' argument is interpreted as
+ * zero; (2) the 'flags' of a "running" taprio instance cannot be
+ * changed.
+ */
+ taprio_flags = nla_get_u32_default(tb[TCA_TAPRIO_ATTR_FLAGS], 0);
+
+ /* txtime-assist and full offload are mutually exclusive */
+ if ((taprio_flags & TCA_TAPRIO_ATTR_FLAG_TXTIME_ASSIST) &&
+ (taprio_flags & TCA_TAPRIO_ATTR_FLAG_FULL_OFFLOAD)) {
+ NL_SET_ERR_MSG_ATTR(extack, tb[TCA_TAPRIO_ATTR_FLAGS],
+ "TXTIME_ASSIST and FULL_OFFLOAD are mutually exclusive");
+ return -EINVAL;
+ }
+
+ if (q->flags != TAPRIO_FLAGS_INVALID && q->flags != taprio_flags) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "Changing 'flags' of a running schedule is not supported");
+ return -EOPNOTSUPP;
+ }
+ q->flags = taprio_flags;
+
+ /* Needed for length_to_duration() during netlink attribute parsing */
+ taprio_set_picos_per_byte(dev, q, extack);
+
+ err = taprio_parse_mqprio_opt(dev, mqprio, extack, q->flags);
if (err < 0)
return err;
- /* A schedule with less than one entry is an error */
- size = parse_taprio_opt(tb, q, extack);
- if (size < 0)
- return size;
-
- hrtimer_init(&q->advance_timer, q->clockid, HRTIMER_MODE_ABS);
- q->advance_timer.function = advance_sched;
+ err = taprio_parse_tc_entries(sch, opt, extack);
+ if (err)
+ return err;
- switch (q->clockid) {
- case CLOCK_REALTIME:
- q->get_time = ktime_get_real;
- break;
- case CLOCK_MONOTONIC:
- q->get_time = ktime_get;
- break;
- case CLOCK_BOOTTIME:
- q->get_time = ktime_get_boottime;
- break;
- case CLOCK_TAI:
- q->get_time = ktime_get_clocktai;
- break;
- default:
- return -ENOTSUPP;
+ new_admin = kzalloc(sizeof(*new_admin), GFP_KERNEL);
+ if (!new_admin) {
+ NL_SET_ERR_MSG(extack, "Not enough memory for a new schedule");
+ return -ENOMEM;
}
+ INIT_LIST_HEAD(&new_admin->entries);
- for (i = 0; i < dev->num_tx_queues; i++) {
- struct netdev_queue *dev_queue;
- struct Qdisc *qdisc;
-
- dev_queue = netdev_get_tx_queue(dev, i);
- qdisc = qdisc_create_dflt(dev_queue,
- &pfifo_qdisc_ops,
- TC_H_MAKE(TC_H_MAJ(sch->handle),
- TC_H_MIN(i + 1)),
- extack);
- if (!qdisc)
- return -ENOMEM;
+ oper = rtnl_dereference(q->oper_sched);
+ admin = rtnl_dereference(q->admin_sched);
- if (i < dev->real_num_tx_queues)
- qdisc_hash_add(qdisc, false);
+ /* no changes - no new mqprio settings */
+ if (!taprio_mqprio_cmp(dev, mqprio))
+ mqprio = NULL;
- q->qdiscs[i] = qdisc;
+ if (mqprio && (oper || admin)) {
+ NL_SET_ERR_MSG(extack, "Changing the traffic mapping of a running schedule is not supported");
+ err = -ENOTSUPP;
+ goto free_sched;
}
if (mqprio) {
- netdev_set_num_tc(dev, mqprio->num_tc);
- for (i = 0; i < mqprio->num_tc; i++)
+ err = netdev_set_num_tc(dev, mqprio->num_tc);
+ if (err)
+ goto free_sched;
+ for (i = 0; i < mqprio->num_tc; i++) {
netdev_set_tc_queue(dev, i,
mqprio->count[i],
mqprio->offset[i]);
+ q->cur_txq[i] = mqprio->offset[i];
+ }
/* Always use supplied priority mappings */
- for (i = 0; i < TC_BITMASK + 1; i++)
+ for (i = 0; i <= TC_BITMASK; i++)
netdev_set_prio_tc_map(dev, i,
mqprio->prio_tc_map[i]);
}
- if (!__ethtool_get_link_ksettings(dev, &ecmd))
- link_speed = ecmd.base.speed;
+ err = parse_taprio_schedule(q, tb, new_admin, extack);
+ if (err < 0)
+ goto free_sched;
+
+ if (new_admin->num_entries == 0) {
+ NL_SET_ERR_MSG(extack, "There should be at least one entry in the schedule");
+ err = -EINVAL;
+ goto free_sched;
+ }
+
+ err = taprio_parse_clockid(sch, tb, extack);
+ if (err < 0)
+ goto free_sched;
+
+ taprio_update_queue_max_sdu(q, new_admin, stab);
+
+ if (FULL_OFFLOAD_IS_ENABLED(q->flags))
+ err = taprio_enable_offload(dev, q, new_admin, extack);
else
- link_speed = SPEED_1000;
+ err = taprio_disable_offload(dev, q, extack);
+ if (err)
+ goto free_sched;
+
+ /* Protects against enqueue()/dequeue() */
+ spin_lock_bh(qdisc_lock(sch));
+
+ if (tb[TCA_TAPRIO_ATTR_TXTIME_DELAY]) {
+ if (!TXTIME_ASSIST_IS_ENABLED(q->flags)) {
+ NL_SET_ERR_MSG_MOD(extack, "txtime-delay can only be set when txtime-assist mode is enabled");
+ err = -EINVAL;
+ goto unlock;
+ }
- q->picos_per_byte = div64_s64(NSEC_PER_SEC * 1000LL * 8,
- link_speed * 1000 * 1000);
+ q->txtime_delay = nla_get_u32(tb[TCA_TAPRIO_ATTR_TXTIME_DELAY]);
+ }
- start = taprio_get_start_time(sch);
- if (!start)
- return 0;
+ if (!TXTIME_ASSIST_IS_ENABLED(q->flags) &&
+ !FULL_OFFLOAD_IS_ENABLED(q->flags) &&
+ !hrtimer_active(&q->advance_timer)) {
+ hrtimer_setup(&q->advance_timer, advance_sched, q->clockid, HRTIMER_MODE_ABS);
+ }
+
+ err = taprio_get_start_time(sch, new_admin, &start);
+ if (err < 0) {
+ NL_SET_ERR_MSG(extack, "Internal error: failed get start time");
+ goto unlock;
+ }
- taprio_start_sched(sch, start);
+ setup_txtime(q, new_admin, start);
- return 0;
+ if (TXTIME_ASSIST_IS_ENABLED(q->flags)) {
+ if (!oper) {
+ rcu_assign_pointer(q->oper_sched, new_admin);
+ err = 0;
+ new_admin = NULL;
+ goto unlock;
+ }
+
+ /* Not going to race against advance_sched(), but still */
+ admin = rcu_replace_pointer(q->admin_sched, new_admin,
+ lockdep_rtnl_is_held());
+ if (admin)
+ call_rcu(&admin->rcu, taprio_free_sched_cb);
+ } else {
+ setup_first_end_time(q, new_admin, start);
+
+ /* Protects against advance_sched() */
+ spin_lock_irqsave(&q->current_entry_lock, flags);
+
+ taprio_start_sched(sch, start, new_admin);
+
+ admin = rcu_replace_pointer(q->admin_sched, new_admin,
+ lockdep_rtnl_is_held());
+ if (admin)
+ call_rcu(&admin->rcu, taprio_free_sched_cb);
+
+ spin_unlock_irqrestore(&q->current_entry_lock, flags);
+
+ if (FULL_OFFLOAD_IS_ENABLED(q->flags))
+ taprio_offload_config_changed(q);
+ }
+
+ new_admin = NULL;
+ err = 0;
+
+ if (!stab)
+ NL_SET_ERR_MSG_MOD(extack,
+ "Size table not specified, frame length estimations may be inaccurate");
+
+unlock:
+ spin_unlock_bh(qdisc_lock(sch));
+
+free_sched:
+ if (new_admin)
+ call_rcu(&new_admin->rcu, taprio_free_sched_cb);
+
+ return err;
+}
+
+static void taprio_reset(struct Qdisc *sch)
+{
+ struct taprio_sched *q = qdisc_priv(sch);
+ struct net_device *dev = qdisc_dev(sch);
+ int i;
+
+ hrtimer_cancel(&q->advance_timer);
+
+ if (q->qdiscs) {
+ for (i = 0; i < dev->num_tx_queues; i++)
+ if (q->qdiscs[i])
+ qdisc_reset(q->qdiscs[i]);
+ }
}
static void taprio_destroy(struct Qdisc *sch)
{
struct taprio_sched *q = qdisc_priv(sch);
struct net_device *dev = qdisc_dev(sch);
- struct sched_entry *entry, *n;
+ struct sched_gate_list *oper, *admin;
unsigned int i;
+ list_del(&q->taprio_list);
+
+ /* Note that taprio_reset() might not be called if an error
+ * happens in qdisc_create(), after taprio_init() has been called.
+ */
hrtimer_cancel(&q->advance_timer);
+ qdisc_synchronize(sch);
+
+ taprio_disable_offload(dev, q, NULL);
if (q->qdiscs) {
- for (i = 0; i < dev->num_tx_queues && q->qdiscs[i]; i++)
+ for (i = 0; i < dev->num_tx_queues; i++)
qdisc_put(q->qdiscs[i]);
kfree(q->qdiscs);
}
q->qdiscs = NULL;
- netdev_set_num_tc(dev, 0);
+ netdev_reset_tc(dev);
- list_for_each_entry_safe(entry, n, &q->entries, list) {
- list_del(&entry->list);
- kfree(entry);
- }
+ oper = rtnl_dereference(q->oper_sched);
+ admin = rtnl_dereference(q->admin_sched);
+
+ if (oper)
+ call_rcu(&oper->rcu, taprio_free_sched_cb);
+
+ if (admin)
+ call_rcu(&admin->rcu, taprio_free_sched_cb);
+
+ taprio_cleanup_broken_mqprio(q);
}
static int taprio_init(struct Qdisc *sch, struct nlattr *opt,
@@ -704,12 +2065,11 @@ static int taprio_init(struct Qdisc *sch, struct nlattr *opt,
{
struct taprio_sched *q = qdisc_priv(sch);
struct net_device *dev = qdisc_dev(sch);
+ int i, tc;
- INIT_LIST_HEAD(&q->entries);
spin_lock_init(&q->current_entry_lock);
- /* We may overwrite the configuration later */
- hrtimer_init(&q->advance_timer, CLOCK_TAI, HRTIMER_MODE_ABS);
+ hrtimer_setup(&q->advance_timer, advance_sched, CLOCK_TAI, HRTIMER_MODE_ABS);
q->root = sch;
@@ -717,27 +2077,91 @@ static int taprio_init(struct Qdisc *sch, struct nlattr *opt,
* and get the valid one on taprio_change().
*/
q->clockid = -1;
+ q->flags = TAPRIO_FLAGS_INVALID;
+
+ list_add(&q->taprio_list, &taprio_list);
- if (sch->parent != TC_H_ROOT)
+ if (sch->parent != TC_H_ROOT) {
+ NL_SET_ERR_MSG_MOD(extack, "Can only be attached as root qdisc");
return -EOPNOTSUPP;
+ }
- if (!netif_is_multiqueue(dev))
+ if (!netif_is_multiqueue(dev)) {
+ NL_SET_ERR_MSG_MOD(extack, "Multi-queue device is required");
return -EOPNOTSUPP;
+ }
- /* pre-allocate qdisc, attachment can't fail */
- q->qdiscs = kcalloc(dev->num_tx_queues,
- sizeof(q->qdiscs[0]),
+ q->qdiscs = kcalloc(dev->num_tx_queues, sizeof(q->qdiscs[0]),
GFP_KERNEL);
-
if (!q->qdiscs)
return -ENOMEM;
if (!opt)
return -EINVAL;
+ for (i = 0; i < dev->num_tx_queues; i++) {
+ struct netdev_queue *dev_queue;
+ struct Qdisc *qdisc;
+
+ dev_queue = netdev_get_tx_queue(dev, i);
+ qdisc = qdisc_create_dflt(dev_queue,
+ &pfifo_qdisc_ops,
+ TC_H_MAKE(TC_H_MAJ(sch->handle),
+ TC_H_MIN(i + 1)),
+ extack);
+ if (!qdisc)
+ return -ENOMEM;
+
+ if (i < dev->real_num_tx_queues)
+ qdisc_hash_add(qdisc, false);
+
+ q->qdiscs[i] = qdisc;
+ }
+
+ for (tc = 0; tc < TC_QOPT_MAX_QUEUE; tc++)
+ q->fp[tc] = TC_FP_EXPRESS;
+
+ taprio_detect_broken_mqprio(q);
+
return taprio_change(sch, opt, extack);
}
+static void taprio_attach(struct Qdisc *sch)
+{
+ struct taprio_sched *q = qdisc_priv(sch);
+ struct net_device *dev = qdisc_dev(sch);
+ unsigned int ntx;
+
+ /* Attach underlying qdisc */
+ for (ntx = 0; ntx < dev->num_tx_queues; ntx++) {
+ struct netdev_queue *dev_queue = netdev_get_tx_queue(dev, ntx);
+ struct Qdisc *old, *dev_queue_qdisc;
+
+ if (FULL_OFFLOAD_IS_ENABLED(q->flags)) {
+ struct Qdisc *qdisc = q->qdiscs[ntx];
+
+ /* In offload mode, the root taprio qdisc is bypassed
+ * and the netdev TX queues see the children directly
+ */
+ qdisc->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT;
+ dev_queue_qdisc = qdisc;
+ } else {
+ /* In software mode, attach the root taprio qdisc
+ * to all netdev TX queues, so that dev_qdisc_enqueue()
+ * goes through taprio_enqueue().
+ */
+ dev_queue_qdisc = sch;
+ }
+ old = dev_graft_qdisc(dev_queue, dev_queue_qdisc);
+ /* The qdisc's refcount requires to be elevated once
+ * for each netdev TX queue it is grafted onto
+ */
+ qdisc_refcount_inc(dev_queue_qdisc);
+ if (old)
+ qdisc_put(old);
+ }
+}
+
static struct netdev_queue *taprio_queue_get(struct Qdisc *sch,
unsigned long cl)
{
@@ -764,9 +2188,23 @@ static int taprio_graft(struct Qdisc *sch, unsigned long cl,
if (dev->flags & IFF_UP)
dev_deactivate(dev);
+ /* In offload mode, the child Qdisc is directly attached to the netdev
+ * TX queue, and thus, we need to keep its refcount elevated in order
+ * to counteract qdisc_graft()'s call to qdisc_put() once per TX queue.
+ * However, save the reference to the new qdisc in the private array in
+ * both software and offload cases, to have an up-to-date reference to
+ * our children.
+ */
*old = q->qdiscs[cl - 1];
- q->qdiscs[cl - 1] = new;
+ if (FULL_OFFLOAD_IS_ENABLED(q->flags)) {
+ WARN_ON_ONCE(dev_graft_qdisc(dev_queue, new) != *old);
+ if (new)
+ qdisc_refcount_inc(new);
+ if (*old)
+ qdisc_put(*old);
+ }
+ q->qdiscs[cl - 1] = new;
if (new)
new->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT;
@@ -781,7 +2219,7 @@ static int dump_entry(struct sk_buff *msg,
{
struct nlattr *item;
- item = nla_nest_start(msg, TCA_TAPRIO_SCHED_ENTRY);
+ item = nla_nest_start_noflag(msg, TCA_TAPRIO_SCHED_ENTRY);
if (!item)
return -ENOSPC;
@@ -806,63 +2244,218 @@ nla_put_failure:
return -1;
}
+static int dump_schedule(struct sk_buff *msg,
+ const struct sched_gate_list *root)
+{
+ struct nlattr *entry_list;
+ struct sched_entry *entry;
+
+ if (nla_put_s64(msg, TCA_TAPRIO_ATTR_SCHED_BASE_TIME,
+ root->base_time, TCA_TAPRIO_PAD))
+ return -1;
+
+ if (nla_put_s64(msg, TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME,
+ root->cycle_time, TCA_TAPRIO_PAD))
+ return -1;
+
+ if (nla_put_s64(msg, TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME_EXTENSION,
+ root->cycle_time_extension, TCA_TAPRIO_PAD))
+ return -1;
+
+ entry_list = nla_nest_start_noflag(msg,
+ TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST);
+ if (!entry_list)
+ goto error_nest;
+
+ list_for_each_entry(entry, &root->entries, list) {
+ if (dump_entry(msg, entry) < 0)
+ goto error_nest;
+ }
+
+ nla_nest_end(msg, entry_list);
+ return 0;
+
+error_nest:
+ nla_nest_cancel(msg, entry_list);
+ return -1;
+}
+
+static int taprio_dump_tc_entries(struct sk_buff *skb,
+ struct taprio_sched *q,
+ struct sched_gate_list *sched)
+{
+ struct nlattr *n;
+ int tc;
+
+ for (tc = 0; tc < TC_MAX_QUEUE; tc++) {
+ n = nla_nest_start(skb, TCA_TAPRIO_ATTR_TC_ENTRY);
+ if (!n)
+ return -EMSGSIZE;
+
+ if (nla_put_u32(skb, TCA_TAPRIO_TC_ENTRY_INDEX, tc))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, TCA_TAPRIO_TC_ENTRY_MAX_SDU,
+ sched->max_sdu[tc]))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, TCA_TAPRIO_TC_ENTRY_FP, q->fp[tc]))
+ goto nla_put_failure;
+
+ nla_nest_end(skb, n);
+ }
+
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(skb, n);
+ return -EMSGSIZE;
+}
+
+static int taprio_put_stat(struct sk_buff *skb, u64 val, u16 attrtype)
+{
+ if (val == TAPRIO_STAT_NOT_SET)
+ return 0;
+ if (nla_put_u64_64bit(skb, attrtype, val, TCA_TAPRIO_OFFLOAD_STATS_PAD))
+ return -EMSGSIZE;
+ return 0;
+}
+
+static int taprio_dump_xstats(struct Qdisc *sch, struct gnet_dump *d,
+ struct tc_taprio_qopt_offload *offload,
+ struct tc_taprio_qopt_stats *stats)
+{
+ struct net_device *dev = qdisc_dev(sch);
+ const struct net_device_ops *ops;
+ struct sk_buff *skb = d->skb;
+ struct nlattr *xstats;
+ int err;
+
+ ops = qdisc_dev(sch)->netdev_ops;
+
+ /* FIXME I could use qdisc_offload_dump_helper(), but that messes
+ * with sch->flags depending on whether the device reports taprio
+ * stats, and I'm not sure whether that's a good idea, considering
+ * that stats are optional to the offload itself
+ */
+ if (!ops->ndo_setup_tc)
+ return 0;
+
+ memset(stats, 0xff, sizeof(*stats));
+
+ err = ops->ndo_setup_tc(dev, TC_SETUP_QDISC_TAPRIO, offload);
+ if (err == -EOPNOTSUPP)
+ return 0;
+ if (err)
+ return err;
+
+ xstats = nla_nest_start(skb, TCA_STATS_APP);
+ if (!xstats)
+ goto err;
+
+ if (taprio_put_stat(skb, stats->window_drops,
+ TCA_TAPRIO_OFFLOAD_STATS_WINDOW_DROPS) ||
+ taprio_put_stat(skb, stats->tx_overruns,
+ TCA_TAPRIO_OFFLOAD_STATS_TX_OVERRUNS))
+ goto err_cancel;
+
+ nla_nest_end(skb, xstats);
+
+ return 0;
+
+err_cancel:
+ nla_nest_cancel(skb, xstats);
+err:
+ return -EMSGSIZE;
+}
+
+static int taprio_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
+{
+ struct tc_taprio_qopt_offload offload = {
+ .cmd = TAPRIO_CMD_STATS,
+ };
+
+ return taprio_dump_xstats(sch, d, &offload, &offload.stats);
+}
+
static int taprio_dump(struct Qdisc *sch, struct sk_buff *skb)
{
struct taprio_sched *q = qdisc_priv(sch);
struct net_device *dev = qdisc_dev(sch);
+ struct sched_gate_list *oper, *admin;
struct tc_mqprio_qopt opt = { 0 };
- struct nlattr *nest, *entry_list;
- struct sched_entry *entry;
- unsigned int i;
+ struct nlattr *nest, *sched_nest;
- opt.num_tc = netdev_get_num_tc(dev);
- memcpy(opt.prio_tc_map, dev->prio_tc_map, sizeof(opt.prio_tc_map));
+ mqprio_qopt_reconstruct(dev, &opt);
- for (i = 0; i < netdev_get_num_tc(dev); i++) {
- opt.count[i] = dev->tc_to_txq[i].count;
- opt.offset[i] = dev->tc_to_txq[i].offset;
- }
-
- nest = nla_nest_start(skb, TCA_OPTIONS);
+ nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
if (!nest)
- return -ENOSPC;
+ goto start_error;
if (nla_put(skb, TCA_TAPRIO_ATTR_PRIOMAP, sizeof(opt), &opt))
goto options_error;
- if (nla_put_s64(skb, TCA_TAPRIO_ATTR_SCHED_BASE_TIME,
- q->base_time, TCA_TAPRIO_PAD))
+ if (!FULL_OFFLOAD_IS_ENABLED(q->flags) &&
+ nla_put_s32(skb, TCA_TAPRIO_ATTR_SCHED_CLOCKID, q->clockid))
goto options_error;
- if (nla_put_s32(skb, TCA_TAPRIO_ATTR_SCHED_CLOCKID, q->clockid))
+ if (q->flags && nla_put_u32(skb, TCA_TAPRIO_ATTR_FLAGS, q->flags))
goto options_error;
- entry_list = nla_nest_start(skb, TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST);
- if (!entry_list)
+ if (q->txtime_delay &&
+ nla_put_u32(skb, TCA_TAPRIO_ATTR_TXTIME_DELAY, q->txtime_delay))
goto options_error;
- list_for_each_entry(entry, &q->entries, list) {
- if (dump_entry(skb, entry) < 0)
- goto options_error;
- }
+ rcu_read_lock();
+
+ oper = rtnl_dereference(q->oper_sched);
+ admin = rtnl_dereference(q->admin_sched);
+
+ if (oper && taprio_dump_tc_entries(skb, q, oper))
+ goto options_error_rcu;
+
+ if (oper && dump_schedule(skb, oper))
+ goto options_error_rcu;
- nla_nest_end(skb, entry_list);
+ if (!admin)
+ goto done;
+ sched_nest = nla_nest_start_noflag(skb, TCA_TAPRIO_ATTR_ADMIN_SCHED);
+ if (!sched_nest)
+ goto options_error_rcu;
+
+ if (dump_schedule(skb, admin))
+ goto admin_error;
+
+ nla_nest_end(skb, sched_nest);
+
+done:
+ rcu_read_unlock();
return nla_nest_end(skb, nest);
+admin_error:
+ nla_nest_cancel(skb, sched_nest);
+
+options_error_rcu:
+ rcu_read_unlock();
+
options_error:
nla_nest_cancel(skb, nest);
- return -1;
+
+start_error:
+ return -ENOSPC;
}
static struct Qdisc *taprio_leaf(struct Qdisc *sch, unsigned long cl)
{
- struct netdev_queue *dev_queue = taprio_queue_get(sch, cl);
+ struct taprio_sched *q = qdisc_priv(sch);
+ struct net_device *dev = qdisc_dev(sch);
+ unsigned int ntx = cl - 1;
- if (!dev_queue)
+ if (ntx >= dev->num_tx_queues)
return NULL;
- return dev_queue->qdisc_sleeping;
+ return q->qdiscs[ntx];
}
static unsigned long taprio_find(struct Qdisc *sch, u32 classid)
@@ -877,11 +2470,11 @@ static unsigned long taprio_find(struct Qdisc *sch, u32 classid)
static int taprio_dump_class(struct Qdisc *sch, unsigned long cl,
struct sk_buff *skb, struct tcmsg *tcm)
{
- struct netdev_queue *dev_queue = taprio_queue_get(sch, cl);
+ struct Qdisc *child = taprio_leaf(sch, cl);
tcm->tcm_parent = TC_H_ROOT;
tcm->tcm_handle |= TC_H_MIN(cl);
- tcm->tcm_info = dev_queue->qdisc_sleeping->handle;
+ tcm->tcm_info = child->handle;
return 0;
}
@@ -891,13 +2484,19 @@ static int taprio_dump_class_stats(struct Qdisc *sch, unsigned long cl,
__releases(d->lock)
__acquires(d->lock)
{
- struct netdev_queue *dev_queue = taprio_queue_get(sch, cl);
-
- sch = dev_queue->qdisc_sleeping;
- if (gnet_stats_copy_basic(&sch->running, d, NULL, &sch->bstats) < 0 ||
- gnet_stats_copy_queue(d, NULL, &sch->qstats, sch->q.qlen) < 0)
+ struct Qdisc *child = taprio_leaf(sch, cl);
+ struct tc_taprio_qopt_offload offload = {
+ .cmd = TAPRIO_CMD_QUEUE_STATS,
+ .queue_stats = {
+ .queue = cl - 1,
+ },
+ };
+
+ if (gnet_stats_copy_basic(d, NULL, &child->bstats, true) < 0 ||
+ qdisc_qstats_copy(d, child) < 0)
return -1;
- return 0;
+
+ return taprio_dump_xstats(sch, d, &offload, &offload.queue_stats.stats);
}
static void taprio_walk(struct Qdisc *sch, struct qdisc_walker *arg)
@@ -910,11 +2509,8 @@ static void taprio_walk(struct Qdisc *sch, struct qdisc_walker *arg)
arg->count = arg->skip;
for (ntx = arg->skip; ntx < dev->num_tx_queues; ntx++) {
- if (arg->fn(sch, ntx + 1, arg) < 0) {
- arg->stop = 1;
+ if (!tc_qdisc_stats_dump(sch, ntx + 1, arg))
break;
- }
- arg->count++;
}
}
@@ -939,24 +2535,40 @@ static struct Qdisc_ops taprio_qdisc_ops __read_mostly = {
.id = "taprio",
.priv_size = sizeof(struct taprio_sched),
.init = taprio_init,
+ .change = taprio_change,
.destroy = taprio_destroy,
+ .reset = taprio_reset,
+ .attach = taprio_attach,
.peek = taprio_peek,
.dequeue = taprio_dequeue,
.enqueue = taprio_enqueue,
.dump = taprio_dump,
+ .dump_stats = taprio_dump_stats,
.owner = THIS_MODULE,
};
+MODULE_ALIAS_NET_SCH("taprio");
+
+static struct notifier_block taprio_device_notifier = {
+ .notifier_call = taprio_dev_notifier,
+};
static int __init taprio_module_init(void)
{
+ int err = register_netdevice_notifier(&taprio_device_notifier);
+
+ if (err)
+ return err;
+
return register_qdisc(&taprio_qdisc_ops);
}
static void __exit taprio_module_exit(void)
{
unregister_qdisc(&taprio_qdisc_ops);
+ unregister_netdevice_notifier(&taprio_device_notifier);
}
module_init(taprio_module_init);
module_exit(taprio_module_exit);
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Time Aware Priority qdisc");
diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c
index 7f272a9070c5..f2340164f579 100644
--- a/net/sched/sch_tbf.c
+++ b/net/sched/sch_tbf.c
@@ -1,15 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* net/sched/sch_tbf.c Token Bucket Filter queue.
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
* Dmitry Torokhov <dtor@mail.ru> - allow attaching inner qdiscs -
* original idea by Martin Devera
- *
*/
#include <linux/module.h>
@@ -18,8 +13,10 @@
#include <linux/string.h>
#include <linux/errno.h>
#include <linux/skbuff.h>
+#include <net/gso.h>
#include <net/netlink.h>
#include <net/sch_generic.h>
+#include <net/pkt_cls.h>
#include <net/pkt_sched.h>
@@ -142,6 +139,66 @@ static u64 psched_ns_t2l(const struct psched_ratecfg *r,
return len;
}
+static void tbf_offload_change(struct Qdisc *sch)
+{
+ struct tbf_sched_data *q = qdisc_priv(sch);
+ struct net_device *dev = qdisc_dev(sch);
+ struct tc_tbf_qopt_offload qopt;
+
+ if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
+ return;
+
+ qopt.command = TC_TBF_REPLACE;
+ qopt.handle = sch->handle;
+ qopt.parent = sch->parent;
+ qopt.replace_params.rate = q->rate;
+ qopt.replace_params.max_size = q->max_size;
+ qopt.replace_params.qstats = &sch->qstats;
+
+ dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_TBF, &qopt);
+}
+
+static void tbf_offload_destroy(struct Qdisc *sch)
+{
+ struct net_device *dev = qdisc_dev(sch);
+ struct tc_tbf_qopt_offload qopt;
+
+ if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
+ return;
+
+ qopt.command = TC_TBF_DESTROY;
+ qopt.handle = sch->handle;
+ qopt.parent = sch->parent;
+ dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_TBF, &qopt);
+}
+
+static int tbf_offload_dump(struct Qdisc *sch)
+{
+ struct tc_tbf_qopt_offload qopt;
+
+ qopt.command = TC_TBF_STATS;
+ qopt.handle = sch->handle;
+ qopt.parent = sch->parent;
+ qopt.stats.bstats = &sch->bstats;
+ qopt.stats.qstats = &sch->qstats;
+
+ return qdisc_offload_dump_helper(sch, TC_SETUP_QDISC_TBF, &qopt);
+}
+
+static void tbf_offload_graft(struct Qdisc *sch, struct Qdisc *new,
+ struct Qdisc *old, struct netlink_ext_ack *extack)
+{
+ struct tc_tbf_qopt_offload graft_offload = {
+ .handle = sch->handle,
+ .parent = sch->parent,
+ .child_handle = new->handle,
+ .command = TC_TBF_GRAFT,
+ };
+
+ qdisc_offload_graft_helper(qdisc_dev(sch), sch, new, old,
+ TC_SETUP_QDISC_TBF, &graft_offload, extack);
+}
+
/* GSO packet is too big, segment it so that tbf can transmit
* each segment in time
*/
@@ -151,7 +208,7 @@ static int tbf_segment(struct sk_buff *skb, struct Qdisc *sch,
struct tbf_sched_data *q = qdisc_priv(sch);
struct sk_buff *segs, *nskb;
netdev_features_t features = netif_skb_features(skb);
- unsigned int len = 0, prev_len = qdisc_pkt_len(skb);
+ unsigned int len = 0, prev_len = qdisc_pkt_len(skb), seg_len;
int ret, nb;
segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
@@ -160,25 +217,30 @@ static int tbf_segment(struct sk_buff *skb, struct Qdisc *sch,
return qdisc_drop(skb, sch, to_free);
nb = 0;
- while (segs) {
- nskb = segs->next;
+ skb_list_walk_safe(segs, segs, nskb) {
skb_mark_not_on_list(segs);
- qdisc_skb_cb(segs)->pkt_len = segs->len;
- len += segs->len;
+ seg_len = segs->len;
+ qdisc_skb_cb(segs)->pkt_len = seg_len;
+ qdisc_skb_cb(segs)->pkt_segs = 1;
ret = qdisc_enqueue(segs, q->qdisc, to_free);
if (ret != NET_XMIT_SUCCESS) {
if (net_xmit_drop_count(ret))
qdisc_qstats_drop(sch);
} else {
nb++;
+ len += seg_len;
}
- segs = nskb;
}
sch->q.qlen += nb;
- if (nb > 1)
+ sch->qstats.backlog += len;
+ if (nb > 0) {
qdisc_tree_reduce_backlog(sch, 1 - nb, prev_len - len);
- consume_skb(skb);
- return nb > 0 ? NET_XMIT_SUCCESS : NET_XMIT_DROP;
+ consume_skb(skb);
+ return NET_XMIT_SUCCESS;
+ }
+
+ kfree_skb(skb);
+ return NET_XMIT_DROP;
}
static int tbf_enqueue(struct sk_buff *skb, struct Qdisc *sch,
@@ -276,8 +338,6 @@ static void tbf_reset(struct Qdisc *sch)
struct tbf_sched_data *q = qdisc_priv(sch);
qdisc_reset(q->qdisc);
- sch->qstats.backlog = 0;
- sch->q.qlen = 0;
q->t_c = ktime_get_ns();
q->tokens = q->buffer;
q->ptokens = q->mtu;
@@ -302,13 +362,15 @@ static int tbf_change(struct Qdisc *sch, struct nlattr *opt,
struct nlattr *tb[TCA_TBF_MAX + 1];
struct tc_tbf_qopt *qopt;
struct Qdisc *child = NULL;
+ struct Qdisc *old = NULL;
struct psched_ratecfg rate;
struct psched_ratecfg peak;
u64 max_size;
s64 buffer, mtu;
u64 rate64 = 0, prate64 = 0;
- err = nla_parse_nested(tb, TCA_TBF_MAX, opt, tbf_policy, NULL);
+ err = nla_parse_nested_deprecated(tb, TCA_TBF_MAX, opt, tbf_policy,
+ NULL);
if (err < 0)
return err;
@@ -391,9 +453,8 @@ static int tbf_change(struct Qdisc *sch, struct nlattr *opt,
sch_tree_lock(sch);
if (child) {
- qdisc_tree_reduce_backlog(q->qdisc, q->qdisc->q.qlen,
- q->qdisc->qstats.backlog);
- qdisc_put(q->qdisc);
+ qdisc_purge_queue(q->qdisc);
+ old = q->qdisc;
q->qdisc = child;
}
q->limit = qopt->limit;
@@ -413,7 +474,10 @@ static int tbf_change(struct Qdisc *sch, struct nlattr *opt,
memcpy(&q->peak, &peak, sizeof(struct psched_ratecfg));
sch_tree_unlock(sch);
+ qdisc_put(old);
err = 0;
+
+ tbf_offload_change(sch);
done:
return err;
}
@@ -439,6 +503,7 @@ static void tbf_destroy(struct Qdisc *sch)
struct tbf_sched_data *q = qdisc_priv(sch);
qdisc_watchdog_cancel(&q->watchdog);
+ tbf_offload_destroy(sch);
qdisc_put(q->qdisc);
}
@@ -447,9 +512,13 @@ static int tbf_dump(struct Qdisc *sch, struct sk_buff *skb)
struct tbf_sched_data *q = qdisc_priv(sch);
struct nlattr *nest;
struct tc_tbf_qopt opt;
+ int err;
- sch->qstats.backlog = q->qdisc->qstats.backlog;
- nest = nla_nest_start(skb, TCA_OPTIONS);
+ err = tbf_offload_dump(sch);
+ if (err)
+ return err;
+
+ nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
if (nest == NULL)
goto nla_put_failure;
@@ -500,6 +569,8 @@ static int tbf_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
new = &noop_qdisc;
*old = qdisc_replace(sch, new, &q->qdisc);
+
+ tbf_offload_graft(sch, new, *old, extack);
return 0;
}
@@ -517,12 +588,7 @@ static unsigned long tbf_find(struct Qdisc *sch, u32 classid)
static void tbf_walk(struct Qdisc *sch, struct qdisc_walker *walker)
{
if (!walker->stop) {
- if (walker->count >= walker->skip)
- if (walker->fn(sch, 1, walker) < 0) {
- walker->stop = 1;
- return;
- }
- walker->count++;
+ tc_qdisc_stats_dump(sch, 1, walker);
}
}
@@ -549,6 +615,7 @@ static struct Qdisc_ops tbf_qdisc_ops __read_mostly = {
.dump = tbf_dump,
.owner = THIS_MODULE,
};
+MODULE_ALIAS_NET_SCH("tbf");
static int __init tbf_module_init(void)
{
@@ -562,3 +629,4 @@ static void __exit tbf_module_exit(void)
module_init(tbf_module_init)
module_exit(tbf_module_exit)
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Token Bucket Filter qdisc");
diff --git a/net/sched/sch_teql.c b/net/sched/sch_teql.c
index 93f04cf5cac1..8badec6d82a2 100644
--- a/net/sched/sch_teql.c
+++ b/net/sched/sch_teql.c
@@ -1,10 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/* net/sched/sch_teql.c "True" (or "trivial") link equalizer.
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
*/
@@ -82,7 +78,7 @@ teql_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free)
struct net_device *dev = qdisc_dev(sch);
struct teql_sched_data *q = qdisc_priv(sch);
- if (q->q.qlen < dev->tx_queue_len) {
+ if (q->q.qlen < READ_ONCE(dev->tx_queue_len)) {
__skb_queue_tail(&q->q, skb);
return NET_XMIT_SUCCESS;
}
@@ -128,7 +124,6 @@ teql_reset(struct Qdisc *sch)
struct teql_sched_data *dat = qdisc_priv(sch);
skb_queue_purge(&dat->q);
- sch->q.qlen = 0;
}
static void
@@ -138,6 +133,9 @@ teql_destroy(struct Qdisc *sch)
struct teql_sched_data *dat = qdisc_priv(sch);
struct teql_master *master = dat->m;
+ if (!master)
+ return;
+
prev = master->slaves;
if (prev) {
do {
@@ -243,7 +241,7 @@ __teql_resolve(struct sk_buff *skb, struct sk_buff *skb_res,
char haddr[MAX_ADDR_LEN];
neigh_ha_snapshot(haddr, n, dev);
- err = dev_hard_header(skb, dev, ntohs(tc_skb_protocol(skb)),
+ err = dev_hard_header(skb, dev, ntohs(skb_protocol(skb, false)),
haddr, NULL, skb->len);
if (err < 0)
@@ -299,7 +297,7 @@ restart:
struct net_device *slave = qdisc_dev(q);
struct netdev_queue *slave_txq = netdev_get_tx_queue(slave, 0);
- if (slave_txq->qdisc_sleeping != q)
+ if (rcu_access_pointer(slave_txq->qdisc_sleeping) != q)
continue;
if (netif_xmit_stopped(netdev_get_tx_queue(slave, subq)) ||
!netif_running(slave)) {
@@ -426,7 +424,7 @@ static int teql_master_mtu(struct net_device *dev, int new_mtu)
} while ((q = NEXT_SLAVE(q)) != m->slaves);
}
- dev->mtu = new_mtu;
+ WRITE_ONCE(dev->mtu, new_mtu);
return 0;
}
@@ -493,7 +491,7 @@ static int __init teql_init(void)
master = netdev_priv(dev);
- strlcpy(master->qops.id, dev->name, IFNAMSIZ);
+ strscpy(master->qops.id, dev->name, IFNAMSIZ);
err = register_qdisc(&master->qops);
if (err) {
@@ -525,3 +523,4 @@ module_init(teql_init);
module_exit(teql_exit);
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("True (or trivial) link equalizer qdisc");