diff options
Diffstat (limited to 'net/sched/sch_cake.c')
| -rw-r--r-- | net/sched/sch_cake.c | 768 |
1 files changed, 443 insertions, 325 deletions
diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c index 73940293700d..4a64d6397b6f 100644 --- a/net/sched/sch_cake.c +++ b/net/sched/sch_cake.c @@ -65,6 +65,7 @@ #include <linux/reciprocal_div.h> #include <net/netlink.h> #include <linux/if_vlan.h> +#include <net/gso.h> #include <net/pkt_sched.h> #include <net/pkt_cls.h> #include <net/tcp.h> @@ -138,8 +139,8 @@ struct cake_flow { struct cake_host { u32 srchost_tag; u32 dsthost_tag; - u16 srchost_refcnt; - u16 dsthost_refcnt; + u16 srchost_bulk_flow_count; + u16 dsthost_bulk_flow_count; }; struct cake_heap_entry { @@ -173,8 +174,7 @@ struct cake_tin_data { u64 tin_rate_bps; u16 tin_rate_shft; - u16 tin_quantum_prio; - u16 tin_quantum_band; + u16 tin_quantum; s32 tin_deficit; u32 tin_backlog; u32 tin_dropped; @@ -211,6 +211,9 @@ struct cake_sched_data { u8 ack_filter; u8 atm_mode; + u32 fwmark_mask; + u16 fwmark_shft; + /* time_next = time_this + ((len * rate_ns) >> rate_shft) */ u16 rate_shft; ktime_t time_next_packet; @@ -310,8 +313,8 @@ static const u8 precedence[] = { }; static const u8 diffserv8[] = { - 2, 5, 1, 2, 4, 2, 2, 2, - 0, 2, 1, 2, 1, 2, 1, 2, + 2, 0, 1, 2, 4, 2, 2, 2, + 1, 2, 1, 2, 1, 2, 1, 2, 5, 2, 4, 2, 4, 2, 4, 2, 3, 2, 3, 2, 3, 2, 3, 2, 6, 2, 3, 2, 3, 2, 3, 2, @@ -321,7 +324,7 @@ static const u8 diffserv8[] = { }; static const u8 diffserv4[] = { - 0, 2, 0, 0, 2, 0, 0, 0, + 0, 1, 0, 0, 2, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, @@ -332,7 +335,7 @@ static const u8 diffserv4[] = { }; static const u8 diffserv3[] = { - 0, 0, 0, 0, 2, 0, 0, 0, + 0, 1, 0, 0, 2, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, @@ -358,8 +361,24 @@ static const u8 besteffort[] = { static const u8 normal_order[] = {0, 1, 2, 3, 4, 5, 6, 7}; static const u8 bulk_order[] = {1, 0, 2, 3}; +/* There is a big difference in timing between the accurate values placed in the + * cache and the approximations given by a single Newton step for small count + * values, particularly when stepping from count 1 to 2 or vice versa. Hence, + * these values are calculated using eight Newton steps, using the + * implementation below. Above 16, a single Newton step gives sufficient + * accuracy in either direction, given the precision stored. + * + * The magnitude of the error when stepping up to count 2 is such as to give the + * value that *should* have been produced at count 4. + */ + #define REC_INV_SQRT_CACHE (16) -static u32 cobalt_rec_inv_sqrt_cache[REC_INV_SQRT_CACHE] = {0}; +static const u32 inv_sqrt_cache[REC_INV_SQRT_CACHE] = { + ~0, ~0, 3037000500, 2479700525, + 2147483647, 1920767767, 1753413056, 1623345051, + 1518500250, 1431655765, 1358187914, 1294981364, + 1239850263, 1191209601, 1147878294, 1108955788 +}; /* http://en.wikipedia.org/wiki/Methods_of_computing_square_roots * new_invsqrt = (invsqrt / 2) * (3 - count * invsqrt^2) @@ -385,47 +404,14 @@ static void cobalt_newton_step(struct cobalt_vars *vars) static void cobalt_invsqrt(struct cobalt_vars *vars) { if (vars->count < REC_INV_SQRT_CACHE) - vars->rec_inv_sqrt = cobalt_rec_inv_sqrt_cache[vars->count]; + vars->rec_inv_sqrt = inv_sqrt_cache[vars->count]; else cobalt_newton_step(vars); } -/* There is a big difference in timing between the accurate values placed in - * the cache and the approximations given by a single Newton step for small - * count values, particularly when stepping from count 1 to 2 or vice versa. - * Above 16, a single Newton step gives sufficient accuracy in either - * direction, given the precision stored. - * - * The magnitude of the error when stepping up to count 2 is such as to give - * the value that *should* have been produced at count 4. - */ - -static void cobalt_cache_init(void) -{ - struct cobalt_vars v; - - memset(&v, 0, sizeof(v)); - v.rec_inv_sqrt = ~0U; - cobalt_rec_inv_sqrt_cache[0] = v.rec_inv_sqrt; - - for (v.count = 1; v.count < REC_INV_SQRT_CACHE; v.count++) { - cobalt_newton_step(&v); - cobalt_newton_step(&v); - cobalt_newton_step(&v); - cobalt_newton_step(&v); - - cobalt_rec_inv_sqrt_cache[v.count] = v.rec_inv_sqrt; - } -} - static void cobalt_vars_init(struct cobalt_vars *vars) { memset(vars, 0, sizeof(*vars)); - - if (!cobalt_rec_inv_sqrt_cache[0]) { - cobalt_cache_init(); - cobalt_rec_inv_sqrt_cache[0] = ~0; - } } /* CoDel control_law is t + interval/sqrt(count) @@ -498,13 +484,14 @@ static bool cobalt_queue_empty(struct cobalt_vars *vars, /* Call this with a freshly dequeued packet for possible congestion marking. * Returns true as an instruction to drop the packet, false for delivery. */ -static bool cobalt_should_drop(struct cobalt_vars *vars, - struct cobalt_params *p, - ktime_t now, - struct sk_buff *skb, - u32 bulk_flows) -{ - bool next_due, over_target, drop = false; +static enum skb_drop_reason cobalt_should_drop(struct cobalt_vars *vars, + struct cobalt_params *p, + ktime_t now, + struct sk_buff *skb, + u32 bulk_flows) +{ + enum skb_drop_reason reason = SKB_NOT_DROPPED_YET; + bool next_due, over_target; ktime_t schedule; u64 sojourn; @@ -547,7 +534,8 @@ static bool cobalt_should_drop(struct cobalt_vars *vars, if (next_due && vars->dropping) { /* Use ECN mark if possible, otherwise drop */ - drop = !(vars->ecn_marked = INET_ECN_set_ce(skb)); + if (!(vars->ecn_marked = INET_ECN_set_ce(skb))) + reason = SKB_DROP_REASON_QDISC_CONGESTED; vars->count++; if (!vars->count) @@ -570,38 +558,61 @@ static bool cobalt_should_drop(struct cobalt_vars *vars, } /* Simple BLUE implementation. Lack of ECN is deliberate. */ - if (vars->p_drop) - drop |= (prandom_u32() < vars->p_drop); + if (vars->p_drop && reason == SKB_NOT_DROPPED_YET && + get_random_u32() < vars->p_drop) + reason = SKB_DROP_REASON_CAKE_FLOOD; /* Overload the drop_next field as an activity timeout */ if (!vars->count) vars->drop_next = ktime_add_ns(now, p->interval); - else if (ktime_to_ns(schedule) > 0 && !drop) + else if (ktime_to_ns(schedule) > 0 && reason == SKB_NOT_DROPPED_YET) vars->drop_next = now; - return drop; + return reason; } -static void cake_update_flowkeys(struct flow_keys *keys, +static bool cake_update_flowkeys(struct flow_keys *keys, const struct sk_buff *skb) { #if IS_ENABLED(CONFIG_NF_CONNTRACK) struct nf_conntrack_tuple tuple = {}; - bool rev = !skb->_nfct; + bool rev = !skb->_nfct, upd = false; + __be32 ip; - if (tc_skb_protocol(skb) != htons(ETH_P_IP)) - return; + if (skb_protocol(skb, true) != htons(ETH_P_IP)) + return false; if (!nf_ct_get_tuple_skb(&tuple, skb)) - return; + return false; - keys->addrs.v4addrs.src = rev ? tuple.dst.u3.ip : tuple.src.u3.ip; - keys->addrs.v4addrs.dst = rev ? tuple.src.u3.ip : tuple.dst.u3.ip; + ip = rev ? tuple.dst.u3.ip : tuple.src.u3.ip; + if (ip != keys->addrs.v4addrs.src) { + keys->addrs.v4addrs.src = ip; + upd = true; + } + ip = rev ? tuple.src.u3.ip : tuple.dst.u3.ip; + if (ip != keys->addrs.v4addrs.dst) { + keys->addrs.v4addrs.dst = ip; + upd = true; + } if (keys->ports.ports) { - keys->ports.src = rev ? tuple.dst.u.all : tuple.src.u.all; - keys->ports.dst = rev ? tuple.src.u.all : tuple.dst.u.all; + __be16 port; + + port = rev ? tuple.dst.u.all : tuple.src.u.all; + if (port != keys->ports.src) { + keys->ports.src = port; + upd = true; + } + port = rev ? tuple.src.u.all : tuple.dst.u.all; + if (port != keys->ports.dst) { + port = keys->ports.dst; + upd = true; + } } + return upd; +#else + return false; #endif } @@ -619,26 +630,96 @@ static bool cake_ddst(int flow_mode) return (flow_mode & CAKE_FLOW_DUAL_DST) == CAKE_FLOW_DUAL_DST; } +static void cake_dec_srchost_bulk_flow_count(struct cake_tin_data *q, + struct cake_flow *flow, + int flow_mode) +{ + if (likely(cake_dsrc(flow_mode) && + q->hosts[flow->srchost].srchost_bulk_flow_count)) + q->hosts[flow->srchost].srchost_bulk_flow_count--; +} + +static void cake_inc_srchost_bulk_flow_count(struct cake_tin_data *q, + struct cake_flow *flow, + int flow_mode) +{ + if (likely(cake_dsrc(flow_mode) && + q->hosts[flow->srchost].srchost_bulk_flow_count < CAKE_QUEUES)) + q->hosts[flow->srchost].srchost_bulk_flow_count++; +} + +static void cake_dec_dsthost_bulk_flow_count(struct cake_tin_data *q, + struct cake_flow *flow, + int flow_mode) +{ + if (likely(cake_ddst(flow_mode) && + q->hosts[flow->dsthost].dsthost_bulk_flow_count)) + q->hosts[flow->dsthost].dsthost_bulk_flow_count--; +} + +static void cake_inc_dsthost_bulk_flow_count(struct cake_tin_data *q, + struct cake_flow *flow, + int flow_mode) +{ + if (likely(cake_ddst(flow_mode) && + q->hosts[flow->dsthost].dsthost_bulk_flow_count < CAKE_QUEUES)) + q->hosts[flow->dsthost].dsthost_bulk_flow_count++; +} + +static u16 cake_get_flow_quantum(struct cake_tin_data *q, + struct cake_flow *flow, + int flow_mode) +{ + u16 host_load = 1; + + if (cake_dsrc(flow_mode)) + host_load = max(host_load, + q->hosts[flow->srchost].srchost_bulk_flow_count); + + if (cake_ddst(flow_mode)) + host_load = max(host_load, + q->hosts[flow->dsthost].dsthost_bulk_flow_count); + + /* The get_random_u16() is a way to apply dithering to avoid + * accumulating roundoff errors + */ + return (q->flow_quantum * quantum_div[host_load] + + get_random_u16()) >> 16; +} + static u32 cake_hash(struct cake_tin_data *q, const struct sk_buff *skb, int flow_mode, u16 flow_override, u16 host_override) { + bool hash_flows = (!flow_override && !!(flow_mode & CAKE_FLOW_FLOWS)); + bool hash_hosts = (!host_override && !!(flow_mode & CAKE_FLOW_HOSTS)); + bool nat_enabled = !!(flow_mode & CAKE_FLOW_NAT_FLAG); u32 flow_hash = 0, srchost_hash = 0, dsthost_hash = 0; u16 reduced_hash, srchost_idx, dsthost_idx; struct flow_keys keys, host_keys; + bool use_skbhash = skb->l4_hash; if (unlikely(flow_mode == CAKE_FLOW_NONE)) return 0; - /* If both overrides are set we can skip packet dissection entirely */ - if ((flow_override || !(flow_mode & CAKE_FLOW_FLOWS)) && - (host_override || !(flow_mode & CAKE_FLOW_HOSTS))) + /* If both overrides are set, or we can use the SKB hash and nat mode is + * disabled, we can skip packet dissection entirely. If nat mode is + * enabled there's another check below after doing the conntrack lookup. + */ + if ((!hash_flows || (use_skbhash && !nat_enabled)) && !hash_hosts) goto skip_hash; skb_flow_dissect_flow_keys(skb, &keys, FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL); - if (flow_mode & CAKE_FLOW_NAT_FLAG) - cake_update_flowkeys(&keys, skb); + /* Don't use the SKB hash if we change the lookup keys from conntrack */ + if (nat_enabled && cake_update_flowkeys(&keys, skb)) + use_skbhash = false; + + /* If we can still use the SKB hash and don't need the host hash, we can + * skip the rest of the hashing procedure + */ + if (use_skbhash && !hash_hosts) + goto skip_hash; /* flow_hash_from_keys() sorts the addresses by value, so we have * to preserve their order in a separate data structure to treat @@ -677,12 +758,14 @@ static u32 cake_hash(struct cake_tin_data *q, const struct sk_buff *skb, /* This *must* be after the above switch, since as a * side-effect it sorts the src and dst addresses. */ - if (flow_mode & CAKE_FLOW_FLOWS) + if (hash_flows && !use_skbhash) flow_hash = flow_hash_from_keys(&keys); skip_hash: if (flow_override) flow_hash = flow_override - 1; + else if (use_skbhash && (flow_mode & CAKE_FLOW_FLOWS)) + flow_hash = skb->hash; if (host_override) { dsthost_hash = host_override - 1; srchost_hash = host_override - 1; @@ -746,10 +829,13 @@ skip_hash: * queue, accept the collision, update the host tags. */ q->way_collisions++; - q->hosts[q->flows[reduced_hash].srchost].srchost_refcnt--; - q->hosts[q->flows[reduced_hash].dsthost].dsthost_refcnt--; allocate_src = cake_dsrc(flow_mode); allocate_dst = cake_ddst(flow_mode); + + if (q->flows[outer_hash + k].set == CAKE_SET_BULK) { + cake_dec_srchost_bulk_flow_count(q, &q->flows[outer_hash + k], flow_mode); + cake_dec_dsthost_bulk_flow_count(q, &q->flows[outer_hash + k], flow_mode); + } found: /* reserve queue for future packets in same flow */ reduced_hash = outer_hash + k; @@ -767,14 +853,16 @@ found: } for (i = 0; i < CAKE_SET_WAYS; i++, k = (k + 1) % CAKE_SET_WAYS) { - if (!q->hosts[outer_hash + k].srchost_refcnt) + if (!q->hosts[outer_hash + k].srchost_bulk_flow_count) break; } q->hosts[outer_hash + k].srchost_tag = srchost_hash; found_src: srchost_idx = outer_hash + k; - q->hosts[srchost_idx].srchost_refcnt++; q->flows[reduced_hash].srchost = srchost_idx; + + if (q->flows[reduced_hash].set == CAKE_SET_BULK) + cake_inc_srchost_bulk_flow_count(q, &q->flows[reduced_hash], flow_mode); } if (allocate_dst) { @@ -789,14 +877,16 @@ found_src: } for (i = 0; i < CAKE_SET_WAYS; i++, k = (k + 1) % CAKE_SET_WAYS) { - if (!q->hosts[outer_hash + k].dsthost_refcnt) + if (!q->hosts[outer_hash + k].dsthost_bulk_flow_count) break; } q->hosts[outer_hash + k].dsthost_tag = dsthost_hash; found_dst: dsthost_idx = outer_hash + k; - q->hosts[dsthost_idx].dsthost_refcnt++; q->flows[reduced_hash].dsthost = dsthost_idx; + + if (q->flows[reduced_hash].set == CAKE_SET_BULK) + cake_inc_dsthost_bulk_flow_count(q, &q->flows[reduced_hash], flow_mode); } } @@ -900,7 +990,7 @@ static struct tcphdr *cake_get_tcphdr(const struct sk_buff *skb, } tcph = skb_header_pointer(skb, offset, sizeof(_tcph), &_tcph); - if (!tcph) + if (!tcph || tcph->doff < 5) return NULL; return skb_header_pointer(skb, offset, @@ -924,6 +1014,8 @@ static const void *cake_get_tcpopt(const struct tcphdr *tcph, length--; continue; } + if (length < 2) + break; opsize = *ptr++; if (opsize < 2 || opsize > length) break; @@ -1061,6 +1153,8 @@ static bool cake_tcph_may_drop(const struct tcphdr *tcph, length--; continue; } + if (length < 2) + break; opsize = *ptr++; if (opsize < 2 || opsize > length) break; @@ -1162,7 +1256,7 @@ static struct sk_buff *cake_ack_filter(struct cake_sched_data *q, iph_check->daddr != iph->daddr) continue; - seglen = ntohs(iph_check->tot_len) - + seglen = iph_totlen(skb, iph_check) - (4 * iph_check->ihl); } else if (iph_check->version == 6) { ipv6h = (struct ipv6hdr *)iph; @@ -1304,16 +1398,19 @@ static u32 cake_overhead(struct cake_sched_data *q, const struct sk_buff *skb) const struct skb_shared_info *shinfo = skb_shinfo(skb); unsigned int hdr_len, last_len = 0; u32 off = skb_network_offset(skb); + u16 segs = qdisc_pkt_segs(skb); u32 len = qdisc_pkt_len(skb); - u16 segs = 1; q->avg_netoff = cake_ewma(q->avg_netoff, off << 16, 8); - if (!shinfo->gso_size) + if (segs == 1) return cake_calc_overhead(q, len, off); - /* borrowed from qdisc_pkt_len_init() */ - hdr_len = skb_transport_header(skb) - skb_mac_header(skb); + /* borrowed from qdisc_pkt_len_segs_init() */ + if (!skb->encapsulation) + hdr_len = skb_transport_offset(skb); + else + hdr_len = skb_inner_transport_offset(skb); /* + transport layer */ if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | @@ -1321,24 +1418,18 @@ static u32 cake_overhead(struct cake_sched_data *q, const struct sk_buff *skb) const struct tcphdr *th; struct tcphdr _tcphdr; - th = skb_header_pointer(skb, skb_transport_offset(skb), + th = skb_header_pointer(skb, hdr_len, sizeof(_tcphdr), &_tcphdr); if (likely(th)) hdr_len += __tcp_hdrlen(th); } else { struct udphdr _udphdr; - if (skb_header_pointer(skb, skb_transport_offset(skb), + if (skb_header_pointer(skb, hdr_len, sizeof(_udphdr), &_udphdr)) hdr_len += sizeof(struct udphdr); } - if (unlikely(shinfo->gso_type & SKB_GSO_DODGY)) - segs = DIV_ROUND_UP(skb->len - hdr_len, - shinfo->gso_size); - else - segs = shinfo->gso_segs; - len = shinfo->gso_size + hdr_len; last_len = skb->len - shinfo->gso_size * (segs - 1); @@ -1464,7 +1555,7 @@ static unsigned int cake_drop(struct Qdisc *sch, struct sk_buff **to_free) if (!q->overflow_timeout) { int i; /* Build fresh max-heap */ - for (i = CAKE_MAX_TINS * CAKE_QUEUES / 2; i >= 0; i--) + for (i = CAKE_MAX_TINS * CAKE_QUEUES / 2 - 1; i >= 0; i--) cake_heapify(q, i); } q->overflow_timeout = 65535; @@ -1491,16 +1582,14 @@ static unsigned int cake_drop(struct Qdisc *sch, struct sk_buff **to_free) b->backlogs[idx] -= len; b->tin_backlog -= len; sch->qstats.backlog -= len; - qdisc_tree_reduce_backlog(sch, 1, len); flow->dropped++; b->tin_dropped++; - sch->qstats.drops++; if (q->rate_flags & CAKE_FLAG_INGRESS) cake_advance_shaper(q, b, skb, now, true); - __qdisc_drop(skb, to_free); + qdisc_drop_reason(skb, sch, to_free, SKB_DROP_REASON_QDISC_OVERLIMIT); sch->q.qlen--; cake_heapify(q, 0); @@ -1508,35 +1597,51 @@ static unsigned int cake_drop(struct Qdisc *sch, struct sk_buff **to_free) return idx + (tin << 16); } -static void cake_wash_diffserv(struct sk_buff *skb) -{ - switch (skb->protocol) { - case htons(ETH_P_IP): - ipv4_change_dsfield(ip_hdr(skb), INET_ECN_MASK, 0); - break; - case htons(ETH_P_IPV6): - ipv6_change_dsfield(ipv6_hdr(skb), INET_ECN_MASK, 0); - break; - default: - break; - } -} - -static u8 cake_handle_diffserv(struct sk_buff *skb, u16 wash) +static u8 cake_handle_diffserv(struct sk_buff *skb, bool wash) { + const int offset = skb_network_offset(skb); + u16 *buf, buf_; u8 dscp; - switch (skb->protocol) { + switch (skb_protocol(skb, true)) { case htons(ETH_P_IP): - dscp = ipv4_get_dsfield(ip_hdr(skb)) >> 2; - if (wash && dscp) + buf = skb_header_pointer(skb, offset, sizeof(buf_), &buf_); + if (unlikely(!buf)) + return 0; + + /* ToS is in the second byte of iphdr */ + dscp = ipv4_get_dsfield((struct iphdr *)buf) >> 2; + + if (wash && dscp) { + const int wlen = offset + sizeof(struct iphdr); + + if (!pskb_may_pull(skb, wlen) || + skb_try_make_writable(skb, wlen)) + return 0; + ipv4_change_dsfield(ip_hdr(skb), INET_ECN_MASK, 0); + } + return dscp; case htons(ETH_P_IPV6): - dscp = ipv6_get_dsfield(ipv6_hdr(skb)) >> 2; - if (wash && dscp) + buf = skb_header_pointer(skb, offset, sizeof(buf_), &buf_); + if (unlikely(!buf)) + return 0; + + /* Traffic class is in the first and second bytes of ipv6hdr */ + dscp = ipv6_get_dsfield((struct ipv6hdr *)buf) >> 2; + + if (wash && dscp) { + const int wlen = offset + sizeof(struct ipv6hdr); + + if (!pskb_may_pull(skb, wlen) || + skb_try_make_writable(skb, wlen)) + return 0; + ipv6_change_dsfield(ipv6_hdr(skb), INET_ECN_MASK, 0); + } + return dscp; case htons(ETH_P_ARP): @@ -1552,26 +1657,37 @@ static struct cake_tin_data *cake_select_tin(struct Qdisc *sch, struct sk_buff *skb) { struct cake_sched_data *q = qdisc_priv(sch); - u32 tin; + u32 tin, mark; + bool wash; + u8 dscp; - if (TC_H_MAJ(skb->priority) == sch->handle && - TC_H_MIN(skb->priority) > 0 && - TC_H_MIN(skb->priority) <= q->tin_cnt) { + /* Tin selection: Default to diffserv-based selection, allow overriding + * using firewall marks or skb->priority. Call DSCP parsing early if + * wash is enabled, otherwise defer to below to skip unneeded parsing. + */ + mark = (skb->mark & q->fwmark_mask) >> q->fwmark_shft; + wash = !!(q->rate_flags & CAKE_FLAG_WASH); + if (wash) + dscp = cake_handle_diffserv(skb, wash); + + if (q->tin_mode == CAKE_DIFFSERV_BESTEFFORT) + tin = 0; + + else if (mark && mark <= q->tin_cnt) + tin = q->tin_order[mark - 1]; + + else if (TC_H_MAJ(skb->priority) == sch->handle && + TC_H_MIN(skb->priority) > 0 && + TC_H_MIN(skb->priority) <= q->tin_cnt) tin = q->tin_order[TC_H_MIN(skb->priority) - 1]; - if (q->rate_flags & CAKE_FLAG_WASH) - cake_wash_diffserv(skb); - } else if (q->tin_mode != CAKE_DIFFSERV_BESTEFFORT) { - /* extract the Diffserv Precedence field, if it exists */ - /* and clear DSCP bits if washing */ - tin = q->tin_index[cake_handle_diffserv(skb, - q->rate_flags & CAKE_FLAG_WASH)]; + else { + if (!wash) + dscp = cake_handle_diffserv(skb, wash); + tin = q->tin_index[dscp]; + if (unlikely(tin >= q->tin_cnt)) tin = 0; - } else { - tin = 0; - if (q->rate_flags & CAKE_FLAG_WASH) - cake_wash_diffserv(skb); } return &q->tins[tin]; @@ -1591,7 +1707,7 @@ static u32 cake_classify(struct Qdisc *sch, struct cake_tin_data **t, goto hash; *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; - result = tcf_classify(skb, filter, &res, false); + result = tcf_classify(skb, NULL, filter, &res, false); if (result >= 0) { #ifdef CONFIG_NET_CLS_ACT @@ -1600,7 +1716,7 @@ static u32 cake_classify(struct Qdisc *sch, struct cake_tin_data **t, case TC_ACT_QUEUED: case TC_ACT_TRAP: *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; - /* fall through */ + fallthrough; case TC_ACT_SHOT: return 0; } @@ -1620,14 +1736,14 @@ static void cake_reconfigure(struct Qdisc *sch); static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { + u32 idx, tin, prev_qlen, prev_backlog, drop_id; struct cake_sched_data *q = qdisc_priv(sch); - int len = qdisc_pkt_len(skb); - int uninitialized_var(ret); + int len = qdisc_pkt_len(skb), ret; struct sk_buff *ack = NULL; ktime_t now = ktime_get(); struct cake_tin_data *b; struct cake_flow *flow; - u32 idx; + bool same_flow = false; /* choose flow to insert into */ idx = cake_classify(sch, &b, skb, q->flow_mode, &ret); @@ -1637,6 +1753,7 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, __qdisc_drop(skb, to_free); return ret; } + tin = (u32)(b - q->tins); idx--; flow = &b->flows[idx]; @@ -1664,7 +1781,7 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, if (unlikely(len > b->max_skblen)) b->max_skblen = len; - if (skb_is_gso(skb) && q->rate_flags & CAKE_FLAG_SPLIT_GSO) { + if (qdisc_pkt_segs(skb) > 1 && q->rate_flags & CAKE_FLAG_SPLIT_GSO) { struct sk_buff *segs, *nskb; netdev_features_t features = netif_skb_features(skb); unsigned int slen = 0, numsegs = 0; @@ -1673,10 +1790,10 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, if (IS_ERR_OR_NULL(segs)) return qdisc_drop(skb, sch, to_free); - while (segs) { - nskb = segs->next; + skb_list_walk_safe(segs, segs, nskb) { skb_mark_not_on_list(segs); qdisc_skb_cb(segs)->pkt_len = segs->len; + qdisc_skb_cb(segs)->pkt_segs = 1; cobalt_set_enqueue_time(segs, now); get_cobalt_cb(segs)->adjusted_len = cake_overhead(q, segs); @@ -1687,7 +1804,6 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, slen += segs->len; q->buffer_used += segs->truesize; b->packets++; - segs = nskb; } /* stats */ @@ -1701,6 +1817,8 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, consume_skb(skb); } else { /* not splitting */ + int ack_pkt_len = 0; + cobalt_set_enqueue_time(skb, now); get_cobalt_cb(skb)->adjusted_len = cake_overhead(q, skb); flow_queue_add(flow, skb); @@ -1711,13 +1829,13 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, if (ack) { b->ack_drops++; sch->qstats.drops++; - b->bytes += qdisc_pkt_len(ack); - len -= qdisc_pkt_len(ack); + ack_pkt_len = qdisc_pkt_len(ack); + b->bytes += ack_pkt_len; q->buffer_used += skb->truesize - ack->truesize; if (q->rate_flags & CAKE_FLAG_INGRESS) cake_advance_shaper(q, b, ack, now, true); - qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(ack)); + qdisc_tree_reduce_backlog(sch, 1, ack_pkt_len); consume_skb(ack); } else { sch->q.qlen++; @@ -1726,11 +1844,11 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, /* stats */ b->packets++; - b->bytes += len; - b->backlogs[idx] += len; - b->tin_backlog += len; - sch->qstats.backlog += len; - q->avg_window_bytes += len; + b->bytes += len - ack_pkt_len; + b->backlogs[idx] += len - ack_pkt_len; + b->tin_backlog += len - ack_pkt_len; + sch->qstats.backlog += len - ack_pkt_len; + q->avg_window_bytes += len - ack_pkt_len; } if (q->overflow_timeout) @@ -1759,7 +1877,7 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, q->avg_window_begin)); u64 b = q->avg_window_bytes * (u64)NSEC_PER_SEC; - do_div(b, window_interval); + b = div64_u64(b, window_interval); q->avg_peak_bandwidth = cake_ewma(q->avg_peak_bandwidth, b, b > q->avg_peak_bandwidth ? 2 : 8); @@ -1780,10 +1898,6 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, /* flowchain */ if (!flow->set || flow->set == CAKE_SET_DECAYING) { - struct cake_host *srchost = &b->hosts[flow->srchost]; - struct cake_host *dsthost = &b->hosts[flow->dsthost]; - u16 host_load = 1; - if (!flow->set) { list_add_tail(&flow->flowchain, &b->new_flows); } else { @@ -1793,14 +1907,7 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, flow->set = CAKE_SET_SPARSE; b->sparse_flow_count++; - if (cake_dsrc(q->flow_mode)) - host_load = max(host_load, srchost->srchost_refcnt); - - if (cake_ddst(q->flow_mode)) - host_load = max(host_load, dsthost->dsthost_refcnt); - - flow->deficit = (b->flow_quantum * - quantum_div[host_load]) >> 16; + flow->deficit = cake_get_flow_quantum(b, flow, q->flow_mode); } else if (flow->set == CAKE_SET_SPARSE_WAIT) { /* this flow was empty, accounted as a sparse flow, but actually * in the bulk rotation. @@ -1808,20 +1915,37 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, flow->set = CAKE_SET_BULK; b->sparse_flow_count--; b->bulk_flow_count++; + + cake_inc_srchost_bulk_flow_count(b, flow, q->flow_mode); + cake_inc_dsthost_bulk_flow_count(b, flow, q->flow_mode); } if (q->buffer_used > q->buffer_max_used) q->buffer_max_used = q->buffer_used; - if (q->buffer_used > q->buffer_limit) { - u32 dropped = 0; + if (q->buffer_used <= q->buffer_limit) + return NET_XMIT_SUCCESS; - while (q->buffer_used > q->buffer_limit) { - dropped++; - cake_drop(sch, to_free); - } - b->drop_overlimit += dropped; + prev_qlen = sch->q.qlen; + prev_backlog = sch->qstats.backlog; + + while (q->buffer_used > q->buffer_limit) { + drop_id = cake_drop(sch, to_free); + if ((drop_id >> 16) == tin && + (drop_id & 0xFFFF) == idx) + same_flow = true; + } + + prev_qlen -= sch->q.qlen; + prev_backlog -= sch->qstats.backlog; + b->drop_overlimit += prev_qlen; + + if (same_flow) { + qdisc_tree_reduce_backlog(sch, prev_qlen - 1, + prev_backlog - len); + return NET_XMIT_CN; } + qdisc_tree_reduce_backlog(sch, prev_qlen, prev_backlog); return NET_XMIT_SUCCESS; } @@ -1857,20 +1981,19 @@ static void cake_clear_tin(struct Qdisc *sch, u16 tin) q->cur_tin = tin; for (q->cur_flow = 0; q->cur_flow < CAKE_QUEUES; q->cur_flow++) while (!!(skb = cake_dequeue_one(sch))) - kfree_skb(skb); + kfree_skb_reason(skb, SKB_DROP_REASON_QUEUE_PURGE); } static struct sk_buff *cake_dequeue(struct Qdisc *sch) { struct cake_sched_data *q = qdisc_priv(sch); struct cake_tin_data *b = &q->tins[q->cur_tin]; - struct cake_host *srchost, *dsthost; + enum skb_drop_reason reason; ktime_t now = ktime_get(); struct cake_flow *flow; struct list_head *head; bool first_flow = true; struct sk_buff *skb; - u16 host_load; u64 delay; u32 len; @@ -1899,7 +2022,7 @@ begin: while (b->tin_deficit < 0 || !(b->sparse_flow_count + b->bulk_flow_count)) { if (b->tin_deficit <= 0) - b->tin_deficit += b->tin_quantum_band; + b->tin_deficit += b->tin_quantum; if (b->sparse_flow_count + b->bulk_flow_count) empty = false; @@ -1970,28 +2093,8 @@ retry: q->cur_flow = flow - b->flows; first_flow = false; - /* triple isolation (modified DRR++) */ - srchost = &b->hosts[flow->srchost]; - dsthost = &b->hosts[flow->dsthost]; - host_load = 1; - - if (cake_dsrc(q->flow_mode)) - host_load = max(host_load, srchost->srchost_refcnt); - - if (cake_ddst(q->flow_mode)) - host_load = max(host_load, dsthost->dsthost_refcnt); - - WARN_ON(host_load > CAKE_QUEUES); - /* flow isolation (DRR++) */ if (flow->deficit <= 0) { - /* The shifted prandom_u32() is a way to apply dithering to - * avoid accumulating roundoff errors - */ - flow->deficit += (b->flow_quantum * quantum_div[host_load] + - (prandom_u32() >> 16)) >> 16; - list_move_tail(&flow->flowchain, &b->old_flows); - /* Keep all flows with deficits out of the sparse and decaying * rotations. No non-empty flow can go into the decaying * rotation, so they can't get deficits @@ -2000,6 +2103,10 @@ retry: if (flow->head) { b->sparse_flow_count--; b->bulk_flow_count++; + + cake_inc_srchost_bulk_flow_count(b, flow, q->flow_mode); + cake_inc_dsthost_bulk_flow_count(b, flow, q->flow_mode); + flow->set = CAKE_SET_BULK; } else { /* we've moved it to the bulk rotation for @@ -2009,6 +2116,10 @@ retry: flow->set = CAKE_SET_SPARSE_WAIT; } } + + flow->deficit += cake_get_flow_quantum(b, flow, q->flow_mode); + list_move_tail(&flow->flowchain, &b->old_flows); + goto retry; } @@ -2029,6 +2140,10 @@ retry: &b->decaying_flows); if (flow->set == CAKE_SET_BULK) { b->bulk_flow_count--; + + cake_dec_srchost_bulk_flow_count(b, flow, q->flow_mode); + cake_dec_dsthost_bulk_flow_count(b, flow, q->flow_mode); + b->decaying_flow_count++; } else if (flow->set == CAKE_SET_SPARSE || flow->set == CAKE_SET_SPARSE_WAIT) { @@ -2042,24 +2157,25 @@ retry: if (flow->set == CAKE_SET_SPARSE || flow->set == CAKE_SET_SPARSE_WAIT) b->sparse_flow_count--; - else if (flow->set == CAKE_SET_BULK) + else if (flow->set == CAKE_SET_BULK) { b->bulk_flow_count--; - else + + cake_dec_srchost_bulk_flow_count(b, flow, q->flow_mode); + cake_dec_dsthost_bulk_flow_count(b, flow, q->flow_mode); + } else b->decaying_flow_count--; flow->set = CAKE_SET_NONE; - srchost->srchost_refcnt--; - dsthost->dsthost_refcnt--; } goto begin; } + reason = cobalt_should_drop(&flow->cvars, &b->cparams, now, skb, + (b->bulk_flow_count * + !!(q->rate_flags & + CAKE_FLAG_INGRESS))); /* Last packet in queue may be marked, shouldn't be dropped */ - if (!cobalt_should_drop(&flow->cvars, &b->cparams, now, skb, - (b->bulk_flow_count * - !!(q->rate_flags & - CAKE_FLAG_INGRESS))) || - !flow->head) + if (reason == SKB_NOT_DROPPED_YET || !flow->head) break; /* drop this packet, get another one */ @@ -2073,7 +2189,7 @@ retry: b->tin_dropped++; qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(skb)); qdisc_qstats_drop(sch); - kfree_skb(skb); + qdisc_dequeue_drop(sch, skb, reason); if (q->rate_flags & CAKE_FLAG_INGRESS) goto retry; } @@ -2122,8 +2238,12 @@ retry: static void cake_reset(struct Qdisc *sch) { + struct cake_sched_data *q = qdisc_priv(sch); u32 c; + if (!q->tins) + return; + for (c = 0; c < CAKE_MAX_TINS; c++) cake_clear_tin(sch, c); } @@ -2144,6 +2264,8 @@ static const struct nla_policy cake_policy[TCA_CAKE_MAX + 1] = { [TCA_CAKE_MPU] = { .type = NLA_U32 }, [TCA_CAKE_INGRESS] = { .type = NLA_U32 }, [TCA_CAKE_ACK_FILTER] = { .type = NLA_U32 }, + [TCA_CAKE_SPLIT_GSO] = { .type = NLA_U32 }, + [TCA_CAKE_FWMARK] = { .type = NLA_U32 }, }; static void cake_set_rate(struct cake_tin_data *b, u64 rate, u32 mtu, @@ -2199,8 +2321,7 @@ static int cake_config_besteffort(struct Qdisc *sch) cake_set_rate(b, rate, mtu, us_to_ns(q->target), us_to_ns(q->interval)); - b->tin_quantum_band = 65535; - b->tin_quantum_prio = 65535; + b->tin_quantum = 65535; return 0; } @@ -2211,8 +2332,7 @@ static int cake_config_precedence(struct Qdisc *sch) struct cake_sched_data *q = qdisc_priv(sch); u32 mtu = psched_mtu(qdisc_dev(sch)); u64 rate = q->rate_bps; - u32 quantum1 = 256; - u32 quantum2 = 256; + u32 quantum = 256; u32 i; q->tin_cnt = 8; @@ -2225,18 +2345,14 @@ static int cake_config_precedence(struct Qdisc *sch) cake_set_rate(b, rate, mtu, us_to_ns(q->target), us_to_ns(q->interval)); - b->tin_quantum_prio = max_t(u16, 1U, quantum1); - b->tin_quantum_band = max_t(u16, 1U, quantum2); + b->tin_quantum = max_t(u16, 1U, quantum); /* calculate next class's parameters */ rate *= 7; rate >>= 3; - quantum1 *= 3; - quantum1 >>= 1; - - quantum2 *= 7; - quantum2 >>= 3; + quantum *= 7; + quantum >>= 3; } return 0; @@ -2244,9 +2360,7 @@ static int cake_config_precedence(struct Qdisc *sch) /* List of known Diffserv codepoints: * - * Least Effort (CS1) - * Best Effort (CS0) - * Max Reliability & LLT "Lo" (TOS1) + * Default Forwarding (DF/CS0) - Best Effort * Max Throughput (TOS2) * Min Delay (TOS4) * LLT "La" (TOS5) @@ -2254,6 +2368,7 @@ static int cake_config_precedence(struct Qdisc *sch) * Assured Forwarding 2 (AF2x) - x3 * Assured Forwarding 3 (AF3x) - x3 * Assured Forwarding 4 (AF4x) - x3 + * Precedence Class 1 (CS1) * Precedence Class 2 (CS2) * Precedence Class 3 (CS3) * Precedence Class 4 (CS4) @@ -2262,11 +2377,12 @@ static int cake_config_precedence(struct Qdisc *sch) * Precedence Class 7 (CS7) * Voice Admit (VA) * Expedited Forwarding (EF) - - * Total 25 codepoints. + * Lower Effort (LE) + * + * Total 26 codepoints. */ -/* List of traffic classes in RFC 4594: +/* List of traffic classes in RFC 4594, updated by RFC 8622: * (roughly descending order of contended priority) * (roughly ascending order of uncontended throughput) * @@ -2277,12 +2393,12 @@ static int cake_config_precedence(struct Qdisc *sch) * Realtime Interactive (CS4) - eg. games * Multimedia Streaming (AF3x) - eg. YouTube, NetFlix, Twitch * Broadcast Video (CS3) - * Low Latency Data (AF2x,TOS4) - eg. database - * Ops, Admin, Management (CS2,TOS1) - eg. ssh - * Standard Service (CS0 & unrecognised codepoints) - * High Throughput Data (AF1x,TOS2) - eg. web traffic - * Low Priority Data (CS1) - eg. BitTorrent - + * Low-Latency Data (AF2x,TOS4) - eg. database + * Ops, Admin, Management (CS2) - eg. ssh + * Standard Service (DF & unrecognised codepoints) + * High-Throughput Data (AF1x,TOS2) - eg. web traffic + * Low-Priority Data (LE,CS1) - eg. BitTorrent + * * Total 12 traffic classes. */ @@ -2292,12 +2408,12 @@ static int cake_config_diffserv8(struct Qdisc *sch) * * Network Control (CS6, CS7) * Minimum Latency (EF, VA, CS5, CS4) - * Interactive Shell (CS2, TOS1) + * Interactive Shell (CS2) * Low Latency Transactions (AF2x, TOS4) * Video Streaming (AF4x, AF3x, CS3) - * Bog Standard (CS0 etc.) - * High Throughput (AF1x, TOS2) - * Background Traffic (CS1) + * Bog Standard (DF etc.) + * High Throughput (AF1x, TOS2, CS1) + * Background Traffic (LE) * * Total 8 traffic classes. */ @@ -2305,8 +2421,7 @@ static int cake_config_diffserv8(struct Qdisc *sch) struct cake_sched_data *q = qdisc_priv(sch); u32 mtu = psched_mtu(qdisc_dev(sch)); u64 rate = q->rate_bps; - u32 quantum1 = 256; - u32 quantum2 = 256; + u32 quantum = 256; u32 i; q->tin_cnt = 8; @@ -2322,18 +2437,14 @@ static int cake_config_diffserv8(struct Qdisc *sch) cake_set_rate(b, rate, mtu, us_to_ns(q->target), us_to_ns(q->interval)); - b->tin_quantum_prio = max_t(u16, 1U, quantum1); - b->tin_quantum_band = max_t(u16, 1U, quantum2); + b->tin_quantum = max_t(u16, 1U, quantum); /* calculate next class's parameters */ rate *= 7; rate >>= 3; - quantum1 *= 3; - quantum1 >>= 1; - - quantum2 *= 7; - quantum2 >>= 3; + quantum *= 7; + quantum >>= 3; } return 0; @@ -2344,9 +2455,9 @@ static int cake_config_diffserv4(struct Qdisc *sch) /* Further pruned list of traffic classes for four-class system: * * Latency Sensitive (CS7, CS6, EF, VA, CS5, CS4) - * Streaming Media (AF4x, AF3x, CS3, AF2x, TOS4, CS2, TOS1) - * Best Effort (CS0, AF1x, TOS2, and those not specified) - * Background Traffic (CS1) + * Streaming Media (AF4x, AF3x, CS3, AF2x, TOS4, CS2) + * Best Effort (DF, AF1x, TOS2, and those not specified) + * Background Traffic (LE, CS1) * * Total 4 traffic classes. */ @@ -2372,17 +2483,11 @@ static int cake_config_diffserv4(struct Qdisc *sch) cake_set_rate(&q->tins[3], rate >> 2, mtu, us_to_ns(q->target), us_to_ns(q->interval)); - /* priority weights */ - q->tins[0].tin_quantum_prio = quantum; - q->tins[1].tin_quantum_prio = quantum >> 4; - q->tins[2].tin_quantum_prio = quantum << 2; - q->tins[3].tin_quantum_prio = quantum << 4; - /* bandwidth-sharing weights */ - q->tins[0].tin_quantum_band = quantum; - q->tins[1].tin_quantum_band = quantum >> 4; - q->tins[2].tin_quantum_band = quantum >> 1; - q->tins[3].tin_quantum_band = quantum >> 2; + q->tins[0].tin_quantum = quantum; + q->tins[1].tin_quantum = quantum >> 4; + q->tins[2].tin_quantum = quantum >> 1; + q->tins[3].tin_quantum = quantum >> 2; return 0; } @@ -2390,9 +2495,9 @@ static int cake_config_diffserv4(struct Qdisc *sch) static int cake_config_diffserv3(struct Qdisc *sch) { /* Simplified Diffserv structure with 3 tins. - * Low Priority (CS1) + * Latency Sensitive (CS7, CS6, EF, VA, TOS4) * Best Effort - * Latency Sensitive (TOS4, VA, EF, CS6, CS7) + * Low Priority (LE, CS1) */ struct cake_sched_data *q = qdisc_priv(sch); u32 mtu = psched_mtu(qdisc_dev(sch)); @@ -2413,15 +2518,10 @@ static int cake_config_diffserv3(struct Qdisc *sch) cake_set_rate(&q->tins[2], rate >> 2, mtu, us_to_ns(q->target), us_to_ns(q->interval)); - /* priority weights */ - q->tins[0].tin_quantum_prio = quantum; - q->tins[1].tin_quantum_prio = quantum >> 4; - q->tins[2].tin_quantum_prio = quantum << 4; - /* bandwidth-sharing weights */ - q->tins[0].tin_quantum_band = quantum; - q->tins[1].tin_quantum_band = quantum >> 4; - q->tins[2].tin_quantum_band = quantum >> 2; + q->tins[0].tin_quantum = quantum; + q->tins[1].tin_quantum = quantum >> 4; + q->tins[2].tin_quantum = quantum >> 2; return 0; } @@ -2485,19 +2585,20 @@ static int cake_change(struct Qdisc *sch, struct nlattr *opt, { struct cake_sched_data *q = qdisc_priv(sch); struct nlattr *tb[TCA_CAKE_MAX + 1]; + u16 rate_flags; + u8 flow_mode; int err; - if (!opt) - return -EINVAL; - - err = nla_parse_nested(tb, TCA_CAKE_MAX, opt, cake_policy, extack); + err = nla_parse_nested_deprecated(tb, TCA_CAKE_MAX, opt, cake_policy, + extack); if (err < 0) return err; + flow_mode = q->flow_mode; if (tb[TCA_CAKE_NAT]) { #if IS_ENABLED(CONFIG_NF_CONNTRACK) - q->flow_mode &= ~CAKE_FLOW_NAT_FLAG; - q->flow_mode |= CAKE_FLOW_NAT_FLAG * + flow_mode &= ~CAKE_FLOW_NAT_FLAG; + flow_mode |= CAKE_FLOW_NAT_FLAG * !!nla_get_u32(tb[TCA_CAKE_NAT]); #else NL_SET_ERR_MSG_ATTR(extack, tb[TCA_CAKE_NAT], @@ -2507,29 +2608,34 @@ static int cake_change(struct Qdisc *sch, struct nlattr *opt, } if (tb[TCA_CAKE_BASE_RATE64]) - q->rate_bps = nla_get_u64(tb[TCA_CAKE_BASE_RATE64]); + WRITE_ONCE(q->rate_bps, + nla_get_u64(tb[TCA_CAKE_BASE_RATE64])); if (tb[TCA_CAKE_DIFFSERV_MODE]) - q->tin_mode = nla_get_u32(tb[TCA_CAKE_DIFFSERV_MODE]); + WRITE_ONCE(q->tin_mode, + nla_get_u32(tb[TCA_CAKE_DIFFSERV_MODE])); + rate_flags = q->rate_flags; if (tb[TCA_CAKE_WASH]) { if (!!nla_get_u32(tb[TCA_CAKE_WASH])) - q->rate_flags |= CAKE_FLAG_WASH; + rate_flags |= CAKE_FLAG_WASH; else - q->rate_flags &= ~CAKE_FLAG_WASH; + rate_flags &= ~CAKE_FLAG_WASH; } if (tb[TCA_CAKE_FLOW_MODE]) - q->flow_mode = ((q->flow_mode & CAKE_FLOW_NAT_FLAG) | + flow_mode = ((flow_mode & CAKE_FLOW_NAT_FLAG) | (nla_get_u32(tb[TCA_CAKE_FLOW_MODE]) & CAKE_FLOW_MASK)); if (tb[TCA_CAKE_ATM]) - q->atm_mode = nla_get_u32(tb[TCA_CAKE_ATM]); + WRITE_ONCE(q->atm_mode, + nla_get_u32(tb[TCA_CAKE_ATM])); if (tb[TCA_CAKE_OVERHEAD]) { - q->rate_overhead = nla_get_s32(tb[TCA_CAKE_OVERHEAD]); - q->rate_flags |= CAKE_FLAG_OVERHEAD; + WRITE_ONCE(q->rate_overhead, + nla_get_s32(tb[TCA_CAKE_OVERHEAD])); + rate_flags |= CAKE_FLAG_OVERHEAD; q->max_netlen = 0; q->max_adjlen = 0; @@ -2538,7 +2644,7 @@ static int cake_change(struct Qdisc *sch, struct nlattr *opt, } if (tb[TCA_CAKE_RAW]) { - q->rate_flags &= ~CAKE_FLAG_OVERHEAD; + rate_flags &= ~CAKE_FLAG_OVERHEAD; q->max_netlen = 0; q->max_adjlen = 0; @@ -2547,49 +2653,58 @@ static int cake_change(struct Qdisc *sch, struct nlattr *opt, } if (tb[TCA_CAKE_MPU]) - q->rate_mpu = nla_get_u32(tb[TCA_CAKE_MPU]); + WRITE_ONCE(q->rate_mpu, + nla_get_u32(tb[TCA_CAKE_MPU])); if (tb[TCA_CAKE_RTT]) { - q->interval = nla_get_u32(tb[TCA_CAKE_RTT]); + u32 interval = nla_get_u32(tb[TCA_CAKE_RTT]); - if (!q->interval) - q->interval = 1; + WRITE_ONCE(q->interval, max(interval, 1U)); } if (tb[TCA_CAKE_TARGET]) { - q->target = nla_get_u32(tb[TCA_CAKE_TARGET]); + u32 target = nla_get_u32(tb[TCA_CAKE_TARGET]); - if (!q->target) - q->target = 1; + WRITE_ONCE(q->target, max(target, 1U)); } if (tb[TCA_CAKE_AUTORATE]) { if (!!nla_get_u32(tb[TCA_CAKE_AUTORATE])) - q->rate_flags |= CAKE_FLAG_AUTORATE_INGRESS; + rate_flags |= CAKE_FLAG_AUTORATE_INGRESS; else - q->rate_flags &= ~CAKE_FLAG_AUTORATE_INGRESS; + rate_flags &= ~CAKE_FLAG_AUTORATE_INGRESS; } if (tb[TCA_CAKE_INGRESS]) { if (!!nla_get_u32(tb[TCA_CAKE_INGRESS])) - q->rate_flags |= CAKE_FLAG_INGRESS; + rate_flags |= CAKE_FLAG_INGRESS; else - q->rate_flags &= ~CAKE_FLAG_INGRESS; + rate_flags &= ~CAKE_FLAG_INGRESS; } if (tb[TCA_CAKE_ACK_FILTER]) - q->ack_filter = nla_get_u32(tb[TCA_CAKE_ACK_FILTER]); + WRITE_ONCE(q->ack_filter, + nla_get_u32(tb[TCA_CAKE_ACK_FILTER])); if (tb[TCA_CAKE_MEMORY]) - q->buffer_config_limit = nla_get_u32(tb[TCA_CAKE_MEMORY]); + WRITE_ONCE(q->buffer_config_limit, + nla_get_u32(tb[TCA_CAKE_MEMORY])); if (tb[TCA_CAKE_SPLIT_GSO]) { if (!!nla_get_u32(tb[TCA_CAKE_SPLIT_GSO])) - q->rate_flags |= CAKE_FLAG_SPLIT_GSO; + rate_flags |= CAKE_FLAG_SPLIT_GSO; else - q->rate_flags &= ~CAKE_FLAG_SPLIT_GSO; + rate_flags &= ~CAKE_FLAG_SPLIT_GSO; + } + + if (tb[TCA_CAKE_FWMARK]) { + WRITE_ONCE(q->fwmark_mask, nla_get_u32(tb[TCA_CAKE_FWMARK])); + WRITE_ONCE(q->fwmark_shft, + q->fwmark_mask ? __ffs(q->fwmark_mask) : 0); } + WRITE_ONCE(q->rate_flags, rate_flags); + WRITE_ONCE(q->flow_mode, flow_mode); if (q->tins) { sch_tree_lock(sch); cake_reconfigure(sch); @@ -2615,6 +2730,8 @@ static int cake_init(struct Qdisc *sch, struct nlattr *opt, int i, j, err; sch->limit = 10240; + sch->flags |= TCQ_F_DEQUEUE_DROPS; + q->tin_mode = CAKE_DIFFSERV_DIFFSERV3; q->flow_mode = CAKE_FLOW_TRIPLE; @@ -2631,7 +2748,7 @@ static int cake_init(struct Qdisc *sch, struct nlattr *opt, qdisc_watchdog_init(&q->watchdog, sch); if (opt) { - int err = cake_change(sch, opt, extack); + err = cake_change(sch, opt, extack); if (err) return err; @@ -2648,7 +2765,7 @@ static int cake_init(struct Qdisc *sch, struct nlattr *opt, q->tins = kvcalloc(CAKE_MAX_TINS, sizeof(struct cake_tin_data), GFP_KERNEL); if (!q->tins) - goto nomem; + return -ENOMEM; for (i = 0; i < CAKE_MAX_TINS; i++) { struct cake_tin_data *b = q->tins + i; @@ -2678,75 +2795,78 @@ static int cake_init(struct Qdisc *sch, struct nlattr *opt, q->min_netlen = ~0; q->min_adjlen = ~0; return 0; - -nomem: - cake_destroy(sch); - return -ENOMEM; } static int cake_dump(struct Qdisc *sch, struct sk_buff *skb) { struct cake_sched_data *q = qdisc_priv(sch); struct nlattr *opts; + u16 rate_flags; + u8 flow_mode; - opts = nla_nest_start(skb, TCA_OPTIONS); + opts = nla_nest_start_noflag(skb, TCA_OPTIONS); if (!opts) goto nla_put_failure; - if (nla_put_u64_64bit(skb, TCA_CAKE_BASE_RATE64, q->rate_bps, - TCA_CAKE_PAD)) + if (nla_put_u64_64bit(skb, TCA_CAKE_BASE_RATE64, + READ_ONCE(q->rate_bps), TCA_CAKE_PAD)) goto nla_put_failure; - if (nla_put_u32(skb, TCA_CAKE_FLOW_MODE, - q->flow_mode & CAKE_FLOW_MASK)) + flow_mode = READ_ONCE(q->flow_mode); + if (nla_put_u32(skb, TCA_CAKE_FLOW_MODE, flow_mode & CAKE_FLOW_MASK)) goto nla_put_failure; - if (nla_put_u32(skb, TCA_CAKE_RTT, q->interval)) + if (nla_put_u32(skb, TCA_CAKE_RTT, READ_ONCE(q->interval))) goto nla_put_failure; - if (nla_put_u32(skb, TCA_CAKE_TARGET, q->target)) + if (nla_put_u32(skb, TCA_CAKE_TARGET, READ_ONCE(q->target))) goto nla_put_failure; - if (nla_put_u32(skb, TCA_CAKE_MEMORY, q->buffer_config_limit)) + if (nla_put_u32(skb, TCA_CAKE_MEMORY, + READ_ONCE(q->buffer_config_limit))) goto nla_put_failure; + rate_flags = READ_ONCE(q->rate_flags); if (nla_put_u32(skb, TCA_CAKE_AUTORATE, - !!(q->rate_flags & CAKE_FLAG_AUTORATE_INGRESS))) + !!(rate_flags & CAKE_FLAG_AUTORATE_INGRESS))) goto nla_put_failure; if (nla_put_u32(skb, TCA_CAKE_INGRESS, - !!(q->rate_flags & CAKE_FLAG_INGRESS))) + !!(rate_flags & CAKE_FLAG_INGRESS))) goto nla_put_failure; - if (nla_put_u32(skb, TCA_CAKE_ACK_FILTER, q->ack_filter)) + if (nla_put_u32(skb, TCA_CAKE_ACK_FILTER, READ_ONCE(q->ack_filter))) goto nla_put_failure; if (nla_put_u32(skb, TCA_CAKE_NAT, - !!(q->flow_mode & CAKE_FLOW_NAT_FLAG))) + !!(flow_mode & CAKE_FLOW_NAT_FLAG))) goto nla_put_failure; - if (nla_put_u32(skb, TCA_CAKE_DIFFSERV_MODE, q->tin_mode)) + if (nla_put_u32(skb, TCA_CAKE_DIFFSERV_MODE, READ_ONCE(q->tin_mode))) goto nla_put_failure; if (nla_put_u32(skb, TCA_CAKE_WASH, - !!(q->rate_flags & CAKE_FLAG_WASH))) + !!(rate_flags & CAKE_FLAG_WASH))) goto nla_put_failure; - if (nla_put_u32(skb, TCA_CAKE_OVERHEAD, q->rate_overhead)) + if (nla_put_u32(skb, TCA_CAKE_OVERHEAD, READ_ONCE(q->rate_overhead))) goto nla_put_failure; - if (!(q->rate_flags & CAKE_FLAG_OVERHEAD)) + if (!(rate_flags & CAKE_FLAG_OVERHEAD)) if (nla_put_u32(skb, TCA_CAKE_RAW, 0)) goto nla_put_failure; - if (nla_put_u32(skb, TCA_CAKE_ATM, q->atm_mode)) + if (nla_put_u32(skb, TCA_CAKE_ATM, READ_ONCE(q->atm_mode))) goto nla_put_failure; - if (nla_put_u32(skb, TCA_CAKE_MPU, q->rate_mpu)) + if (nla_put_u32(skb, TCA_CAKE_MPU, READ_ONCE(q->rate_mpu))) goto nla_put_failure; if (nla_put_u32(skb, TCA_CAKE_SPLIT_GSO, - !!(q->rate_flags & CAKE_FLAG_SPLIT_GSO))) + !!(rate_flags & CAKE_FLAG_SPLIT_GSO))) + goto nla_put_failure; + + if (nla_put_u32(skb, TCA_CAKE_FWMARK, READ_ONCE(q->fwmark_mask))) goto nla_put_failure; return nla_nest_end(skb, opts); @@ -2757,7 +2877,7 @@ nla_put_failure: static int cake_dump_stats(struct Qdisc *sch, struct gnet_dump *d) { - struct nlattr *stats = nla_nest_start(d->skb, TCA_STATS_APP); + struct nlattr *stats = nla_nest_start_noflag(d->skb, TCA_STATS_APP); struct cake_sched_data *q = qdisc_priv(sch); struct nlattr *tstats, *ts; int i; @@ -2787,7 +2907,7 @@ static int cake_dump_stats(struct Qdisc *sch, struct gnet_dump *d) #undef PUT_STAT_U32 #undef PUT_STAT_U64 - tstats = nla_nest_start(d->skb, TCA_CAKE_STATS_TIN_STATS); + tstats = nla_nest_start_noflag(d->skb, TCA_CAKE_STATS_TIN_STATS); if (!tstats) goto nla_put_failure; @@ -2804,7 +2924,7 @@ static int cake_dump_stats(struct Qdisc *sch, struct gnet_dump *d) for (i = 0; i < q->tin_cnt; i++) { struct cake_tin_data *b = &q->tins[q->tin_order[i]]; - ts = nla_nest_start(d->skb, i + 1); + ts = nla_nest_start_noflag(d->skb, i + 1); if (!ts) goto nla_put_failure; @@ -2924,7 +3044,7 @@ static int cake_dump_class_stats(struct Qdisc *sch, unsigned long cl, if (flow) { ktime_t now = ktime_get(); - stats = nla_nest_start(d->skb, TCA_STATS_APP); + stats = nla_nest_start_noflag(d->skb, TCA_STATS_APP); if (!stats) return -1; @@ -2945,7 +3065,7 @@ static int cake_dump_class_stats(struct Qdisc *sch, unsigned long cl, PUT_STAT_S32(BLUE_TIMER_US, ktime_to_us( ktime_sub(now, - flow->cvars.blue_timer))); + flow->cvars.blue_timer))); } if (flow->cvars.dropping) { PUT_STAT_S32(DROP_NEXT_US, @@ -2977,16 +3097,13 @@ static void cake_walk(struct Qdisc *sch, struct qdisc_walker *arg) struct cake_tin_data *b = &q->tins[q->tin_order[i]]; for (j = 0; j < CAKE_QUEUES; j++) { - if (list_empty(&b->flows[j].flowchain) || - arg->count < arg->skip) { + if (list_empty(&b->flows[j].flowchain)) { arg->count++; continue; } - if (arg->fn(sch, i * CAKE_QUEUES + j + 1, arg) < 0) { - arg->stop = 1; + if (!tc_qdisc_stats_dump(sch, i * CAKE_QUEUES + j + 1, + arg)) break; - } - arg->count++; } } } @@ -3017,6 +3134,7 @@ static struct Qdisc_ops cake_qdisc_ops __read_mostly = { .dump_stats = cake_dump_stats, .owner = THIS_MODULE, }; +MODULE_ALIAS_NET_SCH("cake"); static int __init cake_module_init(void) { |
