summaryrefslogtreecommitdiff
path: root/samples/bpf/hbm_out_kern.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2019-03-05 08:26:13 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2019-03-05 08:26:13 -0800
commit6456300356433873309a1cae6aa05e77d6b59153 (patch)
tree3158f04f2ca63a48e4d3021aba31aee8f18221cf /samples/bpf/hbm_out_kern.c
parentcd2a3bf02625ffad02a6b9f7df758ee36cf12769 (diff)
parent18a4d8bf250a33c015955f0dec27259780ef6448 (diff)
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next
Pull networking updates from David Miller: "Here we go, another merge window full of networking and #ebpf changes: 1) Snoop DHCPACKS in batman-adv to learn MAC/IP pairs in the DHCP range without dealing with floods of ARP traffic, from Linus Lüssing. 2) Throttle buffered multicast packet transmission in mt76, from Felix Fietkau. 3) Support adaptive interrupt moderation in ice, from Brett Creeley. 4) A lot of struct_size conversions, from Gustavo A. R. Silva. 5) Add peek/push/pop commands to bpftool, as well as bash completion, from Stanislav Fomichev. 6) Optimize sk_msg_clone(), from Vakul Garg. 7) Add SO_BINDTOIFINDEX, from David Herrmann. 8) Be more conservative with local resends due to local congestion, from Yuchung Cheng. 9) Allow vetoing of unsupported VXLAN FDBs, from Petr Machata. 10) Add health buffer support to devlink, from Eran Ben Elisha. 11) Add TXQ scheduling API to mac80211, from Toke Høiland-Jørgensen. 12) Add statistics to basic packet scheduler filter, from Cong Wang. 13) Add GRE tunnel support for mlxsw Spectrum-2, from Nir Dotan. 14) Lots of new IP tunneling forwarding tests, also from Nir Dotan. 15) Add 3ad stats to bonding, from Nikolay Aleksandrov. 16) Lots of probing improvements for bpftool, from Quentin Monnet. 17) Various nfp drive #ebpf JIT improvements from Jakub Kicinski. 18) Allow #ebpf programs to access gso_segs from skb shared info, from Eric Dumazet. 19) Add sock_diag support for AF_XDP sockets, from Björn Töpel. 20) Support 22260 iwlwifi devices, from Luca Coelho. 21) Use rbtree for ipv6 defragmentation, from Peter Oskolkov. 22) Add JMP32 instruction class support to #ebpf, from Jiong Wang. 23) Add spinlock support to #ebpf, from Alexei Starovoitov. 24) Support 256-bit keys and TLS 1.3 in ktls, from Dave Watson. 25) Add device infomation API to devlink, from Jakub Kicinski. 26) Add new timestamping socket options which are y2038 safe, from Deepa Dinamani. 27) Add RX checksum offloading for various sh_eth chips, from Sergei Shtylyov. 28) Flow offload infrastructure, from Pablo Neira Ayuso. 29) Numerous cleanups, improvements, and bug fixes to the PHY layer and many drivers from Heiner Kallweit. 30) Lots of changes to try and make packet scheduler classifiers run lockless as much as possible, from Vlad Buslov. 31) Support BCM957504 chip in bnxt_en driver, from Erik Burrows. 32) Add concurrency tests to tc-tests infrastructure, from Vlad Buslov. 33) Add hwmon support to aquantia, from Heiner Kallweit. 34) Allow 64-bit values for SO_MAX_PACING_RATE, from Eric Dumazet. And I would be remiss if I didn't thank the various major networking subsystem maintainers for integrating much of this work before I even saw it. Alexei Starovoitov, Daniel Borkmann, Pablo Neira Ayuso, Johannes Berg, Kalle Valo, and many others. Thank you!" * git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next: (2207 commits) net/sched: avoid unused-label warning net: ignore sysctl_devconf_inherit_init_net without SYSCTL phy: mdio-mux: fix Kconfig dependencies net: phy: use phy_modify_mmd_changed in genphy_c45_an_config_aneg net: dsa: mv88e6xxx: add call to mv88e6xxx_ports_cmode_init to probe for new DSA framework selftest/net: Remove duplicate header sky2: Disable MSI on Dell Inspiron 1545 and Gateway P-79 net/mlx5e: Update tx reporter status in case channels were successfully opened devlink: Add support for direct reporter health state update devlink: Update reporter state to error even if recover aborted sctp: call iov_iter_revert() after sending ABORT team: Free BPF filter when unregistering netdev ip6mr: Do not call __IP6_INC_STATS() from preemptible context isdn: mISDN: Fix potential NULL pointer dereference of kzalloc net: dsa: mv88e6xxx: support in-band signalling on SGMII ports with external PHYs cxgb4/chtls: Prefix adapter flags with CXGB4 net-sysfs: Switch to bitmap_zalloc() mellanox: Switch to bitmap_zalloc() bpf: add test cases for non-pointer sanitiation logic mlxsw: i2c: Extend initialization by querying resources data ...
Diffstat (limited to 'samples/bpf/hbm_out_kern.c')
-rw-r--r--samples/bpf/hbm_out_kern.c157
1 files changed, 157 insertions, 0 deletions
diff --git a/samples/bpf/hbm_out_kern.c b/samples/bpf/hbm_out_kern.c
new file mode 100644
index 000000000000..f806863d0b79
--- /dev/null
+++ b/samples/bpf/hbm_out_kern.c
@@ -0,0 +1,157 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2019 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * Sample Host Bandwidth Manager (HBM) BPF program.
+ *
+ * A cgroup skb BPF egress program to limit cgroup output bandwidth.
+ * It uses a modified virtual token bucket queue to limit average
+ * egress bandwidth. The implementation uses credits instead of tokens.
+ * Negative credits imply that queueing would have happened (this is
+ * a virtual queue, so no queueing is done by it. However, queueing may
+ * occur at the actual qdisc (which is not used for rate limiting).
+ *
+ * This implementation uses 3 thresholds, one to start marking packets and
+ * the other two to drop packets:
+ * CREDIT
+ * - <--------------------------|------------------------> +
+ * | | | 0
+ * | Large pkt |
+ * | drop thresh |
+ * Small pkt drop Mark threshold
+ * thresh
+ *
+ * The effect of marking depends on the type of packet:
+ * a) If the packet is ECN enabled and it is a TCP packet, then the packet
+ * is ECN marked.
+ * b) If the packet is a TCP packet, then we probabilistically call tcp_cwr
+ * to reduce the congestion window. The current implementation uses a linear
+ * distribution (0% probability at marking threshold, 100% probability
+ * at drop threshold).
+ * c) If the packet is not a TCP packet, then it is dropped.
+ *
+ * If the credit is below the drop threshold, the packet is dropped. If it
+ * is a TCP packet, then it also calls tcp_cwr since packets dropped by
+ * by a cgroup skb BPF program do not automatically trigger a call to
+ * tcp_cwr in the current kernel code.
+ *
+ * This BPF program actually uses 2 drop thresholds, one threshold
+ * for larger packets (>= 120 bytes) and another for smaller packets. This
+ * protects smaller packets such as SYNs, ACKs, etc.
+ *
+ * The default bandwidth limit is set at 1Gbps but this can be changed by
+ * a user program through a shared BPF map. In addition, by default this BPF
+ * program does not limit connections using loopback. This behavior can be
+ * overwritten by the user program. There is also an option to calculate
+ * some statistics, such as percent of packets marked or dropped, which
+ * the user program can access.
+ *
+ * A latter patch provides such a program (hbm.c)
+ */
+
+#include "hbm_kern.h"
+
+SEC("cgroup_skb/egress")
+int _hbm_out_cg(struct __sk_buff *skb)
+{
+ struct hbm_pkt_info pkti;
+ int len = skb->len;
+ unsigned int queue_index = 0;
+ unsigned long long curtime;
+ int credit;
+ signed long long delta = 0, zero = 0;
+ int max_credit = MAX_CREDIT;
+ bool congestion_flag = false;
+ bool drop_flag = false;
+ bool cwr_flag = false;
+ struct hbm_vqueue *qdp;
+ struct hbm_queue_stats *qsp = NULL;
+ int rv = ALLOW_PKT;
+
+ qsp = bpf_map_lookup_elem(&queue_stats, &queue_index);
+ if (qsp != NULL && !qsp->loopback && (skb->ifindex == 1))
+ return ALLOW_PKT;
+
+ hbm_get_pkt_info(skb, &pkti);
+
+ // We may want to account for the length of headers in len
+ // calculation, like ETH header + overhead, specially if it
+ // is a gso packet. But I am not doing it right now.
+
+ qdp = bpf_get_local_storage(&queue_state, 0);
+ if (!qdp)
+ return ALLOW_PKT;
+ else if (qdp->lasttime == 0)
+ hbm_init_vqueue(qdp, 1024);
+
+ curtime = bpf_ktime_get_ns();
+
+ // Begin critical section
+ bpf_spin_lock(&qdp->lock);
+ credit = qdp->credit;
+ delta = curtime - qdp->lasttime;
+ /* delta < 0 implies that another process with a curtime greater
+ * than ours beat us to the critical section and already added
+ * the new credit, so we should not add it ourselves
+ */
+ if (delta > 0) {
+ qdp->lasttime = curtime;
+ credit += CREDIT_PER_NS(delta, qdp->rate);
+ if (credit > MAX_CREDIT)
+ credit = MAX_CREDIT;
+ }
+ credit -= len;
+ qdp->credit = credit;
+ bpf_spin_unlock(&qdp->lock);
+ // End critical section
+
+ // Check if we should update rate
+ if (qsp != NULL && (qsp->rate * 128) != qdp->rate) {
+ qdp->rate = qsp->rate * 128;
+ bpf_printk("Updating rate: %d (1sec:%llu bits)\n",
+ (int)qdp->rate,
+ CREDIT_PER_NS(1000000000, qdp->rate) * 8);
+ }
+
+ // Set flags (drop, congestion, cwr)
+ // Dropping => we are congested, so ignore congestion flag
+ if (credit < -DROP_THRESH ||
+ (len > LARGE_PKT_THRESH &&
+ credit < -LARGE_PKT_DROP_THRESH)) {
+ // Very congested, set drop flag
+ drop_flag = true;
+ } else if (credit < 0) {
+ // Congested, set congestion flag
+ if (pkti.ecn) {
+ if (credit < -MARK_THRESH)
+ congestion_flag = true;
+ else
+ congestion_flag = false;
+ } else {
+ congestion_flag = true;
+ }
+ }
+
+ if (congestion_flag) {
+ if (!bpf_skb_ecn_set_ce(skb)) {
+ if (len > LARGE_PKT_THRESH) {
+ // Problem if too many small packets?
+ drop_flag = true;
+ }
+ }
+ }
+
+ if (drop_flag)
+ rv = DROP_PKT;
+
+ hbm_update_stats(qsp, len, curtime, congestion_flag, drop_flag);
+
+ if (rv == DROP_PKT)
+ __sync_add_and_fetch(&(qdp->credit), len);
+
+ return rv;
+}
+char _license[] SEC("license") = "GPL";