summaryrefslogtreecommitdiff
path: root/kernel/bpf/cpumap.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/bpf/cpumap.c')
-rw-r--r--kernel/bpf/cpumap.c185
1 files changed, 104 insertions, 81 deletions
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index a8e34416e960..67e8a2fc1a99 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -33,8 +33,8 @@
#include <trace/events/xdp.h>
#include <linux/btf_ids.h>
-#include <linux/netdevice.h> /* netif_receive_skb_list */
-#include <linux/etherdevice.h> /* eth_type_trans */
+#include <linux/netdevice.h>
+#include <net/gro.h>
/* General idea: XDP packets getting XDP redirected to another CPU,
* will maximum be stored/queued for one driver ->poll() call. It is
@@ -68,6 +68,7 @@ struct bpf_cpu_map_entry {
struct bpf_cpumap_val value;
struct bpf_prog *prog;
+ struct gro_node gro;
struct completion kthread_running;
struct rcu_work free_work;
@@ -79,8 +80,6 @@ struct bpf_cpu_map {
struct bpf_cpu_map_entry __rcu **cpu_map;
};
-static DEFINE_PER_CPU(struct list_head, cpu_map_flush_list);
-
static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
{
u32 value_size = attr->value_size;
@@ -135,22 +134,23 @@ static void __cpu_map_ring_cleanup(struct ptr_ring *ring)
}
}
-static void cpu_map_bpf_prog_run_skb(struct bpf_cpu_map_entry *rcpu,
- struct list_head *listp,
- struct xdp_cpumap_stats *stats)
+static u32 cpu_map_bpf_prog_run_skb(struct bpf_cpu_map_entry *rcpu,
+ void **skbs, u32 skb_n,
+ struct xdp_cpumap_stats *stats)
{
- struct sk_buff *skb, *tmp;
struct xdp_buff xdp;
- u32 act;
+ u32 act, pass = 0;
int err;
- list_for_each_entry_safe(skb, tmp, listp, list) {
+ for (u32 i = 0; i < skb_n; i++) {
+ struct sk_buff *skb = skbs[i];
+
act = bpf_prog_run_generic_xdp(skb, &xdp, rcpu->prog);
switch (act) {
case XDP_PASS:
+ skbs[pass++] = skb;
break;
case XDP_REDIRECT:
- skb_list_del_init(skb);
err = xdp_do_generic_redirect(skb->dev, skb, &xdp,
rcpu->prog);
if (unlikely(err)) {
@@ -159,7 +159,7 @@ static void cpu_map_bpf_prog_run_skb(struct bpf_cpu_map_entry *rcpu,
} else {
stats->redirect++;
}
- return;
+ break;
default:
bpf_warn_invalid_xdp_action(NULL, rcpu->prog, act);
fallthrough;
@@ -167,12 +167,15 @@ static void cpu_map_bpf_prog_run_skb(struct bpf_cpu_map_entry *rcpu,
trace_xdp_exception(skb->dev, rcpu->prog, act);
fallthrough;
case XDP_DROP:
- skb_list_del_init(skb);
- kfree_skb(skb);
+ napi_consume_skb(skb, true);
stats->drop++;
- return;
+ break;
}
}
+
+ stats->pass += pass;
+
+ return pass;
}
static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu,
@@ -192,7 +195,7 @@ static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu,
int err;
rxq.dev = xdpf->dev_rx;
- rxq.mem = xdpf->mem;
+ rxq.mem.type = xdpf->mem_type;
/* TODO: report queue_index to xdp_rxq_info */
xdp_convert_frame_to_buff(xdpf, &xdp);
@@ -206,7 +209,6 @@ static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu,
stats->drop++;
} else {
frames[nframes++] = xdpf;
- stats->pass++;
}
break;
case XDP_REDIRECT:
@@ -230,40 +232,65 @@ static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu,
}
xdp_clear_return_frame_no_direct();
+ stats->pass += nframes;
return nframes;
}
#define CPUMAP_BATCH 8
-static int cpu_map_bpf_prog_run(struct bpf_cpu_map_entry *rcpu, void **frames,
- int xdp_n, struct xdp_cpumap_stats *stats,
- struct list_head *list)
+struct cpu_map_ret {
+ u32 xdp_n;
+ u32 skb_n;
+};
+
+static void cpu_map_bpf_prog_run(struct bpf_cpu_map_entry *rcpu, void **frames,
+ void **skbs, struct cpu_map_ret *ret,
+ struct xdp_cpumap_stats *stats)
{
- int nframes;
+ struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
if (!rcpu->prog)
- return xdp_n;
+ goto out;
- rcu_read_lock_bh();
+ rcu_read_lock();
+ bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);
- nframes = cpu_map_bpf_prog_run_xdp(rcpu, frames, xdp_n, stats);
+ ret->xdp_n = cpu_map_bpf_prog_run_xdp(rcpu, frames, ret->xdp_n, stats);
+ if (unlikely(ret->skb_n))
+ ret->skb_n = cpu_map_bpf_prog_run_skb(rcpu, skbs, ret->skb_n,
+ stats);
if (stats->redirect)
xdp_do_flush();
- if (unlikely(!list_empty(list)))
- cpu_map_bpf_prog_run_skb(rcpu, list, stats);
+ bpf_net_ctx_clear(bpf_net_ctx);
+ rcu_read_unlock();
- rcu_read_unlock_bh(); /* resched point, may call do_softirq() */
+out:
+ if (unlikely(ret->skb_n) && ret->xdp_n)
+ memmove(&skbs[ret->xdp_n], skbs, ret->skb_n * sizeof(*skbs));
+}
- return nframes;
+static void cpu_map_gro_flush(struct bpf_cpu_map_entry *rcpu, bool empty)
+{
+ /*
+ * If the ring is not empty, there'll be a new iteration soon, and we
+ * only need to do a full flush if a tick is long (> 1 ms).
+ * If the ring is empty, to not hold GRO packets in the stack for too
+ * long, do a full flush.
+ * This is equivalent to how NAPI decides whether to perform a full
+ * flush.
+ */
+ gro_flush(&rcpu->gro, !empty && HZ >= 1000);
+ gro_normal_list(&rcpu->gro);
}
static int cpu_map_kthread_run(void *data)
{
struct bpf_cpu_map_entry *rcpu = data;
unsigned long last_qs = jiffies;
+ u32 packets = 0;
complete(&rcpu->kthread_running);
set_current_state(TASK_INTERRUPTIBLE);
@@ -276,11 +303,11 @@ static int cpu_map_kthread_run(void *data)
while (!kthread_should_stop() || !__ptr_ring_empty(rcpu->queue)) {
struct xdp_cpumap_stats stats = {}; /* zero stats */
unsigned int kmem_alloc_drops = 0, sched = 0;
- gfp_t gfp = __GFP_ZERO | GFP_ATOMIC;
- int i, n, m, nframes, xdp_n;
+ struct cpu_map_ret ret = { };
void *frames[CPUMAP_BATCH];
void *skbs[CPUMAP_BATCH];
- LIST_HEAD(list);
+ u32 i, n, m;
+ bool empty;
/* Release CPU reschedule checks */
if (__ptr_ring_empty(rcpu->queue)) {
@@ -305,7 +332,7 @@ static int cpu_map_kthread_run(void *data)
*/
n = __ptr_ring_consume_batched(rcpu->queue, frames,
CPUMAP_BATCH);
- for (i = 0, xdp_n = 0; i < n; i++) {
+ for (i = 0; i < n; i++) {
void *f = frames[i];
struct page *page;
@@ -313,11 +340,11 @@ static int cpu_map_kthread_run(void *data)
struct sk_buff *skb = f;
__ptr_clear_bit(0, &skb);
- list_add_tail(&skb->list, &list);
+ skbs[ret.skb_n++] = skb;
continue;
}
- frames[xdp_n++] = f;
+ frames[ret.xdp_n++] = f;
page = virt_to_page(f);
/* Bring struct page memory area to curr CPU. Read by
@@ -327,38 +354,51 @@ static int cpu_map_kthread_run(void *data)
prefetchw(page);
}
+ local_bh_disable();
+
/* Support running another XDP prog on this CPU */
- nframes = cpu_map_bpf_prog_run(rcpu, frames, xdp_n, &stats, &list);
- if (nframes) {
- m = kmem_cache_alloc_bulk(net_hotdata.skbuff_cache,
- gfp, nframes, skbs);
- if (unlikely(m == 0)) {
- for (i = 0; i < nframes; i++)
- skbs[i] = NULL; /* effect: xdp_return_frame */
- kmem_alloc_drops += nframes;
- }
+ cpu_map_bpf_prog_run(rcpu, frames, skbs, &ret, &stats);
+ if (!ret.xdp_n)
+ goto stats;
+
+ m = napi_skb_cache_get_bulk(skbs, ret.xdp_n);
+ if (unlikely(m < ret.xdp_n)) {
+ for (i = m; i < ret.xdp_n; i++)
+ xdp_return_frame(frames[i]);
+
+ if (ret.skb_n)
+ memmove(&skbs[m], &skbs[ret.xdp_n],
+ ret.skb_n * sizeof(*skbs));
+
+ kmem_alloc_drops += ret.xdp_n - m;
+ ret.xdp_n = m;
}
- local_bh_disable();
- for (i = 0; i < nframes; i++) {
+ for (i = 0; i < ret.xdp_n; i++) {
struct xdp_frame *xdpf = frames[i];
- struct sk_buff *skb = skbs[i];
-
- skb = __xdp_build_skb_from_frame(xdpf, skb,
- xdpf->dev_rx);
- if (!skb) {
- xdp_return_frame(xdpf);
- continue;
- }
- list_add_tail(&skb->list, &list);
+ /* Can fail only when !skb -- already handled above */
+ __xdp_build_skb_from_frame(xdpf, skbs[i], xdpf->dev_rx);
}
- netif_receive_skb_list(&list);
- /* Feedback loop via tracepoint */
+stats:
+ /* Feedback loop via tracepoint.
+ * NB: keep before recv to allow measuring enqueue/dequeue latency.
+ */
trace_xdp_cpumap_kthread(rcpu->map_id, n, kmem_alloc_drops,
sched, &stats);
+ for (i = 0; i < ret.xdp_n + ret.skb_n; i++)
+ gro_receive_skb(&rcpu->gro, skbs[i]);
+
+ /* Flush either every 64 packets or in case of empty ring */
+ packets += n;
+ empty = __ptr_ring_empty(rcpu->queue);
+ if (packets >= NAPI_POLL_WEIGHT || empty) {
+ cpu_map_gro_flush(rcpu, empty);
+ packets = 0;
+ }
+
local_bh_enable(); /* resched point, may call do_softirq() */
}
__set_current_state(TASK_RUNNING);
@@ -427,6 +467,7 @@ __cpu_map_entry_alloc(struct bpf_map *map, struct bpf_cpumap_val *value,
rcpu->cpu = cpu;
rcpu->map_id = map->id;
rcpu->value.qsize = value->qsize;
+ gro_init(&rcpu->gro);
if (fd > 0 && __cpu_map_load_bpf_program(rcpu, map, fd))
goto free_ptr_ring;
@@ -455,6 +496,7 @@ free_prog:
if (rcpu->prog)
bpf_prog_put(rcpu->prog);
free_ptr_ring:
+ gro_cleanup(&rcpu->gro);
ptr_ring_cleanup(rcpu->queue, NULL);
free_queue:
kfree(rcpu->queue);
@@ -484,6 +526,7 @@ static void __cpu_map_entry_free(struct work_struct *work)
if (rcpu->prog)
bpf_prog_put(rcpu->prog);
+ gro_cleanup(&rcpu->gro);
/* The queue should be empty at this point */
__cpu_map_ring_cleanup(rcpu->queue);
ptr_ring_cleanup(rcpu->queue, NULL);
@@ -706,7 +749,6 @@ static void bq_flush_to_queue(struct xdp_bulk_queue *bq)
*/
static void bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf)
{
- struct list_head *flush_list = this_cpu_ptr(&cpu_map_flush_list);
struct xdp_bulk_queue *bq = this_cpu_ptr(rcpu->bulkq);
if (unlikely(bq->count == CPU_MAP_BULK_SIZE))
@@ -723,8 +765,11 @@ static void bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf)
*/
bq->q[bq->count++] = xdpf;
- if (!bq->flush_node.prev)
+ if (!bq->flush_node.prev) {
+ struct list_head *flush_list = bpf_net_ctx_get_cpu_map_flush_list();
+
list_add(&bq->flush_node, flush_list);
+ }
}
int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf,
@@ -756,9 +801,8 @@ trace:
return ret;
}
-void __cpu_map_flush(void)
+void __cpu_map_flush(struct list_head *flush_list)
{
- struct list_head *flush_list = this_cpu_ptr(&cpu_map_flush_list);
struct xdp_bulk_queue *bq, *tmp;
list_for_each_entry_safe(bq, tmp, flush_list, flush_node) {
@@ -768,24 +812,3 @@ void __cpu_map_flush(void)
wake_up_process(bq->obj->kthread);
}
}
-
-#ifdef CONFIG_DEBUG_NET
-bool cpu_map_check_flush(void)
-{
- if (list_empty(this_cpu_ptr(&cpu_map_flush_list)))
- return false;
- __cpu_map_flush();
- return true;
-}
-#endif
-
-static int __init cpu_map_init(void)
-{
- int cpu;
-
- for_each_possible_cpu(cpu)
- INIT_LIST_HEAD(&per_cpu(cpu_map_flush_list, cpu));
- return 0;
-}
-
-subsys_initcall(cpu_map_init);