From bd0b2e7fe611953470ec7c533b455fb2abd382cd Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 1 Dec 2017 15:08:57 -0800 Subject: net: xdp: make the stack take care of the tear down Since day one of XDP drivers had to remember to free the program on the remove path. This leads to code duplication and is error prone. Make the stack query the installed programs on unregister and if something is installed, remove the program. Freeing of program attached to XDP generic is moved from free_netdev() as well. Because the remove will now be called before notifiers are invoked, BPF offload state of the program will not get destroyed before uninstall. Signed-off-by: Jakub Kicinski Reviewed-by: Simon Horman Reviewed-by: Quentin Monnet Signed-off-by: Daniel Borkmann --- drivers/net/tun.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'drivers/net/tun.c') diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 95749006d687..6746e498dc61 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -673,7 +673,6 @@ static void tun_detach(struct tun_file *tfile, bool clean) static void tun_detach_all(struct net_device *dev) { struct tun_struct *tun = netdev_priv(dev); - struct bpf_prog *xdp_prog = rtnl_dereference(tun->xdp_prog); struct tun_file *tfile, *tmp; int i, n = tun->numqueues; @@ -708,9 +707,6 @@ static void tun_detach_all(struct net_device *dev) } BUG_ON(tun->numdisabled != 0); - if (xdp_prog) - bpf_prog_put(xdp_prog); - if (tun->flags & IFF_PERSIST) module_put(THIS_MODULE); } -- cgit From 96f84061620c6325a2ca9a9a05b410e6461d03c3 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Mon, 4 Dec 2017 17:31:23 +0800 Subject: tun: add eBPF based queue selection method This patch introduces an eBPF based queue selection method. With this, the policy could be offloaded to userspace completely through a new ioctl TUNSETSTEERINGEBPF. Signed-off-by: Jason Wang Acked-by: Willem de Bruijn Signed-off-by: David S. Miller --- drivers/net/tun.c | 145 +++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 122 insertions(+), 23 deletions(-) (limited to 'drivers/net/tun.c') diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 80568f81a7c8..787cc35ef89b 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -195,6 +195,11 @@ struct tun_flow_entry { #define TUN_NUM_FLOW_ENTRIES 1024 +struct tun_steering_prog { + struct rcu_head rcu; + struct bpf_prog *prog; +}; + /* Since the socket were moved to tun_file, to preserve the behavior of persist * device, socket filter, sndbuf and vnet header size were restore when the * file were attached to a persist device. @@ -232,6 +237,7 @@ struct tun_struct { u32 rx_batched; struct tun_pcpu_stats __percpu *pcpu_stats; struct bpf_prog __rcu *xdp_prog; + struct tun_steering_prog __rcu *steering_prog; }; static int tun_napi_receive(struct napi_struct *napi, int budget) @@ -537,15 +543,12 @@ static inline void tun_flow_save_rps_rxhash(struct tun_flow_entry *e, u32 hash) * different rxq no. here. If we could not get rxhash, then we would * hope the rxq no. may help here. */ -static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb, - void *accel_priv, select_queue_fallback_t fallback) +static u16 tun_automq_select_queue(struct tun_struct *tun, struct sk_buff *skb) { - struct tun_struct *tun = netdev_priv(dev); struct tun_flow_entry *e; u32 txq = 0; u32 numqueues = 0; - rcu_read_lock(); numqueues = READ_ONCE(tun->numqueues); txq = __skb_get_hash_symmetric(skb); @@ -563,10 +566,37 @@ static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb, txq -= numqueues; } - rcu_read_unlock(); return txq; } +static u16 tun_ebpf_select_queue(struct tun_struct *tun, struct sk_buff *skb) +{ + struct tun_steering_prog *prog; + u16 ret = 0; + + prog = rcu_dereference(tun->steering_prog); + if (prog) + ret = bpf_prog_run_clear_cb(prog->prog, skb); + + return ret % tun->numqueues; +} + +static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb, + void *accel_priv, select_queue_fallback_t fallback) +{ + struct tun_struct *tun = netdev_priv(dev); + u16 ret; + + rcu_read_lock(); + if (rcu_dereference(tun->steering_prog)) + ret = tun_ebpf_select_queue(tun, skb); + else + ret = tun_automq_select_queue(tun, skb); + rcu_read_unlock(); + + return ret; +} + static inline bool tun_not_capable(struct tun_struct *tun) { const struct cred *cred = current_cred(); @@ -933,23 +963,10 @@ static int tun_net_close(struct net_device *dev) } /* Net device start xmit */ -static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev) +static void tun_automq_xmit(struct tun_struct *tun, struct sk_buff *skb) { - struct tun_struct *tun = netdev_priv(dev); - int txq = skb->queue_mapping; - struct tun_file *tfile; - u32 numqueues = 0; - - rcu_read_lock(); - tfile = rcu_dereference(tun->tfiles[txq]); - numqueues = READ_ONCE(tun->numqueues); - - /* Drop packet if interface is not attached */ - if (txq >= numqueues) - goto drop; - #ifdef CONFIG_RPS - if (numqueues == 1 && static_key_false(&rps_needed)) { + if (tun->numqueues == 1 && static_key_false(&rps_needed)) { /* Select queue was not called for the skbuff, so we extract the * RPS hash and save it into the flow_table here. */ @@ -965,6 +982,26 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev) } } #endif +} + +/* Net device start xmit */ +static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev) +{ + struct tun_struct *tun = netdev_priv(dev); + int txq = skb->queue_mapping; + struct tun_file *tfile; + u32 numqueues = 0; + + rcu_read_lock(); + tfile = rcu_dereference(tun->tfiles[txq]); + numqueues = READ_ONCE(tun->numqueues); + + /* Drop packet if interface is not attached */ + if (txq >= numqueues) + goto drop; + + if (!rcu_dereference(tun->steering_prog)) + tun_automq_xmit(tun, skb); tun_debug(KERN_INFO, tun, "tun_net_xmit %d\n", skb->len); @@ -1547,7 +1584,7 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, int copylen; bool zerocopy = false; int err; - u32 rxhash; + u32 rxhash = 0; int skb_xdp = 1; bool frags = tun_napi_frags_enabled(tun); @@ -1735,7 +1772,10 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, rcu_read_unlock(); } - rxhash = __skb_get_hash_symmetric(skb); + rcu_read_lock(); + if (!rcu_dereference(tun->steering_prog)) + rxhash = __skb_get_hash_symmetric(skb); + rcu_read_unlock(); if (frags) { /* Exercise flow dissector code path. */ @@ -1779,7 +1819,9 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, u64_stats_update_end(&stats->syncp); put_cpu_ptr(stats); - tun_flow_update(tun, rxhash, tfile); + if (rxhash) + tun_flow_update(tun, rxhash, tfile); + return total_len; } @@ -1987,6 +2029,36 @@ static ssize_t tun_chr_read_iter(struct kiocb *iocb, struct iov_iter *to) return ret; } +static void tun_steering_prog_free(struct rcu_head *rcu) +{ + struct tun_steering_prog *prog = container_of(rcu, + struct tun_steering_prog, rcu); + + bpf_prog_destroy(prog->prog); + kfree(prog); +} + +static int __tun_set_steering_ebpf(struct tun_struct *tun, + struct bpf_prog *prog) +{ + struct tun_steering_prog *old, *new = NULL; + + if (prog) { + new = kmalloc(sizeof(*new), GFP_KERNEL); + if (!new) + return -ENOMEM; + new->prog = prog; + } + + old = rtnl_dereference(tun->steering_prog); + rcu_assign_pointer(tun->steering_prog, new); + + if (old) + call_rcu(&old->rcu, tun_steering_prog_free); + + return 0; +} + static void tun_free_netdev(struct net_device *dev) { struct tun_struct *tun = netdev_priv(dev); @@ -1995,6 +2067,9 @@ static void tun_free_netdev(struct net_device *dev) free_percpu(tun->pcpu_stats); tun_flow_uninit(tun); security_tun_dev_free_security(tun->security); + rtnl_lock(); + __tun_set_steering_ebpf(tun, NULL); + rtnl_unlock(); } static void tun_setup(struct net_device *dev) @@ -2283,6 +2358,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) tun->filter_attached = false; tun->sndbuf = tfile->socket.sk->sk_sndbuf; tun->rx_batched = 0; + RCU_INIT_POINTER(tun->steering_prog, NULL); tun->pcpu_stats = netdev_alloc_pcpu_stats(struct tun_pcpu_stats); if (!tun->pcpu_stats) { @@ -2475,6 +2551,25 @@ unlock: return ret; } +static int tun_set_steering_ebpf(struct tun_struct *tun, void __user *data) +{ + struct bpf_prog *prog; + int fd; + + if (copy_from_user(&fd, data, sizeof(fd))) + return -EFAULT; + + if (fd == -1) { + prog = NULL; + } else { + prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_SOCKET_FILTER); + if (IS_ERR(prog)) + return PTR_ERR(prog); + } + + return __tun_set_steering_ebpf(tun, prog); +} + static long __tun_chr_ioctl(struct file *file, unsigned int cmd, unsigned long arg, int ifreq_len) { @@ -2751,6 +2846,10 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, ret = 0; break; + case TUNSETSTEERINGEBPF: + ret = tun_set_steering_ebpf(tun, argp); + break; + default: ret = -EINVAL; break; -- cgit From cc166427dcdab39a96140fded18ac23be5f0a1ed Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Tue, 5 Dec 2017 22:11:17 -0500 Subject: tun: avoid unnecessary READ_ONCE in tun_net_xmit The statement no longer serves a purpose. Commit fa35864e0bb7 ("tuntap: Fix for a race in accessing numqueues") added the ACCESS_ONCE to avoid a race condition with skb_queue_len. Commit 436accebb530 ("tuntap: remove unnecessary sk_receive_queue length check during xmit") removed the affected skb_queue_len check. Commit 96f84061620c ("tun: add eBPF based queue selection method") split the function, reading the field a second time in the callee. The temp variable is now only read once, so just remove it. Signed-off-by: Willem de Bruijn Acked-by: Jason Wang Signed-off-by: David S. Miller --- drivers/net/tun.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'drivers/net/tun.c') diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 787cc35ef89b..c2ad8f3858d1 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -990,14 +990,12 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev) struct tun_struct *tun = netdev_priv(dev); int txq = skb->queue_mapping; struct tun_file *tfile; - u32 numqueues = 0; rcu_read_lock(); tfile = rcu_dereference(tun->tfiles[txq]); - numqueues = READ_ONCE(tun->numqueues); /* Drop packet if interface is not attached */ - if (txq >= numqueues) + if (txq >= tun->numqueues) goto drop; if (!rcu_dereference(tun->steering_prog)) -- cgit From 124da8f6118bce3d9aeb5c6f5395ed131141aad5 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Fri, 8 Dec 2017 12:02:30 +0800 Subject: tuntap: fix possible deadlock when fail to register netdev Private destructor could be called when register_netdev() fail with rtnl lock held. This will lead deadlock in tun_free_netdev() who tries to hold rtnl_lock. Fixing this by switching to use spinlock to synchronize. Fixes: 96f84061620c ("tun: add eBPF based queue selection method") Reported-by: Eric Dumazet Cc: Eric Dumazet Cc: Willem de Bruijn Signed-off-by: Jason Wang Acked-by: Michael S. Tsirkin Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- drivers/net/tun.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'drivers/net/tun.c') diff --git a/drivers/net/tun.c b/drivers/net/tun.c index c2ad8f3858d1..e367d6310353 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -2048,8 +2048,11 @@ static int __tun_set_steering_ebpf(struct tun_struct *tun, new->prog = prog; } - old = rtnl_dereference(tun->steering_prog); + spin_lock_bh(&tun->lock); + old = rcu_dereference_protected(tun->steering_prog, + lockdep_is_held(&tun->lock)); rcu_assign_pointer(tun->steering_prog, new); + spin_unlock_bh(&tun->lock); if (old) call_rcu(&old->rcu, tun_steering_prog_free); @@ -2065,9 +2068,7 @@ static void tun_free_netdev(struct net_device *dev) free_percpu(tun->pcpu_stats); tun_flow_uninit(tun); security_tun_dev_free_security(tun->security); - rtnl_lock(); __tun_set_steering_ebpf(tun, NULL); - rtnl_unlock(); } static void tun_setup(struct net_device *dev) -- cgit From 8bf5c4ee1889308ccd396fdfd40ac94129ee419f Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Wed, 3 Jan 2018 11:25:59 +0100 Subject: tun: setup xdp_rxq_info Driver hook points for xdp_rxq_info: * reg : tun_attach * unreg: __tun_detach I've done some manual testing of this tun driver, but I would appriciate good review and someone else running their use-case tests, as I'm not 100% sure I understand the tfile->detached semantics. V2: Removed the skb_array_cleanup() call from V1 by request from Jason Wang. Cc: Jason Wang Cc: "Michael S. Tsirkin" Cc: Willem de Bruijn Signed-off-by: Jesper Dangaard Brouer Reviewed-by: Jason Wang Signed-off-by: Alexei Starovoitov --- drivers/net/tun.c | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) (limited to 'drivers/net/tun.c') diff --git a/drivers/net/tun.c b/drivers/net/tun.c index e367d6310353..e7c5f4b2a9a6 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -180,6 +180,7 @@ struct tun_file { struct list_head next; struct tun_struct *detached; struct skb_array tx_array; + struct xdp_rxq_info xdp_rxq; }; struct tun_flow_entry { @@ -687,8 +688,10 @@ static void __tun_detach(struct tun_file *tfile, bool clean) tun->dev->reg_state == NETREG_REGISTERED) unregister_netdevice(tun->dev); } - if (tun) + if (tun) { skb_array_cleanup(&tfile->tx_array); + xdp_rxq_info_unreg(&tfile->xdp_rxq); + } sock_put(&tfile->sk); } } @@ -728,11 +731,13 @@ static void tun_detach_all(struct net_device *dev) tun_napi_del(tun, tfile); /* Drop read queue */ tun_queue_purge(tfile); + xdp_rxq_info_unreg(&tfile->xdp_rxq); sock_put(&tfile->sk); } list_for_each_entry_safe(tfile, tmp, &tun->disabled, next) { tun_enable_queue(tfile); tun_queue_purge(tfile); + xdp_rxq_info_unreg(&tfile->xdp_rxq); sock_put(&tfile->sk); } BUG_ON(tun->numdisabled != 0); @@ -784,6 +789,22 @@ static int tun_attach(struct tun_struct *tun, struct file *file, tfile->queue_index = tun->numqueues; tfile->socket.sk->sk_shutdown &= ~RCV_SHUTDOWN; + + if (tfile->detached) { + /* Re-attach detached tfile, updating XDP queue_index */ + WARN_ON(!xdp_rxq_info_is_reg(&tfile->xdp_rxq)); + + if (tfile->xdp_rxq.queue_index != tfile->queue_index) + tfile->xdp_rxq.queue_index = tfile->queue_index; + } else { + /* Setup XDP RX-queue info, for new tfile getting attached */ + err = xdp_rxq_info_reg(&tfile->xdp_rxq, + tun->dev, tfile->queue_index); + if (err < 0) + goto out; + err = 0; + } + rcu_assign_pointer(tfile->tun, tun); rcu_assign_pointer(tun->tfiles[tun->numqueues], tfile); tun->numqueues++; @@ -1508,6 +1529,7 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun, xdp.data = buf + pad; xdp_set_data_meta_invalid(&xdp); xdp.data_end = xdp.data + len; + xdp.rxq = &tfile->xdp_rxq; orig_data = xdp.data; act = bpf_prog_run_xdp(xdp_prog, &xdp); -- cgit From 5990a30510ed1c37a769d3a035ad2d030b843528 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Thu, 4 Jan 2018 11:14:27 +0800 Subject: tun/tap: use ptr_ring instead of skb_array This patch switches to use ptr_ring instead of skb_array. This will be used to enqueue different types of pointers by encoding type into lower bits. Signed-off-by: Jason Wang Signed-off-by: David S. Miller --- drivers/net/tun.c | 42 ++++++++++++++++++++++-------------------- 1 file changed, 22 insertions(+), 20 deletions(-) (limited to 'drivers/net/tun.c') diff --git a/drivers/net/tun.c b/drivers/net/tun.c index e7c5f4b2a9a6..b8e39c6d2a5f 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -179,7 +179,7 @@ struct tun_file { struct mutex napi_mutex; /* Protects access to the above napi */ struct list_head next; struct tun_struct *detached; - struct skb_array tx_array; + struct ptr_ring tx_ring; struct xdp_rxq_info xdp_rxq; }; @@ -635,7 +635,7 @@ static void tun_queue_purge(struct tun_file *tfile) { struct sk_buff *skb; - while ((skb = skb_array_consume(&tfile->tx_array)) != NULL) + while ((skb = ptr_ring_consume(&tfile->tx_ring)) != NULL) kfree_skb(skb); skb_queue_purge(&tfile->sk.sk_write_queue); @@ -689,7 +689,8 @@ static void __tun_detach(struct tun_file *tfile, bool clean) unregister_netdevice(tun->dev); } if (tun) { - skb_array_cleanup(&tfile->tx_array); + ptr_ring_cleanup(&tfile->tx_ring, + __skb_array_destroy_skb); xdp_rxq_info_unreg(&tfile->xdp_rxq); } sock_put(&tfile->sk); @@ -782,7 +783,7 @@ static int tun_attach(struct tun_struct *tun, struct file *file, } if (!tfile->detached && - skb_array_init(&tfile->tx_array, dev->tx_queue_len, GFP_KERNEL)) { + ptr_ring_init(&tfile->tx_ring, dev->tx_queue_len, GFP_KERNEL)) { err = -ENOMEM; goto out; } @@ -1048,7 +1049,7 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev) nf_reset(skb); - if (skb_array_produce(&tfile->tx_array, skb)) + if (ptr_ring_produce(&tfile->tx_ring, skb)) goto drop; /* Notify and wake up reader process */ @@ -1316,7 +1317,7 @@ static unsigned int tun_chr_poll(struct file *file, poll_table *wait) poll_wait(file, sk_sleep(sk), wait); - if (!skb_array_empty(&tfile->tx_array)) + if (!ptr_ring_empty(&tfile->tx_ring)) mask |= POLLIN | POLLRDNORM; if (tun->dev->flags & IFF_UP && @@ -1966,7 +1967,7 @@ static struct sk_buff *tun_ring_recv(struct tun_file *tfile, int noblock, struct sk_buff *skb = NULL; int error = 0; - skb = skb_array_consume(&tfile->tx_array); + skb = ptr_ring_consume(&tfile->tx_ring); if (skb) goto out; if (noblock) { @@ -1978,7 +1979,7 @@ static struct sk_buff *tun_ring_recv(struct tun_file *tfile, int noblock, current->state = TASK_INTERRUPTIBLE; while (1) { - skb = skb_array_consume(&tfile->tx_array); + skb = ptr_ring_consume(&tfile->tx_ring); if (skb) break; if (signal_pending(current)) { @@ -2208,7 +2209,7 @@ static int tun_peek_len(struct socket *sock) if (!tun) return 0; - ret = skb_array_peek_len(&tfile->tx_array); + ret = PTR_RING_PEEK_CALL(&tfile->tx_ring, __skb_array_len_with_tag); tun_put(tun); return ret; @@ -3114,25 +3115,26 @@ static int tun_queue_resize(struct tun_struct *tun) { struct net_device *dev = tun->dev; struct tun_file *tfile; - struct skb_array **arrays; + struct ptr_ring **rings; int n = tun->numqueues + tun->numdisabled; int ret, i; - arrays = kmalloc_array(n, sizeof(*arrays), GFP_KERNEL); - if (!arrays) + rings = kmalloc_array(n, sizeof(*rings), GFP_KERNEL); + if (!rings) return -ENOMEM; for (i = 0; i < tun->numqueues; i++) { tfile = rtnl_dereference(tun->tfiles[i]); - arrays[i] = &tfile->tx_array; + rings[i] = &tfile->tx_ring; } list_for_each_entry(tfile, &tun->disabled, next) - arrays[i++] = &tfile->tx_array; + rings[i++] = &tfile->tx_ring; - ret = skb_array_resize_multiple(arrays, n, - dev->tx_queue_len, GFP_KERNEL); + ret = ptr_ring_resize_multiple(rings, n, + dev->tx_queue_len, GFP_KERNEL, + __skb_array_destroy_skb); - kfree(arrays); + kfree(rings); return ret; } @@ -3218,7 +3220,7 @@ struct socket *tun_get_socket(struct file *file) } EXPORT_SYMBOL_GPL(tun_get_socket); -struct skb_array *tun_get_skb_array(struct file *file) +struct ptr_ring *tun_get_tx_ring(struct file *file) { struct tun_file *tfile; @@ -3227,9 +3229,9 @@ struct skb_array *tun_get_skb_array(struct file *file) tfile = file->private_data; if (!tfile) return ERR_PTR(-EBADFD); - return &tfile->tx_array; + return &tfile->tx_ring; } -EXPORT_SYMBOL_GPL(tun_get_skb_array); +EXPORT_SYMBOL_GPL(tun_get_tx_ring); module_init(tun_init); module_exit(tun_cleanup); -- cgit From fc72d1d54dd9ffe2552c76b17e9129803ca7b255 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Thu, 4 Jan 2018 11:14:28 +0800 Subject: tuntap: XDP transmission This patch implements XDP transmission for TAP. Since we can't create new queues for TAP during XDP set, exist ptr_ring was reused for queuing XDP buffers. To differ xdp_buff from sk_buff, TUN_XDP_FLAG (0x1UL) was encoded into lowest bit of xpd_buff pointer during ptr_ring_produce, and was decoded during consuming. XDP metadata was stored in the headroom of the packet which should work in most of cases since driver usually reserve enough headroom. Very minor changes were done for vhost_net: it just need to peek the length depends on the type of pointer. Tests were done on two Intel E5-2630 2.40GHz machines connected back to back through two 82599ES. Traffic were generated/received through MoonGen/testpmd(rxonly). It reports ~20% improvements when xdp_redirect_map is doing redirection from ixgbe to TAP (from 2.50Mpps to 3.05Mpps) Cc: Jesper Dangaard Brouer Signed-off-by: Jason Wang Signed-off-by: David S. Miller --- drivers/net/tun.c | 211 +++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 179 insertions(+), 32 deletions(-) (limited to 'drivers/net/tun.c') diff --git a/drivers/net/tun.c b/drivers/net/tun.c index b8e39c6d2a5f..2fba3be5719e 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -241,6 +241,24 @@ struct tun_struct { struct tun_steering_prog __rcu *steering_prog; }; +bool tun_is_xdp_buff(void *ptr) +{ + return (unsigned long)ptr & TUN_XDP_FLAG; +} +EXPORT_SYMBOL(tun_is_xdp_buff); + +void *tun_xdp_to_ptr(void *ptr) +{ + return (void *)((unsigned long)ptr | TUN_XDP_FLAG); +} +EXPORT_SYMBOL(tun_xdp_to_ptr); + +void *tun_ptr_to_xdp(void *ptr) +{ + return (void *)((unsigned long)ptr & ~TUN_XDP_FLAG); +} +EXPORT_SYMBOL(tun_ptr_to_xdp); + static int tun_napi_receive(struct napi_struct *napi, int budget) { struct tun_file *tfile = container_of(napi, struct tun_file, napi); @@ -631,12 +649,25 @@ static struct tun_struct *tun_enable_queue(struct tun_file *tfile) return tun; } +static void tun_ptr_free(void *ptr) +{ + if (!ptr) + return; + if (tun_is_xdp_buff(ptr)) { + struct xdp_buff *xdp = tun_ptr_to_xdp(ptr); + + put_page(virt_to_head_page(xdp->data)); + } else { + __skb_array_destroy_skb(ptr); + } +} + static void tun_queue_purge(struct tun_file *tfile) { - struct sk_buff *skb; + void *ptr; - while ((skb = ptr_ring_consume(&tfile->tx_ring)) != NULL) - kfree_skb(skb); + while ((ptr = ptr_ring_consume(&tfile->tx_ring)) != NULL) + tun_ptr_free(ptr); skb_queue_purge(&tfile->sk.sk_write_queue); skb_queue_purge(&tfile->sk.sk_error_queue); @@ -689,8 +720,7 @@ static void __tun_detach(struct tun_file *tfile, bool clean) unregister_netdevice(tun->dev); } if (tun) { - ptr_ring_cleanup(&tfile->tx_ring, - __skb_array_destroy_skb); + ptr_ring_cleanup(&tfile->tx_ring, tun_ptr_free); xdp_rxq_info_unreg(&tfile->xdp_rxq); } sock_put(&tfile->sk); @@ -1222,6 +1252,67 @@ static const struct net_device_ops tun_netdev_ops = { .ndo_get_stats64 = tun_net_get_stats64, }; +static int tun_xdp_xmit(struct net_device *dev, struct xdp_buff *xdp) +{ + struct tun_struct *tun = netdev_priv(dev); + struct xdp_buff *buff = xdp->data_hard_start; + int headroom = xdp->data - xdp->data_hard_start; + struct tun_file *tfile; + u32 numqueues; + int ret = 0; + + /* Assure headroom is available and buff is properly aligned */ + if (unlikely(headroom < sizeof(*xdp) || tun_is_xdp_buff(xdp))) + return -ENOSPC; + + *buff = *xdp; + + rcu_read_lock(); + + numqueues = READ_ONCE(tun->numqueues); + if (!numqueues) { + ret = -ENOSPC; + goto out; + } + + tfile = rcu_dereference(tun->tfiles[smp_processor_id() % + numqueues]); + /* Encode the XDP flag into lowest bit for consumer to differ + * XDP buffer from sk_buff. + */ + if (ptr_ring_produce(&tfile->tx_ring, tun_xdp_to_ptr(buff))) { + this_cpu_inc(tun->pcpu_stats->tx_dropped); + ret = -ENOSPC; + } + +out: + rcu_read_unlock(); + return ret; +} + +static void tun_xdp_flush(struct net_device *dev) +{ + struct tun_struct *tun = netdev_priv(dev); + struct tun_file *tfile; + u32 numqueues; + + rcu_read_lock(); + + numqueues = READ_ONCE(tun->numqueues); + if (!numqueues) + goto out; + + tfile = rcu_dereference(tun->tfiles[smp_processor_id() % + numqueues]); + /* Notify and wake up reader process */ + if (tfile->flags & TUN_FASYNC) + kill_fasync(&tfile->fasync, SIGIO, POLL_IN); + tfile->socket.sk->sk_data_ready(tfile->socket.sk); + +out: + rcu_read_unlock(); +} + static const struct net_device_ops tap_netdev_ops = { .ndo_uninit = tun_net_uninit, .ndo_open = tun_net_open, @@ -1239,6 +1330,8 @@ static const struct net_device_ops tap_netdev_ops = { .ndo_set_rx_headroom = tun_set_headroom, .ndo_get_stats64 = tun_net_get_stats64, .ndo_bpf = tun_xdp, + .ndo_xdp_xmit = tun_xdp_xmit, + .ndo_xdp_flush = tun_xdp_flush, }; static void tun_flow_init(struct tun_struct *tun) @@ -1863,6 +1956,40 @@ static ssize_t tun_chr_write_iter(struct kiocb *iocb, struct iov_iter *from) return result; } +static ssize_t tun_put_user_xdp(struct tun_struct *tun, + struct tun_file *tfile, + struct xdp_buff *xdp, + struct iov_iter *iter) +{ + int vnet_hdr_sz = 0; + size_t size = xdp->data_end - xdp->data; + struct tun_pcpu_stats *stats; + size_t ret; + + if (tun->flags & IFF_VNET_HDR) { + struct virtio_net_hdr gso = { 0 }; + + vnet_hdr_sz = READ_ONCE(tun->vnet_hdr_sz); + if (unlikely(iov_iter_count(iter) < vnet_hdr_sz)) + return -EINVAL; + if (unlikely(copy_to_iter(&gso, sizeof(gso), iter) != + sizeof(gso))) + return -EFAULT; + iov_iter_advance(iter, vnet_hdr_sz - sizeof(gso)); + } + + ret = copy_to_iter(xdp->data, size, iter) + vnet_hdr_sz; + + stats = get_cpu_ptr(tun->pcpu_stats); + u64_stats_update_begin(&stats->syncp); + stats->tx_packets++; + stats->tx_bytes += ret; + u64_stats_update_end(&stats->syncp); + put_cpu_ptr(tun->pcpu_stats); + + return ret; +} + /* Put packet to the user space buffer */ static ssize_t tun_put_user(struct tun_struct *tun, struct tun_file *tfile, @@ -1960,15 +2087,14 @@ done: return total; } -static struct sk_buff *tun_ring_recv(struct tun_file *tfile, int noblock, - int *err) +static void *tun_ring_recv(struct tun_file *tfile, int noblock, int *err) { DECLARE_WAITQUEUE(wait, current); - struct sk_buff *skb = NULL; + void *ptr = NULL; int error = 0; - skb = ptr_ring_consume(&tfile->tx_ring); - if (skb) + ptr = ptr_ring_consume(&tfile->tx_ring); + if (ptr) goto out; if (noblock) { error = -EAGAIN; @@ -1979,8 +2105,8 @@ static struct sk_buff *tun_ring_recv(struct tun_file *tfile, int noblock, current->state = TASK_INTERRUPTIBLE; while (1) { - skb = ptr_ring_consume(&tfile->tx_ring); - if (skb) + ptr = ptr_ring_consume(&tfile->tx_ring); + if (ptr) break; if (signal_pending(current)) { error = -ERESTARTSYS; @@ -1999,12 +2125,12 @@ static struct sk_buff *tun_ring_recv(struct tun_file *tfile, int noblock, out: *err = error; - return skb; + return ptr; } static ssize_t tun_do_read(struct tun_struct *tun, struct tun_file *tfile, struct iov_iter *to, - int noblock, struct sk_buff *skb) + int noblock, void *ptr) { ssize_t ret; int err; @@ -2012,23 +2138,31 @@ static ssize_t tun_do_read(struct tun_struct *tun, struct tun_file *tfile, tun_debug(KERN_INFO, tun, "tun_do_read\n"); if (!iov_iter_count(to)) { - if (skb) - kfree_skb(skb); + tun_ptr_free(ptr); return 0; } - if (!skb) { + if (!ptr) { /* Read frames from ring */ - skb = tun_ring_recv(tfile, noblock, &err); - if (!skb) + ptr = tun_ring_recv(tfile, noblock, &err); + if (!ptr) return err; } - ret = tun_put_user(tun, tfile, skb, to); - if (unlikely(ret < 0)) - kfree_skb(skb); - else - consume_skb(skb); + if (tun_is_xdp_buff(ptr)) { + struct xdp_buff *xdp = tun_ptr_to_xdp(ptr); + + ret = tun_put_user_xdp(tun, tfile, xdp, to); + put_page(virt_to_head_page(xdp->data)); + } else { + struct sk_buff *skb = ptr; + + ret = tun_put_user(tun, tfile, skb, to); + if (unlikely(ret < 0)) + kfree_skb(skb); + else + consume_skb(skb); + } return ret; } @@ -2165,12 +2299,12 @@ static int tun_recvmsg(struct socket *sock, struct msghdr *m, size_t total_len, { struct tun_file *tfile = container_of(sock, struct tun_file, socket); struct tun_struct *tun = tun_get(tfile); - struct sk_buff *skb = m->msg_control; + void *ptr = m->msg_control; int ret; if (!tun) { ret = -EBADFD; - goto out_free_skb; + goto out_free; } if (flags & ~(MSG_DONTWAIT|MSG_TRUNC|MSG_ERRQUEUE)) { @@ -2182,7 +2316,7 @@ static int tun_recvmsg(struct socket *sock, struct msghdr *m, size_t total_len, SOL_PACKET, TUN_TX_TIMESTAMP); goto out; } - ret = tun_do_read(tun, tfile, &m->msg_iter, flags & MSG_DONTWAIT, skb); + ret = tun_do_read(tun, tfile, &m->msg_iter, flags & MSG_DONTWAIT, ptr); if (ret > (ssize_t)total_len) { m->msg_flags |= MSG_TRUNC; ret = flags & MSG_TRUNC ? ret : total_len; @@ -2193,12 +2327,25 @@ out: out_put_tun: tun_put(tun); -out_free_skb: - if (skb) - kfree_skb(skb); +out_free: + tun_ptr_free(ptr); return ret; } +static int tun_ptr_peek_len(void *ptr) +{ + if (likely(ptr)) { + if (tun_is_xdp_buff(ptr)) { + struct xdp_buff *xdp = tun_ptr_to_xdp(ptr); + + return xdp->data_end - xdp->data; + } + return __skb_array_len_with_tag(ptr); + } else { + return 0; + } +} + static int tun_peek_len(struct socket *sock) { struct tun_file *tfile = container_of(sock, struct tun_file, socket); @@ -2209,7 +2356,7 @@ static int tun_peek_len(struct socket *sock) if (!tun) return 0; - ret = PTR_RING_PEEK_CALL(&tfile->tx_ring, __skb_array_len_with_tag); + ret = PTR_RING_PEEK_CALL(&tfile->tx_ring, tun_ptr_peek_len); tun_put(tun); return ret; @@ -3132,7 +3279,7 @@ static int tun_queue_resize(struct tun_struct *tun) ret = ptr_ring_resize_multiple(rings, n, dev->tx_queue_len, GFP_KERNEL, - __skb_array_destroy_skb); + tun_ptr_free); kfree(rings); return ret; -- cgit From cd5681d7d8903bf43a571aaf96cf6d2e2e00118c Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Tue, 16 Jan 2018 16:31:01 +0800 Subject: tuntap: rename struct tun_steering_prog to struct tun_prog To be reused by other eBPF program other than queue selection. Signed-off-by: Jason Wang Signed-off-by: David S. Miller --- drivers/net/tun.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) (limited to 'drivers/net/tun.c') diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 2fba3be5719e..76197ede22a9 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -196,7 +196,7 @@ struct tun_flow_entry { #define TUN_NUM_FLOW_ENTRIES 1024 -struct tun_steering_prog { +struct tun_prog { struct rcu_head rcu; struct bpf_prog *prog; }; @@ -238,7 +238,7 @@ struct tun_struct { u32 rx_batched; struct tun_pcpu_stats __percpu *pcpu_stats; struct bpf_prog __rcu *xdp_prog; - struct tun_steering_prog __rcu *steering_prog; + struct tun_prog __rcu *steering_prog; }; bool tun_is_xdp_buff(void *ptr) @@ -590,7 +590,7 @@ static u16 tun_automq_select_queue(struct tun_struct *tun, struct sk_buff *skb) static u16 tun_ebpf_select_queue(struct tun_struct *tun, struct sk_buff *skb) { - struct tun_steering_prog *prog; + struct tun_prog *prog; u16 ret = 0; prog = rcu_dereference(tun->steering_prog); @@ -2184,19 +2184,18 @@ static ssize_t tun_chr_read_iter(struct kiocb *iocb, struct iov_iter *to) return ret; } -static void tun_steering_prog_free(struct rcu_head *rcu) +static void tun_prog_free(struct rcu_head *rcu) { - struct tun_steering_prog *prog = container_of(rcu, - struct tun_steering_prog, rcu); + struct tun_prog *prog = container_of(rcu, struct tun_prog, rcu); bpf_prog_destroy(prog->prog); kfree(prog); } -static int __tun_set_steering_ebpf(struct tun_struct *tun, - struct bpf_prog *prog) +static int __tun_set_ebpf(struct tun_struct *tun, struct tun_prog **prog_p, + struct bpf_prog *prog) { - struct tun_steering_prog *old, *new = NULL; + struct tun_prog *old, *new = NULL; if (prog) { new = kmalloc(sizeof(*new), GFP_KERNEL); @@ -2206,13 +2205,13 @@ static int __tun_set_steering_ebpf(struct tun_struct *tun, } spin_lock_bh(&tun->lock); - old = rcu_dereference_protected(tun->steering_prog, + old = rcu_dereference_protected(*prog_p, lockdep_is_held(&tun->lock)); - rcu_assign_pointer(tun->steering_prog, new); + rcu_assign_pointer(*prog_p, new); spin_unlock_bh(&tun->lock); if (old) - call_rcu(&old->rcu, tun_steering_prog_free); + call_rcu(&old->rcu, tun_prog_free); return 0; } @@ -2225,7 +2224,7 @@ static void tun_free_netdev(struct net_device *dev) free_percpu(tun->pcpu_stats); tun_flow_uninit(tun); security_tun_dev_free_security(tun->security); - __tun_set_steering_ebpf(tun, NULL); + __tun_set_ebpf(tun, &tun->steering_prog, NULL); } static void tun_setup(struct net_device *dev) @@ -2720,7 +2719,8 @@ unlock: return ret; } -static int tun_set_steering_ebpf(struct tun_struct *tun, void __user *data) +static int tun_set_ebpf(struct tun_struct *tun, struct tun_prog **prog_p, + void __user *data) { struct bpf_prog *prog; int fd; @@ -2736,7 +2736,7 @@ static int tun_set_steering_ebpf(struct tun_struct *tun, void __user *data) return PTR_ERR(prog); } - return __tun_set_steering_ebpf(tun, prog); + return __tun_set_ebpf(tun, prog_p, prog); } static long __tun_chr_ioctl(struct file *file, unsigned int cmd, @@ -3016,7 +3016,7 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, break; case TUNSETSTEERINGEBPF: - ret = tun_set_steering_ebpf(tun, argp); + ret = tun_set_ebpf(tun, &tun->steering_prog, argp); break; default: -- cgit From aff3d70a07fffc0abb53663e4a4acb059d2f36af Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Tue, 16 Jan 2018 16:31:02 +0800 Subject: tun: allow to attach ebpf socket filter This patch allows userspace to attach eBPF filter to tun. This will allow to implement VM dataplane filtering in a more efficient way compared to cBPF filter by allowing either qemu or libvirt to attach eBPF filter to tun. Signed-off-by: Jason Wang Signed-off-by: David S. Miller --- drivers/net/tun.c | 38 ++++++++++++++++++++++++++++++++++---- 1 file changed, 34 insertions(+), 4 deletions(-) (limited to 'drivers/net/tun.c') diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 76197ede22a9..170a3e89b5af 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -239,6 +239,12 @@ struct tun_struct { struct tun_pcpu_stats __percpu *pcpu_stats; struct bpf_prog __rcu *xdp_prog; struct tun_prog __rcu *steering_prog; + struct tun_prog __rcu *filter_prog; +}; + +struct veth { + __be16 h_vlan_proto; + __be16 h_vlan_TCI; }; bool tun_is_xdp_buff(void *ptr) @@ -1036,12 +1042,25 @@ static void tun_automq_xmit(struct tun_struct *tun, struct sk_buff *skb) #endif } +static unsigned int run_ebpf_filter(struct tun_struct *tun, + struct sk_buff *skb, + int len) +{ + struct tun_prog *prog = rcu_dereference(tun->filter_prog); + + if (prog) + len = bpf_prog_run_clear_cb(prog->prog, skb); + + return len; +} + /* Net device start xmit */ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev) { struct tun_struct *tun = netdev_priv(dev); int txq = skb->queue_mapping; struct tun_file *tfile; + int len = skb->len; rcu_read_lock(); tfile = rcu_dereference(tun->tfiles[txq]); @@ -1067,6 +1086,15 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev) sk_filter(tfile->socket.sk, skb)) goto drop; + len = run_ebpf_filter(tun, skb, len); + + /* Trim extra bytes since we may insert vlan proto & TCI + * in tun_put_user(). + */ + len -= skb_vlan_tag_present(skb) ? sizeof(struct veth) : 0; + if (len <= 0 || pskb_trim(skb, len)) + goto drop; + if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC))) goto drop; @@ -2054,10 +2082,7 @@ static ssize_t tun_put_user(struct tun_struct *tun, if (vlan_hlen) { int ret; - struct { - __be16 h_vlan_proto; - __be16 h_vlan_TCI; - } veth; + struct veth veth; veth.h_vlan_proto = skb->vlan_proto; veth.h_vlan_TCI = htons(skb_vlan_tag_get(skb)); @@ -2225,6 +2250,7 @@ static void tun_free_netdev(struct net_device *dev) tun_flow_uninit(tun); security_tun_dev_free_security(tun->security); __tun_set_ebpf(tun, &tun->steering_prog, NULL); + __tun_set_ebpf(tun, &tun->filter_prog, NULL); } static void tun_setup(struct net_device *dev) @@ -3019,6 +3045,10 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, ret = tun_set_ebpf(tun, &tun->steering_prog, argp); break; + case TUNSETFILTEREBPF: + ret = tun_set_ebpf(tun, &tun->filter_prog, argp); + break; + default: ret = -EINVAL; break; -- cgit From 9d6474e458b13a94a0d5b141f2b8f38adf1991ae Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Mon, 22 Jan 2018 10:55:38 +0800 Subject: tun: add missing rcu annotation This patch fixes the following sparse warnings: drivers/net/tun.c:2241:15: error: incompatible types in comparison expression (different address spaces) Fixes: cd5681d7d890 ("tuntap: rename struct tun_steering_prog to struct tun_prog") Cc: Daniel Borkmann Signed-off-by: Jason Wang Signed-off-by: David S. Miller --- drivers/net/tun.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'drivers/net/tun.c') diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 698874684b4e..2bc18b16a45b 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -2225,7 +2225,8 @@ static void tun_prog_free(struct rcu_head *rcu) kfree(prog); } -static int __tun_set_ebpf(struct tun_struct *tun, struct tun_prog **prog_p, +static int __tun_set_ebpf(struct tun_struct *tun, + struct tun_prog __rcu **prog_p, struct bpf_prog *prog) { struct tun_prog *old, *new = NULL; -- cgit From c13da21cdb8085f1d9a53463c88a4d3967fe90fd Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Mon, 22 Jan 2018 13:49:27 -0800 Subject: tun: avoid calling xdp_rxq_info_unreg() twice Similarly to tx ring, xdp_rxq_info is only registered when !tfile->detached, so we need to avoid calling xdp_rxq_info_unreg() twice too. The helper tun_cleanup_tx_ring() already checks for this properly, so it is correct to put xdp_rxq_info_unreg() just inside there. Reported-by: syzbot+1c788d7ce0f0888f1d7f@syzkaller.appspotmail.com Fixes: 8565d26bcb2f ("Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net") Cc: Jason Wang Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- drivers/net/tun.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'drivers/net/tun.c') diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 2bc18b16a45b..a0c5cb1a1617 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -774,14 +774,12 @@ static void tun_detach_all(struct net_device *dev) tun_napi_del(tun, tfile); /* Drop read queue */ tun_queue_purge(tfile); - xdp_rxq_info_unreg(&tfile->xdp_rxq); sock_put(&tfile->sk); tun_cleanup_tx_ring(tfile); } list_for_each_entry_safe(tfile, tmp, &tun->disabled, next) { tun_enable_queue(tfile); tun_queue_purge(tfile); - xdp_rxq_info_unreg(&tfile->xdp_rxq); sock_put(&tfile->sk); tun_cleanup_tx_ring(tfile); } -- cgit