diff options
Diffstat (limited to 'net/xdp/xsk.c')
| -rw-r--r-- | net/xdp/xsk.c | 1251 |
1 files changed, 1026 insertions, 225 deletions
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 3700266229f6..f093c3453f64 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -22,108 +22,137 @@ #include <linux/net.h> #include <linux/netdevice.h> #include <linux/rculist.h> +#include <linux/vmalloc.h> #include <net/xdp_sock_drv.h> +#include <net/busy_poll.h> +#include <net/netdev_lock.h> +#include <net/netdev_rx_queue.h> #include <net/xdp.h> #include "xsk_queue.h" #include "xdp_umem.h" #include "xsk.h" -#define TX_BATCH_SIZE 16 +#define TX_BATCH_SIZE 32 +#define MAX_PER_SOCKET_BUDGET 32 -static DEFINE_PER_CPU(struct list_head, xskmap_flush_list); +struct xsk_addrs { + u32 num_descs; + u64 addrs[MAX_SKB_FRAGS + 1]; +}; -bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs) -{ - return READ_ONCE(xs->rx) && READ_ONCE(xs->umem) && - READ_ONCE(xs->umem->fq); -} +static struct kmem_cache *xsk_tx_generic_cache; -void xsk_set_rx_need_wakeup(struct xdp_umem *umem) +void xsk_set_rx_need_wakeup(struct xsk_buff_pool *pool) { - if (umem->need_wakeup & XDP_WAKEUP_RX) + if (pool->cached_need_wakeup & XDP_WAKEUP_RX) return; - umem->fq->ring->flags |= XDP_RING_NEED_WAKEUP; - umem->need_wakeup |= XDP_WAKEUP_RX; + pool->fq->ring->flags |= XDP_RING_NEED_WAKEUP; + pool->cached_need_wakeup |= XDP_WAKEUP_RX; } EXPORT_SYMBOL(xsk_set_rx_need_wakeup); -void xsk_set_tx_need_wakeup(struct xdp_umem *umem) +void xsk_set_tx_need_wakeup(struct xsk_buff_pool *pool) { struct xdp_sock *xs; - if (umem->need_wakeup & XDP_WAKEUP_TX) + if (pool->cached_need_wakeup & XDP_WAKEUP_TX) return; rcu_read_lock(); - list_for_each_entry_rcu(xs, &umem->xsk_tx_list, list) { + list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) { xs->tx->ring->flags |= XDP_RING_NEED_WAKEUP; } rcu_read_unlock(); - umem->need_wakeup |= XDP_WAKEUP_TX; + pool->cached_need_wakeup |= XDP_WAKEUP_TX; } EXPORT_SYMBOL(xsk_set_tx_need_wakeup); -void xsk_clear_rx_need_wakeup(struct xdp_umem *umem) +void xsk_clear_rx_need_wakeup(struct xsk_buff_pool *pool) { - if (!(umem->need_wakeup & XDP_WAKEUP_RX)) + if (!(pool->cached_need_wakeup & XDP_WAKEUP_RX)) return; - umem->fq->ring->flags &= ~XDP_RING_NEED_WAKEUP; - umem->need_wakeup &= ~XDP_WAKEUP_RX; + pool->fq->ring->flags &= ~XDP_RING_NEED_WAKEUP; + pool->cached_need_wakeup &= ~XDP_WAKEUP_RX; } EXPORT_SYMBOL(xsk_clear_rx_need_wakeup); -void xsk_clear_tx_need_wakeup(struct xdp_umem *umem) +void xsk_clear_tx_need_wakeup(struct xsk_buff_pool *pool) { struct xdp_sock *xs; - if (!(umem->need_wakeup & XDP_WAKEUP_TX)) + if (!(pool->cached_need_wakeup & XDP_WAKEUP_TX)) return; rcu_read_lock(); - list_for_each_entry_rcu(xs, &umem->xsk_tx_list, list) { + list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) { xs->tx->ring->flags &= ~XDP_RING_NEED_WAKEUP; } rcu_read_unlock(); - umem->need_wakeup &= ~XDP_WAKEUP_TX; + pool->cached_need_wakeup &= ~XDP_WAKEUP_TX; } EXPORT_SYMBOL(xsk_clear_tx_need_wakeup); -bool xsk_umem_uses_need_wakeup(struct xdp_umem *umem) +bool xsk_uses_need_wakeup(struct xsk_buff_pool *pool) { - return umem->flags & XDP_UMEM_USES_NEED_WAKEUP; + return pool->uses_need_wakeup; } -EXPORT_SYMBOL(xsk_umem_uses_need_wakeup); +EXPORT_SYMBOL(xsk_uses_need_wakeup); -void xp_release(struct xdp_buff_xsk *xskb) +struct xsk_buff_pool *xsk_get_pool_from_qid(struct net_device *dev, + u16 queue_id) { - xskb->pool->free_heads[xskb->pool->free_heads_cnt++] = xskb; + if (queue_id < dev->real_num_rx_queues) + return dev->_rx[queue_id].pool; + if (queue_id < dev->real_num_tx_queues) + return dev->_tx[queue_id].pool; + + return NULL; } +EXPORT_SYMBOL(xsk_get_pool_from_qid); -static u64 xp_get_handle(struct xdp_buff_xsk *xskb) +void xsk_clear_pool_at_qid(struct net_device *dev, u16 queue_id) { - u64 offset = xskb->xdp.data - xskb->xdp.data_hard_start; + if (queue_id < dev->num_rx_queues) + dev->_rx[queue_id].pool = NULL; + if (queue_id < dev->num_tx_queues) + dev->_tx[queue_id].pool = NULL; +} - offset += xskb->pool->headroom; - if (!xskb->pool->unaligned) - return xskb->orig_addr + offset; - return xskb->orig_addr + (offset << XSK_UNALIGNED_BUF_OFFSET_SHIFT); +/* The buffer pool is stored both in the _rx struct and the _tx struct as we do + * not know if the device has more tx queues than rx, or the opposite. + * This might also change during run time. + */ +int xsk_reg_pool_at_qid(struct net_device *dev, struct xsk_buff_pool *pool, + u16 queue_id) +{ + if (queue_id >= max_t(unsigned int, + dev->real_num_rx_queues, + dev->real_num_tx_queues)) + return -EINVAL; + + if (queue_id < dev->real_num_rx_queues) + dev->_rx[queue_id].pool = pool; + if (queue_id < dev->real_num_tx_queues) + dev->_tx[queue_id].pool = pool; + + return 0; } -static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) +static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff_xsk *xskb, u32 len, + u32 flags) { - struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp); u64 addr; int err; - addr = xp_get_handle(xskb); - err = xskq_prod_reserve_desc(xs->rx, addr, len); + addr = xp_get_handle(xskb, xskb->pool); + err = xskq_prod_reserve_desc(xs->rx, addr, len, flags); if (err) { - xs->rx_dropped++; + xs->rx_queue_full++; return err; } @@ -131,52 +160,160 @@ static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) return 0; } -static void xsk_copy_xdp(struct xdp_buff *to, struct xdp_buff *from, u32 len) +static int xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) { - void *from_buf, *to_buf; - u32 metalen; + struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp); + u32 frags = xdp_buff_has_frags(xdp); + struct xdp_buff_xsk *pos, *tmp; + struct list_head *xskb_list; + u32 contd = 0; + int err; - if (unlikely(xdp_data_meta_unsupported(from))) { - from_buf = from->data; - to_buf = to->data; - metalen = 0; - } else { - from_buf = from->data_meta; - metalen = from->data - from->data_meta; - to_buf = to->data - metalen; + if (frags) + contd = XDP_PKT_CONTD; + + err = __xsk_rcv_zc(xs, xskb, len, contd); + if (err) + goto err; + if (likely(!frags)) + return 0; + + xskb_list = &xskb->pool->xskb_list; + list_for_each_entry_safe(pos, tmp, xskb_list, list_node) { + if (list_is_singular(xskb_list)) + contd = 0; + len = pos->xdp.data_end - pos->xdp.data; + err = __xsk_rcv_zc(xs, pos, len, contd); + if (err) + goto err; + list_del(&pos->list_node); } - memcpy(to_buf, from_buf, len + metalen); + return 0; +err: + xsk_buff_free(xdp); + return err; +} + +static void *xsk_copy_xdp_start(struct xdp_buff *from) +{ + if (unlikely(xdp_data_meta_unsupported(from))) + return from->data; + else + return from->data_meta; } -static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len, - bool explicit_free) +static u32 xsk_copy_xdp(void *to, void **from, u32 to_len, + u32 *from_len, skb_frag_t **frag, u32 rem) { + u32 copied = 0; + + while (1) { + u32 copy_len = min_t(u32, *from_len, to_len); + + memcpy(to, *from, copy_len); + copied += copy_len; + if (rem == copied) + return copied; + + if (*from_len == copy_len) { + *from = skb_frag_address(*frag); + *from_len = skb_frag_size((*frag)++); + } else { + *from += copy_len; + *from_len -= copy_len; + } + if (to_len == copy_len) + return copied; + + to_len -= copy_len; + to += copy_len; + } +} + +static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) +{ + u32 frame_size = xsk_pool_get_rx_frame_size(xs->pool); + void *copy_from = xsk_copy_xdp_start(xdp), *copy_to; + u32 from_len, meta_len, rem, num_desc; + struct xdp_buff_xsk *xskb; struct xdp_buff *xsk_xdp; - int err; + skb_frag_t *frag; - if (len > xsk_umem_get_rx_frame_size(xs->umem)) { - xs->rx_dropped++; - return -ENOSPC; + from_len = xdp->data_end - copy_from; + meta_len = xdp->data - copy_from; + rem = len + meta_len; + + if (len <= frame_size && !xdp_buff_has_frags(xdp)) { + int err; + + xsk_xdp = xsk_buff_alloc(xs->pool); + if (!xsk_xdp) { + xs->rx_dropped++; + return -ENOMEM; + } + memcpy(xsk_xdp->data - meta_len, copy_from, rem); + xskb = container_of(xsk_xdp, struct xdp_buff_xsk, xdp); + err = __xsk_rcv_zc(xs, xskb, len, 0); + if (err) { + xsk_buff_free(xsk_xdp); + return err; + } + + return 0; } - xsk_xdp = xsk_buff_alloc(xs->umem); - if (!xsk_xdp) { + num_desc = (len - 1) / frame_size + 1; + + if (!xsk_buff_can_alloc(xs->pool, num_desc)) { xs->rx_dropped++; - return -ENOSPC; + return -ENOMEM; + } + if (xskq_prod_nb_free(xs->rx, num_desc) < num_desc) { + xs->rx_queue_full++; + return -ENOBUFS; } - xsk_copy_xdp(xsk_xdp, xdp, len); - err = __xsk_rcv_zc(xs, xsk_xdp, len); - if (err) { - xsk_buff_free(xsk_xdp); - return err; + if (xdp_buff_has_frags(xdp)) { + struct skb_shared_info *sinfo; + + sinfo = xdp_get_shared_info_from_buff(xdp); + frag = &sinfo->frags[0]; } - if (explicit_free) - xdp_return_buff(xdp); + + do { + u32 to_len = frame_size + meta_len; + u32 copied; + + xsk_xdp = xsk_buff_alloc(xs->pool); + copy_to = xsk_xdp->data - meta_len; + + copied = xsk_copy_xdp(copy_to, ©_from, to_len, &from_len, &frag, rem); + rem -= copied; + + xskb = container_of(xsk_xdp, struct xdp_buff_xsk, xdp); + __xsk_rcv_zc(xs, xskb, copied - meta_len, rem ? XDP_PKT_CONTD : 0); + meta_len = 0; + } while (rem); + return 0; } +static bool xsk_tx_writeable(struct xdp_sock *xs) +{ + if (xskq_cons_present_entries(xs->tx) > xs->tx->nentries / 2) + return false; + + return true; +} + +static void __xsk_tx_release(struct xdp_sock *xs) +{ + __xskq_cons_release(xs->tx); + if (xsk_tx_writeable(xs)) + xs->sk.sk_write_space(&xs->sk); +} + static bool xsk_is_bound(struct xdp_sock *xs) { if (READ_ONCE(xs->state) == XSK_BOUND) { @@ -187,60 +324,84 @@ static bool xsk_is_bound(struct xdp_sock *xs) return false; } -static int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, - bool explicit_free) +static int xsk_rcv_check(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) { - u32 len; - if (!xsk_is_bound(xs)) - return -EINVAL; + return -ENXIO; if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index) return -EINVAL; - len = xdp->data_end - xdp->data; + if (len > xsk_pool_get_rx_frame_size(xs->pool) && !xs->sg) { + xs->rx_dropped++; + return -ENOSPC; + } - return xdp->rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL ? - __xsk_rcv_zc(xs, xdp, len) : - __xsk_rcv(xs, xdp, len, explicit_free); + return 0; } static void xsk_flush(struct xdp_sock *xs) { xskq_prod_submit(xs->rx); - __xskq_cons_release(xs->umem->fq); + __xskq_cons_release(xs->pool->fq); sock_def_readable(&xs->sk); } int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) { + u32 len = xdp_get_buff_len(xdp); + int err; + + err = xsk_rcv_check(xs, xdp, len); + if (!err) { + spin_lock_bh(&xs->pool->rx_lock); + err = __xsk_rcv(xs, xdp, len); + xsk_flush(xs); + spin_unlock_bh(&xs->pool->rx_lock); + } + + return err; +} + +static int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) +{ + u32 len = xdp_get_buff_len(xdp); int err; - spin_lock_bh(&xs->rx_lock); - err = xsk_rcv(xs, xdp, false); - xsk_flush(xs); - spin_unlock_bh(&xs->rx_lock); + err = xsk_rcv_check(xs, xdp, len); + if (err) + return err; + + if (xdp->rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL) { + len = xdp->data_end - xdp->data; + return xsk_rcv_zc(xs, xdp, len); + } + + err = __xsk_rcv(xs, xdp, len); + if (!err) + xdp_return_buff(xdp); return err; } int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp) { - struct list_head *flush_list = this_cpu_ptr(&xskmap_flush_list); int err; - err = xsk_rcv(xs, xdp, true); + err = xsk_rcv(xs, xdp); if (err) return err; - if (!xs->flush_node.prev) + if (!xs->flush_node.prev) { + struct list_head *flush_list = bpf_net_ctx_get_xskmap_flush_list(); + list_add(&xs->flush_node, flush_list); + } return 0; } -void __xsk_map_flush(void) +void __xsk_map_flush(struct list_head *flush_list) { - struct list_head *flush_list = this_cpu_ptr(&xskmap_flush_list); struct xdp_sock *xs, *tmp; list_for_each_entry_safe(xs, tmp, flush_list, flush_node) { @@ -249,40 +410,50 @@ void __xsk_map_flush(void) } } -void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries) +void xsk_tx_completed(struct xsk_buff_pool *pool, u32 nb_entries) { - xskq_prod_submit_n(umem->cq, nb_entries); + xskq_prod_submit_n(pool->cq, nb_entries); } -EXPORT_SYMBOL(xsk_umem_complete_tx); +EXPORT_SYMBOL(xsk_tx_completed); -void xsk_umem_consume_tx_done(struct xdp_umem *umem) +void xsk_tx_release(struct xsk_buff_pool *pool) { struct xdp_sock *xs; rcu_read_lock(); - list_for_each_entry_rcu(xs, &umem->xsk_tx_list, list) { - __xskq_cons_release(xs->tx); - xs->sk.sk_write_space(&xs->sk); - } + list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) + __xsk_tx_release(xs); rcu_read_unlock(); } -EXPORT_SYMBOL(xsk_umem_consume_tx_done); +EXPORT_SYMBOL(xsk_tx_release); -bool xsk_umem_consume_tx(struct xdp_umem *umem, struct xdp_desc *desc) +bool xsk_tx_peek_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc) { + bool budget_exhausted = false; struct xdp_sock *xs; rcu_read_lock(); - list_for_each_entry_rcu(xs, &umem->xsk_tx_list, list) { - if (!xskq_cons_peek_desc(xs->tx, desc, umem)) +again: + list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) { + if (xs->tx_budget_spent >= MAX_PER_SOCKET_BUDGET) { + budget_exhausted = true; + continue; + } + + if (!xskq_cons_peek_desc(xs->tx, desc, pool)) { + if (xskq_has_descs(xs->tx)) + xskq_cons_release(xs->tx); continue; + } + + xs->tx_budget_spent++; /* This is the backpressure mechanism for the Tx path. * Reserve space in the completion queue and only proceed * if there is space in it. This avoids having to implement * any buffering in the Tx path. */ - if (xskq_prod_reserve_addr(umem->cq, desc->addr)) + if (xskq_prod_reserve_addr(pool->cq, desc->addr)) goto out; xskq_cons_release(xs->tx); @@ -290,163 +461,676 @@ bool xsk_umem_consume_tx(struct xdp_umem *umem, struct xdp_desc *desc) return true; } + if (budget_exhausted) { + list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) + xs->tx_budget_spent = 0; + + budget_exhausted = false; + goto again; + } + out: rcu_read_unlock(); return false; } -EXPORT_SYMBOL(xsk_umem_consume_tx); +EXPORT_SYMBOL(xsk_tx_peek_desc); -static int xsk_wakeup(struct xdp_sock *xs, u8 flags) +static u32 xsk_tx_peek_release_fallback(struct xsk_buff_pool *pool, u32 max_entries) { - struct net_device *dev = xs->dev; - int err; + struct xdp_desc *descs = pool->tx_descs; + u32 nb_pkts = 0; + + while (nb_pkts < max_entries && xsk_tx_peek_desc(pool, &descs[nb_pkts])) + nb_pkts++; + + xsk_tx_release(pool); + return nb_pkts; +} + +u32 xsk_tx_peek_release_desc_batch(struct xsk_buff_pool *pool, u32 nb_pkts) +{ + struct xdp_sock *xs; rcu_read_lock(); - err = dev->netdev_ops->ndo_xsk_wakeup(dev, xs->queue_id, flags); + if (!list_is_singular(&pool->xsk_tx_list)) { + /* Fallback to the non-batched version */ + rcu_read_unlock(); + return xsk_tx_peek_release_fallback(pool, nb_pkts); + } + + xs = list_first_or_null_rcu(&pool->xsk_tx_list, struct xdp_sock, tx_list); + if (!xs) { + nb_pkts = 0; + goto out; + } + + nb_pkts = xskq_cons_nb_entries(xs->tx, nb_pkts); + + /* This is the backpressure mechanism for the Tx path. Try to + * reserve space in the completion queue for all packets, but + * if there are fewer slots available, just process that many + * packets. This avoids having to implement any buffering in + * the Tx path. + */ + nb_pkts = xskq_prod_nb_free(pool->cq, nb_pkts); + if (!nb_pkts) + goto out; + + nb_pkts = xskq_cons_read_desc_batch(xs->tx, pool, nb_pkts); + if (!nb_pkts) { + xs->tx->queue_empty_descs++; + goto out; + } + + __xskq_cons_release(xs->tx); + xskq_prod_write_addr_batch(pool->cq, pool->tx_descs, nb_pkts); + xs->sk.sk_write_space(&xs->sk); + +out: rcu_read_unlock(); + return nb_pkts; +} +EXPORT_SYMBOL(xsk_tx_peek_release_desc_batch); - return err; +static int xsk_wakeup(struct xdp_sock *xs, u8 flags) +{ + struct net_device *dev = xs->dev; + + return dev->netdev_ops->ndo_xsk_wakeup(dev, xs->queue_id, flags); } -static int xsk_zc_xmit(struct xdp_sock *xs) +static int xsk_cq_reserve_locked(struct xsk_buff_pool *pool) { - return xsk_wakeup(xs, XDP_WAKEUP_TX); + int ret; + + spin_lock(&pool->cq_cached_prod_lock); + ret = xskq_prod_reserve(pool->cq); + spin_unlock(&pool->cq_cached_prod_lock); + + return ret; } -static void xsk_destruct_skb(struct sk_buff *skb) +static bool xsk_skb_destructor_is_addr(struct sk_buff *skb) { - u64 addr = (u64)(long)skb_shinfo(skb)->destructor_arg; - struct xdp_sock *xs = xdp_sk(skb->sk); + return (uintptr_t)skb_shinfo(skb)->destructor_arg & 0x1UL; +} + +static u64 xsk_skb_destructor_get_addr(struct sk_buff *skb) +{ + return (u64)((uintptr_t)skb_shinfo(skb)->destructor_arg & ~0x1UL); +} + +static void xsk_skb_destructor_set_addr(struct sk_buff *skb, u64 addr) +{ + skb_shinfo(skb)->destructor_arg = (void *)((uintptr_t)addr | 0x1UL); +} + +static void xsk_inc_num_desc(struct sk_buff *skb) +{ + struct xsk_addrs *xsk_addr; + + if (!xsk_skb_destructor_is_addr(skb)) { + xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg; + xsk_addr->num_descs++; + } +} + +static u32 xsk_get_num_desc(struct sk_buff *skb) +{ + struct xsk_addrs *xsk_addr; + + if (xsk_skb_destructor_is_addr(skb)) + return 1; + + xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg; + + return xsk_addr->num_descs; +} + +static void xsk_cq_submit_addr_locked(struct xsk_buff_pool *pool, + struct sk_buff *skb) +{ + u32 num_descs = xsk_get_num_desc(skb); + struct xsk_addrs *xsk_addr; + u32 descs_processed = 0; unsigned long flags; + u32 idx, i; + + spin_lock_irqsave(&pool->cq_prod_lock, flags); + idx = xskq_get_prod(pool->cq); - spin_lock_irqsave(&xs->tx_completion_lock, flags); - xskq_prod_submit_addr(xs->umem->cq, addr); - spin_unlock_irqrestore(&xs->tx_completion_lock, flags); + if (unlikely(num_descs > 1)) { + xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg; + for (i = 0; i < num_descs; i++) { + xskq_prod_write_addr(pool->cq, idx + descs_processed, + xsk_addr->addrs[i]); + descs_processed++; + } + kmem_cache_free(xsk_tx_generic_cache, xsk_addr); + } else { + xskq_prod_write_addr(pool->cq, idx, + xsk_skb_destructor_get_addr(skb)); + descs_processed++; + } + xskq_prod_submit_n(pool->cq, descs_processed); + spin_unlock_irqrestore(&pool->cq_prod_lock, flags); +} + +static void xsk_cq_cancel_locked(struct xsk_buff_pool *pool, u32 n) +{ + spin_lock(&pool->cq_cached_prod_lock); + xskq_prod_cancel_n(pool->cq, n); + spin_unlock(&pool->cq_cached_prod_lock); +} + +INDIRECT_CALLABLE_SCOPE +void xsk_destruct_skb(struct sk_buff *skb) +{ + struct xsk_tx_metadata_compl *compl = &skb_shinfo(skb)->xsk_meta; + + if (compl->tx_timestamp) { + /* sw completion timestamp, not a real one */ + *compl->tx_timestamp = ktime_get_tai_fast_ns(); + } + + xsk_cq_submit_addr_locked(xdp_sk(skb->sk)->pool, skb); sock_wfree(skb); } -static int xsk_generic_xmit(struct sock *sk) +static void xsk_skb_init_misc(struct sk_buff *skb, struct xdp_sock *xs, + u64 addr) +{ + skb->dev = xs->dev; + skb->priority = READ_ONCE(xs->sk.sk_priority); + skb->mark = READ_ONCE(xs->sk.sk_mark); + skb->destructor = xsk_destruct_skb; + xsk_skb_destructor_set_addr(skb, addr); +} + +static void xsk_consume_skb(struct sk_buff *skb) +{ + struct xdp_sock *xs = xdp_sk(skb->sk); + u32 num_descs = xsk_get_num_desc(skb); + struct xsk_addrs *xsk_addr; + + if (unlikely(num_descs > 1)) { + xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg; + kmem_cache_free(xsk_tx_generic_cache, xsk_addr); + } + + skb->destructor = sock_wfree; + xsk_cq_cancel_locked(xs->pool, num_descs); + /* Free skb without triggering the perf drop trace */ + consume_skb(skb); + xs->skb = NULL; +} + +static void xsk_drop_skb(struct sk_buff *skb) +{ + xdp_sk(skb->sk)->tx->invalid_descs += xsk_get_num_desc(skb); + xsk_consume_skb(skb); +} + +static int xsk_skb_metadata(struct sk_buff *skb, void *buffer, + struct xdp_desc *desc, struct xsk_buff_pool *pool, + u32 hr) +{ + struct xsk_tx_metadata *meta = NULL; + + if (unlikely(pool->tx_metadata_len == 0)) + return -EINVAL; + + meta = buffer - pool->tx_metadata_len; + if (unlikely(!xsk_buff_valid_tx_metadata(meta))) + return -EINVAL; + + if (meta->flags & XDP_TXMD_FLAGS_CHECKSUM) { + if (unlikely(meta->request.csum_start + + meta->request.csum_offset + + sizeof(__sum16) > desc->len)) + return -EINVAL; + + skb->csum_start = hr + meta->request.csum_start; + skb->csum_offset = meta->request.csum_offset; + skb->ip_summed = CHECKSUM_PARTIAL; + + if (unlikely(pool->tx_sw_csum)) { + int err; + + err = skb_checksum_help(skb); + if (err) + return err; + } + } + + if (meta->flags & XDP_TXMD_FLAGS_LAUNCH_TIME) + skb->skb_mstamp_ns = meta->request.launch_time; + xsk_tx_metadata_to_compl(meta, &skb_shinfo(skb)->xsk_meta); + + return 0; +} + +static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs, + struct xdp_desc *desc) +{ + struct xsk_buff_pool *pool = xs->pool; + u32 hr, len, ts, offset, copy, copied; + struct sk_buff *skb = xs->skb; + struct page *page; + void *buffer; + int err, i; + u64 addr; + + addr = desc->addr; + buffer = xsk_buff_raw_get_data(pool, addr); + + if (!skb) { + hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(xs->dev->needed_headroom)); + + skb = sock_alloc_send_skb(&xs->sk, hr, 1, &err); + if (unlikely(!skb)) + return ERR_PTR(err); + + skb_reserve(skb, hr); + + xsk_skb_init_misc(skb, xs, desc->addr); + if (desc->options & XDP_TX_METADATA) { + err = xsk_skb_metadata(skb, buffer, desc, pool, hr); + if (unlikely(err)) + return ERR_PTR(err); + } + } else { + struct xsk_addrs *xsk_addr; + + if (xsk_skb_destructor_is_addr(skb)) { + xsk_addr = kmem_cache_zalloc(xsk_tx_generic_cache, + GFP_KERNEL); + if (!xsk_addr) + return ERR_PTR(-ENOMEM); + + xsk_addr->num_descs = 1; + xsk_addr->addrs[0] = xsk_skb_destructor_get_addr(skb); + skb_shinfo(skb)->destructor_arg = (void *)xsk_addr; + } else { + xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg; + } + + /* in case of -EOVERFLOW that could happen below, + * xsk_consume_skb() will release this node as whole skb + * would be dropped, which implies freeing all list elements + */ + xsk_addr->addrs[xsk_addr->num_descs] = desc->addr; + } + + len = desc->len; + ts = pool->unaligned ? len : pool->chunk_size; + + offset = offset_in_page(buffer); + addr = buffer - pool->addrs; + + for (copied = 0, i = skb_shinfo(skb)->nr_frags; copied < len; i++) { + if (unlikely(i >= MAX_SKB_FRAGS)) + return ERR_PTR(-EOVERFLOW); + + page = pool->umem->pgs[addr >> PAGE_SHIFT]; + get_page(page); + + copy = min_t(u32, PAGE_SIZE - offset, len - copied); + skb_fill_page_desc(skb, i, page, offset, copy); + + copied += copy; + addr += copy; + offset = 0; + } + + skb->len += len; + skb->data_len += len; + skb->truesize += ts; + + refcount_add(ts, &xs->sk.sk_wmem_alloc); + + return skb; +} + +static struct sk_buff *xsk_build_skb(struct xdp_sock *xs, + struct xdp_desc *desc) +{ + struct net_device *dev = xs->dev; + struct sk_buff *skb = xs->skb; + int err; + + if (dev->priv_flags & IFF_TX_SKB_NO_LINEAR) { + skb = xsk_build_skb_zerocopy(xs, desc); + if (IS_ERR(skb)) { + err = PTR_ERR(skb); + skb = NULL; + goto free_err; + } + } else { + u32 hr, tr, len; + void *buffer; + + buffer = xsk_buff_raw_get_data(xs->pool, desc->addr); + len = desc->len; + + if (!skb) { + hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(dev->needed_headroom)); + tr = dev->needed_tailroom; + skb = sock_alloc_send_skb(&xs->sk, hr + len + tr, 1, &err); + if (unlikely(!skb)) + goto free_err; + + skb_reserve(skb, hr); + skb_put(skb, len); + + err = skb_store_bits(skb, 0, buffer, len); + if (unlikely(err)) + goto free_err; + + xsk_skb_init_misc(skb, xs, desc->addr); + if (desc->options & XDP_TX_METADATA) { + err = xsk_skb_metadata(skb, buffer, desc, + xs->pool, hr); + if (unlikely(err)) + goto free_err; + } + } else { + int nr_frags = skb_shinfo(skb)->nr_frags; + struct xsk_addrs *xsk_addr; + struct page *page; + u8 *vaddr; + + if (xsk_skb_destructor_is_addr(skb)) { + xsk_addr = kmem_cache_zalloc(xsk_tx_generic_cache, + GFP_KERNEL); + if (!xsk_addr) { + err = -ENOMEM; + goto free_err; + } + + xsk_addr->num_descs = 1; + xsk_addr->addrs[0] = xsk_skb_destructor_get_addr(skb); + skb_shinfo(skb)->destructor_arg = (void *)xsk_addr; + } else { + xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg; + } + + if (unlikely(nr_frags == (MAX_SKB_FRAGS - 1) && xp_mb_desc(desc))) { + err = -EOVERFLOW; + goto free_err; + } + + page = alloc_page(xs->sk.sk_allocation); + if (unlikely(!page)) { + err = -EAGAIN; + goto free_err; + } + + vaddr = kmap_local_page(page); + memcpy(vaddr, buffer, len); + kunmap_local(vaddr); + + skb_add_rx_frag(skb, nr_frags, page, 0, len, PAGE_SIZE); + refcount_add(PAGE_SIZE, &xs->sk.sk_wmem_alloc); + + xsk_addr->addrs[xsk_addr->num_descs] = desc->addr; + } + } + + xsk_inc_num_desc(skb); + + return skb; + +free_err: + if (skb && !skb_shinfo(skb)->nr_frags) + kfree_skb(skb); + + if (err == -EOVERFLOW) { + /* Drop the packet */ + xsk_inc_num_desc(xs->skb); + xsk_drop_skb(xs->skb); + xskq_cons_release(xs->tx); + } else { + /* Let application retry */ + xsk_cq_cancel_locked(xs->pool, 1); + } + + return ERR_PTR(err); +} + +static int __xsk_generic_xmit(struct sock *sk) { struct xdp_sock *xs = xdp_sk(sk); - u32 max_batch = TX_BATCH_SIZE; bool sent_frame = false; struct xdp_desc desc; struct sk_buff *skb; + u32 max_batch; int err = 0; mutex_lock(&xs->mutex); - if (xs->queue_id >= xs->dev->real_num_tx_queues) + /* Since we dropped the RCU read lock, the socket state might have changed. */ + if (unlikely(!xsk_is_bound(xs))) { + err = -ENXIO; goto out; + } - while (xskq_cons_peek_desc(xs->tx, &desc, xs->umem)) { - char *buffer; - u64 addr; - u32 len; + if (xs->queue_id >= xs->dev->real_num_tx_queues) + goto out; + max_batch = READ_ONCE(xs->max_tx_budget); + while (xskq_cons_peek_desc(xs->tx, &desc, xs->pool)) { if (max_batch-- == 0) { err = -EAGAIN; goto out; } - len = desc.len; - skb = sock_alloc_send_skb(sk, len, 1, &err); - if (unlikely(!skb)) - goto out; - - skb_put(skb, len); - addr = desc.addr; - buffer = xsk_buff_raw_get_data(xs->umem, addr); - err = skb_store_bits(skb, 0, buffer, len); /* This is the backpressure mechanism for the Tx path. * Reserve space in the completion queue and only proceed * if there is space in it. This avoids having to implement * any buffering in the Tx path. */ - if (unlikely(err) || xskq_prod_reserve(xs->umem->cq)) { - kfree_skb(skb); + err = xsk_cq_reserve_locked(xs->pool); + if (err) { + err = -EAGAIN; goto out; } - skb->dev = xs->dev; - skb->priority = sk->sk_priority; - skb->mark = sk->sk_mark; - skb_shinfo(skb)->destructor_arg = (void *)(long)desc.addr; - skb->destructor = xsk_destruct_skb; + skb = xsk_build_skb(xs, &desc); + if (IS_ERR(skb)) { + err = PTR_ERR(skb); + if (err != -EOVERFLOW) + goto out; + err = 0; + continue; + } - err = dev_direct_xmit(skb, xs->queue_id); xskq_cons_release(xs->tx); + + if (xp_mb_desc(&desc)) { + xs->skb = skb; + continue; + } + + err = __dev_direct_xmit(skb, xs->queue_id); + if (err == NETDEV_TX_BUSY) { + /* Tell user-space to retry the send */ + xskq_cons_cancel_n(xs->tx, xsk_get_num_desc(skb)); + xsk_consume_skb(skb); + err = -EAGAIN; + goto out; + } + /* Ignore NET_XMIT_CN as packet might have been sent */ - if (err == NET_XMIT_DROP || err == NETDEV_TX_BUSY) { + if (err == NET_XMIT_DROP) { /* SKB completed but not sent */ err = -EBUSY; + xs->skb = NULL; goto out; } sent_frame = true; + xs->skb = NULL; + } + + if (xskq_has_descs(xs->tx)) { + if (xs->skb) + xsk_drop_skb(xs->skb); + xskq_cons_release(xs->tx); } out: if (sent_frame) - sk->sk_write_space(sk); + __xsk_tx_release(xs); mutex_unlock(&xs->mutex); return err; } -static int __xsk_sendmsg(struct sock *sk) +static int xsk_generic_xmit(struct sock *sk) { - struct xdp_sock *xs = xdp_sk(sk); + int ret; + + /* Drop the RCU lock since the SKB path might sleep. */ + rcu_read_unlock(); + ret = __xsk_generic_xmit(sk); + /* Reaquire RCU lock before going into common code. */ + rcu_read_lock(); + + return ret; +} +static bool xsk_no_wakeup(struct sock *sk) +{ +#ifdef CONFIG_NET_RX_BUSY_POLL + /* Prefer busy-polling, skip the wakeup. */ + return READ_ONCE(sk->sk_prefer_busy_poll) && READ_ONCE(sk->sk_ll_usec) && + napi_id_valid(READ_ONCE(sk->sk_napi_id)); +#else + return false; +#endif +} + +static int xsk_check_common(struct xdp_sock *xs) +{ + if (unlikely(!xsk_is_bound(xs))) + return -ENXIO; if (unlikely(!(xs->dev->flags & IFF_UP))) return -ENETDOWN; + + return 0; +} + +static int __xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len) +{ + bool need_wait = !(m->msg_flags & MSG_DONTWAIT); + struct sock *sk = sock->sk; + struct xdp_sock *xs = xdp_sk(sk); + struct xsk_buff_pool *pool; + int err; + + err = xsk_check_common(xs); + if (err) + return err; + if (unlikely(need_wait)) + return -EOPNOTSUPP; if (unlikely(!xs->tx)) return -ENOBUFS; - return xs->zc ? xsk_zc_xmit(xs) : xsk_generic_xmit(sk); + if (sk_can_busy_loop(sk)) + sk_busy_loop(sk, 1); /* only support non-blocking sockets */ + + if (xs->zc && xsk_no_wakeup(sk)) + return 0; + + pool = xs->pool; + if (pool->cached_need_wakeup & XDP_WAKEUP_TX) { + if (xs->zc) + return xsk_wakeup(xs, XDP_WAKEUP_TX); + return xsk_generic_xmit(sk); + } + return 0; } static int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len) { - bool need_wait = !(m->msg_flags & MSG_DONTWAIT); + int ret; + + rcu_read_lock(); + ret = __xsk_sendmsg(sock, m, total_len); + rcu_read_unlock(); + + return ret; +} + +static int __xsk_recvmsg(struct socket *sock, struct msghdr *m, size_t len, int flags) +{ + bool need_wait = !(flags & MSG_DONTWAIT); struct sock *sk = sock->sk; struct xdp_sock *xs = xdp_sk(sk); + int err; - if (unlikely(!xsk_is_bound(xs))) - return -ENXIO; + err = xsk_check_common(xs); + if (err) + return err; + if (unlikely(!xs->rx)) + return -ENOBUFS; if (unlikely(need_wait)) return -EOPNOTSUPP; - return __xsk_sendmsg(sk); + if (sk_can_busy_loop(sk)) + sk_busy_loop(sk, 1); /* only support non-blocking sockets */ + + if (xsk_no_wakeup(sk)) + return 0; + + if (xs->pool->cached_need_wakeup & XDP_WAKEUP_RX && xs->zc) + return xsk_wakeup(xs, XDP_WAKEUP_RX); + return 0; +} + +static int xsk_recvmsg(struct socket *sock, struct msghdr *m, size_t len, int flags) +{ + int ret; + + rcu_read_lock(); + ret = __xsk_recvmsg(sock, m, len, flags); + rcu_read_unlock(); + + return ret; } static __poll_t xsk_poll(struct file *file, struct socket *sock, struct poll_table_struct *wait) { - __poll_t mask = datagram_poll(file, sock, wait); + __poll_t mask = 0; struct sock *sk = sock->sk; struct xdp_sock *xs = xdp_sk(sk); - struct xdp_umem *umem; + struct xsk_buff_pool *pool; - if (unlikely(!xsk_is_bound(xs))) - return mask; + sock_poll_wait(file, sock, wait); + + rcu_read_lock(); + if (xsk_check_common(xs)) + goto out; - umem = xs->umem; + pool = xs->pool; - if (umem->need_wakeup) { + if (pool->cached_need_wakeup) { if (xs->zc) - xsk_wakeup(xs, umem->need_wakeup); - else + xsk_wakeup(xs, pool->cached_need_wakeup); + else if (xs->tx) /* Poll needs to drive Tx also in copy mode */ - __xsk_sendmsg(sk); + xsk_generic_xmit(sk); } if (xs->rx && !xskq_prod_is_empty(xs->rx)) mask |= EPOLLIN | EPOLLRDNORM; - if (xs->tx && !xskq_cons_is_full(xs->tx)) + if (xs->tx && xsk_tx_writeable(xs)) mask |= EPOLLOUT | EPOLLWRNORM; - +out: + rcu_read_unlock(); return mask; } @@ -477,14 +1161,13 @@ static void xsk_unbind_dev(struct xdp_sock *xs) WRITE_ONCE(xs->state, XSK_UNBOUND); /* Wait for driver to stop using the xdp socket. */ - xdp_del_sk_umem(xs->umem, xs); - xs->dev = NULL; + xp_del_xsk(xs->pool, xs); synchronize_net(); dev_put(dev); } static struct xsk_map *xsk_get_map_list_entry(struct xdp_sock *xs, - struct xdp_sock ***map_entry) + struct xdp_sock __rcu ***map_entry) { struct xsk_map *map = NULL; struct xsk_map_node *node; @@ -495,7 +1178,7 @@ static struct xsk_map *xsk_get_map_list_entry(struct xdp_sock *xs, node = list_first_entry_or_null(&xs->map_list, struct xsk_map_node, node); if (node) { - WARN_ON(xsk_map_inc(node->map)); + bpf_map_inc(&node->map->map); map = node->map; *map_entry = node->map_entry; } @@ -520,12 +1203,12 @@ static void xsk_delete_from_maps(struct xdp_sock *xs) * might be updates to the map between * xsk_get_map_list_entry() and xsk_map_try_sock_delete(). */ - struct xdp_sock **map_entry = NULL; + struct xdp_sock __rcu **map_entry = NULL; struct xsk_map *map; while ((map = xsk_get_map_list_entry(xs, &map_entry))) { xsk_map_try_sock_delete(map, xs, map_entry); - xsk_map_put(map); + bpf_map_put(&map->map); } } @@ -540,13 +1223,14 @@ static int xsk_release(struct socket *sock) net = sock_net(sk); + if (xs->skb) + xsk_drop_skb(xs->skb); + mutex_lock(&net->xdp.lock); sk_del_node_init_rcu(sk); mutex_unlock(&net->xdp.lock); - local_bh_disable(); sock_prot_inuse_add(net, sk->sk_prot, -1); - local_bh_enable(); xsk_delete_from_maps(xs); mutex_lock(&xs->mutex); @@ -555,11 +1239,12 @@ static int xsk_release(struct socket *sock) xskq_destroy(xs->rx); xskq_destroy(xs->tx); + xskq_destroy(xs->fq_tmp); + xskq_destroy(xs->cq_tmp); sock_orphan(sk); sock->sk = NULL; - sk_refcnt_debug_release(sk); sock_put(sk); return 0; @@ -582,12 +1267,18 @@ static struct socket *xsk_lookup_xsk_from_fd(int fd) return sock; } -static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) +static bool xsk_validate_queues(struct xdp_sock *xs) +{ + return xs->fq_tmp && xs->cq_tmp; +} + +static int xsk_bind(struct socket *sock, struct sockaddr_unsized *addr, int addr_len) { struct sockaddr_xdp *sxdp = (struct sockaddr_xdp *)addr; struct sock *sk = sock->sk; struct xdp_sock *xs = xdp_sk(sk); struct net_device *dev; + int bound_dev_if; u32 flags, qid; int err = 0; @@ -598,7 +1289,11 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) flags = sxdp->sxdp_flags; if (flags & ~(XDP_SHARED_UMEM | XDP_COPY | XDP_ZEROCOPY | - XDP_USE_NEED_WAKEUP)) + XDP_USE_NEED_WAKEUP | XDP_USE_SG)) + return -EINVAL; + + bound_dev_if = READ_ONCE(sk->sk_bound_dev_if); + if (bound_dev_if && bound_dev_if != sxdp->sxdp_ifindex) return -EINVAL; rtnl_lock(); @@ -614,6 +1309,8 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) goto out_release; } + netdev_lock_ops(dev); + if (!xs->rx && !xs->tx) { err = -EINVAL; goto out_unlock; @@ -626,7 +1323,7 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) struct socket *sock; if ((flags & XDP_COPY) || (flags & XDP_ZEROCOPY) || - (flags & XDP_USE_NEED_WAKEUP)) { + (flags & XDP_USE_NEED_WAKEUP) || (flags & XDP_USE_SG)) { /* Cannot specify flags for shared sockets. */ err = -EINVAL; goto out_unlock; @@ -650,29 +1347,93 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) sockfd_put(sock); goto out_unlock; } - if (umem_xs->dev != dev || umem_xs->queue_id != qid) { - err = -EINVAL; - sockfd_put(sock); - goto out_unlock; + + if (umem_xs->queue_id != qid || umem_xs->dev != dev) { + /* Share the umem with another socket on another qid + * and/or device. + */ + xs->pool = xp_create_and_assign_umem(xs, + umem_xs->umem); + if (!xs->pool) { + err = -ENOMEM; + sockfd_put(sock); + goto out_unlock; + } + + err = xp_assign_dev_shared(xs->pool, umem_xs, dev, + qid); + if (err) { + xp_destroy(xs->pool); + xs->pool = NULL; + sockfd_put(sock); + goto out_unlock; + } + } else { + /* Share the buffer pool with the other socket. */ + if (xs->fq_tmp || xs->cq_tmp) { + /* Do not allow setting your own fq or cq. */ + err = -EINVAL; + sockfd_put(sock); + goto out_unlock; + } + + xp_get_pool(umem_xs->pool); + xs->pool = umem_xs->pool; + + /* If underlying shared umem was created without Tx + * ring, allocate Tx descs array that Tx batching API + * utilizes + */ + if (xs->tx && !xs->pool->tx_descs) { + err = xp_alloc_tx_descs(xs->pool, xs); + if (err) { + xp_put_pool(xs->pool); + xs->pool = NULL; + sockfd_put(sock); + goto out_unlock; + } + } } xdp_get_umem(umem_xs->umem); WRITE_ONCE(xs->umem, umem_xs->umem); sockfd_put(sock); - } else if (!xs->umem || !xdp_umem_validate_queues(xs->umem)) { + } else if (!xs->umem || !xsk_validate_queues(xs)) { err = -EINVAL; goto out_unlock; } else { /* This xsk has its own umem. */ - err = xdp_umem_assign_dev(xs->umem, dev, qid, flags); - if (err) + xs->pool = xp_create_and_assign_umem(xs, xs->umem); + if (!xs->pool) { + err = -ENOMEM; goto out_unlock; + } + + err = xp_assign_dev(xs->pool, dev, qid, flags); + if (err) { + xp_destroy(xs->pool); + xs->pool = NULL; + goto out_unlock; + } } + /* FQ and CQ are now owned by the buffer pool and cleaned up with it. */ + xs->fq_tmp = NULL; + xs->cq_tmp = NULL; + xs->dev = dev; xs->zc = xs->umem->zc; + xs->sg = !!(xs->umem->flags & XDP_UMEM_SG_FLAG); xs->queue_id = qid; - xdp_add_sk_umem(xs->umem, xs); + xp_add_xsk(xs->pool, xs); + + if (qid < dev->real_num_rx_queues) { + struct netdev_rx_queue *rxq; + + rxq = __netif_get_rx_queue(dev, qid); + if (rxq->napi) + __sk_mark_napi_id_once(sk, rxq->napi->napi_id); + } out_unlock: if (err) { @@ -684,6 +1445,7 @@ out_unlock: smp_wmb(); WRITE_ONCE(xs->state, XSK_BOUND); } + netdev_unlock_ops(dev); out_release: mutex_unlock(&xs->mutex); rtnl_unlock(); @@ -698,7 +1460,7 @@ struct xdp_umem_reg_v1 { }; static int xsk_setsockopt(struct socket *sock, int level, int optname, - char __user *optval, unsigned int optlen) + sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; struct xdp_sock *xs = xdp_sk(sk); @@ -716,7 +1478,7 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname, if (optlen < sizeof(entries)) return -EINVAL; - if (copy_from_user(&entries, optval, sizeof(entries))) + if (copy_from_sockptr(&entries, optval, sizeof(entries))) return -EFAULT; mutex_lock(&xs->mutex); @@ -743,7 +1505,18 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname, else if (optlen < sizeof(mr)) mr_size = sizeof(struct xdp_umem_reg_v1); - if (copy_from_user(&mr, optval, mr_size)) + BUILD_BUG_ON(sizeof(struct xdp_umem_reg_v1) >= sizeof(struct xdp_umem_reg)); + + /* Make sure the last field of the struct doesn't have + * uninitialized padding. All padding has to be explicit + * and has to be set to zero by the userspace to make + * struct xdp_umem_reg extensible in the future. + */ + BUILD_BUG_ON(offsetof(struct xdp_umem_reg, tx_metadata_len) + + sizeof_field(struct xdp_umem_reg, tx_metadata_len) != + sizeof(struct xdp_umem_reg)); + + if (copy_from_sockptr(&mr, optval, mr_size)) return -EFAULT; mutex_lock(&xs->mutex); @@ -770,7 +1543,9 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname, struct xsk_queue **q; int entries; - if (copy_from_user(&entries, optval, sizeof(entries))) + if (optlen < sizeof(entries)) + return -EINVAL; + if (copy_from_sockptr(&entries, optval, sizeof(entries))) return -EFAULT; mutex_lock(&xs->mutex); @@ -778,19 +1553,28 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname, mutex_unlock(&xs->mutex); return -EBUSY; } - if (!xs->umem) { - mutex_unlock(&xs->mutex); - return -EINVAL; - } - q = (optname == XDP_UMEM_FILL_RING) ? &xs->umem->fq : - &xs->umem->cq; + q = (optname == XDP_UMEM_FILL_RING) ? &xs->fq_tmp : + &xs->cq_tmp; err = xsk_init_queue(entries, q, true); - if (optname == XDP_UMEM_FILL_RING) - xp_set_fq(xs->umem->pool, *q); mutex_unlock(&xs->mutex); return err; } + case XDP_MAX_TX_SKB_BUDGET: + { + unsigned int budget; + + if (optlen != sizeof(budget)) + return -EINVAL; + if (copy_from_sockptr(&budget, optval, sizeof(budget))) + return -EFAULT; + if (!xs->tx || + budget < TX_BATCH_SIZE || budget > xs->tx->nentries) + return -EACCES; + + WRITE_ONCE(xs->max_tx_budget, budget); + return 0; + } default: break; } @@ -812,6 +1596,12 @@ static void xsk_enter_umem_offsets(struct xdp_ring_offset_v1 *ring) ring->desc = offsetof(struct xdp_umem_ring, desc); } +struct xdp_statistics_v1 { + __u64 rx_dropped; + __u64 rx_invalid_descs; + __u64 tx_invalid_descs; +}; + static int xsk_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) { @@ -830,20 +1620,36 @@ static int xsk_getsockopt(struct socket *sock, int level, int optname, switch (optname) { case XDP_STATISTICS: { - struct xdp_statistics stats; + struct xdp_statistics stats = {}; + bool extra_stats = true; + size_t stats_size; - if (len < sizeof(stats)) + if (len < sizeof(struct xdp_statistics_v1)) { return -EINVAL; + } else if (len < sizeof(stats)) { + extra_stats = false; + stats_size = sizeof(struct xdp_statistics_v1); + } else { + stats_size = sizeof(stats); + } mutex_lock(&xs->mutex); stats.rx_dropped = xs->rx_dropped; + if (extra_stats) { + stats.rx_ring_full = xs->rx_queue_full; + stats.rx_fill_ring_empty_descs = + xs->pool ? xskq_nb_queue_empty_descs(xs->pool->fq) : 0; + stats.tx_ring_empty_descs = xskq_nb_queue_empty_descs(xs->tx); + } else { + stats.rx_dropped += xs->rx_queue_full; + } stats.rx_invalid_descs = xskq_nb_invalid_descs(xs->rx); stats.tx_invalid_descs = xskq_nb_invalid_descs(xs->tx); mutex_unlock(&xs->mutex); - if (copy_to_user(optval, &stats, sizeof(stats))) + if (copy_to_user(optval, &stats, stats_size)) return -EFAULT; - if (put_user(sizeof(stats), optlen)) + if (put_user(stats_size, optlen)) return -EFAULT; return 0; @@ -933,12 +1739,10 @@ static int xsk_mmap(struct file *file, struct socket *sock, loff_t offset = (loff_t)vma->vm_pgoff << PAGE_SHIFT; unsigned long size = vma->vm_end - vma->vm_start; struct xdp_sock *xs = xdp_sk(sock->sk); + int state = READ_ONCE(xs->state); struct xsk_queue *q = NULL; - struct xdp_umem *umem; - unsigned long pfn; - struct page *qpg; - if (READ_ONCE(xs->state) != XSK_READY) + if (state != XSK_READY && state != XSK_BOUND) return -EBUSY; if (offset == XDP_PGOFF_RX_RING) { @@ -946,16 +1750,14 @@ static int xsk_mmap(struct file *file, struct socket *sock, } else if (offset == XDP_PGOFF_TX_RING) { q = READ_ONCE(xs->tx); } else { - umem = READ_ONCE(xs->umem); - if (!umem) - return -EINVAL; - /* Matches the smp_wmb() in XDP_UMEM_REG */ smp_rmb(); if (offset == XDP_UMEM_PGOFF_FILL_RING) - q = READ_ONCE(umem->fq); + q = state == XSK_READY ? READ_ONCE(xs->fq_tmp) : + READ_ONCE(xs->pool->fq); else if (offset == XDP_UMEM_PGOFF_COMPLETION_RING) - q = READ_ONCE(umem->cq); + q = state == XSK_READY ? READ_ONCE(xs->cq_tmp) : + READ_ONCE(xs->pool->cq); } if (!q) @@ -963,13 +1765,10 @@ static int xsk_mmap(struct file *file, struct socket *sock, /* Matches the smp_wmb() in xsk_init_queue */ smp_rmb(); - qpg = virt_to_head_page(q->ring); - if (size > page_size(qpg)) + if (size > q->ring_vmalloc_size) return -EINVAL; - pfn = virt_to_phys(q->ring) >> PAGE_SHIFT; - return remap_pfn_range(vma, vma->vm_start, pfn, - size, vma->vm_page_prot); + return remap_vmalloc_range(vma, q->ring, 0); } static int xsk_notifier(struct notifier_block *this, @@ -989,12 +1788,12 @@ static int xsk_notifier(struct notifier_block *this, if (xs->dev == dev) { sk->sk_err = ENETDOWN; if (!sock_flag(sk, SOCK_DEAD)) - sk->sk_error_report(sk); + sk_error_report(sk); xsk_unbind_dev(xs); - /* Clear device references in umem. */ - xdp_umem_clear_dev(xs->umem); + /* Clear device references. */ + xp_clear_dev(xs->pool); } mutex_unlock(&xs->mutex); } @@ -1026,9 +1825,8 @@ static const struct proto_ops xsk_proto_ops = { .setsockopt = xsk_setsockopt, .getsockopt = xsk_getsockopt, .sendmsg = xsk_sendmsg, - .recvmsg = sock_no_recvmsg, + .recvmsg = xsk_recvmsg, .mmap = xsk_mmap, - .sendpage = sock_no_sendpage, }; static void xsk_destruct(struct sock *sk) @@ -1038,16 +1836,15 @@ static void xsk_destruct(struct sock *sk) if (!sock_flag(sk, SOCK_DEAD)) return; - xdp_put_umem(xs->umem); - - sk_refcnt_debug_dec(sk); + if (!xp_put_pool(xs->pool)) + xdp_put_umem(xs->umem, !xs->pool); } static int xsk_create(struct net *net, struct socket *sock, int protocol, int kern) { - struct sock *sk; struct xdp_sock *xs; + struct sock *sk; if (!ns_capable(net->user_ns, CAP_NET_RAW)) return -EPERM; @@ -1070,15 +1867,13 @@ static int xsk_create(struct net *net, struct socket *sock, int protocol, sk->sk_family = PF_XDP; sk->sk_destruct = xsk_destruct; - sk_refcnt_debug_inc(sk); sock_set_flag(sk, SOCK_RCU_FREE); xs = xdp_sk(sk); xs->state = XSK_READY; + xs->max_tx_budget = TX_BATCH_SIZE; mutex_init(&xs->mutex); - spin_lock_init(&xs->rx_lock); - spin_lock_init(&xs->tx_completion_lock); INIT_LIST_HEAD(&xs->map_list); spin_lock_init(&xs->map_list_lock); @@ -1087,9 +1882,7 @@ static int xsk_create(struct net *net, struct socket *sock, int protocol, sk_add_node_rcu(sk, &net->xdp.list); mutex_unlock(&net->xdp.lock); - local_bh_disable(); sock_prot_inuse_add(net, &xsk_proto, 1); - local_bh_enable(); return 0; } @@ -1123,7 +1916,7 @@ static struct pernet_operations xsk_net_ops = { static int __init xsk_init(void) { - int err, cpu; + int err; err = proto_register(&xsk_proto, 0 /* no slab */); if (err) @@ -1141,10 +1934,18 @@ static int __init xsk_init(void) if (err) goto out_pernet; - for_each_possible_cpu(cpu) - INIT_LIST_HEAD(&per_cpu(xskmap_flush_list, cpu)); + xsk_tx_generic_cache = kmem_cache_create("xsk_generic_xmit_cache", + sizeof(struct xsk_addrs), + 0, SLAB_HWCACHE_ALIGN, NULL); + if (!xsk_tx_generic_cache) { + err = -ENOMEM; + goto out_unreg_notif; + } + return 0; +out_unreg_notif: + unregister_netdevice_notifier(&xsk_netdev_notifier); out_pernet: unregister_pernet_subsys(&xsk_net_ops); out_sk: |
