diff options
Diffstat (limited to 'drivers/net/xen-netfront.c')
| -rw-r--r-- | drivers/net/xen-netfront.c | 1060 |
1 files changed, 779 insertions, 281 deletions
diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c index c914c24f880b..7c2220366623 100644 --- a/drivers/net/xen-netfront.c +++ b/drivers/net/xen-netfront.c @@ -44,6 +44,9 @@ #include <linux/mm.h> #include <linux/slab.h> #include <net/ip.h> +#include <linux/bpf.h> +#include <net/page_pool/types.h> +#include <linux/bpf_trace.h> #include <xen/xen.h> #include <xen/xenbus.h> @@ -63,6 +66,12 @@ module_param_named(max_queues, xennet_max_queues, uint, 0644); MODULE_PARM_DESC(max_queues, "Maximum number of queues per virtual interface"); +static bool __read_mostly xennet_trusted = true; +module_param_named(trusted, xennet_trusted, bool, 0644); +MODULE_PARM_DESC(trusted, "Is the backend trusted"); + +#define XENNET_TIMEOUT (5 * HZ) + static const struct ethtool_ops xennet_ethtool_ops; struct netfront_cb { @@ -73,8 +82,6 @@ struct netfront_cb { #define RX_COPY_THRESHOLD 256 -#define GRANT_INVALID_REF 0 - #define NET_TX_RING_SIZE __CONST_RING_SIZE(xen_netif_tx, XEN_PAGE_SIZE) #define NET_RX_RING_SIZE __CONST_RING_SIZE(xen_netif_rx, XEN_PAGE_SIZE) @@ -102,6 +109,8 @@ struct netfront_queue { char name[QUEUE_NAME_SIZE]; /* DEVNAME-qN */ struct netfront_info *info; + struct bpf_prog __rcu *xdp_prog; + struct napi_struct napi; /* Split event channels support, tx_* == rx_* when using @@ -119,21 +128,17 @@ struct netfront_queue { /* * {tx,rx}_skbs store outstanding skbuffs. Free tx_skb entries - * are linked from tx_skb_freelist through skb_entry.link. - * - * NB. Freelist index entries are always going to be less than - * PAGE_OFFSET, whereas pointers to skbs will always be equal or - * greater than PAGE_OFFSET: we use this property to distinguish - * them. + * are linked from tx_skb_freelist through tx_link. */ - union skb_entry { - struct sk_buff *skb; - unsigned long link; - } tx_skbs[NET_TX_RING_SIZE]; + struct sk_buff *tx_skbs[NET_TX_RING_SIZE]; + unsigned short tx_link[NET_TX_RING_SIZE]; +#define TX_LINK_NONE 0xffff +#define TX_PENDING 0xfffe grant_ref_t gref_tx_head; grant_ref_t grant_tx_ref[NET_TX_RING_SIZE]; struct page *grant_tx_page[NET_TX_RING_SIZE]; unsigned tx_skb_freelist; + unsigned int tx_pend_queue; spinlock_t rx_lock ____cacheline_aligned_in_smp; struct xen_netif_rx_front_ring rx; @@ -144,6 +149,12 @@ struct netfront_queue { struct sk_buff *rx_skbs[NET_RX_RING_SIZE]; grant_ref_t gref_rx_head; grant_ref_t grant_rx_ref[NET_RX_RING_SIZE]; + + unsigned int rx_rsp_unconsumed; + spinlock_t rx_cons_lock; + + struct page_pool *page_pool; + struct xdp_rxq_info xdp_rxq; }; struct netfront_info { @@ -159,6 +170,16 @@ struct netfront_info { struct netfront_stats __percpu *rx_stats; struct netfront_stats __percpu *tx_stats; + /* XDP state */ + bool netback_has_xdp_headroom; + bool netfront_xdp_enabled; + + /* Is device behaving sane? */ + bool broken; + + /* Should skbs be bounced into a zeroed buffer? */ + bool bounce; + atomic_t rx_gso_checksum_fixup; }; @@ -167,33 +188,25 @@ struct netfront_rx_info { struct xen_netif_extra_info extras[XEN_NETIF_EXTRA_TYPE_MAX - 1]; }; -static void skb_entry_set_link(union skb_entry *list, unsigned short id) -{ - list->link = id; -} - -static int skb_entry_is_link(const union skb_entry *list) -{ - BUILD_BUG_ON(sizeof(list->skb) != sizeof(list->link)); - return (unsigned long)list->skb < PAGE_OFFSET; -} - /* * Access macros for acquiring freeing slots in tx_skbs[]. */ -static void add_id_to_freelist(unsigned *head, union skb_entry *list, - unsigned short id) +static void add_id_to_list(unsigned *head, unsigned short *list, + unsigned short id) { - skb_entry_set_link(&list[id], *head); + list[id] = *head; *head = id; } -static unsigned short get_id_from_freelist(unsigned *head, - union skb_entry *list) +static unsigned short get_id_from_list(unsigned *head, unsigned short *list) { unsigned int id = *head; - *head = list[id].link; + + if (id != TX_LINK_NONE) { + *head = list[id]; + list[id] = TX_LINK_NONE; + } return id; } @@ -216,7 +229,7 @@ static grant_ref_t xennet_get_rx_ref(struct netfront_queue *queue, { int i = xennet_rxidx(ri); grant_ref_t ref = queue->grant_rx_ref[i]; - queue->grant_rx_ref[i] = GRANT_INVALID_REF; + queue->grant_rx_ref[i] = INVALID_GRANT_REF; return ref; } @@ -232,7 +245,8 @@ static bool xennet_can_sg(struct net_device *dev) static void rx_refill_timeout(struct timer_list *t) { - struct netfront_queue *queue = from_timer(queue, t, rx_refill_timer); + struct netfront_queue *queue = timer_container_of(queue, t, + rx_refill_timer); napi_schedule(&queue->napi); } @@ -265,12 +279,14 @@ static struct sk_buff *xennet_alloc_one_rx_buffer(struct netfront_queue *queue) if (unlikely(!skb)) return NULL; - page = alloc_page(GFP_ATOMIC | __GFP_NOWARN); - if (!page) { + page = page_pool_alloc_pages(queue->page_pool, + GFP_ATOMIC | __GFP_NOWARN | __GFP_ZERO); + if (unlikely(!page)) { kfree_skb(skb); return NULL; } skb_add_rx_frag(skb, 0, page, 0, 0, PAGE_SIZE); + skb_mark_for_recycle(skb); /* Align ip header to a 16 bytes boundary */ skb_reserve(skb, NET_IP_ALIGN); @@ -349,7 +365,7 @@ static int xennet_open(struct net_device *dev) unsigned int i = 0; struct netfront_queue *queue = NULL; - if (!np->queues) + if (!np->queues || np->broken) return -ENODEV; for (i = 0; i < num_queues; ++i) { @@ -371,41 +387,62 @@ static int xennet_open(struct net_device *dev) return 0; } -static void xennet_tx_buf_gc(struct netfront_queue *queue) +static bool xennet_tx_buf_gc(struct netfront_queue *queue) { RING_IDX cons, prod; unsigned short id; struct sk_buff *skb; bool more_to_do; + bool work_done = false; + const struct device *dev = &queue->info->netdev->dev; BUG_ON(!netif_carrier_ok(queue->info->netdev)); do { prod = queue->tx.sring->rsp_prod; + if (RING_RESPONSE_PROD_OVERFLOW(&queue->tx, prod)) { + dev_alert(dev, "Illegal number of responses %u\n", + prod - queue->tx.rsp_cons); + goto err; + } rmb(); /* Ensure we see responses up to 'rp'. */ for (cons = queue->tx.rsp_cons; cons != prod; cons++) { - struct xen_netif_tx_response *txrsp; + struct xen_netif_tx_response txrsp; - txrsp = RING_GET_RESPONSE(&queue->tx, cons); - if (txrsp->status == XEN_NETIF_RSP_NULL) + work_done = true; + + RING_COPY_RESPONSE(&queue->tx, cons, &txrsp); + if (txrsp.status == XEN_NETIF_RSP_NULL) continue; - id = txrsp->id; - skb = queue->tx_skbs[id].skb; - if (unlikely(gnttab_query_foreign_access( - queue->grant_tx_ref[id]) != 0)) { - pr_alert("%s: warning -- grant still in use by backend domain\n", - __func__); - BUG(); + id = txrsp.id; + if (id >= RING_SIZE(&queue->tx)) { + dev_alert(dev, + "Response has incorrect id (%u)\n", + id); + goto err; + } + if (queue->tx_link[id] != TX_PENDING) { + dev_alert(dev, + "Response for inactive request\n"); + goto err; + } + + queue->tx_link[id] = TX_LINK_NONE; + skb = queue->tx_skbs[id]; + queue->tx_skbs[id] = NULL; + if (unlikely(!gnttab_end_foreign_access_ref( + queue->grant_tx_ref[id]))) { + dev_alert(dev, + "Grant still in use by backend domain\n"); + goto err; } - gnttab_end_foreign_access_ref( - queue->grant_tx_ref[id], GNTMAP_readonly); gnttab_release_grant_reference( &queue->gref_tx_head, queue->grant_tx_ref[id]); - queue->grant_tx_ref[id] = GRANT_INVALID_REF; + queue->grant_tx_ref[id] = INVALID_GRANT_REF; queue->grant_tx_page[id] = NULL; - add_id_to_freelist(&queue->tx_skb_freelist, queue->tx_skbs, id); + add_id_to_list(&queue->tx_skb_freelist, queue->tx_link, id); dev_kfree_skb_irq(skb); } @@ -415,13 +452,22 @@ static void xennet_tx_buf_gc(struct netfront_queue *queue) } while (more_to_do); xennet_maybe_wake_tx(queue); + + return work_done; + + err: + queue->info->broken = true; + dev_alert(dev, "Disabled for further use\n"); + + return work_done; } struct xennet_gnttab_make_txreq { struct netfront_queue *queue; struct sk_buff *skb; struct page *page; - struct xen_netif_tx_request *tx; /* Last request */ + struct xen_netif_tx_request *tx; /* Last request on ring page */ + struct xen_netif_tx_request tx_local; /* Last request local copy*/ unsigned int size; }; @@ -437,7 +483,7 @@ static void xennet_tx_setup_grant(unsigned long gfn, unsigned int offset, struct netfront_queue *queue = info->queue; struct sk_buff *skb = info->skb; - id = get_id_from_freelist(&queue->tx_skb_freelist, queue->tx_skbs); + id = get_id_from_list(&queue->tx_skb_freelist, queue->tx_link); tx = RING_GET_REQUEST(&queue->tx, queue->tx.req_prod_pvt++); ref = gnttab_claim_grant_reference(&queue->gref_tx_head); WARN_ON_ONCE(IS_ERR_VALUE((unsigned long)(int)ref)); @@ -445,34 +491,37 @@ static void xennet_tx_setup_grant(unsigned long gfn, unsigned int offset, gnttab_grant_foreign_access_ref(ref, queue->info->xbdev->otherend_id, gfn, GNTMAP_readonly); - queue->tx_skbs[id].skb = skb; + queue->tx_skbs[id] = skb; queue->grant_tx_page[id] = page; queue->grant_tx_ref[id] = ref; - tx->id = id; - tx->gref = ref; - tx->offset = offset; - tx->size = len; - tx->flags = 0; + info->tx_local.id = id; + info->tx_local.gref = ref; + info->tx_local.offset = offset; + info->tx_local.size = len; + info->tx_local.flags = 0; + + *tx = info->tx_local; + + /* + * Put the request in the pending queue, it will be set to be pending + * when the producer index is about to be raised. + */ + add_id_to_list(&queue->tx_pend_queue, queue->tx_link, id); info->tx = tx; - info->size += tx->size; + info->size += info->tx_local.size; } static struct xen_netif_tx_request *xennet_make_first_txreq( - struct netfront_queue *queue, struct sk_buff *skb, - struct page *page, unsigned int offset, unsigned int len) + struct xennet_gnttab_make_txreq *info, + unsigned int offset, unsigned int len) { - struct xennet_gnttab_make_txreq info = { - .queue = queue, - .skb = skb, - .page = page, - .size = 0, - }; + info->size = 0; - gnttab_for_one_grant(page, offset, len, xennet_tx_setup_grant, &info); + gnttab_for_one_grant(info->page, offset, len, xennet_tx_setup_grant, info); - return info.tx; + return info->tx; } static void xennet_make_one_txreq(unsigned long gfn, unsigned int offset, @@ -485,35 +534,27 @@ static void xennet_make_one_txreq(unsigned long gfn, unsigned int offset, xennet_tx_setup_grant(gfn, offset, len, data); } -static struct xen_netif_tx_request *xennet_make_txreqs( - struct netfront_queue *queue, struct xen_netif_tx_request *tx, - struct sk_buff *skb, struct page *page, +static void xennet_make_txreqs( + struct xennet_gnttab_make_txreq *info, + struct page *page, unsigned int offset, unsigned int len) { - struct xennet_gnttab_make_txreq info = { - .queue = queue, - .skb = skb, - .tx = tx, - }; - /* Skip unused frames from start of page */ page += offset >> PAGE_SHIFT; offset &= ~PAGE_MASK; while (len) { - info.page = page; - info.size = 0; + info->page = page; + info->size = 0; gnttab_foreach_grant_in_range(page, offset, len, xennet_make_one_txreq, - &info); + info); page++; offset = 0; - len -= info.size; + len -= info->size; } - - return info.tx; } /* @@ -531,7 +572,7 @@ static int xennet_count_skb_slots(struct sk_buff *skb) for (i = 0; i < frags; i++) { skb_frag_t *frag = skb_shinfo(skb)->frags + i; unsigned long size = skb_frag_size(frag); - unsigned long offset = frag->page_offset; + unsigned long offset = skb_frag_off(frag); /* Skip unused frames from start of page */ offset &= ~PAGE_MASK; @@ -543,8 +584,7 @@ static int xennet_count_skb_slots(struct sk_buff *skb) } static u16 xennet_select_queue(struct net_device *dev, struct sk_buff *skb, - struct net_device *sb_dev, - select_queue_fallback_t fallback) + struct net_device *sb_dev) { unsigned int num_queues = dev->real_num_tx_queues; u32 hash; @@ -561,13 +601,113 @@ static u16 xennet_select_queue(struct net_device *dev, struct sk_buff *skb, return queue_idx; } +static void xennet_mark_tx_pending(struct netfront_queue *queue) +{ + unsigned int i; + + while ((i = get_id_from_list(&queue->tx_pend_queue, queue->tx_link)) != + TX_LINK_NONE) + queue->tx_link[i] = TX_PENDING; +} + +static int xennet_xdp_xmit_one(struct net_device *dev, + struct netfront_queue *queue, + struct xdp_frame *xdpf) +{ + struct netfront_info *np = netdev_priv(dev); + struct netfront_stats *tx_stats = this_cpu_ptr(np->tx_stats); + struct xennet_gnttab_make_txreq info = { + .queue = queue, + .skb = NULL, + .page = virt_to_page(xdpf->data), + }; + int notify; + + xennet_make_first_txreq(&info, + offset_in_page(xdpf->data), + xdpf->len); + + xennet_mark_tx_pending(queue); + + RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&queue->tx, notify); + if (notify) + notify_remote_via_irq(queue->tx_irq); + + u64_stats_update_begin(&tx_stats->syncp); + tx_stats->bytes += xdpf->len; + tx_stats->packets++; + u64_stats_update_end(&tx_stats->syncp); + + return 0; +} + +static int xennet_xdp_xmit(struct net_device *dev, int n, + struct xdp_frame **frames, u32 flags) +{ + unsigned int num_queues = dev->real_num_tx_queues; + struct netfront_info *np = netdev_priv(dev); + struct netfront_queue *queue = NULL; + unsigned long irq_flags; + int nxmit = 0; + int i; + + if (unlikely(np->broken)) + return -ENODEV; + if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK)) + return -EINVAL; + + queue = &np->queues[smp_processor_id() % num_queues]; + + spin_lock_irqsave(&queue->tx_lock, irq_flags); + for (i = 0; i < n; i++) { + struct xdp_frame *xdpf = frames[i]; + + if (!xdpf) + continue; + if (xennet_xdp_xmit_one(dev, queue, xdpf)) + break; + nxmit++; + } + spin_unlock_irqrestore(&queue->tx_lock, irq_flags); + + return nxmit; +} + +static struct sk_buff *bounce_skb(const struct sk_buff *skb) +{ + unsigned int headerlen = skb_headroom(skb); + /* Align size to allocate full pages and avoid contiguous data leaks */ + unsigned int size = ALIGN(skb_end_offset(skb) + skb->data_len, + XEN_PAGE_SIZE); + struct sk_buff *n = alloc_skb(size, GFP_ATOMIC | __GFP_ZERO); + + if (!n) + return NULL; + + if (!IS_ALIGNED((uintptr_t)n->head, XEN_PAGE_SIZE)) { + WARN_ONCE(1, "misaligned skb allocated\n"); + kfree_skb(n); + return NULL; + } + + /* Set the data pointer */ + skb_reserve(n, headerlen); + /* Set the tail pointer and length */ + skb_put(n, skb->len); + + BUG_ON(skb_copy_bits(skb, -headerlen, n->head, headerlen + skb->len)); + + skb_copy_header(n, skb); + return n; +} + #define MAX_XEN_SKB_FRAGS (65536 / XEN_PAGE_SIZE + 1) static netdev_tx_t xennet_start_xmit(struct sk_buff *skb, struct net_device *dev) { struct netfront_info *np = netdev_priv(dev); struct netfront_stats *tx_stats = this_cpu_ptr(np->tx_stats); - struct xen_netif_tx_request *tx, *first_tx; + struct xen_netif_tx_request *first_tx; unsigned int i; int notify; int slots; @@ -576,6 +716,7 @@ static netdev_tx_t xennet_start_xmit(struct sk_buff *skb, struct net_device *dev unsigned int len; unsigned long flags; struct netfront_queue *queue = NULL; + struct xennet_gnttab_make_txreq info = { }; unsigned int num_queues = dev->real_num_tx_queues; u16 queue_index; struct sk_buff *nskb; @@ -583,6 +724,8 @@ static netdev_tx_t xennet_start_xmit(struct sk_buff *skb, struct net_device *dev /* Drop the packet if no queues are set up */ if (num_queues < 1) goto drop; + if (unlikely(np->broken)) + goto drop; /* Determine which queue to transmit this SKB on */ queue_index = skb_get_queue_mapping(skb); queue = &np->queues[queue_index]; @@ -610,9 +753,13 @@ static netdev_tx_t xennet_start_xmit(struct sk_buff *skb, struct net_device *dev /* The first req should be at least ETH_HLEN size or the packet will be * dropped by netback. + * + * If the backend is not trusted bounce all data to zeroed pages to + * avoid exposing contiguous data on the granted page not belonging to + * the skb. */ - if (unlikely(PAGE_SIZE - offset < ETH_HLEN)) { - nskb = skb_copy(skb, GFP_ATOMIC); + if (np->bounce || unlikely(PAGE_SIZE - offset < ETH_HLEN)) { + nskb = bounce_skb(skb); if (!nskb) goto drop; dev_consume_skb_any(skb); @@ -633,21 +780,24 @@ static netdev_tx_t xennet_start_xmit(struct sk_buff *skb, struct net_device *dev } /* First request for the linear area. */ - first_tx = tx = xennet_make_first_txreq(queue, skb, - page, offset, len); - offset += tx->size; + info.queue = queue; + info.skb = skb; + info.page = page; + first_tx = xennet_make_first_txreq(&info, offset, len); + offset += info.tx_local.size; if (offset == PAGE_SIZE) { page++; offset = 0; } - len -= tx->size; + len -= info.tx_local.size; if (skb->ip_summed == CHECKSUM_PARTIAL) /* local packet? */ - tx->flags |= XEN_NETTXF_csum_blank | XEN_NETTXF_data_validated; + first_tx->flags |= XEN_NETTXF_csum_blank | + XEN_NETTXF_data_validated; else if (skb->ip_summed == CHECKSUM_UNNECESSARY) /* remote but checksummed. */ - tx->flags |= XEN_NETTXF_data_validated; + first_tx->flags |= XEN_NETTXF_data_validated; /* Optional extra info after the first request. */ if (skb_shinfo(skb)->gso_size) { @@ -656,7 +806,7 @@ static netdev_tx_t xennet_start_xmit(struct sk_buff *skb, struct net_device *dev gso = (struct xen_netif_extra_info *) RING_GET_REQUEST(&queue->tx, queue->tx.req_prod_pvt++); - tx->flags |= XEN_NETTXF_extra_info; + first_tx->flags |= XEN_NETTXF_extra_info; gso->u.gso.size = skb_shinfo(skb)->gso_size; gso->u.gso.type = (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV6) ? @@ -670,19 +820,24 @@ static netdev_tx_t xennet_start_xmit(struct sk_buff *skb, struct net_device *dev } /* Requests for the rest of the linear area. */ - tx = xennet_make_txreqs(queue, tx, skb, page, offset, len); + xennet_make_txreqs(&info, page, offset, len); /* Requests for all the frags. */ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; - tx = xennet_make_txreqs(queue, tx, skb, - skb_frag_page(frag), frag->page_offset, + xennet_make_txreqs(&info, skb_frag_page(frag), + skb_frag_off(frag), skb_frag_size(frag)); } /* First request has the packet length. */ first_tx->size = skb->len; + /* timestamp packet in software */ + skb_tx_timestamp(skb); + + xennet_mark_tx_pending(queue); + RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&queue->tx, notify); if (notify) notify_remote_via_irq(queue->tx_irq); @@ -692,9 +847,6 @@ static netdev_tx_t xennet_start_xmit(struct sk_buff *skb, struct net_device *dev tx_stats->packets++; u64_stats_update_end(&tx_stats->syncp); - /* Note: It is not safe to access skb after xennet_tx_buf_gc()! */ - xennet_tx_buf_gc(queue); - if (!netfront_tx_slot_available(queue)) netif_tx_stop_queue(netdev_get_tx_queue(dev, queue->id)); @@ -711,7 +863,7 @@ static netdev_tx_t xennet_start_xmit(struct sk_buff *skb, struct net_device *dev static int xennet_close(struct net_device *dev) { struct netfront_info *np = netdev_priv(dev); - unsigned int num_queues = dev->real_num_tx_queues; + unsigned int num_queues = np->queues ? dev->real_num_tx_queues : 0; unsigned int i; struct netfront_queue *queue; netif_tx_stop_all_queues(np->netdev); @@ -722,6 +874,41 @@ static int xennet_close(struct net_device *dev) return 0; } +static void xennet_destroy_queues(struct netfront_info *info) +{ + unsigned int i; + + if (!info->queues) + return; + + for (i = 0; i < info->netdev->real_num_tx_queues; i++) { + struct netfront_queue *queue = &info->queues[i]; + + if (netif_running(info->netdev)) + napi_disable(&queue->napi); + netif_napi_del(&queue->napi); + } + + kfree(info->queues); + info->queues = NULL; +} + +static void xennet_uninit(struct net_device *dev) +{ + struct netfront_info *np = netdev_priv(dev); + xennet_destroy_queues(np); +} + +static void xennet_set_rx_rsp_cons(struct netfront_queue *queue, RING_IDX val) +{ + unsigned long flags; + + spin_lock_irqsave(&queue->rx_cons_lock, flags); + queue->rx.rsp_cons = val; + queue->rx_rsp_unconsumed = XEN_RING_NR_UNCONSUMED_RESPONSES(&queue->rx); + spin_unlock_irqrestore(&queue->rx_cons_lock, flags); +} + static void xennet_move_rx_slot(struct netfront_queue *queue, struct sk_buff *skb, grant_ref_t ref) { @@ -740,7 +927,7 @@ static int xennet_get_extras(struct netfront_queue *queue, RING_IDX rp) { - struct xen_netif_extra_info *extra; + struct xen_netif_extra_info extra; struct device *dev = &queue->info->netdev->dev; RING_IDX cons = queue->rx.rsp_cons; int err = 0; @@ -756,66 +943,119 @@ static int xennet_get_extras(struct netfront_queue *queue, break; } - extra = (struct xen_netif_extra_info *) - RING_GET_RESPONSE(&queue->rx, ++cons); + RING_COPY_RESPONSE(&queue->rx, ++cons, &extra); - if (unlikely(!extra->type || - extra->type >= XEN_NETIF_EXTRA_TYPE_MAX)) { + if (unlikely(!extra.type || + extra.type >= XEN_NETIF_EXTRA_TYPE_MAX)) { if (net_ratelimit()) dev_warn(dev, "Invalid extra type: %d\n", - extra->type); + extra.type); err = -EINVAL; } else { - memcpy(&extras[extra->type - 1], extra, - sizeof(*extra)); + extras[extra.type - 1] = extra; } skb = xennet_get_rx_skb(queue, cons); ref = xennet_get_rx_ref(queue, cons); xennet_move_rx_slot(queue, skb, ref); - } while (extra->flags & XEN_NETIF_EXTRA_FLAG_MORE); + } while (extra.flags & XEN_NETIF_EXTRA_FLAG_MORE); - queue->rx.rsp_cons = cons; + xennet_set_rx_rsp_cons(queue, cons); return err; } +static u32 xennet_run_xdp(struct netfront_queue *queue, struct page *pdata, + struct xen_netif_rx_response *rx, struct bpf_prog *prog, + struct xdp_buff *xdp, bool *need_xdp_flush) +{ + struct xdp_frame *xdpf; + u32 len = rx->status; + u32 act; + int err; + + xdp_init_buff(xdp, XEN_PAGE_SIZE - XDP_PACKET_HEADROOM, + &queue->xdp_rxq); + xdp_prepare_buff(xdp, page_address(pdata), XDP_PACKET_HEADROOM, + len, false); + + act = bpf_prog_run_xdp(prog, xdp); + switch (act) { + case XDP_TX: + xdpf = xdp_convert_buff_to_frame(xdp); + if (unlikely(!xdpf)) { + trace_xdp_exception(queue->info->netdev, prog, act); + break; + } + get_page(pdata); + err = xennet_xdp_xmit(queue->info->netdev, 1, &xdpf, 0); + if (unlikely(err <= 0)) { + if (err < 0) + trace_xdp_exception(queue->info->netdev, prog, act); + xdp_return_frame_rx_napi(xdpf); + } + break; + case XDP_REDIRECT: + get_page(pdata); + err = xdp_do_redirect(queue->info->netdev, xdp, prog); + *need_xdp_flush = true; + if (unlikely(err)) { + trace_xdp_exception(queue->info->netdev, prog, act); + xdp_return_buff(xdp); + } + break; + case XDP_PASS: + case XDP_DROP: + break; + + case XDP_ABORTED: + trace_xdp_exception(queue->info->netdev, prog, act); + break; + + default: + bpf_warn_invalid_xdp_action(queue->info->netdev, prog, act); + } + + return act; +} + static int xennet_get_responses(struct netfront_queue *queue, struct netfront_rx_info *rinfo, RING_IDX rp, - struct sk_buff_head *list) + struct sk_buff_head *list, + bool *need_xdp_flush) { - struct xen_netif_rx_response *rx = &rinfo->rx; - struct xen_netif_extra_info *extras = rinfo->extras; - struct device *dev = &queue->info->netdev->dev; + struct xen_netif_rx_response *rx = &rinfo->rx, rx_local; + int max = XEN_NETIF_NR_SLOTS_MIN + (rx->status <= RX_COPY_THRESHOLD); RING_IDX cons = queue->rx.rsp_cons; struct sk_buff *skb = xennet_get_rx_skb(queue, cons); + struct xen_netif_extra_info *extras = rinfo->extras; grant_ref_t ref = xennet_get_rx_ref(queue, cons); - int max = XEN_NETIF_NR_SLOTS_MIN + (rx->status <= RX_COPY_THRESHOLD); + struct device *dev = &queue->info->netdev->dev; + struct bpf_prog *xdp_prog; + struct xdp_buff xdp; int slots = 1; int err = 0; - unsigned long ret; + u32 verdict; if (rx->flags & XEN_NETRXF_extra_info) { err = xennet_get_extras(queue, extras, rp); + if (!err) { + if (extras[XEN_NETIF_EXTRA_TYPE_XDP - 1].type) { + struct xen_netif_extra_info *xdp; + + xdp = &extras[XEN_NETIF_EXTRA_TYPE_XDP - 1]; + rx->offset = xdp->u.xdp.headroom; + } + } cons = queue->rx.rsp_cons; } for (;;) { - if (unlikely(rx->status < 0 || - rx->offset + rx->status > XEN_PAGE_SIZE)) { - if (net_ratelimit()) - dev_warn(dev, "rx->offset: %u, size: %d\n", - rx->offset, rx->status); - xennet_move_rx_slot(queue, skb, ref); - err = -EINVAL; - goto next; - } - /* * This definitely indicates a bug, either in this driver or in * the backend driver. In future this should flag the bad * situation to the system controller to reboot the backend. */ - if (ref == GRANT_INVALID_REF) { + if (ref == INVALID_GRANT_REF) { if (net_ratelimit()) dev_warn(dev, "Bad rx response id %d.\n", rx->id); @@ -823,11 +1063,43 @@ static int xennet_get_responses(struct netfront_queue *queue, goto next; } - ret = gnttab_end_foreign_access_ref(ref, 0); - BUG_ON(!ret); + if (unlikely(rx->status < 0 || + rx->offset + rx->status > XEN_PAGE_SIZE)) { + if (net_ratelimit()) + dev_warn(dev, "rx->offset: %u, size: %d\n", + rx->offset, rx->status); + xennet_move_rx_slot(queue, skb, ref); + err = -EINVAL; + goto next; + } + + if (!gnttab_end_foreign_access_ref(ref)) { + dev_alert(dev, + "Grant still in use by backend domain\n"); + queue->info->broken = true; + dev_alert(dev, "Disabled for further use\n"); + return -EINVAL; + } gnttab_release_grant_reference(&queue->gref_rx_head, ref); + rcu_read_lock(); + xdp_prog = rcu_dereference(queue->xdp_prog); + if (xdp_prog) { + if (!(rx->flags & XEN_NETRXF_more_data)) { + /* currently only a single page contains data */ + verdict = xennet_run_xdp(queue, + skb_frag_page(&skb_shinfo(skb)->frags[0]), + rx, xdp_prog, &xdp, need_xdp_flush); + if (verdict != XDP_PASS) + err = -EINVAL; + } else { + /* drop the frame */ + err = -EINVAL; + } + } + rcu_read_unlock(); + __skb_queue_tail(list, skb); next: @@ -841,7 +1113,8 @@ next: break; } - rx = RING_GET_RESPONSE(&queue->rx, cons + slots); + RING_COPY_RESPONSE(&queue->rx, cons + slots, &rx_local); + rx = &rx_local; skb = xennet_get_rx_skb(queue, cons + slots); ref = xennet_get_rx_ref(queue, cons + slots); slots++; @@ -854,7 +1127,7 @@ next: } if (unlikely(err)) - queue->rx.rsp_cons = cons + slots; + xennet_set_rx_rsp_cons(queue, cons + slots); return err; } @@ -888,18 +1161,19 @@ static int xennet_set_skb_gso(struct sk_buff *skb, return 0; } -static RING_IDX xennet_fill_frags(struct netfront_queue *queue, - struct sk_buff *skb, - struct sk_buff_head *list) +static int xennet_fill_frags(struct netfront_queue *queue, + struct sk_buff *skb, + struct sk_buff_head *list) { RING_IDX cons = queue->rx.rsp_cons; struct sk_buff *nskb; while ((nskb = __skb_dequeue(list))) { - struct xen_netif_rx_response *rx = - RING_GET_RESPONSE(&queue->rx, ++cons); + struct xen_netif_rx_response rx; skb_frag_t *nfrag = &skb_shinfo(nskb)->frags[0]; + RING_COPY_RESPONSE(&queue->rx, ++cons, &rx); + if (skb_shinfo(skb)->nr_frags == MAX_SKB_FRAGS) { unsigned int pull_to = NETFRONT_SKB_CB(skb)->pull_to; @@ -907,20 +1181,23 @@ static RING_IDX xennet_fill_frags(struct netfront_queue *queue, __pskb_pull_tail(skb, pull_to - skb_headlen(skb)); } if (unlikely(skb_shinfo(skb)->nr_frags >= MAX_SKB_FRAGS)) { - queue->rx.rsp_cons = ++cons; + xennet_set_rx_rsp_cons(queue, + ++cons + skb_queue_len(list)); kfree_skb(nskb); - return ~0U; + return -ENOENT; } skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, skb_frag_page(nfrag), - rx->offset, rx->status, PAGE_SIZE); + rx.offset, rx.status, PAGE_SIZE); skb_shinfo(nskb)->nr_frags = 0; kfree_skb(nskb); } - return cons; + xennet_set_rx_rsp_cons(queue, cons); + + return 0; } static int checksum_setup(struct net_device *dev, struct sk_buff *skb) @@ -997,6 +1274,7 @@ static int xennet_poll(struct napi_struct *napi, int budget) struct sk_buff_head errq; struct sk_buff_head tmpq; int err; + bool need_xdp_flush = false; spin_lock(&queue->rx_lock); @@ -1005,17 +1283,29 @@ static int xennet_poll(struct napi_struct *napi, int budget) skb_queue_head_init(&tmpq); rp = queue->rx.sring->rsp_prod; + if (RING_RESPONSE_PROD_OVERFLOW(&queue->rx, rp)) { + dev_alert(&dev->dev, "Illegal number of responses %u\n", + rp - queue->rx.rsp_cons); + queue->info->broken = true; + spin_unlock(&queue->rx_lock); + return 0; + } rmb(); /* Ensure we see queued responses up to 'rp'. */ i = queue->rx.rsp_cons; work_done = 0; while ((i != rp) && (work_done < budget)) { - memcpy(rx, RING_GET_RESPONSE(&queue->rx, i), sizeof(*rx)); + RING_COPY_RESPONSE(&queue->rx, i, rx); memset(extras, 0, sizeof(rinfo.extras)); - err = xennet_get_responses(queue, &rinfo, rp, &tmpq); + err = xennet_get_responses(queue, &rinfo, rp, &tmpq, + &need_xdp_flush); if (unlikely(err)) { + if (queue->info->broken) { + spin_unlock(&queue->rx_lock); + return 0; + } err: while ((skb = __skb_dequeue(&tmpq))) __skb_queue_tail(&errq, skb); @@ -1032,7 +1322,9 @@ err: if (unlikely(xennet_set_skb_gso(skb, gso))) { __skb_queue_head(&tmpq, skb); - queue->rx.rsp_cons += skb_queue_len(&tmpq); + xennet_set_rx_rsp_cons(queue, + queue->rx.rsp_cons + + skb_queue_len(&tmpq)); goto err; } } @@ -1041,13 +1333,12 @@ err: if (NETFRONT_SKB_CB(skb)->pull_to > RX_COPY_THRESHOLD) NETFRONT_SKB_CB(skb)->pull_to = RX_COPY_THRESHOLD; - skb_shinfo(skb)->frags[0].page_offset = rx->offset; + skb_frag_off_set(&skb_shinfo(skb)->frags[0], rx->offset); skb_frag_size_set(&skb_shinfo(skb)->frags[0], rx->status); skb->data_len = rx->status; skb->len += rx->status; - i = xennet_fill_frags(queue, skb, &tmpq); - if (unlikely(i == ~0U)) + if (unlikely(xennet_fill_frags(queue, skb, &tmpq))) goto err; if (rx->flags & XEN_NETRXF_csum_blank) @@ -1057,9 +1348,12 @@ err: __skb_queue_tail(&rxq, skb); - queue->rx.rsp_cons = ++i; + i = queue->rx.rsp_cons + 1; + xennet_set_rx_rsp_cons(queue, i); work_done++; } + if (need_xdp_flush) + xdp_do_flush(); __skb_queue_purge(&errq); @@ -1088,7 +1382,7 @@ static int xennet_change_mtu(struct net_device *dev, int mtu) if (mtu > max) return -EINVAL; - dev->mtu = mtu; + WRITE_ONCE(dev->mtu, mtu); return 0; } @@ -1105,16 +1399,16 @@ static void xennet_get_stats64(struct net_device *dev, unsigned int start; do { - start = u64_stats_fetch_begin_irq(&tx_stats->syncp); + start = u64_stats_fetch_begin(&tx_stats->syncp); tx_packets = tx_stats->packets; tx_bytes = tx_stats->bytes; - } while (u64_stats_fetch_retry_irq(&tx_stats->syncp, start)); + } while (u64_stats_fetch_retry(&tx_stats->syncp, start)); do { - start = u64_stats_fetch_begin_irq(&rx_stats->syncp); + start = u64_stats_fetch_begin(&rx_stats->syncp); rx_packets = rx_stats->packets; rx_bytes = rx_stats->bytes; - } while (u64_stats_fetch_retry_irq(&rx_stats->syncp, start)); + } while (u64_stats_fetch_retry(&rx_stats->syncp, start)); tot->rx_packets += rx_packets; tot->tx_packets += tx_packets; @@ -1133,17 +1427,17 @@ static void xennet_release_tx_bufs(struct netfront_queue *queue) for (i = 0; i < NET_TX_RING_SIZE; i++) { /* Skip over entries which are actually freelist references */ - if (skb_entry_is_link(&queue->tx_skbs[i])) + if (!queue->tx_skbs[i]) continue; - skb = queue->tx_skbs[i].skb; + skb = queue->tx_skbs[i]; + queue->tx_skbs[i] = NULL; get_page(queue->grant_tx_page[i]); gnttab_end_foreign_access(queue->grant_tx_ref[i], - GNTMAP_readonly, - (unsigned long)page_address(queue->grant_tx_page[i])); + queue->grant_tx_page[i]); queue->grant_tx_page[i] = NULL; - queue->grant_tx_ref[i] = GRANT_INVALID_REF; - add_id_to_freelist(&queue->tx_skb_freelist, queue->tx_skbs, i); + queue->grant_tx_ref[i] = INVALID_GRANT_REF; + add_id_to_list(&queue->tx_skb_freelist, queue->tx_link, i); dev_kfree_skb_irq(skb); } } @@ -1163,7 +1457,7 @@ static void xennet_release_rx_bufs(struct netfront_queue *queue) continue; ref = queue->grant_rx_ref[id]; - if (ref == GRANT_INVALID_REF) + if (ref == INVALID_GRANT_REF) continue; page = skb_frag_page(&skb_shinfo(skb)->frags[0]); @@ -1172,9 +1466,8 @@ static void xennet_release_rx_bufs(struct netfront_queue *queue) * foreign access is ended (which may be deferred). */ get_page(page); - gnttab_end_foreign_access(ref, 0, - (unsigned long)page_address(page)); - queue->grant_rx_ref[id] = GRANT_INVALID_REF; + gnttab_end_foreign_access(ref, page); + queue->grant_rx_ref[id] = INVALID_GRANT_REF; kfree_skb(skb); } @@ -1218,34 +1511,79 @@ static int xennet_set_features(struct net_device *dev, return 0; } -static irqreturn_t xennet_tx_interrupt(int irq, void *dev_id) +static bool xennet_handle_tx(struct netfront_queue *queue, unsigned int *eoi) { - struct netfront_queue *queue = dev_id; unsigned long flags; + if (unlikely(queue->info->broken)) + return false; + spin_lock_irqsave(&queue->tx_lock, flags); - xennet_tx_buf_gc(queue); + if (xennet_tx_buf_gc(queue)) + *eoi = 0; spin_unlock_irqrestore(&queue->tx_lock, flags); + return true; +} + +static irqreturn_t xennet_tx_interrupt(int irq, void *dev_id) +{ + unsigned int eoiflag = XEN_EOI_FLAG_SPURIOUS; + + if (likely(xennet_handle_tx(dev_id, &eoiflag))) + xen_irq_lateeoi(irq, eoiflag); + return IRQ_HANDLED; } -static irqreturn_t xennet_rx_interrupt(int irq, void *dev_id) +static bool xennet_handle_rx(struct netfront_queue *queue, unsigned int *eoi) { - struct netfront_queue *queue = dev_id; - struct net_device *dev = queue->info->netdev; + unsigned int work_queued; + unsigned long flags; - if (likely(netif_carrier_ok(dev) && - RING_HAS_UNCONSUMED_RESPONSES(&queue->rx))) + if (unlikely(queue->info->broken)) + return false; + + spin_lock_irqsave(&queue->rx_cons_lock, flags); + work_queued = XEN_RING_NR_UNCONSUMED_RESPONSES(&queue->rx); + if (work_queued > queue->rx_rsp_unconsumed) { + queue->rx_rsp_unconsumed = work_queued; + *eoi = 0; + } else if (unlikely(work_queued < queue->rx_rsp_unconsumed)) { + const struct device *dev = &queue->info->netdev->dev; + + spin_unlock_irqrestore(&queue->rx_cons_lock, flags); + dev_alert(dev, "RX producer index going backwards\n"); + dev_alert(dev, "Disabled for further use\n"); + queue->info->broken = true; + return false; + } + spin_unlock_irqrestore(&queue->rx_cons_lock, flags); + + if (likely(netif_carrier_ok(queue->info->netdev) && work_queued)) napi_schedule(&queue->napi); + return true; +} + +static irqreturn_t xennet_rx_interrupt(int irq, void *dev_id) +{ + unsigned int eoiflag = XEN_EOI_FLAG_SPURIOUS; + + if (likely(xennet_handle_rx(dev_id, &eoiflag))) + xen_irq_lateeoi(irq, eoiflag); + return IRQ_HANDLED; } static irqreturn_t xennet_interrupt(int irq, void *dev_id) { - xennet_tx_interrupt(irq, dev_id); - xennet_rx_interrupt(irq, dev_id); + unsigned int eoiflag = XEN_EOI_FLAG_SPURIOUS; + + if (xennet_handle_tx(dev_id, &eoiflag) && + xennet_handle_rx(dev_id, &eoiflag)) + xen_irq_lateeoi(irq, eoiflag); + return IRQ_HANDLED; } @@ -1256,12 +1594,96 @@ static void xennet_poll_controller(struct net_device *dev) struct netfront_info *info = netdev_priv(dev); unsigned int num_queues = dev->real_num_tx_queues; unsigned int i; + + if (info->broken) + return; + for (i = 0; i < num_queues; ++i) xennet_interrupt(0, &info->queues[i]); } #endif +#define NETBACK_XDP_HEADROOM_DISABLE 0 +#define NETBACK_XDP_HEADROOM_ENABLE 1 + +static int talk_to_netback_xdp(struct netfront_info *np, int xdp) +{ + int err; + unsigned short headroom; + + headroom = xdp ? XDP_PACKET_HEADROOM : 0; + err = xenbus_printf(XBT_NIL, np->xbdev->nodename, + "xdp-headroom", "%hu", + headroom); + if (err) + pr_warn("Error writing xdp-headroom\n"); + + return err; +} + +static int xennet_xdp_set(struct net_device *dev, struct bpf_prog *prog, + struct netlink_ext_ack *extack) +{ + unsigned long max_mtu = XEN_PAGE_SIZE - XDP_PACKET_HEADROOM; + struct netfront_info *np = netdev_priv(dev); + struct bpf_prog *old_prog; + unsigned int i, err; + + if (dev->mtu > max_mtu) { + netdev_warn(dev, "XDP requires MTU less than %lu\n", max_mtu); + return -EINVAL; + } + + if (!np->netback_has_xdp_headroom) + return 0; + + xenbus_switch_state(np->xbdev, XenbusStateReconfiguring); + + err = talk_to_netback_xdp(np, prog ? NETBACK_XDP_HEADROOM_ENABLE : + NETBACK_XDP_HEADROOM_DISABLE); + if (err) + return err; + + /* avoid the race with XDP headroom adjustment */ + wait_event(module_wq, + xenbus_read_driver_state(np->xbdev->otherend) == + XenbusStateReconfigured); + np->netfront_xdp_enabled = true; + + old_prog = rtnl_dereference(np->queues[0].xdp_prog); + + if (prog) + bpf_prog_add(prog, dev->real_num_tx_queues); + + for (i = 0; i < dev->real_num_tx_queues; ++i) + rcu_assign_pointer(np->queues[i].xdp_prog, prog); + + if (old_prog) + for (i = 0; i < dev->real_num_tx_queues; ++i) + bpf_prog_put(old_prog); + + xenbus_switch_state(np->xbdev, XenbusStateConnected); + + return 0; +} + +static int xennet_xdp(struct net_device *dev, struct netdev_bpf *xdp) +{ + struct netfront_info *np = netdev_priv(dev); + + if (np->broken) + return -ENODEV; + + switch (xdp->command) { + case XDP_SETUP_PROG: + return xennet_xdp_set(dev, xdp->prog, xdp->extack); + default: + return -EINVAL; + } +} + static const struct net_device_ops xennet_netdev_ops = { + .ndo_uninit = xennet_uninit, .ndo_open = xennet_open, .ndo_stop = xennet_close, .ndo_start_xmit = xennet_start_xmit, @@ -1272,6 +1694,8 @@ static const struct net_device_ops xennet_netdev_ops = { .ndo_fix_features = xennet_fix_features, .ndo_set_features = xennet_set_features, .ndo_select_queue = xennet_select_queue, + .ndo_bpf = xennet_xdp, + .ndo_xdp_xmit = xennet_xdp_xmit, #ifdef CONFIG_NET_POLL_CONTROLLER .ndo_poll_controller = xennet_poll_controller, #endif @@ -1324,6 +1748,8 @@ static struct net_device *xennet_create_dev(struct xenbus_device *dev) * negotiate with the backend regarding supported features. */ netdev->features |= netdev->hw_features; + netdev->xdp_features = NETDEV_XDP_ACT_BASIC | NETDEV_XDP_ACT_REDIRECT | + NETDEV_XDP_ACT_NDO_XMIT; netdev->ethtool_ops = &xennet_ethtool_ops; netdev->min_mtu = ETH_MIN_MTU; @@ -1331,15 +1757,19 @@ static struct net_device *xennet_create_dev(struct xenbus_device *dev) SET_NETDEV_DEV(netdev, &dev->dev); np->netdev = netdev; + np->netfront_xdp_enabled = false; netif_carrier_off(netdev); - xenbus_switch_state(dev, XenbusStateInitialising); - wait_event(module_wq, - xenbus_read_driver_state(dev->otherend) != - XenbusStateClosed && - xenbus_read_driver_state(dev->otherend) != - XenbusStateUnknown); + do { + xenbus_switch_state(dev, XenbusStateInitialising); + err = wait_event_timeout(module_wq, + xenbus_read_driver_state(dev->otherend) != + XenbusStateClosed && + xenbus_read_driver_state(dev->otherend) != + XenbusStateUnknown, XENNET_TIMEOUT); + } while (!err); + return netdev; exit: @@ -1347,7 +1777,7 @@ static struct net_device *xennet_create_dev(struct xenbus_device *dev) return ERR_PTR(err); } -/** +/* * Entry point to this code when a new device is created. Allocate the basic * structures and the ring buffers for communication with the backend, and * inform the backend of the appropriate details for those. @@ -1378,8 +1808,8 @@ static int netfront_probe(struct xenbus_device *dev, static void xennet_end_access(int ref, void *page) { /* This frees the page as a side-effect */ - if (ref != GRANT_INVALID_REF) - gnttab_end_foreign_access(ref, 0, (unsigned long)page); + if (ref != INVALID_GRANT_REF) + gnttab_end_foreign_access(ref, virt_to_page(page)); } static void xennet_disconnect_backend(struct netfront_info *info) @@ -1392,7 +1822,7 @@ static void xennet_disconnect_backend(struct netfront_info *info) for (i = 0; i < num_queues && info->queues; ++i) { struct netfront_queue *queue = &info->queues[i]; - del_timer_sync(&queue->rx_refill_timer); + timer_delete_sync(&queue->rx_refill_timer); if (queue->tx_irq && (queue->tx_irq == queue->rx_irq)) unbind_from_irqhandler(queue->tx_irq, queue); @@ -1415,14 +1845,16 @@ static void xennet_disconnect_backend(struct netfront_info *info) xennet_end_access(queue->tx_ring_ref, queue->tx.sring); xennet_end_access(queue->rx_ring_ref, queue->rx.sring); - queue->tx_ring_ref = GRANT_INVALID_REF; - queue->rx_ring_ref = GRANT_INVALID_REF; + queue->tx_ring_ref = INVALID_GRANT_REF; + queue->rx_ring_ref = INVALID_GRANT_REF; queue->tx.sring = NULL; queue->rx.sring = NULL; + + page_pool_destroy(queue->page_pool); } } -/** +/* * We are reconnecting to the backend, due to a suspend/resume, or a backend * driver restart. We tear down our netif structure and recreate it, but * leave the device-layer structures intact so that this is transparent to the @@ -1434,7 +1866,17 @@ static int netfront_resume(struct xenbus_device *dev) dev_dbg(&dev->dev, "%s\n", dev->nodename); + netif_tx_lock_bh(info->netdev); + netif_device_detach(info->netdev); + netif_tx_unlock_bh(info->netdev); + xennet_disconnect_backend(info); + + rtnl_lock(); + if (info->queues) + xennet_destroy_queues(info); + rtnl_unlock(); + return 0; } @@ -1468,9 +1910,10 @@ static int setup_netfront_single(struct netfront_queue *queue) if (err < 0) goto fail; - err = bind_evtchn_to_irqhandler(queue->tx_evtchn, - xennet_interrupt, - 0, queue->info->netdev->name, queue); + err = bind_evtchn_to_irqhandler_lateeoi(queue->tx_evtchn, + xennet_interrupt, 0, + queue->info->netdev->name, + queue); if (err < 0) goto bind_fail; queue->rx_evtchn = queue->tx_evtchn; @@ -1498,18 +1941,18 @@ static int setup_netfront_split(struct netfront_queue *queue) snprintf(queue->tx_irq_name, sizeof(queue->tx_irq_name), "%s-tx", queue->name); - err = bind_evtchn_to_irqhandler(queue->tx_evtchn, - xennet_tx_interrupt, - 0, queue->tx_irq_name, queue); + err = bind_evtchn_to_irqhandler_lateeoi(queue->tx_evtchn, + xennet_tx_interrupt, 0, + queue->tx_irq_name, queue); if (err < 0) goto bind_tx_fail; queue->tx_irq = err; snprintf(queue->rx_irq_name, sizeof(queue->rx_irq_name), "%s-rx", queue->name); - err = bind_evtchn_to_irqhandler(queue->rx_evtchn, - xennet_rx_interrupt, - 0, queue->rx_irq_name, queue); + err = bind_evtchn_to_irqhandler_lateeoi(queue->rx_evtchn, + xennet_rx_interrupt, 0, + queue->rx_irq_name, queue); if (err < 0) goto bind_rx_fail; queue->rx_irq = err; @@ -1534,41 +1977,26 @@ static int setup_netfront(struct xenbus_device *dev, { struct xen_netif_tx_sring *txs; struct xen_netif_rx_sring *rxs; - grant_ref_t gref; int err; - queue->tx_ring_ref = GRANT_INVALID_REF; - queue->rx_ring_ref = GRANT_INVALID_REF; + queue->tx_ring_ref = INVALID_GRANT_REF; + queue->rx_ring_ref = INVALID_GRANT_REF; queue->rx.sring = NULL; queue->tx.sring = NULL; - txs = (struct xen_netif_tx_sring *)get_zeroed_page(GFP_NOIO | __GFP_HIGH); - if (!txs) { - err = -ENOMEM; - xenbus_dev_fatal(dev, err, "allocating tx ring page"); + err = xenbus_setup_ring(dev, GFP_NOIO | __GFP_HIGH, (void **)&txs, + 1, &queue->tx_ring_ref); + if (err) goto fail; - } - SHARED_RING_INIT(txs); - FRONT_RING_INIT(&queue->tx, txs, XEN_PAGE_SIZE); - err = xenbus_grant_ring(dev, txs, 1, &gref); - if (err < 0) - goto grant_tx_ring_fail; - queue->tx_ring_ref = gref; + XEN_FRONT_RING_INIT(&queue->tx, txs, XEN_PAGE_SIZE); - rxs = (struct xen_netif_rx_sring *)get_zeroed_page(GFP_NOIO | __GFP_HIGH); - if (!rxs) { - err = -ENOMEM; - xenbus_dev_fatal(dev, err, "allocating rx ring page"); - goto alloc_rx_ring_fail; - } - SHARED_RING_INIT(rxs); - FRONT_RING_INIT(&queue->rx, rxs, XEN_PAGE_SIZE); + err = xenbus_setup_ring(dev, GFP_NOIO | __GFP_HIGH, (void **)&rxs, + 1, &queue->rx_ring_ref); + if (err) + goto fail; - err = xenbus_grant_ring(dev, rxs, 1, &gref); - if (err < 0) - goto grant_rx_ring_fail; - queue->rx_ring_ref = gref; + XEN_FRONT_RING_INIT(&queue->rx, rxs, XEN_PAGE_SIZE); if (feature_split_evtchn) err = setup_netfront_split(queue); @@ -1576,26 +2004,18 @@ static int setup_netfront(struct xenbus_device *dev, * a) feature-split-event-channels == 0 * b) feature-split-event-channels == 1 but failed to setup */ - if (!feature_split_evtchn || (feature_split_evtchn && err)) + if (!feature_split_evtchn || err) err = setup_netfront_single(queue); if (err) - goto alloc_evtchn_fail; + goto fail; return 0; - /* If we fail to setup netfront, it is safe to just revoke access to - * granted pages because backend is not accessing it at this point. - */ -alloc_evtchn_fail: - gnttab_end_foreign_access_ref(queue->rx_ring_ref, 0); -grant_rx_ring_fail: - free_page((unsigned long)rxs); -alloc_rx_ring_fail: - gnttab_end_foreign_access_ref(queue->tx_ring_ref, 0); -grant_tx_ring_fail: - free_page((unsigned long)txs); -fail: + fail: + xenbus_teardown_ring((void **)&queue->rx.sring, 1, &queue->rx_ring_ref); + xenbus_teardown_ring((void **)&queue->tx.sring, 1, &queue->tx_ring_ref); + return err; } @@ -1611,6 +2031,7 @@ static int xennet_init_queue(struct netfront_queue *queue) spin_lock_init(&queue->tx_lock); spin_lock_init(&queue->rx_lock); + spin_lock_init(&queue->rx_cons_lock); timer_setup(&queue->rx_refill_timer, rx_refill_timeout, 0); @@ -1618,18 +2039,20 @@ static int xennet_init_queue(struct netfront_queue *queue) snprintf(queue->name, sizeof(queue->name), "vif%s-q%u", devid, queue->id); - /* Initialise tx_skbs as a free chain containing every entry. */ + /* Initialise tx_skb_freelist as a free chain containing every entry. */ queue->tx_skb_freelist = 0; + queue->tx_pend_queue = TX_LINK_NONE; for (i = 0; i < NET_TX_RING_SIZE; i++) { - skb_entry_set_link(&queue->tx_skbs[i], i+1); - queue->grant_tx_ref[i] = GRANT_INVALID_REF; + queue->tx_link[i] = i + 1; + queue->grant_tx_ref[i] = INVALID_GRANT_REF; queue->grant_tx_page[i] = NULL; } + queue->tx_link[NET_TX_RING_SIZE - 1] = TX_LINK_NONE; /* Clear out rx_skbs */ for (i = 0; i < NET_RX_RING_SIZE; i++) { queue->rx_skbs[i] = NULL; - queue->grant_rx_ref[i] = GRANT_INVALID_REF; + queue->grant_rx_ref[i] = INVALID_GRANT_REF; } /* A grant for every tx ring slot */ @@ -1738,20 +2161,49 @@ error: return err; } -static void xennet_destroy_queues(struct netfront_info *info) + + +static int xennet_create_page_pool(struct netfront_queue *queue) { - unsigned int i; + int err; + struct page_pool_params pp_params = { + .order = 0, + .flags = 0, + .pool_size = NET_RX_RING_SIZE, + .nid = NUMA_NO_NODE, + .dev = &queue->info->netdev->dev, + .offset = XDP_PACKET_HEADROOM, + .max_len = XEN_PAGE_SIZE - XDP_PACKET_HEADROOM, + }; - for (i = 0; i < info->netdev->real_num_tx_queues; i++) { - struct netfront_queue *queue = &info->queues[i]; + queue->page_pool = page_pool_create(&pp_params); + if (IS_ERR(queue->page_pool)) { + err = PTR_ERR(queue->page_pool); + queue->page_pool = NULL; + return err; + } - if (netif_running(info->netdev)) - napi_disable(&queue->napi); - netif_napi_del(&queue->napi); + err = xdp_rxq_info_reg(&queue->xdp_rxq, queue->info->netdev, + queue->id, 0); + if (err) { + netdev_err(queue->info->netdev, "xdp_rxq_info_reg failed\n"); + goto err_free_pp; } - kfree(info->queues); - info->queues = NULL; + err = xdp_rxq_info_reg_mem_model(&queue->xdp_rxq, + MEM_TYPE_PAGE_POOL, queue->page_pool); + if (err) { + netdev_err(queue->info->netdev, "xdp_rxq_info_reg_mem_model failed\n"); + goto err_unregister_rxq; + } + return 0; + +err_unregister_rxq: + xdp_rxq_info_unreg(&queue->xdp_rxq); +err_free_pp: + page_pool_destroy(queue->page_pool); + queue->page_pool = NULL; + return err; } static int xennet_create_queues(struct netfront_info *info, @@ -1779,8 +2231,15 @@ static int xennet_create_queues(struct netfront_info *info, break; } - netif_napi_add(queue->info->netdev, &queue->napi, - xennet_poll, 64); + /* use page pool recycling instead of buddy allocator */ + ret = xennet_create_page_pool(queue); + if (ret < 0) { + dev_err(&info->xbdev->dev, "can't allocate page pool\n"); + *num_queues = i; + return ret; + } + + netif_napi_add(queue->info->netdev, &queue->napi, xennet_poll); if (netif_running(info->netdev)) napi_enable(&queue->napi); } @@ -1806,9 +2265,14 @@ static int talk_to_netback(struct xenbus_device *dev, unsigned int max_queues = 0; struct netfront_queue *queue = NULL; unsigned int num_queues = 1; + u8 addr[ETH_ALEN]; info->netdev->irq = 0; + /* Check if backend is trusted. */ + info->bounce = !xennet_trusted || + !xenbus_read_unsigned(dev->nodename, "trusted", 1); + /* Check if backend supports multiple queues */ max_queues = xenbus_read_unsigned(info->xbdev->otherend, "multi-queue-max-queues", 1); @@ -1819,16 +2283,31 @@ static int talk_to_netback(struct xenbus_device *dev, "feature-split-event-channels", 0); /* Read mac addr. */ - err = xen_net_read_mac(dev, info->netdev->dev_addr); + err = xen_net_read_mac(dev, addr); if (err) { xenbus_dev_fatal(dev, err, "parsing %s/mac", dev->nodename); goto out_unlocked; } + eth_hw_addr_set(info->netdev, addr); + + info->netback_has_xdp_headroom = xenbus_read_unsigned(info->xbdev->otherend, + "feature-xdp-headroom", 0); + if (info->netback_has_xdp_headroom) { + /* set the current xen-netfront xdp state */ + err = talk_to_netback_xdp(info, info->netfront_xdp_enabled ? + NETBACK_XDP_HEADROOM_ENABLE : + NETBACK_XDP_HEADROOM_DISABLE); + if (err) + goto out_unlocked; + } rtnl_lock(); if (info->queues) xennet_destroy_queues(info); + /* For the case of a reconnect reset the "broken" indicator. */ + info->broken = false; + err = xennet_create_queues(info, &num_queues); if (err < 0) { xenbus_dev_fatal(dev, err, "creating queues"); @@ -1959,6 +2438,11 @@ static int xennet_connect(struct net_device *dev) err = talk_to_netback(np->xbdev, np); if (err) return err; + if (np->netback_has_xdp_headroom) + pr_info("backend supports XDP headroom\n"); + if (np->bounce) + dev_info(&np->xbdev->dev, + "bouncing transmitted data to zeroed pages\n"); /* talk_to_netback() sets the correct number of queues */ num_queues = dev->real_num_tx_queues; @@ -1982,6 +2466,10 @@ static int xennet_connect(struct net_device *dev) * domain a kick because we've probably just requeued some * packets. */ + netif_tx_lock_bh(np->netdev); + netif_device_attach(np->netdev); + netif_tx_unlock_bh(np->netdev); + netif_carrier_on(np->netdev); for (j = 0; j < num_queues; ++j) { queue = &np->queues[j]; @@ -1990,10 +2478,6 @@ static int xennet_connect(struct net_device *dev) if (queue->tx_irq != queue->rx_irq) notify_remote_via_irq(queue->rx_irq); - spin_lock_irq(&queue->tx_lock); - xennet_tx_buf_gc(queue); - spin_unlock_irq(&queue->tx_lock); - spin_lock_bh(&queue->rx_lock); xennet_alloc_rx_buffers(queue); spin_unlock_bh(&queue->rx_lock); @@ -2002,7 +2486,7 @@ static int xennet_connect(struct net_device *dev) return 0; } -/** +/* * Callback received when the backend's state changes. */ static void netback_changed(struct xenbus_device *dev, @@ -2038,7 +2522,7 @@ static void netback_changed(struct xenbus_device *dev, case XenbusStateClosed: if (dev->state == XenbusStateClosed) break; - /* Missed the backend's CLOSING state -- fallthrough */ + fallthrough; /* Missed the backend's CLOSING state */ case XenbusStateClosing: xenbus_frontend_closed(dev); break; @@ -2095,6 +2579,7 @@ static const struct ethtool_ops xennet_ethtool_ops = .get_sset_count = xennet_get_sset_count, .get_ethtool_stats = xennet_get_ethtool_stats, .get_strings = xennet_get_strings, + .get_ts_info = ethtool_op_get_ts_info, }; #ifdef CONFIG_SYSFS @@ -2109,12 +2594,11 @@ static ssize_t store_rxbuf(struct device *dev, const char *buf, size_t len) { char *endp; - unsigned long target; if (!capable(CAP_NET_ADMIN)) return -EPERM; - target = simple_strtoul(buf, &endp, 0); + simple_strtoul(buf, &endp, 0); if (endp == buf) return -EBADMSG; @@ -2139,28 +2623,43 @@ static const struct attribute_group xennet_dev_group = { }; #endif /* CONFIG_SYSFS */ -static int xennet_remove(struct xenbus_device *dev) +static void xennet_bus_close(struct xenbus_device *dev) { - struct netfront_info *info = dev_get_drvdata(&dev->dev); - - dev_dbg(&dev->dev, "%s\n", dev->nodename); + int ret; - if (xenbus_read_driver_state(dev->otherend) != XenbusStateClosed) { + if (xenbus_read_driver_state(dev->otherend) == XenbusStateClosed) + return; + do { xenbus_switch_state(dev, XenbusStateClosing); - wait_event(module_wq, - xenbus_read_driver_state(dev->otherend) == - XenbusStateClosing || - xenbus_read_driver_state(dev->otherend) == - XenbusStateUnknown); + ret = wait_event_timeout(module_wq, + xenbus_read_driver_state(dev->otherend) == + XenbusStateClosing || + xenbus_read_driver_state(dev->otherend) == + XenbusStateClosed || + xenbus_read_driver_state(dev->otherend) == + XenbusStateUnknown, + XENNET_TIMEOUT); + } while (!ret); + + if (xenbus_read_driver_state(dev->otherend) == XenbusStateClosed) + return; + do { xenbus_switch_state(dev, XenbusStateClosed); - wait_event(module_wq, - xenbus_read_driver_state(dev->otherend) == - XenbusStateClosed || - xenbus_read_driver_state(dev->otherend) == - XenbusStateUnknown); - } + ret = wait_event_timeout(module_wq, + xenbus_read_driver_state(dev->otherend) == + XenbusStateClosed || + xenbus_read_driver_state(dev->otherend) == + XenbusStateUnknown, + XENNET_TIMEOUT); + } while (!ret); +} +static void xennet_remove(struct xenbus_device *dev) +{ + struct netfront_info *info = dev_get_drvdata(&dev->dev); + + xennet_bus_close(dev); xennet_disconnect_backend(info); if (info->netdev->reg_state == NETREG_REGISTERED) @@ -2172,8 +2671,6 @@ static int xennet_remove(struct xenbus_device *dev) rtnl_unlock(); } xennet_free_netdev(info->netdev); - - return 0; } static const struct xenbus_device_id netfront_ids[] = { @@ -2199,8 +2696,9 @@ static int __init netif_init(void) pr_info("Initialising Xen virtual ethernet driver\n"); - /* Allow as many queues as there are CPUs inut max. 8 if user has not - * specified a value. + /* Allow the number of queues to match the number of CPUs, but not exceed + * the maximum limit. If the user has not specified a value, the default + * maximum limit is 8. */ if (xennet_max_queues == 0) xennet_max_queues = min_t(unsigned int, MAX_QUEUES_DEFAULT, |
