summaryrefslogtreecommitdiff
path: root/drivers/net/ethernet/intel/i40e/i40e_txrx.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/net/ethernet/intel/i40e/i40e_txrx.c')
-rw-r--r--drivers/net/ethernet/intel/i40e/i40e_txrx.c2718
1 files changed, 1660 insertions, 1058 deletions
diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
index b936febc315a..cc0b9efc2637 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
@@ -1,45 +1,16 @@
-/*******************************************************************************
- *
- * Intel Ethernet Controller XL710 Family Linux Driver
- * Copyright(c) 2013 - 2016 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program. If not, see <http://www.gnu.org/licenses/>.
- *
- * The full GNU General Public License is included in this distribution in
- * the file called "COPYING".
- *
- * Contact Information:
- * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
- * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
- *
- ******************************************************************************/
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2013 - 2018 Intel Corporation. */
-#include <linux/prefetch.h>
-#include <net/busy_poll.h>
#include <linux/bpf_trace.h>
-#include "i40e.h"
+#include <linux/net/intel/libie/pctype.h>
+#include <linux/net/intel/libie/rx.h>
+#include <linux/prefetch.h>
+#include <linux/sctp.h>
+#include <net/mpls.h>
+#include <net/xdp.h>
+#include "i40e_txrx_common.h"
#include "i40e_trace.h"
-#include "i40e_prototype.h"
-
-static inline __le64 build_ctob(u32 td_cmd, u32 td_offset, unsigned int size,
- u32 td_tag)
-{
- return cpu_to_le64(I40E_TX_DESC_DTYPE_DATA |
- ((u64)td_cmd << I40E_TXD_QW1_CMD_SHIFT) |
- ((u64)td_offset << I40E_TXD_QW1_OFFSET_SHIFT) |
- ((u64)size << I40E_TXD_QW1_TX_BUF_SZ_SHIFT) |
- ((u64)td_tag << I40E_TXD_QW1_L2TAG1_SHIFT));
-}
+#include "i40e_xsk.h"
#define I40E_TXD_CMD (I40E_TX_DESC_CMD_EOP | I40E_TX_DESC_CMD_RS)
/**
@@ -54,7 +25,7 @@ static void i40e_fdir(struct i40e_ring *tx_ring,
{
struct i40e_filter_program_desc *fdir_desc;
struct i40e_pf *pf = tx_ring->vsi->back;
- u32 flex_ptype, dtype_cmd;
+ u32 flex_ptype, dtype_cmd, vsi_id;
u16 i;
/* grab the next descriptor */
@@ -64,22 +35,16 @@ static void i40e_fdir(struct i40e_ring *tx_ring,
i++;
tx_ring->next_to_use = (i < tx_ring->count) ? i : 0;
- flex_ptype = I40E_TXD_FLTR_QW0_QINDEX_MASK &
- (fdata->q_index << I40E_TXD_FLTR_QW0_QINDEX_SHIFT);
+ flex_ptype = FIELD_PREP(I40E_TXD_FLTR_QW0_QINDEX_MASK, fdata->q_index);
- flex_ptype |= I40E_TXD_FLTR_QW0_FLEXOFF_MASK &
- (fdata->flex_off << I40E_TXD_FLTR_QW0_FLEXOFF_SHIFT);
+ flex_ptype |= FIELD_PREP(I40E_TXD_FLTR_QW0_FLEXOFF_MASK,
+ fdata->flex_off);
- flex_ptype |= I40E_TXD_FLTR_QW0_PCTYPE_MASK &
- (fdata->pctype << I40E_TXD_FLTR_QW0_PCTYPE_SHIFT);
-
- flex_ptype |= I40E_TXD_FLTR_QW0_PCTYPE_MASK &
- (fdata->flex_offset << I40E_TXD_FLTR_QW0_FLEXOFF_SHIFT);
+ flex_ptype |= FIELD_PREP(I40E_TXD_FLTR_QW0_PCTYPE_MASK, fdata->pctype);
/* Use LAN VSI Id if not programmed by user */
- flex_ptype |= I40E_TXD_FLTR_QW0_DEST_VSI_MASK &
- ((u32)(fdata->dest_vsi ? : pf->vsi[pf->lan_vsi]->id) <<
- I40E_TXD_FLTR_QW0_DEST_VSI_SHIFT);
+ vsi_id = fdata->dest_vsi ? : i40e_pf_get_main_vsi(pf)->id;
+ flex_ptype |= FIELD_PREP(I40E_TXD_FLTR_QW0_DEST_VSI_MASK, vsi_id);
dtype_cmd = I40E_TX_DESC_DTYPE_FILTER_PROG;
@@ -89,17 +54,15 @@ static void i40e_fdir(struct i40e_ring *tx_ring,
I40E_FILTER_PROGRAM_DESC_PCMD_REMOVE <<
I40E_TXD_FLTR_QW1_PCMD_SHIFT;
- dtype_cmd |= I40E_TXD_FLTR_QW1_DEST_MASK &
- (fdata->dest_ctl << I40E_TXD_FLTR_QW1_DEST_SHIFT);
+ dtype_cmd |= FIELD_PREP(I40E_TXD_FLTR_QW1_DEST_MASK, fdata->dest_ctl);
- dtype_cmd |= I40E_TXD_FLTR_QW1_FD_STATUS_MASK &
- (fdata->fd_status << I40E_TXD_FLTR_QW1_FD_STATUS_SHIFT);
+ dtype_cmd |= FIELD_PREP(I40E_TXD_FLTR_QW1_FD_STATUS_MASK,
+ fdata->fd_status);
if (fdata->cnt_index) {
dtype_cmd |= I40E_TXD_FLTR_QW1_CNT_ENA_MASK;
- dtype_cmd |= I40E_TXD_FLTR_QW1_CNTINDEX_MASK &
- ((u32)fdata->cnt_index <<
- I40E_TXD_FLTR_QW1_CNTINDEX_SHIFT);
+ dtype_cmd |= FIELD_PREP(I40E_TXD_FLTR_QW1_CNTINDEX_MASK,
+ fdata->cnt_index);
}
fdir_desc->qindex_flex_ptype_vsi = cpu_to_le32(flex_ptype);
@@ -191,59 +154,180 @@ dma_fail:
return -1;
}
-#define IP_HEADER_OFFSET 14
-#define I40E_UDPIP_DUMMY_PACKET_LEN 42
/**
- * i40e_add_del_fdir_udpv4 - Add/Remove UDPv4 filters
- * @vsi: pointer to the targeted VSI
- * @fd_data: the flow director data required for the FDir descriptor
- * @add: true adds a filter, false removes it
+ * i40e_create_dummy_packet - Constructs dummy packet for HW
+ * @dummy_packet: preallocated space for dummy packet
+ * @ipv4: is layer 3 packet of version 4 or 6
+ * @l4proto: next level protocol used in data portion of l3
+ * @data: filter data
*
- * Returns 0 if the filters were successfully added or removed
+ * Returns address of layer 4 protocol dummy packet.
**/
-static int i40e_add_del_fdir_udpv4(struct i40e_vsi *vsi,
- struct i40e_fdir_filter *fd_data,
- bool add)
+static char *i40e_create_dummy_packet(u8 *dummy_packet, bool ipv4, u8 l4proto,
+ struct i40e_fdir_filter *data)
+{
+ bool is_vlan = !!data->vlan_tag;
+ struct vlan_hdr vlan = {};
+ struct ipv6hdr ipv6 = {};
+ struct ethhdr eth = {};
+ struct iphdr ip = {};
+ u8 *tmp;
+
+ if (ipv4) {
+ eth.h_proto = cpu_to_be16(ETH_P_IP);
+ ip.protocol = l4proto;
+ ip.version = 0x4;
+ ip.ihl = 0x5;
+
+ ip.daddr = data->dst_ip;
+ ip.saddr = data->src_ip;
+ } else {
+ eth.h_proto = cpu_to_be16(ETH_P_IPV6);
+ ipv6.nexthdr = l4proto;
+ ipv6.version = 0x6;
+
+ memcpy(&ipv6.saddr.in6_u.u6_addr32, data->src_ip6,
+ sizeof(__be32) * 4);
+ memcpy(&ipv6.daddr.in6_u.u6_addr32, data->dst_ip6,
+ sizeof(__be32) * 4);
+ }
+
+ if (is_vlan) {
+ vlan.h_vlan_TCI = data->vlan_tag;
+ vlan.h_vlan_encapsulated_proto = eth.h_proto;
+ eth.h_proto = data->vlan_etype;
+ }
+
+ tmp = dummy_packet;
+ memcpy(tmp, &eth, sizeof(eth));
+ tmp += sizeof(eth);
+
+ if (is_vlan) {
+ memcpy(tmp, &vlan, sizeof(vlan));
+ tmp += sizeof(vlan);
+ }
+
+ if (ipv4) {
+ memcpy(tmp, &ip, sizeof(ip));
+ tmp += sizeof(ip);
+ } else {
+ memcpy(tmp, &ipv6, sizeof(ipv6));
+ tmp += sizeof(ipv6);
+ }
+
+ return tmp;
+}
+
+/**
+ * i40e_create_dummy_udp_packet - helper function to create UDP packet
+ * @raw_packet: preallocated space for dummy packet
+ * @ipv4: is layer 3 packet of version 4 or 6
+ * @l4proto: next level protocol used in data portion of l3
+ * @data: filter data
+ *
+ * Helper function to populate udp fields.
+ **/
+static void i40e_create_dummy_udp_packet(u8 *raw_packet, bool ipv4, u8 l4proto,
+ struct i40e_fdir_filter *data)
{
- struct i40e_pf *pf = vsi->back;
struct udphdr *udp;
- struct iphdr *ip;
- u8 *raw_packet;
- int ret;
- static char packet[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x08, 0,
- 0x45, 0, 0, 0x1c, 0, 0, 0x40, 0, 0x40, 0x11, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+ u8 *tmp;
- raw_packet = kzalloc(I40E_FDIR_MAX_RAW_PACKET_SIZE, GFP_KERNEL);
- if (!raw_packet)
- return -ENOMEM;
- memcpy(raw_packet, packet, I40E_UDPIP_DUMMY_PACKET_LEN);
+ tmp = i40e_create_dummy_packet(raw_packet, ipv4, IPPROTO_UDP, data);
+ udp = (struct udphdr *)(tmp);
+ udp->dest = data->dst_port;
+ udp->source = data->src_port;
+}
+
+/**
+ * i40e_create_dummy_tcp_packet - helper function to create TCP packet
+ * @raw_packet: preallocated space for dummy packet
+ * @ipv4: is layer 3 packet of version 4 or 6
+ * @l4proto: next level protocol used in data portion of l3
+ * @data: filter data
+ *
+ * Helper function to populate tcp fields.
+ **/
+static void i40e_create_dummy_tcp_packet(u8 *raw_packet, bool ipv4, u8 l4proto,
+ struct i40e_fdir_filter *data)
+{
+ struct tcphdr *tcp;
+ u8 *tmp;
+ /* Dummy tcp packet */
+ static const char tcp_packet[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0x50, 0x11, 0x0, 0x72, 0, 0, 0, 0};
- ip = (struct iphdr *)(raw_packet + IP_HEADER_OFFSET);
- udp = (struct udphdr *)(raw_packet + IP_HEADER_OFFSET
- + sizeof(struct iphdr));
+ tmp = i40e_create_dummy_packet(raw_packet, ipv4, IPPROTO_TCP, data);
- ip->daddr = fd_data->dst_ip;
- udp->dest = fd_data->dst_port;
- ip->saddr = fd_data->src_ip;
- udp->source = fd_data->src_port;
+ tcp = (struct tcphdr *)tmp;
+ memcpy(tcp, tcp_packet, sizeof(tcp_packet));
+ tcp->dest = data->dst_port;
+ tcp->source = data->src_port;
+}
+
+/**
+ * i40e_create_dummy_sctp_packet - helper function to create SCTP packet
+ * @raw_packet: preallocated space for dummy packet
+ * @ipv4: is layer 3 packet of version 4 or 6
+ * @l4proto: next level protocol used in data portion of l3
+ * @data: filter data
+ *
+ * Helper function to populate sctp fields.
+ **/
+static void i40e_create_dummy_sctp_packet(u8 *raw_packet, bool ipv4,
+ u8 l4proto,
+ struct i40e_fdir_filter *data)
+{
+ struct sctphdr *sctp;
+ u8 *tmp;
+
+ tmp = i40e_create_dummy_packet(raw_packet, ipv4, IPPROTO_SCTP, data);
+
+ sctp = (struct sctphdr *)tmp;
+ sctp->dest = data->dst_port;
+ sctp->source = data->src_port;
+}
+
+/**
+ * i40e_prepare_fdir_filter - Prepare and program fdir filter
+ * @pf: physical function to attach filter to
+ * @fd_data: filter data
+ * @add: add or delete filter
+ * @packet_addr: address of dummy packet, used in filtering
+ * @payload_offset: offset from dummy packet address to user defined data
+ * @pctype: Packet type for which filter is used
+ *
+ * Helper function to offset data of dummy packet, program it and
+ * handle errors.
+ **/
+static int i40e_prepare_fdir_filter(struct i40e_pf *pf,
+ struct i40e_fdir_filter *fd_data,
+ bool add, char *packet_addr,
+ int payload_offset, u8 pctype)
+{
+ int ret;
if (fd_data->flex_filter) {
- u8 *payload = raw_packet + I40E_UDPIP_DUMMY_PACKET_LEN;
+ u8 *payload;
__be16 pattern = fd_data->flex_word;
u16 off = fd_data->flex_offset;
+ payload = packet_addr + payload_offset;
+
+ /* If user provided vlan, offset payload by vlan header length */
+ if (!!fd_data->vlan_tag)
+ payload += VLAN_HLEN;
+
*((__force __be16 *)(payload + off)) = pattern;
}
- fd_data->pctype = I40E_FILTER_PCTYPE_NONF_IPV4_UDP;
- ret = i40e_program_fdir_filter(fd_data, raw_packet, pf, add);
+ fd_data->pctype = pctype;
+ ret = i40e_program_fdir_filter(fd_data, packet_addr, pf, add);
if (ret) {
dev_info(&pf->pdev->dev,
"PCTYPE:%d, Filter command send failed for fd_id:%d (ret = %d)\n",
fd_data->pctype, fd_data->fd_id, ret);
/* Free the packet buffer since it wasn't added to the ring */
- kfree(raw_packet);
return -EOPNOTSUPP;
} else if (I40E_DEBUG_FD & pf->hw.debug_mask) {
if (add)
@@ -256,281 +340,316 @@ static int i40e_add_del_fdir_udpv4(struct i40e_vsi *vsi,
fd_data->pctype, fd_data->fd_id);
}
- if (add)
- pf->fd_udp4_filter_cnt++;
- else
- pf->fd_udp4_filter_cnt--;
+ return ret;
+}
- return 0;
+/**
+ * i40e_change_filter_num - Prepare and program fdir filter
+ * @ipv4: is layer 3 packet of version 4 or 6
+ * @add: add or delete filter
+ * @ipv4_filter_num: field to update
+ * @ipv6_filter_num: field to update
+ *
+ * Update filter number field for pf.
+ **/
+static void i40e_change_filter_num(bool ipv4, bool add, u16 *ipv4_filter_num,
+ u16 *ipv6_filter_num)
+{
+ if (add) {
+ if (ipv4)
+ (*ipv4_filter_num)++;
+ else
+ (*ipv6_filter_num)++;
+ } else {
+ if (ipv4)
+ (*ipv4_filter_num)--;
+ else
+ (*ipv6_filter_num)--;
+ }
}
-#define I40E_TCPIP_DUMMY_PACKET_LEN 54
+#define I40E_UDPIP_DUMMY_PACKET_LEN 42
+#define I40E_UDPIP6_DUMMY_PACKET_LEN 62
/**
- * i40e_add_del_fdir_tcpv4 - Add/Remove TCPv4 filters
+ * i40e_add_del_fdir_udp - Add/Remove UDP filters
* @vsi: pointer to the targeted VSI
* @fd_data: the flow director data required for the FDir descriptor
* @add: true adds a filter, false removes it
+ * @ipv4: true is v4, false is v6
*
* Returns 0 if the filters were successfully added or removed
**/
-static int i40e_add_del_fdir_tcpv4(struct i40e_vsi *vsi,
- struct i40e_fdir_filter *fd_data,
- bool add)
+static int i40e_add_del_fdir_udp(struct i40e_vsi *vsi,
+ struct i40e_fdir_filter *fd_data,
+ bool add,
+ bool ipv4)
{
struct i40e_pf *pf = vsi->back;
- struct tcphdr *tcp;
- struct iphdr *ip;
u8 *raw_packet;
int ret;
- /* Dummy packet */
- static char packet[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x08, 0,
- 0x45, 0, 0, 0x28, 0, 0, 0x40, 0, 0x40, 0x6, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x80, 0x11,
- 0x0, 0x72, 0, 0, 0, 0};
raw_packet = kzalloc(I40E_FDIR_MAX_RAW_PACKET_SIZE, GFP_KERNEL);
if (!raw_packet)
return -ENOMEM;
- memcpy(raw_packet, packet, I40E_TCPIP_DUMMY_PACKET_LEN);
-
- ip = (struct iphdr *)(raw_packet + IP_HEADER_OFFSET);
- tcp = (struct tcphdr *)(raw_packet + IP_HEADER_OFFSET
- + sizeof(struct iphdr));
- ip->daddr = fd_data->dst_ip;
- tcp->dest = fd_data->dst_port;
- ip->saddr = fd_data->src_ip;
- tcp->source = fd_data->src_port;
+ i40e_create_dummy_udp_packet(raw_packet, ipv4, IPPROTO_UDP, fd_data);
- if (fd_data->flex_filter) {
- u8 *payload = raw_packet + I40E_TCPIP_DUMMY_PACKET_LEN;
- __be16 pattern = fd_data->flex_word;
- u16 off = fd_data->flex_offset;
+ if (ipv4)
+ ret = i40e_prepare_fdir_filter
+ (pf, fd_data, add, raw_packet,
+ I40E_UDPIP_DUMMY_PACKET_LEN,
+ LIBIE_FILTER_PCTYPE_NONF_IPV4_UDP);
+ else
+ ret = i40e_prepare_fdir_filter
+ (pf, fd_data, add, raw_packet,
+ I40E_UDPIP6_DUMMY_PACKET_LEN,
+ LIBIE_FILTER_PCTYPE_NONF_IPV6_UDP);
- *((__force __be16 *)(payload + off)) = pattern;
+ if (ret) {
+ kfree(raw_packet);
+ return ret;
}
- fd_data->pctype = I40E_FILTER_PCTYPE_NONF_IPV4_TCP;
- ret = i40e_program_fdir_filter(fd_data, raw_packet, pf, add);
+ i40e_change_filter_num(ipv4, add, &pf->fd_udp4_filter_cnt,
+ &pf->fd_udp6_filter_cnt);
+
+ return 0;
+}
+
+#define I40E_TCPIP_DUMMY_PACKET_LEN 54
+#define I40E_TCPIP6_DUMMY_PACKET_LEN 74
+/**
+ * i40e_add_del_fdir_tcp - Add/Remove TCPv4 filters
+ * @vsi: pointer to the targeted VSI
+ * @fd_data: the flow director data required for the FDir descriptor
+ * @add: true adds a filter, false removes it
+ * @ipv4: true is v4, false is v6
+ *
+ * Returns 0 if the filters were successfully added or removed
+ **/
+static int i40e_add_del_fdir_tcp(struct i40e_vsi *vsi,
+ struct i40e_fdir_filter *fd_data,
+ bool add,
+ bool ipv4)
+{
+ struct i40e_pf *pf = vsi->back;
+ u8 *raw_packet;
+ int ret;
+
+ raw_packet = kzalloc(I40E_FDIR_MAX_RAW_PACKET_SIZE, GFP_KERNEL);
+ if (!raw_packet)
+ return -ENOMEM;
+
+ i40e_create_dummy_tcp_packet(raw_packet, ipv4, IPPROTO_TCP, fd_data);
+ if (ipv4)
+ ret = i40e_prepare_fdir_filter
+ (pf, fd_data, add, raw_packet,
+ I40E_TCPIP_DUMMY_PACKET_LEN,
+ LIBIE_FILTER_PCTYPE_NONF_IPV4_TCP);
+ else
+ ret = i40e_prepare_fdir_filter
+ (pf, fd_data, add, raw_packet,
+ I40E_TCPIP6_DUMMY_PACKET_LEN,
+ LIBIE_FILTER_PCTYPE_NONF_IPV6_TCP);
+
if (ret) {
- dev_info(&pf->pdev->dev,
- "PCTYPE:%d, Filter command send failed for fd_id:%d (ret = %d)\n",
- fd_data->pctype, fd_data->fd_id, ret);
- /* Free the packet buffer since it wasn't added to the ring */
kfree(raw_packet);
- return -EOPNOTSUPP;
- } else if (I40E_DEBUG_FD & pf->hw.debug_mask) {
- if (add)
- dev_info(&pf->pdev->dev, "Filter OK for PCTYPE %d loc = %d)\n",
- fd_data->pctype, fd_data->fd_id);
- else
- dev_info(&pf->pdev->dev,
- "Filter deleted for PCTYPE %d loc = %d\n",
- fd_data->pctype, fd_data->fd_id);
+ return ret;
}
+ i40e_change_filter_num(ipv4, add, &pf->fd_tcp4_filter_cnt,
+ &pf->fd_tcp6_filter_cnt);
+
if (add) {
- pf->fd_tcp4_filter_cnt++;
- if ((pf->flags & I40E_FLAG_FD_ATR_ENABLED) &&
+ if (test_bit(I40E_FLAG_FD_ATR_ENA, pf->flags) &&
I40E_DEBUG_FD & pf->hw.debug_mask)
dev_info(&pf->pdev->dev, "Forcing ATR off, sideband rules for TCP/IPv4 flow being applied\n");
- pf->flags |= I40E_FLAG_FD_ATR_AUTO_DISABLED;
- } else {
- pf->fd_tcp4_filter_cnt--;
+ set_bit(__I40E_FD_ATR_AUTO_DISABLED, pf->state);
}
-
return 0;
}
-#define I40E_SCTPIP_DUMMY_PACKET_LEN 46
+#define I40E_SCTPIP_DUMMY_PACKET_LEN 46
+#define I40E_SCTPIP6_DUMMY_PACKET_LEN 66
/**
- * i40e_add_del_fdir_sctpv4 - Add/Remove SCTPv4 Flow Director filters for
+ * i40e_add_del_fdir_sctp - Add/Remove SCTPv4 Flow Director filters for
* a specific flow spec
* @vsi: pointer to the targeted VSI
* @fd_data: the flow director data required for the FDir descriptor
* @add: true adds a filter, false removes it
+ * @ipv4: true is v4, false is v6
*
* Returns 0 if the filters were successfully added or removed
**/
-static int i40e_add_del_fdir_sctpv4(struct i40e_vsi *vsi,
- struct i40e_fdir_filter *fd_data,
- bool add)
+static int i40e_add_del_fdir_sctp(struct i40e_vsi *vsi,
+ struct i40e_fdir_filter *fd_data,
+ bool add,
+ bool ipv4)
{
struct i40e_pf *pf = vsi->back;
- struct sctphdr *sctp;
- struct iphdr *ip;
u8 *raw_packet;
int ret;
- /* Dummy packet */
- static char packet[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x08, 0,
- 0x45, 0, 0, 0x20, 0, 0, 0x40, 0, 0x40, 0x84, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
raw_packet = kzalloc(I40E_FDIR_MAX_RAW_PACKET_SIZE, GFP_KERNEL);
if (!raw_packet)
return -ENOMEM;
- memcpy(raw_packet, packet, I40E_SCTPIP_DUMMY_PACKET_LEN);
- ip = (struct iphdr *)(raw_packet + IP_HEADER_OFFSET);
- sctp = (struct sctphdr *)(raw_packet + IP_HEADER_OFFSET
- + sizeof(struct iphdr));
+ i40e_create_dummy_sctp_packet(raw_packet, ipv4, IPPROTO_SCTP, fd_data);
- ip->daddr = fd_data->dst_ip;
- sctp->dest = fd_data->dst_port;
- ip->saddr = fd_data->src_ip;
- sctp->source = fd_data->src_port;
-
- if (fd_data->flex_filter) {
- u8 *payload = raw_packet + I40E_SCTPIP_DUMMY_PACKET_LEN;
- __be16 pattern = fd_data->flex_word;
- u16 off = fd_data->flex_offset;
-
- *((__force __be16 *)(payload + off)) = pattern;
- }
+ if (ipv4)
+ ret = i40e_prepare_fdir_filter
+ (pf, fd_data, add, raw_packet,
+ I40E_SCTPIP_DUMMY_PACKET_LEN,
+ LIBIE_FILTER_PCTYPE_NONF_IPV4_SCTP);
+ else
+ ret = i40e_prepare_fdir_filter
+ (pf, fd_data, add, raw_packet,
+ I40E_SCTPIP6_DUMMY_PACKET_LEN,
+ LIBIE_FILTER_PCTYPE_NONF_IPV6_SCTP);
- fd_data->pctype = I40E_FILTER_PCTYPE_NONF_IPV4_SCTP;
- ret = i40e_program_fdir_filter(fd_data, raw_packet, pf, add);
if (ret) {
- dev_info(&pf->pdev->dev,
- "PCTYPE:%d, Filter command send failed for fd_id:%d (ret = %d)\n",
- fd_data->pctype, fd_data->fd_id, ret);
- /* Free the packet buffer since it wasn't added to the ring */
kfree(raw_packet);
- return -EOPNOTSUPP;
- } else if (I40E_DEBUG_FD & pf->hw.debug_mask) {
- if (add)
- dev_info(&pf->pdev->dev,
- "Filter OK for PCTYPE %d loc = %d\n",
- fd_data->pctype, fd_data->fd_id);
- else
- dev_info(&pf->pdev->dev,
- "Filter deleted for PCTYPE %d loc = %d\n",
- fd_data->pctype, fd_data->fd_id);
+ return ret;
}
- if (add)
- pf->fd_sctp4_filter_cnt++;
- else
- pf->fd_sctp4_filter_cnt--;
+ i40e_change_filter_num(ipv4, add, &pf->fd_sctp4_filter_cnt,
+ &pf->fd_sctp6_filter_cnt);
return 0;
}
-#define I40E_IP_DUMMY_PACKET_LEN 34
+#define I40E_IP_DUMMY_PACKET_LEN 34
+#define I40E_IP6_DUMMY_PACKET_LEN 54
/**
- * i40e_add_del_fdir_ipv4 - Add/Remove IPv4 Flow Director filters for
+ * i40e_add_del_fdir_ip - Add/Remove IPv4 Flow Director filters for
* a specific flow spec
* @vsi: pointer to the targeted VSI
* @fd_data: the flow director data required for the FDir descriptor
* @add: true adds a filter, false removes it
+ * @ipv4: true is v4, false is v6
*
* Returns 0 if the filters were successfully added or removed
**/
-static int i40e_add_del_fdir_ipv4(struct i40e_vsi *vsi,
- struct i40e_fdir_filter *fd_data,
- bool add)
+static int i40e_add_del_fdir_ip(struct i40e_vsi *vsi,
+ struct i40e_fdir_filter *fd_data,
+ bool add,
+ bool ipv4)
{
struct i40e_pf *pf = vsi->back;
- struct iphdr *ip;
+ int payload_offset;
u8 *raw_packet;
+ int iter_start;
+ int iter_end;
int ret;
int i;
- static char packet[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x08, 0,
- 0x45, 0, 0, 0x14, 0, 0, 0x40, 0, 0x40, 0x10, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0};
- for (i = I40E_FILTER_PCTYPE_NONF_IPV4_OTHER;
- i <= I40E_FILTER_PCTYPE_FRAG_IPV4; i++) {
+ if (ipv4) {
+ iter_start = LIBIE_FILTER_PCTYPE_NONF_IPV4_OTHER;
+ iter_end = LIBIE_FILTER_PCTYPE_FRAG_IPV4;
+ } else {
+ iter_start = LIBIE_FILTER_PCTYPE_NONF_IPV6_OTHER;
+ iter_end = LIBIE_FILTER_PCTYPE_FRAG_IPV6;
+ }
+
+ for (i = iter_start; i <= iter_end; i++) {
raw_packet = kzalloc(I40E_FDIR_MAX_RAW_PACKET_SIZE, GFP_KERNEL);
if (!raw_packet)
return -ENOMEM;
- memcpy(raw_packet, packet, I40E_IP_DUMMY_PACKET_LEN);
- ip = (struct iphdr *)(raw_packet + IP_HEADER_OFFSET);
-
- ip->saddr = fd_data->src_ip;
- ip->daddr = fd_data->dst_ip;
- ip->protocol = 0;
- if (fd_data->flex_filter) {
- u8 *payload = raw_packet + I40E_IP_DUMMY_PACKET_LEN;
- __be16 pattern = fd_data->flex_word;
- u16 off = fd_data->flex_offset;
+ /* IPv6 no header option differs from IPv4 */
+ (void)i40e_create_dummy_packet
+ (raw_packet, ipv4, (ipv4) ? IPPROTO_IP : IPPROTO_NONE,
+ fd_data);
- *((__force __be16 *)(payload + off)) = pattern;
- }
-
- fd_data->pctype = i;
- ret = i40e_program_fdir_filter(fd_data, raw_packet, pf, add);
- if (ret) {
- dev_info(&pf->pdev->dev,
- "PCTYPE:%d, Filter command send failed for fd_id:%d (ret = %d)\n",
- fd_data->pctype, fd_data->fd_id, ret);
- /* The packet buffer wasn't added to the ring so we
- * need to free it now.
- */
- kfree(raw_packet);
- return -EOPNOTSUPP;
- } else if (I40E_DEBUG_FD & pf->hw.debug_mask) {
- if (add)
- dev_info(&pf->pdev->dev,
- "Filter OK for PCTYPE %d loc = %d\n",
- fd_data->pctype, fd_data->fd_id);
- else
- dev_info(&pf->pdev->dev,
- "Filter deleted for PCTYPE %d loc = %d\n",
- fd_data->pctype, fd_data->fd_id);
- }
+ payload_offset = (ipv4) ? I40E_IP_DUMMY_PACKET_LEN :
+ I40E_IP6_DUMMY_PACKET_LEN;
+ ret = i40e_prepare_fdir_filter(pf, fd_data, add, raw_packet,
+ payload_offset, i);
+ if (ret)
+ goto err;
}
- if (add)
- pf->fd_ip4_filter_cnt++;
- else
- pf->fd_ip4_filter_cnt--;
+ i40e_change_filter_num(ipv4, add, &pf->fd_ip4_filter_cnt,
+ &pf->fd_ip6_filter_cnt);
return 0;
+err:
+ kfree(raw_packet);
+ return ret;
}
/**
* i40e_add_del_fdir - Build raw packets to add/del fdir filter
* @vsi: pointer to the targeted VSI
- * @cmd: command to get or set RX flow classification rules
+ * @input: filter to add or delete
* @add: true adds a filter, false removes it
*
**/
int i40e_add_del_fdir(struct i40e_vsi *vsi,
struct i40e_fdir_filter *input, bool add)
{
+ enum ip_ver { ipv6 = 0, ipv4 = 1 };
struct i40e_pf *pf = vsi->back;
int ret;
switch (input->flow_type & ~FLOW_EXT) {
case TCP_V4_FLOW:
- ret = i40e_add_del_fdir_tcpv4(vsi, input, add);
+ ret = i40e_add_del_fdir_tcp(vsi, input, add, ipv4);
break;
case UDP_V4_FLOW:
- ret = i40e_add_del_fdir_udpv4(vsi, input, add);
+ ret = i40e_add_del_fdir_udp(vsi, input, add, ipv4);
break;
case SCTP_V4_FLOW:
- ret = i40e_add_del_fdir_sctpv4(vsi, input, add);
+ ret = i40e_add_del_fdir_sctp(vsi, input, add, ipv4);
+ break;
+ case TCP_V6_FLOW:
+ ret = i40e_add_del_fdir_tcp(vsi, input, add, ipv6);
+ break;
+ case UDP_V6_FLOW:
+ ret = i40e_add_del_fdir_udp(vsi, input, add, ipv6);
+ break;
+ case SCTP_V6_FLOW:
+ ret = i40e_add_del_fdir_sctp(vsi, input, add, ipv6);
break;
case IP_USER_FLOW:
- switch (input->ip4_proto) {
+ switch (input->ipl4_proto) {
case IPPROTO_TCP:
- ret = i40e_add_del_fdir_tcpv4(vsi, input, add);
+ ret = i40e_add_del_fdir_tcp(vsi, input, add, ipv4);
break;
case IPPROTO_UDP:
- ret = i40e_add_del_fdir_udpv4(vsi, input, add);
+ ret = i40e_add_del_fdir_udp(vsi, input, add, ipv4);
break;
case IPPROTO_SCTP:
- ret = i40e_add_del_fdir_sctpv4(vsi, input, add);
+ ret = i40e_add_del_fdir_sctp(vsi, input, add, ipv4);
break;
case IPPROTO_IP:
- ret = i40e_add_del_fdir_ipv4(vsi, input, add);
+ ret = i40e_add_del_fdir_ip(vsi, input, add, ipv4);
break;
default:
/* We cannot support masking based on protocol */
dev_info(&pf->pdev->dev, "Unsupported IPv4 protocol 0x%02x\n",
- input->ip4_proto);
+ input->ipl4_proto);
+ return -EINVAL;
+ }
+ break;
+ case IPV6_USER_FLOW:
+ switch (input->ipl4_proto) {
+ case IPPROTO_TCP:
+ ret = i40e_add_del_fdir_tcp(vsi, input, add, ipv6);
+ break;
+ case IPPROTO_UDP:
+ ret = i40e_add_del_fdir_udp(vsi, input, add, ipv6);
+ break;
+ case IPPROTO_SCTP:
+ ret = i40e_add_del_fdir_sctp(vsi, input, add, ipv6);
+ break;
+ case IPPROTO_IP:
+ ret = i40e_add_del_fdir_ip(vsi, input, add, ipv6);
+ break;
+ default:
+ /* We cannot support masking based on protocol */
+ dev_info(&pf->pdev->dev, "Unsupported IPv6 protocol 0x%02x\n",
+ input->ipl4_proto);
return -EINVAL;
}
break;
@@ -552,28 +671,28 @@ int i40e_add_del_fdir(struct i40e_vsi *vsi,
/**
* i40e_fd_handle_status - check the Programming Status for FD
* @rx_ring: the Rx ring for this descriptor
- * @rx_desc: the Rx descriptor for programming Status, not a packet descriptor.
+ * @qword0_raw: qword0
+ * @qword1: qword1 after le_to_cpu
* @prog_id: the id originally used for programming
*
* This is used to verify if the FD programming or invalidation
* requested by SW to the HW is successful or not and take actions accordingly.
**/
-static void i40e_fd_handle_status(struct i40e_ring *rx_ring,
- union i40e_rx_desc *rx_desc, u8 prog_id)
+static void i40e_fd_handle_status(struct i40e_ring *rx_ring, u64 qword0_raw,
+ u64 qword1, u8 prog_id)
{
struct i40e_pf *pf = rx_ring->vsi->back;
struct pci_dev *pdev = pf->pdev;
+ struct i40e_16b_rx_wb_qw0 *qw0;
u32 fcnt_prog, fcnt_avail;
u32 error;
- u64 qw;
- qw = le64_to_cpu(rx_desc->wb.qword1.status_error_len);
- error = (qw & I40E_RX_PROG_STATUS_DESC_QW1_ERROR_MASK) >>
- I40E_RX_PROG_STATUS_DESC_QW1_ERROR_SHIFT;
+ qw0 = (struct i40e_16b_rx_wb_qw0 *)&qword0_raw;
+ error = FIELD_GET(I40E_RX_PROG_STATUS_DESC_QW1_ERROR_MASK, qword1);
if (error == BIT(I40E_RX_PROG_STATUS_DESC_FD_TBL_FULL_SHIFT)) {
- pf->fd_inv = le32_to_cpu(rx_desc->wb.qword0.hi_dword.fd_id);
- if ((rx_desc->wb.qword0.hi_dword.fd_id != 0) ||
+ pf->fd_inv = le32_to_cpu(qw0->hi_dword.fd_id);
+ if (qw0->hi_dword.fd_id != 0 ||
(I40E_DEBUG_FD & pf->hw.debug_mask))
dev_warn(&pdev->dev, "ntuple filter loc = %d, could not be added\n",
pf->fd_inv);
@@ -591,9 +710,15 @@ static void i40e_fd_handle_status(struct i40e_ring *rx_ring,
/* store the current atr filter count */
pf->fd_atr_cnt = i40e_get_current_atr_cnt(pf);
- if ((rx_desc->wb.qword0.hi_dword.fd_id == 0) &&
- pf->flags & I40E_FLAG_FD_SB_AUTO_DISABLED) {
- pf->flags |= I40E_FLAG_FD_ATR_AUTO_DISABLED;
+ if (qw0->hi_dword.fd_id == 0 &&
+ test_bit(__I40E_FD_SB_AUTO_DISABLED, pf->state)) {
+ /* These set_bit() calls aren't atomic with the
+ * test_bit() here, but worse case we potentially
+ * disable ATR and queue a flush right after SB
+ * support is re-enabled. That shouldn't cause an
+ * issue in practice
+ */
+ set_bit(__I40E_FD_ATR_AUTO_DISABLED, pf->state);
set_bit(__I40E_FD_FLUSH_REQUESTED, pf->state);
}
@@ -605,17 +730,16 @@ static void i40e_fd_handle_status(struct i40e_ring *rx_ring,
* FD ATR/SB and then re-enable it when there is room.
*/
if (fcnt_prog >= (fcnt_avail - I40E_FDIR_BUFFER_FULL_MARGIN)) {
- if ((pf->flags & I40E_FLAG_FD_SB_ENABLED) &&
- !(pf->flags & I40E_FLAG_FD_SB_AUTO_DISABLED)) {
- pf->flags |= I40E_FLAG_FD_SB_AUTO_DISABLED;
+ if (test_bit(I40E_FLAG_FD_SB_ENA, pf->flags) &&
+ !test_and_set_bit(__I40E_FD_SB_AUTO_DISABLED,
+ pf->state))
if (I40E_DEBUG_FD & pf->hw.debug_mask)
dev_warn(&pdev->dev, "FD filter space full, new ntuple rules will not be added\n");
- }
}
} else if (error == BIT(I40E_RX_PROG_STATUS_DESC_NO_FD_ENTRY_SHIFT)) {
if (I40E_DEBUG_FD & pf->hw.debug_mask)
dev_info(&pdev->dev, "ntuple filter fd_id = %d, could not be removed\n",
- rx_desc->wb.qword0.hi_dword.fd_id);
+ qw0->hi_dword.fd_id);
}
}
@@ -631,7 +755,7 @@ static void i40e_unmap_and_free_tx_resource(struct i40e_ring *ring,
if (tx_buffer->tx_flags & I40E_TX_FLAGS_FD_SB)
kfree(tx_buffer->raw_buf);
else if (ring_is_xdp(ring))
- page_frag_free(tx_buffer->raw_buf);
+ xdp_return_frame(tx_buffer->xdpf);
else
dev_kfree_skb_any(tx_buffer->skb);
if (dma_unmap_len(tx_buffer, len))
@@ -661,13 +785,18 @@ void i40e_clean_tx_ring(struct i40e_ring *tx_ring)
unsigned long bi_size;
u16 i;
- /* ring already cleared, nothing to do */
- if (!tx_ring->tx_bi)
- return;
+ if (ring_is_xdp(tx_ring) && tx_ring->xsk_pool) {
+ i40e_xsk_clean_tx_ring(tx_ring);
+ } else {
+ /* ring already cleared, nothing to do */
+ if (!tx_ring->tx_bi)
+ return;
- /* Free all the Tx ring sk_buffs */
- for (i = 0; i < tx_ring->count; i++)
- i40e_unmap_and_free_tx_resource(tx_ring, &tx_ring->tx_bi[i]);
+ /* Free all the Tx ring sk_buffs */
+ for (i = 0; i < tx_ring->count; i++)
+ i40e_unmap_and_free_tx_resource(tx_ring,
+ &tx_ring->tx_bi[i]);
+ }
bi_size = sizeof(struct i40e_tx_buffer) * tx_ring->count;
memset(tx_ring->tx_bi, 0, bi_size);
@@ -706,17 +835,23 @@ void i40e_free_tx_resources(struct i40e_ring *tx_ring)
/**
* i40e_get_tx_pending - how many tx descriptors not processed
- * @tx_ring: the ring of descriptors
+ * @ring: the ring of descriptors
+ * @in_sw: use SW variables
*
* Since there is no access to the ring head register
* in XL710, we need to use our local copies
**/
-u32 i40e_get_tx_pending(struct i40e_ring *ring)
+u32 i40e_get_tx_pending(struct i40e_ring *ring, bool in_sw)
{
u32 head, tail;
- head = i40e_get_head(ring);
- tail = readl(ring->tail);
+ if (!in_sw) {
+ head = i40e_get_head(ring);
+ tail = readl(ring->tail);
+ } else {
+ head = ring->next_to_clean;
+ tail = ring->next_to_use;
+ }
if (head != tail)
return (head < tail) ?
@@ -725,20 +860,75 @@ u32 i40e_get_tx_pending(struct i40e_ring *ring)
return 0;
}
-#define WB_STRIDE 4
+/**
+ * i40e_detect_recover_hung - Function to detect and recover hung_queues
+ * @pf: pointer to PF struct
+ *
+ * LAN VSI has netdev and netdev has TX queues. This function is to check
+ * each of those TX queues if they are hung, trigger recovery by issuing
+ * SW interrupt.
+ **/
+void i40e_detect_recover_hung(struct i40e_pf *pf)
+{
+ struct i40e_vsi *vsi = i40e_pf_get_main_vsi(pf);
+ struct i40e_ring *tx_ring = NULL;
+ struct net_device *netdev;
+ unsigned int i;
+ int packets;
+
+ if (!vsi)
+ return;
+
+ if (test_bit(__I40E_VSI_DOWN, vsi->state))
+ return;
+
+ netdev = vsi->netdev;
+ if (!netdev)
+ return;
+
+ if (!netif_carrier_ok(netdev))
+ return;
+
+ for (i = 0; i < vsi->num_queue_pairs; i++) {
+ tx_ring = vsi->tx_rings[i];
+ if (tx_ring && tx_ring->desc) {
+ /* If packet counter has not changed the queue is
+ * likely stalled, so force an interrupt for this
+ * queue.
+ *
+ * prev_pkt_ctr would be negative if there was no
+ * pending work.
+ */
+ packets = tx_ring->stats.packets & INT_MAX;
+ if (tx_ring->tx_stats.prev_pkt_ctr == packets) {
+ i40e_force_wb(vsi, tx_ring->q_vector);
+ continue;
+ }
+
+ /* Memory barrier between read of packet count and call
+ * to i40e_get_tx_pending()
+ */
+ smp_rmb();
+ tx_ring->tx_stats.prev_pkt_ctr =
+ i40e_get_tx_pending(tx_ring, true) ? packets : -1;
+ }
+ }
+}
/**
* i40e_clean_tx_irq - Reclaim resources after transmit completes
* @vsi: the VSI we care about
* @tx_ring: Tx ring to clean
* @napi_budget: Used to determine if we are in netpoll
+ * @tx_cleaned: Out parameter set to the number of TXes cleaned
*
* Returns true if there's any budget left (e.g. the clean is finished)
**/
static bool i40e_clean_tx_irq(struct i40e_vsi *vsi,
- struct i40e_ring *tx_ring, int napi_budget)
+ struct i40e_ring *tx_ring, int napi_budget,
+ unsigned int *tx_cleaned)
{
- u16 i = tx_ring->next_to_clean;
+ int i = tx_ring->next_to_clean;
struct i40e_tx_buffer *tx_buf;
struct i40e_tx_desc *tx_head;
struct i40e_tx_desc *tx_desc;
@@ -758,9 +948,6 @@ static bool i40e_clean_tx_irq(struct i40e_vsi *vsi,
if (!eop_desc)
break;
- /* prevent any other reads prior to eop_desc */
- read_barrier_depends();
-
i40e_trace(clean_tx_irq, tx_ring, tx_desc, tx_buf);
/* we have caught up to head, no work left to do */
if (tx_head == tx_desc)
@@ -775,7 +962,7 @@ static bool i40e_clean_tx_irq(struct i40e_vsi *vsi,
/* free the skb/XDP data */
if (ring_is_xdp(tx_ring))
- page_frag_free(tx_buf->raw_buf);
+ xdp_return_frame(tx_buf->xdpf);
else
napi_consume_skb(tx_buf->skb, napi_budget);
@@ -831,27 +1018,8 @@ static bool i40e_clean_tx_irq(struct i40e_vsi *vsi,
i += tx_ring->count;
tx_ring->next_to_clean = i;
- u64_stats_update_begin(&tx_ring->syncp);
- tx_ring->stats.bytes += total_bytes;
- tx_ring->stats.packets += total_packets;
- u64_stats_update_end(&tx_ring->syncp);
- tx_ring->q_vector->tx.total_bytes += total_bytes;
- tx_ring->q_vector->tx.total_packets += total_packets;
-
- if (tx_ring->flags & I40E_TXR_FLAGS_WB_ON_ITR) {
- /* check to see if there are < 4 descriptors
- * waiting to be written back, then kick the hardware to force
- * them to be written back in case we stay in NAPI.
- * In this mode on X722 we do not enable Interrupt.
- */
- unsigned int j = i40e_get_tx_pending(tx_ring);
-
- if (budget &&
- ((j / WB_STRIDE) == 0) && (j > 0) &&
- !test_bit(__I40E_VSI_DOWN, vsi->state) &&
- (I40E_DESC_UNUSED(tx_ring) != tx_ring->count))
- tx_ring->arm_wb = true;
- }
+ i40e_update_tx_stats(tx_ring, total_packets, total_bytes);
+ i40e_arm_wb(tx_ring, vsi, budget);
if (ring_is_xdp(tx_ring))
return !!budget;
@@ -860,7 +1028,7 @@ static bool i40e_clean_tx_irq(struct i40e_vsi *vsi,
netdev_tx_completed_queue(txring_txq(tx_ring),
total_packets, total_bytes);
-#define TX_WAKE_THRESHOLD (DESC_NEEDED * 2)
+#define TX_WAKE_THRESHOLD ((s16)(DESC_NEEDED * 2))
if (unlikely(total_packets && netif_carrier_ok(tx_ring->netdev) &&
(I40E_DESC_UNUSED(tx_ring) >= TX_WAKE_THRESHOLD))) {
/* Make sure that anybody stopping the queue after this
@@ -876,6 +1044,7 @@ static bool i40e_clean_tx_irq(struct i40e_vsi *vsi,
}
}
+ *tx_cleaned = total_packets;
return !!budget;
}
@@ -897,12 +1066,12 @@ static void i40e_enable_wb_on_itr(struct i40e_vsi *vsi,
if (q_vector->arm_wb_state)
return;
- if (vsi->back->flags & I40E_FLAG_MSIX_ENABLED) {
+ if (test_bit(I40E_FLAG_MSIX_ENA, vsi->back->flags)) {
val = I40E_PFINT_DYN_CTLN_WB_ON_ITR_MASK |
I40E_PFINT_DYN_CTLN_ITR_INDX_MASK; /* set noitr */
wr32(&vsi->back->hw,
- I40E_PFINT_DYN_CTLN(q_vector->v_idx + vsi->base_vector - 1),
+ I40E_PFINT_DYN_CTLN(q_vector->reg_idx),
val);
} else {
val = I40E_PFINT_DYN_CTL0_WB_ON_ITR_MASK |
@@ -921,7 +1090,7 @@ static void i40e_enable_wb_on_itr(struct i40e_vsi *vsi,
**/
void i40e_force_wb(struct i40e_vsi *vsi, struct i40e_q_vector *q_vector)
{
- if (vsi->back->flags & I40E_FLAG_MSIX_ENABLED) {
+ if (test_bit(I40E_FLAG_MSIX_ENA, vsi->back->flags)) {
u32 val = I40E_PFINT_DYN_CTLN_INTENA_MASK |
I40E_PFINT_DYN_CTLN_ITR_INDX_MASK | /* set noitr */
I40E_PFINT_DYN_CTLN_SWINT_TRIG_MASK |
@@ -929,8 +1098,7 @@ void i40e_force_wb(struct i40e_vsi *vsi, struct i40e_q_vector *q_vector)
/* allow 00 to be written to the index */
wr32(&vsi->back->hw,
- I40E_PFINT_DYN_CTLN(q_vector->v_idx +
- vsi->base_vector - 1), val);
+ I40E_PFINT_DYN_CTLN(q_vector->reg_idx), val);
} else {
u32 val = I40E_PFINT_DYN_CTL0_INTENA_MASK |
I40E_PFINT_DYN_CTL0_ITR_INDX_MASK | /* set noitr */
@@ -942,154 +1110,298 @@ void i40e_force_wb(struct i40e_vsi *vsi, struct i40e_q_vector *q_vector)
}
}
+static inline bool i40e_container_is_rx(struct i40e_q_vector *q_vector,
+ struct i40e_ring_container *rc)
+{
+ return &q_vector->rx == rc;
+}
+
+static inline unsigned int i40e_itr_divisor(struct i40e_q_vector *q_vector)
+{
+ unsigned int divisor;
+
+ switch (q_vector->vsi->back->hw.phy.link_info.link_speed) {
+ case I40E_LINK_SPEED_40GB:
+ divisor = I40E_ITR_ADAPTIVE_MIN_INC * 1024;
+ break;
+ case I40E_LINK_SPEED_25GB:
+ case I40E_LINK_SPEED_20GB:
+ divisor = I40E_ITR_ADAPTIVE_MIN_INC * 512;
+ break;
+ default:
+ case I40E_LINK_SPEED_10GB:
+ divisor = I40E_ITR_ADAPTIVE_MIN_INC * 256;
+ break;
+ case I40E_LINK_SPEED_1GB:
+ case I40E_LINK_SPEED_100MB:
+ divisor = I40E_ITR_ADAPTIVE_MIN_INC * 32;
+ break;
+ }
+
+ return divisor;
+}
+
/**
- * i40e_set_new_dynamic_itr - Find new ITR level
+ * i40e_update_itr - update the dynamic ITR value based on statistics
+ * @q_vector: structure containing interrupt and ring information
* @rc: structure containing ring performance data
*
- * Returns true if ITR changed, false if not
- *
- * Stores a new ITR value based on packets and byte counts during
- * the last interrupt. The advantage of per interrupt computation
- * is faster updates and more accurate ITR for the current traffic
- * pattern. Constants in this function were computed based on
- * theoretical maximum wire speed and thresholds were set based on
- * testing data as well as attempting to minimize response time
+ * Stores a new ITR value based on packets and byte
+ * counts during the last interrupt. The advantage of per interrupt
+ * computation is faster updates and more accurate ITR for the current
+ * traffic pattern. Constants in this function were computed
+ * based on theoretical maximum wire speed and thresholds were set based
+ * on testing data as well as attempting to minimize response time
* while increasing bulk throughput.
**/
-static bool i40e_set_new_dynamic_itr(struct i40e_ring_container *rc)
+static void i40e_update_itr(struct i40e_q_vector *q_vector,
+ struct i40e_ring_container *rc)
{
- enum i40e_latency_range new_latency_range = rc->latency_range;
- struct i40e_q_vector *qv = rc->ring->q_vector;
- u32 new_itr = rc->itr;
- int bytes_per_int;
- int usecs;
+ unsigned int avg_wire_size, packets, bytes, itr;
+ unsigned long next_update = jiffies;
- if (rc->total_packets == 0 || !rc->itr)
- return false;
+ /* If we don't have any rings just leave ourselves set for maximum
+ * possible latency so we take ourselves out of the equation.
+ */
+ if (!rc->ring || !ITR_IS_DYNAMIC(rc->ring->itr_setting))
+ return;
- /* simple throttlerate management
- * 0-10MB/s lowest (50000 ints/s)
- * 10-20MB/s low (20000 ints/s)
- * 20-1249MB/s bulk (18000 ints/s)
- * > 40000 Rx packets per second (8000 ints/s)
+ /* For Rx we want to push the delay up and default to low latency.
+ * for Tx we want to pull the delay down and default to high latency.
+ */
+ itr = i40e_container_is_rx(q_vector, rc) ?
+ I40E_ITR_ADAPTIVE_MIN_USECS | I40E_ITR_ADAPTIVE_LATENCY :
+ I40E_ITR_ADAPTIVE_MAX_USECS | I40E_ITR_ADAPTIVE_LATENCY;
+
+ /* If we didn't update within up to 1 - 2 jiffies we can assume
+ * that either packets are coming in so slow there hasn't been
+ * any work, or that there is so much work that NAPI is dealing
+ * with interrupt moderation and we don't need to do anything.
+ */
+ if (time_after(next_update, rc->next_update))
+ goto clear_counts;
+
+ /* If itr_countdown is set it means we programmed an ITR within
+ * the last 4 interrupt cycles. This has a side effect of us
+ * potentially firing an early interrupt. In order to work around
+ * this we need to throw out any data received for a few
+ * interrupts following the update.
+ */
+ if (q_vector->itr_countdown) {
+ itr = rc->target_itr;
+ goto clear_counts;
+ }
+
+ packets = rc->total_packets;
+ bytes = rc->total_bytes;
+
+ if (i40e_container_is_rx(q_vector, rc)) {
+ /* If Rx there are 1 to 4 packets and bytes are less than
+ * 9000 assume insufficient data to use bulk rate limiting
+ * approach unless Tx is already in bulk rate limiting. We
+ * are likely latency driven.
+ */
+ if (packets && packets < 4 && bytes < 9000 &&
+ (q_vector->tx.target_itr & I40E_ITR_ADAPTIVE_LATENCY)) {
+ itr = I40E_ITR_ADAPTIVE_LATENCY;
+ goto adjust_by_size;
+ }
+ } else if (packets < 4) {
+ /* If we have Tx and Rx ITR maxed and Tx ITR is running in
+ * bulk mode and we are receiving 4 or fewer packets just
+ * reset the ITR_ADAPTIVE_LATENCY bit for latency mode so
+ * that the Rx can relax.
+ */
+ if (rc->target_itr == I40E_ITR_ADAPTIVE_MAX_USECS &&
+ (q_vector->rx.target_itr & I40E_ITR_MASK) ==
+ I40E_ITR_ADAPTIVE_MAX_USECS)
+ goto clear_counts;
+ } else if (packets > 32) {
+ /* If we have processed over 32 packets in a single interrupt
+ * for Tx assume we need to switch over to "bulk" mode.
+ */
+ rc->target_itr &= ~I40E_ITR_ADAPTIVE_LATENCY;
+ }
+
+ /* We have no packets to actually measure against. This means
+ * either one of the other queues on this vector is active or
+ * we are a Tx queue doing TSO with too high of an interrupt rate.
*
- * The math works out because the divisor is in 10^(-6) which
- * turns the bytes/us input value into MB/s values, but
- * make sure to use usecs, as the register values written
- * are in 2 usec increments in the ITR registers, and make sure
- * to use the smoothed values that the countdown timer gives us.
+ * Between 4 and 56 we can assume that our current interrupt delay
+ * is only slightly too low. As such we should increase it by a small
+ * fixed amount.
*/
- usecs = (rc->itr << 1) * ITR_COUNTDOWN_START;
- bytes_per_int = rc->total_bytes / usecs;
+ if (packets < 56) {
+ itr = rc->target_itr + I40E_ITR_ADAPTIVE_MIN_INC;
+ if ((itr & I40E_ITR_MASK) > I40E_ITR_ADAPTIVE_MAX_USECS) {
+ itr &= I40E_ITR_ADAPTIVE_LATENCY;
+ itr += I40E_ITR_ADAPTIVE_MAX_USECS;
+ }
+ goto clear_counts;
+ }
- switch (new_latency_range) {
- case I40E_LOWEST_LATENCY:
- if (bytes_per_int > 10)
- new_latency_range = I40E_LOW_LATENCY;
- break;
- case I40E_LOW_LATENCY:
- if (bytes_per_int > 20)
- new_latency_range = I40E_BULK_LATENCY;
- else if (bytes_per_int <= 10)
- new_latency_range = I40E_LOWEST_LATENCY;
- break;
- case I40E_BULK_LATENCY:
- case I40E_ULTRA_LATENCY:
- default:
- if (bytes_per_int <= 20)
- new_latency_range = I40E_LOW_LATENCY;
- break;
+ if (packets <= 256) {
+ itr = min(q_vector->tx.current_itr, q_vector->rx.current_itr);
+ itr &= I40E_ITR_MASK;
+
+ /* Between 56 and 112 is our "goldilocks" zone where we are
+ * working out "just right". Just report that our current
+ * ITR is good for us.
+ */
+ if (packets <= 112)
+ goto clear_counts;
+
+ /* If packet count is 128 or greater we are likely looking
+ * at a slight overrun of the delay we want. Try halving
+ * our delay to see if that will cut the number of packets
+ * in half per interrupt.
+ */
+ itr /= 2;
+ itr &= I40E_ITR_MASK;
+ if (itr < I40E_ITR_ADAPTIVE_MIN_USECS)
+ itr = I40E_ITR_ADAPTIVE_MIN_USECS;
+
+ goto clear_counts;
}
- /* this is to adjust RX more aggressively when streaming small
- * packets. The value of 40000 was picked as it is just beyond
- * what the hardware can receive per second if in low latency
- * mode.
+ /* The paths below assume we are dealing with a bulk ITR since
+ * number of packets is greater than 256. We are just going to have
+ * to compute a value and try to bring the count under control,
+ * though for smaller packet sizes there isn't much we can do as
+ * NAPI polling will likely be kicking in sooner rather than later.
*/
-#define RX_ULTRA_PACKET_RATE 40000
+ itr = I40E_ITR_ADAPTIVE_BULK;
- if ((((rc->total_packets * 1000000) / usecs) > RX_ULTRA_PACKET_RATE) &&
- (&qv->rx == rc))
- new_latency_range = I40E_ULTRA_LATENCY;
+adjust_by_size:
+ /* If packet counts are 256 or greater we can assume we have a gross
+ * overestimation of what the rate should be. Instead of trying to fine
+ * tune it just use the formula below to try and dial in an exact value
+ * give the current packet size of the frame.
+ */
+ avg_wire_size = bytes / packets;
- rc->latency_range = new_latency_range;
+ /* The following is a crude approximation of:
+ * wmem_default / (size + overhead) = desired_pkts_per_int
+ * rate / bits_per_byte / (size + ethernet overhead) = pkt_rate
+ * (desired_pkt_rate / pkt_rate) * usecs_per_sec = ITR value
+ *
+ * Assuming wmem_default is 212992 and overhead is 640 bytes per
+ * packet, (256 skb, 64 headroom, 320 shared info), we can reduce the
+ * formula down to
+ *
+ * (170 * (size + 24)) / (size + 640) = ITR
+ *
+ * We first do some math on the packet size and then finally bitshift
+ * by 8 after rounding up. We also have to account for PCIe link speed
+ * difference as ITR scales based on this.
+ */
+ if (avg_wire_size <= 60) {
+ /* Start at 250k ints/sec */
+ avg_wire_size = 4096;
+ } else if (avg_wire_size <= 380) {
+ /* 250K ints/sec to 60K ints/sec */
+ avg_wire_size *= 40;
+ avg_wire_size += 1696;
+ } else if (avg_wire_size <= 1084) {
+ /* 60K ints/sec to 36K ints/sec */
+ avg_wire_size *= 15;
+ avg_wire_size += 11452;
+ } else if (avg_wire_size <= 1980) {
+ /* 36K ints/sec to 30K ints/sec */
+ avg_wire_size *= 5;
+ avg_wire_size += 22420;
+ } else {
+ /* plateau at a limit of 30K ints/sec */
+ avg_wire_size = 32256;
+ }
- switch (new_latency_range) {
- case I40E_LOWEST_LATENCY:
- new_itr = I40E_ITR_50K;
- break;
- case I40E_LOW_LATENCY:
- new_itr = I40E_ITR_20K;
- break;
- case I40E_BULK_LATENCY:
- new_itr = I40E_ITR_18K;
- break;
- case I40E_ULTRA_LATENCY:
- new_itr = I40E_ITR_8K;
- break;
- default:
- break;
+ /* If we are in low latency mode halve our delay which doubles the
+ * rate to somewhere between 100K to 16K ints/sec
+ */
+ if (itr & I40E_ITR_ADAPTIVE_LATENCY)
+ avg_wire_size /= 2;
+
+ /* Resultant value is 256 times larger than it needs to be. This
+ * gives us room to adjust the value as needed to either increase
+ * or decrease the value based on link speeds of 10G, 2.5G, 1G, etc.
+ *
+ * Use addition as we have already recorded the new latency flag
+ * for the ITR value.
+ */
+ itr += DIV_ROUND_UP(avg_wire_size, i40e_itr_divisor(q_vector)) *
+ I40E_ITR_ADAPTIVE_MIN_INC;
+
+ if ((itr & I40E_ITR_MASK) > I40E_ITR_ADAPTIVE_MAX_USECS) {
+ itr &= I40E_ITR_ADAPTIVE_LATENCY;
+ itr += I40E_ITR_ADAPTIVE_MAX_USECS;
}
+clear_counts:
+ /* write back value */
+ rc->target_itr = itr;
+
+ /* next update should occur within next jiffy */
+ rc->next_update = next_update + 1;
+
rc->total_bytes = 0;
rc->total_packets = 0;
+}
- if (new_itr != rc->itr) {
- rc->itr = new_itr;
- return true;
- }
-
- return false;
+static struct i40e_rx_buffer *i40e_rx_bi(struct i40e_ring *rx_ring, u32 idx)
+{
+ return &rx_ring->rx_bi[idx];
}
/**
- * i40e_rx_is_programming_status - check for programming status descriptor
- * @qw: qword representing status_error_len in CPU ordering
+ * i40e_reuse_rx_page - page flip buffer and store it back on the ring
+ * @rx_ring: rx descriptor ring to store buffers on
+ * @old_buff: donor buffer to have page reused
*
- * The value of in the descriptor length field indicate if this
- * is a programming status descriptor for flow director or FCoE
- * by the value of I40E_RX_PROG_STATUS_DESC_LENGTH, otherwise
- * it is a packet descriptor.
+ * Synchronizes page for reuse by the adapter
**/
-static inline bool i40e_rx_is_programming_status(u64 qw)
+static void i40e_reuse_rx_page(struct i40e_ring *rx_ring,
+ struct i40e_rx_buffer *old_buff)
{
- /* The Rx filter programming status and SPH bit occupy the same
- * spot in the descriptor. Since we don't support packet split we
- * can just reuse the bit as an indication that this is a
- * programming status descriptor.
- */
- return qw & I40E_RXD_QW1_LENGTH_SPH_MASK;
+ struct i40e_rx_buffer *new_buff;
+ u16 nta = rx_ring->next_to_alloc;
+
+ new_buff = i40e_rx_bi(rx_ring, nta);
+
+ /* update, and store next to alloc */
+ nta++;
+ rx_ring->next_to_alloc = (nta < rx_ring->count) ? nta : 0;
+
+ /* transfer page from old buffer to new buffer */
+ new_buff->dma = old_buff->dma;
+ new_buff->page = old_buff->page;
+ new_buff->page_offset = old_buff->page_offset;
+ new_buff->pagecnt_bias = old_buff->pagecnt_bias;
+
+ /* clear contents of buffer_info */
+ old_buff->page = NULL;
}
/**
* i40e_clean_programming_status - clean the programming status descriptor
* @rx_ring: the rx ring that has this descriptor
- * @rx_desc: the rx descriptor written back by HW
- * @qw: qword representing status_error_len in CPU ordering
+ * @qword0_raw: qword0
+ * @qword1: qword1 representing status_error_len in CPU ordering
*
* Flow director should handle FD_FILTER_STATUS to check its filter programming
* status being successful or not and take actions accordingly. FCoE should
* handle its context/filter programming/invalidation status and take actions.
*
+ * Returns an i40e_rx_buffer to reuse if the cleanup occurred, otherwise NULL.
**/
-static void i40e_clean_programming_status(struct i40e_ring *rx_ring,
- union i40e_rx_desc *rx_desc,
- u64 qw)
+void i40e_clean_programming_status(struct i40e_ring *rx_ring, u64 qword0_raw,
+ u64 qword1)
{
- u32 ntc = rx_ring->next_to_clean + 1;
u8 id;
- /* fetch, update, and store next to clean */
- ntc = (ntc < rx_ring->count) ? ntc : 0;
- rx_ring->next_to_clean = ntc;
-
- prefetch(I40E_RX_DESC(rx_ring, ntc));
-
- id = (qw & I40E_RX_PROG_STATUS_DESC_QW1_PROGID_MASK) >>
- I40E_RX_PROG_STATUS_DESC_QW1_PROGID_SHIFT;
+ id = FIELD_GET(I40E_RX_PROG_STATUS_DESC_QW1_PROGID_MASK, qword1);
if (id == I40E_RX_PROG_STATUS_DESC_FD_FILTER_STATUS)
- i40e_fd_handle_status(rx_ring, rx_desc, id);
+ i40e_fd_handle_status(rx_ring, qword0_raw, qword1, id);
}
/**
@@ -1113,6 +1425,8 @@ int i40e_setup_tx_descriptors(struct i40e_ring *tx_ring)
if (!tx_ring->tx_bi)
goto err;
+ u64_stats_init(&tx_ring->syncp);
+
/* round up to nearest 4K */
tx_ring->size = tx_ring->count * sizeof(struct i40e_tx_desc);
/* add u32 for head writeback, align after this takes care of
@@ -1130,6 +1444,7 @@ int i40e_setup_tx_descriptors(struct i40e_ring *tx_ring)
tx_ring->next_to_use = 0;
tx_ring->next_to_clean = 0;
+ tx_ring->tx_stats.prev_pkt_ctr = -1;
return 0;
err:
@@ -1138,27 +1453,31 @@ err:
return -ENOMEM;
}
+static void i40e_clear_rx_bi(struct i40e_ring *rx_ring)
+{
+ memset(rx_ring->rx_bi, 0, sizeof(*rx_ring->rx_bi) * rx_ring->count);
+}
+
/**
* i40e_clean_rx_ring - Free Rx buffers
* @rx_ring: ring to be cleaned
**/
void i40e_clean_rx_ring(struct i40e_ring *rx_ring)
{
- unsigned long bi_size;
u16 i;
/* ring already cleared, nothing to do */
if (!rx_ring->rx_bi)
return;
- if (rx_ring->skb) {
- dev_kfree_skb(rx_ring->skb);
- rx_ring->skb = NULL;
+ if (rx_ring->xsk_pool) {
+ i40e_xsk_clean_rx_ring(rx_ring);
+ goto skip_free;
}
/* Free all the Rx ring sk_buffs */
for (i = 0; i < rx_ring->count; i++) {
- struct i40e_rx_buffer *rx_bi = &rx_ring->rx_bi[i];
+ struct i40e_rx_buffer *rx_bi = i40e_rx_bi(rx_ring, i);
if (!rx_bi->page)
continue;
@@ -1184,14 +1503,18 @@ void i40e_clean_rx_ring(struct i40e_ring *rx_ring)
rx_bi->page_offset = 0;
}
- bi_size = sizeof(struct i40e_rx_buffer) * rx_ring->count;
- memset(rx_ring->rx_bi, 0, bi_size);
+skip_free:
+ if (rx_ring->xsk_pool)
+ i40e_clear_rx_bi_zc(rx_ring);
+ else
+ i40e_clear_rx_bi(rx_ring);
/* Zero out the descriptor ring */
memset(rx_ring->desc, 0, rx_ring->size);
rx_ring->next_to_alloc = 0;
rx_ring->next_to_clean = 0;
+ rx_ring->next_to_process = 0;
rx_ring->next_to_use = 0;
}
@@ -1204,6 +1527,8 @@ void i40e_clean_rx_ring(struct i40e_ring *rx_ring)
void i40e_free_rx_resources(struct i40e_ring *rx_ring)
{
i40e_clean_rx_ring(rx_ring);
+ if (rx_ring->vsi->type == I40E_VSI_MAIN)
+ xdp_rxq_info_unreg(&rx_ring->xdp_rxq);
rx_ring->xdp_prog = NULL;
kfree(rx_ring->rx_bi);
rx_ring->rx_bi = NULL;
@@ -1224,19 +1549,11 @@ void i40e_free_rx_resources(struct i40e_ring *rx_ring)
int i40e_setup_rx_descriptors(struct i40e_ring *rx_ring)
{
struct device *dev = rx_ring->dev;
- int bi_size;
-
- /* warn if we are about to overwrite the pointer */
- WARN_ON(rx_ring->rx_bi);
- bi_size = sizeof(struct i40e_rx_buffer) * rx_ring->count;
- rx_ring->rx_bi = kzalloc(bi_size, GFP_KERNEL);
- if (!rx_ring->rx_bi)
- goto err;
u64_stats_init(&rx_ring->syncp);
/* Round up to nearest 4K */
- rx_ring->size = rx_ring->count * sizeof(union i40e_32byte_rx_desc);
+ rx_ring->size = rx_ring->count * sizeof(union i40e_rx_desc);
rx_ring->size = ALIGN(rx_ring->size, 4096);
rx_ring->desc = dma_alloc_coherent(dev, rx_ring->size,
&rx_ring->dma, GFP_KERNEL);
@@ -1244,20 +1561,22 @@ int i40e_setup_rx_descriptors(struct i40e_ring *rx_ring)
if (!rx_ring->desc) {
dev_info(dev, "Unable to allocate memory for the Rx descriptor ring, size=%d\n",
rx_ring->size);
- goto err;
+ return -ENOMEM;
}
rx_ring->next_to_alloc = 0;
rx_ring->next_to_clean = 0;
+ rx_ring->next_to_process = 0;
rx_ring->next_to_use = 0;
rx_ring->xdp_prog = rx_ring->vsi->xdp_prog;
+ rx_ring->rx_bi =
+ kcalloc(rx_ring->count, sizeof(*rx_ring->rx_bi), GFP_KERNEL);
+ if (!rx_ring->rx_bi)
+ return -ENOMEM;
+
return 0;
-err:
- kfree(rx_ring->rx_bi);
- rx_ring->rx_bi = NULL;
- return -ENOMEM;
}
/**
@@ -1265,7 +1584,7 @@ err:
* @rx_ring: ring to bump
* @val: new head index
**/
-static inline void i40e_release_rx_desc(struct i40e_ring *rx_ring, u32 val)
+void i40e_release_rx_desc(struct i40e_ring *rx_ring, u32 val)
{
rx_ring->next_to_use = val;
@@ -1281,16 +1600,19 @@ static inline void i40e_release_rx_desc(struct i40e_ring *rx_ring, u32 val)
writel(val, rx_ring->tail);
}
-/**
- * i40e_rx_offset - Return expected offset into page to access data
- * @rx_ring: Ring we are requesting offset of
- *
- * Returns the offset value for ring into the data buffer.
- */
-static inline unsigned int i40e_rx_offset(struct i40e_ring *rx_ring)
+#if (PAGE_SIZE >= 8192)
+static unsigned int i40e_rx_frame_truesize(struct i40e_ring *rx_ring,
+ unsigned int size)
{
- return ring_uses_build_skb(rx_ring) ? I40E_SKB_PAD : 0;
+ unsigned int truesize;
+
+ truesize = rx_ring->rx_offset ?
+ SKB_DATA_ALIGN(size + rx_ring->rx_offset) +
+ SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) :
+ SKB_DATA_ALIGN(size);
+ return truesize;
}
+#endif
/**
* i40e_alloc_mapped_page - recycle or make a new page
@@ -1319,6 +1641,8 @@ static bool i40e_alloc_mapped_page(struct i40e_ring *rx_ring,
return false;
}
+ rx_ring->rx_stats.page_alloc_count++;
+
/* map page for use */
dma = dma_map_page_attrs(rx_ring->dev, page, 0,
i40e_rx_pg_size(rx_ring),
@@ -1336,33 +1660,14 @@ static bool i40e_alloc_mapped_page(struct i40e_ring *rx_ring,
bi->dma = dma;
bi->page = page;
- bi->page_offset = i40e_rx_offset(rx_ring);
-
- /* initialize pagecnt_bias to 1 representing we fully own page */
- bi->pagecnt_bias = 1;
+ bi->page_offset = rx_ring->rx_offset;
+ page_ref_add(page, USHRT_MAX - 1);
+ bi->pagecnt_bias = USHRT_MAX;
return true;
}
/**
- * i40e_receive_skb - Send a completed packet up the stack
- * @rx_ring: rx ring in play
- * @skb: packet to send up
- * @vlan_tag: vlan tag for packet
- **/
-static void i40e_receive_skb(struct i40e_ring *rx_ring,
- struct sk_buff *skb, u16 vlan_tag)
-{
- struct i40e_q_vector *q_vector = rx_ring->q_vector;
-
- if ((rx_ring->netdev->features & NETIF_F_HW_VLAN_CTAG_RX) &&
- (vlan_tag & VLAN_VID_MASK))
- __vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), vlan_tag);
-
- napi_gro_receive(&q_vector->napi, skb);
-}
-
-/**
* i40e_alloc_rx_buffers - Replace used receive buffers
* @rx_ring: ring to place buffers on
* @cleaned_count: number of buffers to replace
@@ -1380,7 +1685,7 @@ bool i40e_alloc_rx_buffers(struct i40e_ring *rx_ring, u16 cleaned_count)
return false;
rx_desc = I40E_RX_DESC(rx_ring, ntu);
- bi = &rx_ring->rx_bi[ntu];
+ bi = i40e_rx_bi(rx_ring, ntu);
do {
if (!i40e_alloc_mapped_page(rx_ring, bi))
@@ -1402,7 +1707,7 @@ bool i40e_alloc_rx_buffers(struct i40e_ring *rx_ring, u16 cleaned_count)
ntu++;
if (unlikely(ntu == rx_ring->count)) {
rx_desc = I40E_RX_DESC(rx_ring, 0);
- bi = rx_ring->rx_bi;
+ bi = i40e_rx_bi(rx_ring, 0);
ntu = 0;
}
@@ -1437,40 +1742,30 @@ static inline void i40e_rx_checksum(struct i40e_vsi *vsi,
struct sk_buff *skb,
union i40e_rx_desc *rx_desc)
{
- struct i40e_rx_ptype_decoded decoded;
+ struct libeth_rx_pt decoded;
u32 rx_error, rx_status;
bool ipv4, ipv6;
u8 ptype;
u64 qword;
- qword = le64_to_cpu(rx_desc->wb.qword1.status_error_len);
- ptype = (qword & I40E_RXD_QW1_PTYPE_MASK) >> I40E_RXD_QW1_PTYPE_SHIFT;
- rx_error = (qword & I40E_RXD_QW1_ERROR_MASK) >>
- I40E_RXD_QW1_ERROR_SHIFT;
- rx_status = (qword & I40E_RXD_QW1_STATUS_MASK) >>
- I40E_RXD_QW1_STATUS_SHIFT;
- decoded = decode_rx_desc_ptype(ptype);
-
skb->ip_summed = CHECKSUM_NONE;
- skb_checksum_none_assert(skb);
+ qword = le64_to_cpu(rx_desc->wb.qword1.status_error_len);
+ ptype = FIELD_GET(I40E_RXD_QW1_PTYPE_MASK, qword);
- /* Rx csum enabled and ip headers found? */
- if (!(vsi->netdev->features & NETIF_F_RXCSUM))
+ decoded = libie_rx_pt_parse(ptype);
+ if (!libeth_rx_pt_has_checksum(vsi->netdev, decoded))
return;
+ rx_error = FIELD_GET(I40E_RXD_QW1_ERROR_MASK, qword);
+ rx_status = FIELD_GET(I40E_RXD_QW1_STATUS_MASK, qword);
+
/* did the hardware decode the packet and checksum? */
if (!(rx_status & BIT(I40E_RX_DESC_STATUS_L3L4P_SHIFT)))
return;
- /* both known and outer_ip must be set for the below code to work */
- if (!(decoded.known && decoded.outer_ip))
- return;
-
- ipv4 = (decoded.outer_ip == I40E_RX_PTYPE_OUTER_IP) &&
- (decoded.outer_ip_ver == I40E_RX_PTYPE_OUTER_IPV4);
- ipv6 = (decoded.outer_ip == I40E_RX_PTYPE_OUTER_IP) &&
- (decoded.outer_ip_ver == I40E_RX_PTYPE_OUTER_IPV6);
+ ipv4 = libeth_rx_pt_get_ip_ver(decoded) == LIBETH_RX_PT_OUTER_IPV4;
+ ipv6 = libeth_rx_pt_get_ip_ver(decoded) == LIBETH_RX_PT_OUTER_IPV6;
if (ipv4 &&
(rx_error & (BIT(I40E_RX_DESC_ERROR_IPE_SHIFT) |
@@ -1498,20 +1793,10 @@ static inline void i40e_rx_checksum(struct i40e_vsi *vsi,
* we need to bump the checksum level by 1 to reflect the fact that
* we are indicating we validated the inner checksum.
*/
- if (decoded.tunnel_type >= I40E_RX_PTYPE_TUNNEL_IP_GRENAT)
+ if (decoded.tunnel_type >= LIBETH_RX_PT_TUNNEL_IP_GRENAT)
skb->csum_level = 1;
- /* Only report checksum unnecessary for TCP, UDP, or SCTP */
- switch (decoded.inner_prot) {
- case I40E_RX_PTYPE_INNER_PROT_TCP:
- case I40E_RX_PTYPE_INNER_PROT_UDP:
- case I40E_RX_PTYPE_INNER_PROT_SCTP:
- skb->ip_summed = CHECKSUM_UNNECESSARY;
- /* fall though */
- default:
- break;
- }
-
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
return;
checksum_fail:
@@ -1519,49 +1804,30 @@ checksum_fail:
}
/**
- * i40e_ptype_to_htype - get a hash type
- * @ptype: the ptype value from the descriptor
- *
- * Returns a hash type to be used by skb_set_hash
- **/
-static inline int i40e_ptype_to_htype(u8 ptype)
-{
- struct i40e_rx_ptype_decoded decoded = decode_rx_desc_ptype(ptype);
-
- if (!decoded.known)
- return PKT_HASH_TYPE_NONE;
-
- if (decoded.outer_ip == I40E_RX_PTYPE_OUTER_IP &&
- decoded.payload_layer == I40E_RX_PTYPE_PAYLOAD_LAYER_PAY4)
- return PKT_HASH_TYPE_L4;
- else if (decoded.outer_ip == I40E_RX_PTYPE_OUTER_IP &&
- decoded.payload_layer == I40E_RX_PTYPE_PAYLOAD_LAYER_PAY3)
- return PKT_HASH_TYPE_L3;
- else
- return PKT_HASH_TYPE_L2;
-}
-
-/**
* i40e_rx_hash - set the hash value in the skb
* @ring: descriptor ring
* @rx_desc: specific descriptor
+ * @skb: skb currently being received and modified
+ * @rx_ptype: Rx packet type
**/
static inline void i40e_rx_hash(struct i40e_ring *ring,
union i40e_rx_desc *rx_desc,
struct sk_buff *skb,
u8 rx_ptype)
{
+ struct libeth_rx_pt decoded;
u32 hash;
const __le64 rss_mask =
cpu_to_le64((u64)I40E_RX_DESC_FLTSTAT_RSS_HASH <<
I40E_RX_DESC_STATUS_FLTSTAT_SHIFT);
- if (!(ring->netdev->features & NETIF_F_RXHASH))
+ decoded = libie_rx_pt_parse(rx_ptype);
+ if (!libeth_rx_pt_has_hash(ring->netdev, decoded))
return;
if ((rx_desc->wb.qword1.status_error_len & rss_mask) == rss_mask) {
hash = le32_to_cpu(rx_desc->wb.qword0.hi_dword.rss);
- skb_set_hash(skb, hash, i40e_ptype_to_htype(rx_ptype));
+ libeth_rx_pt_set_hash(skb, hash, decoded);
}
}
@@ -1570,23 +1836,19 @@ static inline void i40e_rx_hash(struct i40e_ring *ring,
* @rx_ring: rx descriptor ring packet is being transacted on
* @rx_desc: pointer to the EOP Rx descriptor
* @skb: pointer to current skb being populated
- * @rx_ptype: the packet type decoded by hardware
*
* This function checks the ring, descriptor, and packet information in
* order to populate the hash, checksum, VLAN, protocol, and
* other fields within the skb.
**/
-static inline
void i40e_process_skb_fields(struct i40e_ring *rx_ring,
- union i40e_rx_desc *rx_desc, struct sk_buff *skb,
- u8 rx_ptype)
+ union i40e_rx_desc *rx_desc, struct sk_buff *skb)
{
u64 qword = le64_to_cpu(rx_desc->wb.qword1.status_error_len);
- u32 rx_status = (qword & I40E_RXD_QW1_STATUS_MASK) >>
- I40E_RXD_QW1_STATUS_SHIFT;
+ u32 rx_status = FIELD_GET(I40E_RXD_QW1_STATUS_MASK, qword);
u32 tsynvalid = rx_status & I40E_RXD_QW1_STATUS_TSYNVALID_MASK;
- u32 tsyn = (rx_status & I40E_RXD_QW1_STATUS_TSYNINDX_MASK) >>
- I40E_RXD_QW1_STATUS_TSYNINDX_SHIFT;
+ u32 tsyn = FIELD_GET(I40E_RXD_QW1_STATUS_TSYNINDX_MASK, rx_status);
+ u8 rx_ptype = FIELD_GET(I40E_RXD_QW1_PTYPE_MASK, qword);
if (unlikely(tsynvalid))
i40e_ptp_rx_hwtstamp(rx_ring->vsi->back, skb, tsyn);
@@ -1597,6 +1859,13 @@ void i40e_process_skb_fields(struct i40e_ring *rx_ring,
skb_record_rx_queue(skb, rx_ring->queue_index);
+ if (qword & BIT(I40E_RX_DESC_STATUS_L2TAG1P_SHIFT)) {
+ __le16 vlan_tag = rx_desc->wb.qword0.lo_dword.l2tag1;
+
+ __vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q),
+ le16_to_cpu(vlan_tag));
+ }
+
/* modifies the skb - consumes the enet header */
skb->protocol = eth_type_trans(skb, rx_ring->netdev);
}
@@ -1607,9 +1876,6 @@ void i40e_process_skb_fields(struct i40e_ring *rx_ring,
* @skb: pointer to current skb being fixed
* @rx_desc: pointer to the EOP Rx descriptor
*
- * Also address the case where we are pulling data in on pages only
- * and as such no data is present in the skb header.
- *
* In addition if skb is not at least 60 bytes we need to pad it so that
* it is large enough to qualify as a valid Ethernet frame.
*
@@ -1619,10 +1885,6 @@ static bool i40e_cleanup_headers(struct i40e_ring *rx_ring, struct sk_buff *skb,
union i40e_rx_desc *rx_desc)
{
- /* XDP packets use error pointer so abort at this point */
- if (IS_ERR(skb))
- return true;
-
/* ERR_MASK will only have valid bits if EOP set, and
* what we are doing here is actually checking
* I40E_RX_DESC_ERROR_RXE_SHIFT, since it is the zeroth bit in
@@ -1642,97 +1904,51 @@ static bool i40e_cleanup_headers(struct i40e_ring *rx_ring, struct sk_buff *skb,
}
/**
- * i40e_reuse_rx_page - page flip buffer and store it back on the ring
- * @rx_ring: rx descriptor ring to store buffers on
- * @old_buff: donor buffer to have page reused
- *
- * Synchronizes page for reuse by the adapter
- **/
-static void i40e_reuse_rx_page(struct i40e_ring *rx_ring,
- struct i40e_rx_buffer *old_buff)
-{
- struct i40e_rx_buffer *new_buff;
- u16 nta = rx_ring->next_to_alloc;
-
- new_buff = &rx_ring->rx_bi[nta];
-
- /* update, and store next to alloc */
- nta++;
- rx_ring->next_to_alloc = (nta < rx_ring->count) ? nta : 0;
-
- /* transfer page from old buffer to new buffer */
- new_buff->dma = old_buff->dma;
- new_buff->page = old_buff->page;
- new_buff->page_offset = old_buff->page_offset;
- new_buff->pagecnt_bias = old_buff->pagecnt_bias;
-}
-
-/**
- * i40e_page_is_reusable - check if any reuse is possible
- * @page: page struct to check
- *
- * A page is not reusable if it was allocated under low memory
- * conditions, or it's not in the same NUMA node as this CPU.
- */
-static inline bool i40e_page_is_reusable(struct page *page)
-{
- return (page_to_nid(page) == numa_mem_id()) &&
- !page_is_pfmemalloc(page);
-}
-
-/**
- * i40e_can_reuse_rx_page - Determine if this page can be reused by
- * the adapter for another receive
- *
+ * i40e_can_reuse_rx_page - Determine if page can be reused for another Rx
* @rx_buffer: buffer containing the page
+ * @rx_stats: rx stats structure for the rx ring
*
- * If page is reusable, rx_buffer->page_offset is adjusted to point to
- * an unused region in the page.
- *
- * For small pages, @truesize will be a constant value, half the size
- * of the memory at page. We'll attempt to alternate between high and
- * low halves of the page, with one half ready for use by the hardware
- * and the other half being consumed by the stack. We use the page
- * ref count to determine whether the stack has finished consuming the
- * portion of this page that was passed up with a previous packet. If
- * the page ref count is >1, we'll assume the "other" half page is
- * still busy, and this page cannot be reused.
+ * If page is reusable, we have a green light for calling i40e_reuse_rx_page,
+ * which will assign the current buffer to the buffer that next_to_alloc is
+ * pointing to; otherwise, the DMA mapping needs to be destroyed and
+ * page freed.
*
- * For larger pages, @truesize will be the actual space used by the
- * received packet (adjusted upward to an even multiple of the cache
- * line size). This will advance through the page by the amount
- * actually consumed by the received packets while there is still
- * space for a buffer. Each region of larger pages will be used at
- * most once, after which the page will not be reused.
- *
- * In either case, if the page is reusable its refcount is increased.
- **/
-static bool i40e_can_reuse_rx_page(struct i40e_rx_buffer *rx_buffer)
+ * rx_stats will be updated to indicate whether the page was waived
+ * or busy if it could not be reused.
+ */
+static bool i40e_can_reuse_rx_page(struct i40e_rx_buffer *rx_buffer,
+ struct i40e_rx_queue_stats *rx_stats)
{
unsigned int pagecnt_bias = rx_buffer->pagecnt_bias;
struct page *page = rx_buffer->page;
/* Is any reuse possible? */
- if (unlikely(!i40e_page_is_reusable(page)))
+ if (!dev_page_is_reusable(page)) {
+ rx_stats->page_waive_count++;
return false;
+ }
#if (PAGE_SIZE < 8192)
/* if we are only owner of page we can reuse it */
- if (unlikely((page_count(page) - pagecnt_bias) > 1))
+ if (unlikely((rx_buffer->page_count - pagecnt_bias) > 1)) {
+ rx_stats->page_busy_count++;
return false;
+ }
#else
#define I40E_LAST_OFFSET \
(SKB_WITH_OVERHEAD(PAGE_SIZE) - I40E_RXBUFFER_2048)
- if (rx_buffer->page_offset > I40E_LAST_OFFSET)
+ if (rx_buffer->page_offset > I40E_LAST_OFFSET) {
+ rx_stats->page_busy_count++;
return false;
+ }
#endif
/* If we have drained the page fragment pool we need to update
* the pagecnt_bias and page count so that we fully restock the
* number of references the driver holds.
*/
- if (unlikely(!pagecnt_bias)) {
- page_ref_add(page, USHRT_MAX);
+ if (unlikely(pagecnt_bias == 1)) {
+ page_ref_add(page, USHRT_MAX - 1);
rx_buffer->pagecnt_bias = USHRT_MAX;
}
@@ -1740,33 +1956,14 @@ static bool i40e_can_reuse_rx_page(struct i40e_rx_buffer *rx_buffer)
}
/**
- * i40e_add_rx_frag - Add contents of Rx buffer to sk_buff
- * @rx_ring: rx descriptor ring to transact packets on
- * @rx_buffer: buffer containing page to add
- * @skb: sk_buff to place the data into
- * @size: packet length from rx_desc
- *
- * This function will add the data contained in rx_buffer->page to the skb.
- * It will just attach the page as a frag to the skb.
- *
- * The function will then update the page offset.
+ * i40e_rx_buffer_flip - adjusted rx_buffer to point to an unused region
+ * @rx_buffer: Rx buffer to adjust
+ * @truesize: Size of adjustment
**/
-static void i40e_add_rx_frag(struct i40e_ring *rx_ring,
- struct i40e_rx_buffer *rx_buffer,
- struct sk_buff *skb,
- unsigned int size)
+static void i40e_rx_buffer_flip(struct i40e_rx_buffer *rx_buffer,
+ unsigned int truesize)
{
#if (PAGE_SIZE < 8192)
- unsigned int truesize = i40e_rx_pg_size(rx_ring) / 2;
-#else
- unsigned int truesize = SKB_DATA_ALIGN(size + i40e_rx_offset(rx_ring));
-#endif
-
- skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, rx_buffer->page,
- rx_buffer->page_offset, size, truesize);
-
- /* page is being used so we must update the page offset */
-#if (PAGE_SIZE < 8192)
rx_buffer->page_offset ^= truesize;
#else
rx_buffer->page_offset += truesize;
@@ -1786,8 +1983,14 @@ static struct i40e_rx_buffer *i40e_get_rx_buffer(struct i40e_ring *rx_ring,
{
struct i40e_rx_buffer *rx_buffer;
- rx_buffer = &rx_ring->rx_bi[rx_ring->next_to_clean];
- prefetchw(rx_buffer->page);
+ rx_buffer = i40e_rx_bi(rx_ring, rx_ring->next_to_process);
+ rx_buffer->page_count =
+#if (PAGE_SIZE < 8192)
+ page_count(rx_buffer->page);
+#else
+ 0;
+#endif
+ prefetch_page_address(rx_buffer->page);
/* we are reusing so sync this buffer for CPU use */
dma_sync_single_range_for_cpu(rx_ring->dev,
@@ -1803,9 +2006,70 @@ static struct i40e_rx_buffer *i40e_get_rx_buffer(struct i40e_ring *rx_ring,
}
/**
- * i40e_construct_skb - Allocate skb and populate it
+ * i40e_put_rx_buffer - Clean up used buffer and either recycle or free
* @rx_ring: rx descriptor ring to transact packets on
* @rx_buffer: rx buffer to pull data from
+ *
+ * This function will clean up the contents of the rx_buffer. It will
+ * either recycle the buffer or unmap it and free the associated resources.
+ */
+static void i40e_put_rx_buffer(struct i40e_ring *rx_ring,
+ struct i40e_rx_buffer *rx_buffer)
+{
+ if (i40e_can_reuse_rx_page(rx_buffer, &rx_ring->rx_stats)) {
+ /* hand second half of page back to the ring */
+ i40e_reuse_rx_page(rx_ring, rx_buffer);
+ } else {
+ /* we are not reusing the buffer so unmap it */
+ dma_unmap_page_attrs(rx_ring->dev, rx_buffer->dma,
+ i40e_rx_pg_size(rx_ring),
+ DMA_FROM_DEVICE, I40E_RX_DMA_ATTR);
+ __page_frag_cache_drain(rx_buffer->page,
+ rx_buffer->pagecnt_bias);
+ /* clear contents of buffer_info */
+ rx_buffer->page = NULL;
+ }
+}
+
+/**
+ * i40e_process_rx_buffs- Processing of buffers post XDP prog or on error
+ * @rx_ring: Rx descriptor ring to transact packets on
+ * @xdp_res: Result of the XDP program
+ * @xdp: xdp_buff pointing to the data
+ **/
+static void i40e_process_rx_buffs(struct i40e_ring *rx_ring, int xdp_res,
+ struct xdp_buff *xdp)
+{
+ u32 nr_frags = xdp_get_shared_info_from_buff(xdp)->nr_frags;
+ u32 next = rx_ring->next_to_clean, i = 0;
+ struct i40e_rx_buffer *rx_buffer;
+
+ xdp->flags = 0;
+
+ while (1) {
+ rx_buffer = i40e_rx_bi(rx_ring, next);
+ if (++next == rx_ring->count)
+ next = 0;
+
+ if (!rx_buffer->page)
+ continue;
+
+ if (xdp_res != I40E_XDP_CONSUMED)
+ i40e_rx_buffer_flip(rx_buffer, xdp->frame_sz);
+ else if (i++ <= nr_frags)
+ rx_buffer->pagecnt_bias++;
+
+ /* EOP buffer will be put in i40e_clean_rx_irq() */
+ if (next == rx_ring->next_to_process)
+ return;
+
+ i40e_put_rx_buffer(rx_ring, rx_buffer);
+ }
+}
+
+/**
+ * i40e_construct_skb - Allocate skb and populate it
+ * @rx_ring: rx descriptor ring to transact packets on
* @xdp: xdp_buff pointing to the data
*
* This function allocates an skb. It then populates it with the page
@@ -1813,158 +2077,159 @@ static struct i40e_rx_buffer *i40e_get_rx_buffer(struct i40e_ring *rx_ring,
* skb correctly.
*/
static struct sk_buff *i40e_construct_skb(struct i40e_ring *rx_ring,
- struct i40e_rx_buffer *rx_buffer,
struct xdp_buff *xdp)
{
unsigned int size = xdp->data_end - xdp->data;
-#if (PAGE_SIZE < 8192)
- unsigned int truesize = i40e_rx_pg_size(rx_ring) / 2;
-#else
- unsigned int truesize = SKB_DATA_ALIGN(size);
-#endif
+ struct i40e_rx_buffer *rx_buffer;
+ struct skb_shared_info *sinfo;
unsigned int headlen;
struct sk_buff *skb;
+ u32 nr_frags = 0;
/* prefetch first cache line of first page */
- prefetch(xdp->data);
-#if L1_CACHE_BYTES < 128
- prefetch(xdp->data + L1_CACHE_BYTES);
-#endif
+ net_prefetch(xdp->data);
+
+ /* Note, we get here by enabling legacy-rx via:
+ *
+ * ethtool --set-priv-flags <dev> legacy-rx on
+ *
+ * In this mode, we currently get 0 extra XDP headroom as
+ * opposed to having legacy-rx off, where we process XDP
+ * packets going to stack via i40e_build_skb(). The latter
+ * provides us currently with 192 bytes of headroom.
+ *
+ * For i40e_construct_skb() mode it means that the
+ * xdp->data_meta will always point to xdp->data, since
+ * the helper cannot expand the head. Should this ever
+ * change in future for legacy-rx mode on, then lets also
+ * add xdp->data_meta handling here.
+ */
/* allocate a skb to store the frags */
- skb = __napi_alloc_skb(&rx_ring->q_vector->napi,
- I40E_RX_HDR_SIZE,
- GFP_ATOMIC | __GFP_NOWARN);
+ skb = napi_alloc_skb(&rx_ring->q_vector->napi, I40E_RX_HDR_SIZE);
if (unlikely(!skb))
return NULL;
/* Determine available headroom for copy */
headlen = size;
if (headlen > I40E_RX_HDR_SIZE)
- headlen = eth_get_headlen(xdp->data, I40E_RX_HDR_SIZE);
+ headlen = eth_get_headlen(skb->dev, xdp->data,
+ I40E_RX_HDR_SIZE);
/* align pull length to size of long to optimize memcpy performance */
memcpy(__skb_put(skb, headlen), xdp->data,
ALIGN(headlen, sizeof(long)));
+ if (unlikely(xdp_buff_has_frags(xdp))) {
+ sinfo = xdp_get_shared_info_from_buff(xdp);
+ nr_frags = sinfo->nr_frags;
+ }
+ rx_buffer = i40e_rx_bi(rx_ring, rx_ring->next_to_clean);
/* update all of the pointers */
size -= headlen;
if (size) {
+ if (unlikely(nr_frags >= MAX_SKB_FRAGS)) {
+ dev_kfree_skb(skb);
+ return NULL;
+ }
skb_add_rx_frag(skb, 0, rx_buffer->page,
rx_buffer->page_offset + headlen,
- size, truesize);
-
+ size, xdp->frame_sz);
/* buffer is used by skb, update page_offset */
-#if (PAGE_SIZE < 8192)
- rx_buffer->page_offset ^= truesize;
-#else
- rx_buffer->page_offset += truesize;
-#endif
+ i40e_rx_buffer_flip(rx_buffer, xdp->frame_sz);
} else {
/* buffer is unused, reset bias back to rx_buffer */
rx_buffer->pagecnt_bias++;
}
+ if (unlikely(xdp_buff_has_frags(xdp))) {
+ struct skb_shared_info *skinfo = skb_shinfo(skb);
+
+ memcpy(&skinfo->frags[skinfo->nr_frags], &sinfo->frags[0],
+ sizeof(skb_frag_t) * nr_frags);
+
+ xdp_update_skb_frags_info(skb, skinfo->nr_frags + nr_frags,
+ sinfo->xdp_frags_size,
+ nr_frags * xdp->frame_sz,
+ xdp_buff_get_skb_flags(xdp));
+
+ /* First buffer has already been processed, so bump ntc */
+ if (++rx_ring->next_to_clean == rx_ring->count)
+ rx_ring->next_to_clean = 0;
+
+ i40e_process_rx_buffs(rx_ring, I40E_XDP_PASS, xdp);
+ }
+
return skb;
}
/**
* i40e_build_skb - Build skb around an existing buffer
* @rx_ring: Rx descriptor ring to transact packets on
- * @rx_buffer: Rx buffer to pull data from
* @xdp: xdp_buff pointing to the data
*
* This function builds an skb around an existing Rx buffer, taking care
* to set up the skb correctly and avoid any memcpy overhead.
*/
static struct sk_buff *i40e_build_skb(struct i40e_ring *rx_ring,
- struct i40e_rx_buffer *rx_buffer,
struct xdp_buff *xdp)
{
- unsigned int size = xdp->data_end - xdp->data;
-#if (PAGE_SIZE < 8192)
- unsigned int truesize = i40e_rx_pg_size(rx_ring) / 2;
-#else
- unsigned int truesize = SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) +
- SKB_DATA_ALIGN(I40E_SKB_PAD + size);
-#endif
+ unsigned int metasize = xdp->data - xdp->data_meta;
+ struct skb_shared_info *sinfo;
struct sk_buff *skb;
+ u32 nr_frags;
+
+ /* Prefetch first cache line of first page. If xdp->data_meta
+ * is unused, this points exactly as xdp->data, otherwise we
+ * likely have a consumer accessing first few bytes of meta
+ * data, and then actual data.
+ */
+ net_prefetch(xdp->data_meta);
+
+ if (unlikely(xdp_buff_has_frags(xdp))) {
+ sinfo = xdp_get_shared_info_from_buff(xdp);
+ nr_frags = sinfo->nr_frags;
+ }
- /* prefetch first cache line of first page */
- prefetch(xdp->data);
-#if L1_CACHE_BYTES < 128
- prefetch(xdp->data + L1_CACHE_BYTES);
-#endif
/* build an skb around the page buffer */
- skb = build_skb(xdp->data_hard_start, truesize);
+ skb = napi_build_skb(xdp->data_hard_start, xdp->frame_sz);
if (unlikely(!skb))
return NULL;
/* update pointers within the skb to store the data */
- skb_reserve(skb, I40E_SKB_PAD);
- __skb_put(skb, size);
-
- /* buffer is used by skb, update page_offset */
-#if (PAGE_SIZE < 8192)
- rx_buffer->page_offset ^= truesize;
-#else
- rx_buffer->page_offset += truesize;
-#endif
+ skb_reserve(skb, xdp->data - xdp->data_hard_start);
+ __skb_put(skb, xdp->data_end - xdp->data);
+ if (metasize)
+ skb_metadata_set(skb, metasize);
- return skb;
-}
+ if (unlikely(xdp_buff_has_frags(xdp))) {
+ xdp_update_skb_frags_info(skb, nr_frags, sinfo->xdp_frags_size,
+ nr_frags * xdp->frame_sz,
+ xdp_buff_get_skb_flags(xdp));
-/**
- * i40e_put_rx_buffer - Clean up used buffer and either recycle or free
- * @rx_ring: rx descriptor ring to transact packets on
- * @rx_buffer: rx buffer to pull data from
- *
- * This function will clean up the contents of the rx_buffer. It will
- * either recycle the bufer or unmap it and free the associated resources.
- */
-static void i40e_put_rx_buffer(struct i40e_ring *rx_ring,
- struct i40e_rx_buffer *rx_buffer)
-{
- if (i40e_can_reuse_rx_page(rx_buffer)) {
- /* hand second half of page back to the ring */
- i40e_reuse_rx_page(rx_ring, rx_buffer);
- rx_ring->rx_stats.page_reuse_count++;
+ i40e_process_rx_buffs(rx_ring, I40E_XDP_PASS, xdp);
} else {
- /* we are not reusing the buffer so unmap it */
- dma_unmap_page_attrs(rx_ring->dev, rx_buffer->dma,
- i40e_rx_pg_size(rx_ring),
- DMA_FROM_DEVICE, I40E_RX_DMA_ATTR);
- __page_frag_cache_drain(rx_buffer->page,
- rx_buffer->pagecnt_bias);
+ struct i40e_rx_buffer *rx_buffer;
+
+ rx_buffer = i40e_rx_bi(rx_ring, rx_ring->next_to_clean);
+ /* buffer is used by skb, update page_offset */
+ i40e_rx_buffer_flip(rx_buffer, xdp->frame_sz);
}
- /* clear contents of buffer_info */
- rx_buffer->page = NULL;
+ return skb;
}
/**
* i40e_is_non_eop - process handling of non-EOP buffers
* @rx_ring: Rx ring being processed
* @rx_desc: Rx descriptor for current buffer
- * @skb: Current socket buffer containing buffer in progress
*
- * This function updates next to clean. If the buffer is an EOP buffer
- * this function exits returning false, otherwise it will place the
- * sk_buff in the next buffer to be chained and return true indicating
- * that this is in fact a non-EOP buffer.
- **/
-static bool i40e_is_non_eop(struct i40e_ring *rx_ring,
- union i40e_rx_desc *rx_desc,
- struct sk_buff *skb)
+ * If the buffer is an EOP buffer, this function exits returning false,
+ * otherwise return true indicating that this is in fact a non-EOP buffer.
+ */
+bool i40e_is_non_eop(struct i40e_ring *rx_ring,
+ union i40e_rx_desc *rx_desc)
{
- u32 ntc = rx_ring->next_to_clean + 1;
-
- /* fetch, update, and store next to clean */
- ntc = (ntc < rx_ring->count) ? ntc : 0;
- rx_ring->next_to_clean = ntc;
-
- prefetch(I40E_RX_DESC(rx_ring, ntc));
-
/* if we are the last buffer then there is nothing else to do */
#define I40E_RXD_EOF BIT(I40E_RX_DESC_STATUS_EOF_SHIFT)
if (likely(i40e_test_staterr(rx_desc, I40E_RXD_EOF)))
@@ -1975,79 +2240,191 @@ static bool i40e_is_non_eop(struct i40e_ring *rx_ring,
return true;
}
-#define I40E_XDP_PASS 0
-#define I40E_XDP_CONSUMED 1
-#define I40E_XDP_TX 2
-
-static int i40e_xmit_xdp_ring(struct xdp_buff *xdp,
+static int i40e_xmit_xdp_ring(struct xdp_frame *xdpf,
struct i40e_ring *xdp_ring);
+int i40e_xmit_xdp_tx_ring(struct xdp_buff *xdp, struct i40e_ring *xdp_ring)
+{
+ struct xdp_frame *xdpf = xdp_convert_buff_to_frame(xdp);
+
+ if (unlikely(!xdpf))
+ return I40E_XDP_CONSUMED;
+
+ return i40e_xmit_xdp_ring(xdpf, xdp_ring);
+}
+
/**
* i40e_run_xdp - run an XDP program
* @rx_ring: Rx ring being processed
* @xdp: XDP buffer containing the frame
+ * @xdp_prog: XDP program to run
**/
-static struct sk_buff *i40e_run_xdp(struct i40e_ring *rx_ring,
- struct xdp_buff *xdp)
+static int i40e_run_xdp(struct i40e_ring *rx_ring, struct xdp_buff *xdp, struct bpf_prog *xdp_prog)
{
- int result = I40E_XDP_PASS;
+ int err, result = I40E_XDP_PASS;
struct i40e_ring *xdp_ring;
- struct bpf_prog *xdp_prog;
u32 act;
- rcu_read_lock();
- xdp_prog = READ_ONCE(rx_ring->xdp_prog);
-
if (!xdp_prog)
goto xdp_out;
+ prefetchw(xdp->data_hard_start); /* xdp_frame write */
+
act = bpf_prog_run_xdp(xdp_prog, xdp);
switch (act) {
case XDP_PASS:
break;
case XDP_TX:
xdp_ring = rx_ring->vsi->xdp_rings[rx_ring->queue_index];
- result = i40e_xmit_xdp_ring(xdp, xdp_ring);
+ result = i40e_xmit_xdp_tx_ring(xdp, xdp_ring);
+ if (result == I40E_XDP_CONSUMED)
+ goto out_failure;
+ break;
+ case XDP_REDIRECT:
+ err = xdp_do_redirect(rx_ring->netdev, xdp, xdp_prog);
+ if (err)
+ goto out_failure;
+ result = I40E_XDP_REDIR;
break;
default:
- bpf_warn_invalid_xdp_action(act);
+ bpf_warn_invalid_xdp_action(rx_ring->netdev, xdp_prog, act);
+ fallthrough;
case XDP_ABORTED:
+out_failure:
trace_xdp_exception(rx_ring->netdev, xdp_prog, act);
- /* fallthrough -- handle aborts by dropping packet */
+ fallthrough; /* handle aborts by dropping packet */
case XDP_DROP:
result = I40E_XDP_CONSUMED;
break;
}
xdp_out:
- rcu_read_unlock();
- return ERR_PTR(-result);
+ return result;
}
/**
- * i40e_rx_buffer_flip - adjusted rx_buffer to point to an unused region
+ * i40e_xdp_ring_update_tail - Updates the XDP Tx ring tail register
+ * @xdp_ring: XDP Tx ring
+ *
+ * This function updates the XDP Tx ring tail register.
+ **/
+void i40e_xdp_ring_update_tail(struct i40e_ring *xdp_ring)
+{
+ /* Force memory writes to complete before letting h/w
+ * know there are new descriptors to fetch.
+ */
+ wmb();
+ writel_relaxed(xdp_ring->next_to_use, xdp_ring->tail);
+}
+
+/**
+ * i40e_update_rx_stats - Update Rx ring statistics
+ * @rx_ring: rx descriptor ring
+ * @total_rx_bytes: number of bytes received
+ * @total_rx_packets: number of packets received
+ *
+ * This function updates the Rx ring statistics.
+ **/
+void i40e_update_rx_stats(struct i40e_ring *rx_ring,
+ unsigned int total_rx_bytes,
+ unsigned int total_rx_packets)
+{
+ u64_stats_update_begin(&rx_ring->syncp);
+ rx_ring->stats.packets += total_rx_packets;
+ rx_ring->stats.bytes += total_rx_bytes;
+ u64_stats_update_end(&rx_ring->syncp);
+ rx_ring->q_vector->rx.total_packets += total_rx_packets;
+ rx_ring->q_vector->rx.total_bytes += total_rx_bytes;
+}
+
+/**
+ * i40e_finalize_xdp_rx - Bump XDP Tx tail and/or flush redirect map
* @rx_ring: Rx ring
- * @rx_buffer: Rx buffer to adjust
- * @size: Size of adjustment
+ * @xdp_res: Result of the receive batch
+ *
+ * This function bumps XDP Tx tail and/or flush redirect map, and
+ * should be called when a batch of packets has been processed in the
+ * napi loop.
**/
-static void i40e_rx_buffer_flip(struct i40e_ring *rx_ring,
- struct i40e_rx_buffer *rx_buffer,
- unsigned int size)
+void i40e_finalize_xdp_rx(struct i40e_ring *rx_ring, unsigned int xdp_res)
{
-#if (PAGE_SIZE < 8192)
- unsigned int truesize = i40e_rx_pg_size(rx_ring) / 2;
+ if (xdp_res & I40E_XDP_REDIR)
+ xdp_do_flush();
- rx_buffer->page_offset ^= truesize;
-#else
- unsigned int truesize = SKB_DATA_ALIGN(i40e_rx_offset(rx_ring) + size);
+ if (xdp_res & I40E_XDP_TX) {
+ struct i40e_ring *xdp_ring =
+ rx_ring->vsi->xdp_rings[rx_ring->queue_index];
- rx_buffer->page_offset += truesize;
-#endif
+ i40e_xdp_ring_update_tail(xdp_ring);
+ }
+}
+
+/**
+ * i40e_inc_ntp: Advance the next_to_process index
+ * @rx_ring: Rx ring
+ **/
+static void i40e_inc_ntp(struct i40e_ring *rx_ring)
+{
+ u32 ntp = rx_ring->next_to_process + 1;
+
+ ntp = (ntp < rx_ring->count) ? ntp : 0;
+ rx_ring->next_to_process = ntp;
+ prefetch(I40E_RX_DESC(rx_ring, ntp));
+}
+
+/**
+ * i40e_add_xdp_frag: Add a frag to xdp_buff
+ * @xdp: xdp_buff pointing to the data
+ * @nr_frags: return number of buffers for the packet
+ * @rx_buffer: rx_buffer holding data of the current frag
+ * @size: size of data of current frag
+ */
+static int i40e_add_xdp_frag(struct xdp_buff *xdp, u32 *nr_frags,
+ struct i40e_rx_buffer *rx_buffer, u32 size)
+{
+ struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp);
+
+ if (!xdp_buff_has_frags(xdp)) {
+ sinfo->nr_frags = 0;
+ sinfo->xdp_frags_size = 0;
+ xdp_buff_set_frags_flag(xdp);
+ } else if (unlikely(sinfo->nr_frags >= MAX_SKB_FRAGS)) {
+ /* Overflowing packet: All frags need to be dropped */
+ return -ENOMEM;
+ }
+
+ __skb_fill_page_desc_noacc(sinfo, sinfo->nr_frags++, rx_buffer->page,
+ rx_buffer->page_offset, size);
+
+ sinfo->xdp_frags_size += size;
+
+ if (page_is_pfmemalloc(rx_buffer->page))
+ xdp_buff_set_frag_pfmemalloc(xdp);
+ *nr_frags = sinfo->nr_frags;
+
+ return 0;
+}
+
+/**
+ * i40e_consume_xdp_buff - Consume all the buffers of the packet and update ntc
+ * @rx_ring: rx descriptor ring to transact packets on
+ * @xdp: xdp_buff pointing to the data
+ * @rx_buffer: rx_buffer of eop desc
+ */
+static void i40e_consume_xdp_buff(struct i40e_ring *rx_ring,
+ struct xdp_buff *xdp,
+ struct i40e_rx_buffer *rx_buffer)
+{
+ i40e_process_rx_buffs(rx_ring, I40E_XDP_CONSUMED, xdp);
+ i40e_put_rx_buffer(rx_ring, rx_buffer);
+ rx_ring->next_to_clean = rx_ring->next_to_process;
+ xdp->data = NULL;
}
/**
* i40e_clean_rx_irq - Clean completed descriptors from Rx ring - bounce buf
* @rx_ring: rx descriptor ring to transact packets on
* @budget: Total limit on number of packets to process
+ * @rx_cleaned: Out parameter of the number of packets processed
*
* This function provides a "bounce buffer" approach to Rx interrupt
* processing. The advantage to this is that on systems that have
@@ -2056,30 +2433,39 @@ static void i40e_rx_buffer_flip(struct i40e_ring *rx_ring,
*
* Returns amount of work completed
**/
-static int i40e_clean_rx_irq(struct i40e_ring *rx_ring, int budget)
+static int i40e_clean_rx_irq(struct i40e_ring *rx_ring, int budget,
+ unsigned int *rx_cleaned)
{
unsigned int total_rx_bytes = 0, total_rx_packets = 0;
- struct sk_buff *skb = rx_ring->skb;
u16 cleaned_count = I40E_DESC_UNUSED(rx_ring);
- bool failure = false, xdp_xmit = false;
+ u16 clean_threshold = rx_ring->count / 2;
+ unsigned int offset = rx_ring->rx_offset;
+ struct xdp_buff *xdp = &rx_ring->xdp;
+ unsigned int xdp_xmit = 0;
+ struct bpf_prog *xdp_prog;
+ bool failure = false;
+ int xdp_res = 0;
- while (likely(total_rx_packets < budget)) {
+ xdp_prog = READ_ONCE(rx_ring->xdp_prog);
+
+ while (likely(total_rx_packets < (unsigned int)budget)) {
+ u16 ntp = rx_ring->next_to_process;
struct i40e_rx_buffer *rx_buffer;
union i40e_rx_desc *rx_desc;
- struct xdp_buff xdp;
+ struct sk_buff *skb;
unsigned int size;
- u16 vlan_tag;
- u8 rx_ptype;
+ u32 nfrags = 0;
+ bool neop;
u64 qword;
/* return some buffers to hardware, one at a time is too slow */
- if (cleaned_count >= I40E_RX_BUFFER_WRITE) {
+ if (cleaned_count >= clean_threshold) {
failure = failure ||
i40e_alloc_rx_buffers(rx_ring, cleaned_count);
cleaned_count = 0;
}
- rx_desc = I40E_RX_DESC(rx_ring, rx_ring->next_to_clean);
+ rx_desc = I40E_RX_DESC(rx_ring, ntp);
/* status_error_len will always be zero for unused descriptors
* because it's cleared in cleanup, and overlaps with hdr_addr
@@ -2094,136 +2480,179 @@ static int i40e_clean_rx_irq(struct i40e_ring *rx_ring, int budget)
*/
dma_rmb();
- if (unlikely(i40e_rx_is_programming_status(qword))) {
- i40e_clean_programming_status(rx_ring, rx_desc, qword);
+ if (i40e_rx_is_programming_status(qword)) {
+ i40e_clean_programming_status(rx_ring,
+ rx_desc->raw.qword[0],
+ qword);
+ rx_buffer = i40e_rx_bi(rx_ring, ntp);
+ i40e_inc_ntp(rx_ring);
+ i40e_reuse_rx_page(rx_ring, rx_buffer);
+ /* Update ntc and bump cleaned count if not in the
+ * middle of mb packet.
+ */
+ if (rx_ring->next_to_clean == ntp) {
+ rx_ring->next_to_clean =
+ rx_ring->next_to_process;
+ cleaned_count++;
+ }
continue;
}
- size = (qword & I40E_RXD_QW1_LENGTH_PBUF_MASK) >>
- I40E_RXD_QW1_LENGTH_PBUF_SHIFT;
+
+ size = FIELD_GET(I40E_RXD_QW1_LENGTH_PBUF_MASK, qword);
if (!size)
break;
- i40e_trace(clean_rx_irq, rx_ring, rx_desc, skb);
+ i40e_trace(clean_rx_irq, rx_ring, rx_desc, xdp);
+ /* retrieve a buffer from the ring */
rx_buffer = i40e_get_rx_buffer(rx_ring, size);
- /* retrieve a buffer from the ring */
- if (!skb) {
- xdp.data = page_address(rx_buffer->page) +
- rx_buffer->page_offset;
- xdp.data_hard_start = xdp.data -
- i40e_rx_offset(rx_ring);
- xdp.data_end = xdp.data + size;
-
- skb = i40e_run_xdp(rx_ring, &xdp);
- }
+ neop = i40e_is_non_eop(rx_ring, rx_desc);
+ i40e_inc_ntp(rx_ring);
- if (IS_ERR(skb)) {
- if (PTR_ERR(skb) == -I40E_XDP_TX) {
- xdp_xmit = true;
- i40e_rx_buffer_flip(rx_ring, rx_buffer, size);
- } else {
- rx_buffer->pagecnt_bias++;
- }
- total_rx_bytes += size;
- total_rx_packets++;
- } else if (skb) {
- i40e_add_rx_frag(rx_ring, rx_buffer, skb, size);
- } else if (ring_uses_build_skb(rx_ring)) {
- skb = i40e_build_skb(rx_ring, rx_buffer, &xdp);
- } else {
- skb = i40e_construct_skb(rx_ring, rx_buffer, &xdp);
- }
+ if (!xdp->data) {
+ unsigned char *hard_start;
- /* exit if we failed to retrieve a buffer */
- if (!skb) {
- rx_ring->rx_stats.alloc_buff_failed++;
- rx_buffer->pagecnt_bias++;
+ hard_start = page_address(rx_buffer->page) +
+ rx_buffer->page_offset - offset;
+ xdp_prepare_buff(xdp, hard_start, offset, size, true);
+#if (PAGE_SIZE > 4096)
+ /* At larger PAGE_SIZE, frame_sz depend on len size */
+ xdp->frame_sz = i40e_rx_frame_truesize(rx_ring, size);
+#endif
+ } else if (i40e_add_xdp_frag(xdp, &nfrags, rx_buffer, size) &&
+ !neop) {
+ /* Overflowing packet: Drop all frags on EOP */
+ i40e_consume_xdp_buff(rx_ring, xdp, rx_buffer);
break;
}
- i40e_put_rx_buffer(rx_ring, rx_buffer);
- cleaned_count++;
-
- if (i40e_is_non_eop(rx_ring, rx_desc, skb))
+ if (neop)
continue;
- if (i40e_cleanup_headers(rx_ring, skb, rx_desc)) {
- skb = NULL;
- continue;
- }
+ xdp_res = i40e_run_xdp(rx_ring, xdp, xdp_prog);
- /* probably a little skewed due to removing CRC */
- total_rx_bytes += skb->len;
+ if (xdp_res) {
+ xdp_xmit |= xdp_res & (I40E_XDP_TX | I40E_XDP_REDIR);
- qword = le64_to_cpu(rx_desc->wb.qword1.status_error_len);
- rx_ptype = (qword & I40E_RXD_QW1_PTYPE_MASK) >>
- I40E_RXD_QW1_PTYPE_SHIFT;
-
- /* populate checksum, VLAN, and protocol */
- i40e_process_skb_fields(rx_ring, rx_desc, skb, rx_ptype);
+ if (unlikely(xdp_buff_has_frags(xdp))) {
+ i40e_process_rx_buffs(rx_ring, xdp_res, xdp);
+ size = xdp_get_buff_len(xdp);
+ } else if (xdp_res & (I40E_XDP_TX | I40E_XDP_REDIR)) {
+ i40e_rx_buffer_flip(rx_buffer, xdp->frame_sz);
+ } else {
+ rx_buffer->pagecnt_bias++;
+ }
+ total_rx_bytes += size;
+ } else {
+ if (ring_uses_build_skb(rx_ring))
+ skb = i40e_build_skb(rx_ring, xdp);
+ else
+ skb = i40e_construct_skb(rx_ring, xdp);
- vlan_tag = (qword & BIT(I40E_RX_DESC_STATUS_L2TAG1P_SHIFT)) ?
- le16_to_cpu(rx_desc->wb.qword0.lo_dword.l2tag1) : 0;
+ /* drop if we failed to retrieve a buffer */
+ if (!skb) {
+ rx_ring->rx_stats.alloc_buff_failed++;
+ i40e_consume_xdp_buff(rx_ring, xdp, rx_buffer);
+ break;
+ }
- i40e_trace(clean_rx_irq_rx, rx_ring, rx_desc, skb);
- i40e_receive_skb(rx_ring, skb, vlan_tag);
- skb = NULL;
+ if (i40e_cleanup_headers(rx_ring, skb, rx_desc))
+ goto process_next;
- /* update budget accounting */
- total_rx_packets++;
- }
+ /* probably a little skewed due to removing CRC */
+ total_rx_bytes += skb->len;
- if (xdp_xmit) {
- struct i40e_ring *xdp_ring;
+ /* populate checksum, VLAN, and protocol */
+ i40e_process_skb_fields(rx_ring, rx_desc, skb);
- xdp_ring = rx_ring->vsi->xdp_rings[rx_ring->queue_index];
+ i40e_trace(clean_rx_irq_rx, rx_ring, rx_desc, xdp);
+ napi_gro_receive(&rx_ring->q_vector->napi, skb);
+ }
- /* Force memory writes to complete before letting h/w
- * know there are new descriptors to fetch.
- */
- wmb();
+ /* update budget accounting */
+ total_rx_packets++;
+process_next:
+ cleaned_count += nfrags + 1;
+ i40e_put_rx_buffer(rx_ring, rx_buffer);
+ rx_ring->next_to_clean = rx_ring->next_to_process;
- writel(xdp_ring->next_to_use, xdp_ring->tail);
+ xdp->data = NULL;
}
- rx_ring->skb = skb;
+ i40e_finalize_xdp_rx(rx_ring, xdp_xmit);
- u64_stats_update_begin(&rx_ring->syncp);
- rx_ring->stats.packets += total_rx_packets;
- rx_ring->stats.bytes += total_rx_bytes;
- u64_stats_update_end(&rx_ring->syncp);
- rx_ring->q_vector->rx.total_packets += total_rx_packets;
- rx_ring->q_vector->rx.total_bytes += total_rx_bytes;
+ i40e_update_rx_stats(rx_ring, total_rx_bytes, total_rx_packets);
+
+ *rx_cleaned = total_rx_packets;
/* guarantee a trip back through this routine if there was a failure */
- return failure ? budget : total_rx_packets;
+ return failure ? budget : (int)total_rx_packets;
}
-static u32 i40e_buildreg_itr(const int type, const u16 itr)
+/**
+ * i40e_buildreg_itr - build a value for writing to I40E_PFINT_DYN_CTLN register
+ * @itr_idx: interrupt throttling index
+ * @interval: interrupt throttling interval value in usecs
+ * @force_swint: force software interrupt
+ *
+ * The function builds a value for I40E_PFINT_DYN_CTLN register that
+ * is used to update interrupt throttling interval for specified ITR index
+ * and optionally enforces a software interrupt. If the @itr_idx is equal
+ * to I40E_ITR_NONE then no interval change is applied and only @force_swint
+ * parameter is taken into account. If the interval change and enforced
+ * software interrupt are not requested then the built value just enables
+ * appropriate vector interrupt.
+ **/
+static u32 i40e_buildreg_itr(enum i40e_dyn_idx itr_idx, u16 interval,
+ bool force_swint)
{
u32 val;
+ /* We don't bother with setting the CLEARPBA bit as the data sheet
+ * points out doing so is "meaningless since it was already
+ * auto-cleared". The auto-clearing happens when the interrupt is
+ * asserted.
+ *
+ * Hardware errata 28 for also indicates that writing to a
+ * xxINT_DYN_CTLx CSR with INTENA_MSK (bit 31) set to 0 will clear
+ * an event in the PBA anyway so we need to rely on the automask
+ * to hold pending events for us until the interrupt is re-enabled
+ *
+ * We have to shift the given value as it is reported in microseconds
+ * and the register value is recorded in 2 microsecond units.
+ */
+ interval >>= 1;
+
+ /* 1. Enable vector interrupt
+ * 2. Update the interval for the specified ITR index
+ * (I40E_ITR_NONE in the register is used to indicate that
+ * no interval update is requested)
+ */
val = I40E_PFINT_DYN_CTLN_INTENA_MASK |
- /* Don't clear PBA because that can cause lost interrupts that
- * came in while we were cleaning/polling
- */
- (type << I40E_PFINT_DYN_CTLN_ITR_INDX_SHIFT) |
- (itr << I40E_PFINT_DYN_CTLN_INTERVAL_SHIFT);
+ FIELD_PREP(I40E_PFINT_DYN_CTLN_ITR_INDX_MASK, itr_idx) |
+ FIELD_PREP(I40E_PFINT_DYN_CTLN_INTERVAL_MASK, interval);
- return val;
-}
+ /* 3. Enforce software interrupt trigger if requested
+ * (These software interrupts rate is limited by ITR2 that is
+ * set to 20K interrupts per second)
+ */
+ if (force_swint)
+ val |= I40E_PFINT_DYN_CTLN_SWINT_TRIG_MASK |
+ I40E_PFINT_DYN_CTLN_SW_ITR_INDX_ENA_MASK |
+ FIELD_PREP(I40E_PFINT_DYN_CTLN_SW_ITR_INDX_MASK,
+ I40E_SW_ITR);
-/* a small macro to shorten up some long lines */
-#define INTREG I40E_PFINT_DYN_CTLN
-static inline int get_rx_itr(struct i40e_vsi *vsi, int idx)
-{
- return vsi->rx_rings[idx]->rx_itr_setting;
+ return val;
}
-static inline int get_tx_itr(struct i40e_vsi *vsi, int idx)
-{
- return vsi->tx_rings[idx]->tx_itr_setting;
-}
+/* The act of updating the ITR will cause it to immediately trigger. In order
+ * to prevent this from throwing off adaptive update statistics we defer the
+ * update so that it can only happen so often. So after either Tx or Rx are
+ * updated we make the adaptive scheme wait until either the ITR completely
+ * expires via the next_update expiration or we have been through at least
+ * 3 interrupts.
+ */
+#define ITR_COUNTDOWN_START 3
/**
* i40e_update_enable_itr - Update itr and re-enable MSIX interrupt
@@ -2234,74 +2663,71 @@ static inline int get_tx_itr(struct i40e_vsi *vsi, int idx)
static inline void i40e_update_enable_itr(struct i40e_vsi *vsi,
struct i40e_q_vector *q_vector)
{
+ enum i40e_dyn_idx itr_idx = I40E_ITR_NONE;
struct i40e_hw *hw = &vsi->back->hw;
- bool rx = false, tx = false;
- u32 rxval, txval;
- int vector;
- int idx = q_vector->v_idx;
- int rx_itr_setting, tx_itr_setting;
-
- vector = (q_vector->v_idx + vsi->base_vector);
-
- /* avoid dynamic calculation if in countdown mode OR if
- * all dynamic is disabled
- */
- rxval = txval = i40e_buildreg_itr(I40E_ITR_NONE, 0);
-
- rx_itr_setting = get_rx_itr(vsi, idx);
- tx_itr_setting = get_tx_itr(vsi, idx);
+ u16 interval = 0;
+ u32 itr_val;
- if (q_vector->itr_countdown > 0 ||
- (!ITR_IS_DYNAMIC(rx_itr_setting) &&
- !ITR_IS_DYNAMIC(tx_itr_setting))) {
- goto enable_int;
- }
-
- if (ITR_IS_DYNAMIC(tx_itr_setting)) {
- rx = i40e_set_new_dynamic_itr(&q_vector->rx);
- rxval = i40e_buildreg_itr(I40E_RX_ITR, q_vector->rx.itr);
- }
-
- if (ITR_IS_DYNAMIC(tx_itr_setting)) {
- tx = i40e_set_new_dynamic_itr(&q_vector->tx);
- txval = i40e_buildreg_itr(I40E_TX_ITR, q_vector->tx.itr);
+ /* If we don't have MSIX, then we only need to re-enable icr0 */
+ if (!test_bit(I40E_FLAG_MSIX_ENA, vsi->back->flags)) {
+ i40e_irq_dynamic_enable_icr0(vsi->back);
+ return;
}
- if (rx || tx) {
- /* get the higher of the two ITR adjustments and
- * use the same value for both ITR registers
- * when in adaptive mode (Rx and/or Tx)
- */
- u16 itr = max(q_vector->tx.itr, q_vector->rx.itr);
-
- q_vector->tx.itr = q_vector->rx.itr = itr;
- txval = i40e_buildreg_itr(I40E_TX_ITR, itr);
- tx = true;
- rxval = i40e_buildreg_itr(I40E_RX_ITR, itr);
- rx = true;
- }
+ /* These will do nothing if dynamic updates are not enabled */
+ i40e_update_itr(q_vector, &q_vector->tx);
+ i40e_update_itr(q_vector, &q_vector->rx);
- /* only need to enable the interrupt once, but need
- * to possibly update both ITR values
+ /* This block of logic allows us to get away with only updating
+ * one ITR value with each interrupt. The idea is to perform a
+ * pseudo-lazy update with the following criteria.
+ *
+ * 1. Rx is given higher priority than Tx if both are in same state
+ * 2. If we must reduce an ITR that is given highest priority.
+ * 3. We then give priority to increasing ITR based on amount.
*/
- if (rx) {
- /* set the INTENA_MSK_MASK so that this first write
- * won't actually enable the interrupt, instead just
- * updating the ITR (it's bit 31 PF and VF)
+ if (q_vector->rx.target_itr < q_vector->rx.current_itr) {
+ /* Rx ITR needs to be reduced, this is highest priority */
+ itr_idx = I40E_RX_ITR;
+ interval = q_vector->rx.target_itr;
+ q_vector->rx.current_itr = q_vector->rx.target_itr;
+ q_vector->itr_countdown = ITR_COUNTDOWN_START;
+ } else if ((q_vector->tx.target_itr < q_vector->tx.current_itr) ||
+ ((q_vector->rx.target_itr - q_vector->rx.current_itr) <
+ (q_vector->tx.target_itr - q_vector->tx.current_itr))) {
+ /* Tx ITR needs to be reduced, this is second priority
+ * Tx ITR needs to be increased more than Rx, fourth priority
*/
- rxval |= BIT(31);
- /* don't check _DOWN because interrupt isn't being enabled */
- wr32(hw, INTREG(vector - 1), rxval);
+ itr_idx = I40E_TX_ITR;
+ interval = q_vector->tx.target_itr;
+ q_vector->tx.current_itr = q_vector->tx.target_itr;
+ q_vector->itr_countdown = ITR_COUNTDOWN_START;
+ } else if (q_vector->rx.current_itr != q_vector->rx.target_itr) {
+ /* Rx ITR needs to be increased, third priority */
+ itr_idx = I40E_RX_ITR;
+ interval = q_vector->rx.target_itr;
+ q_vector->rx.current_itr = q_vector->rx.target_itr;
+ q_vector->itr_countdown = ITR_COUNTDOWN_START;
+ } else {
+ /* No ITR update, lowest priority */
+ if (q_vector->itr_countdown)
+ q_vector->itr_countdown--;
}
-enable_int:
- if (!test_bit(__I40E_VSI_DOWN, vsi->state))
- wr32(hw, INTREG(vector - 1), txval);
+ /* Do not update interrupt control register if VSI is down */
+ if (test_bit(__I40E_VSI_DOWN, vsi->state))
+ return;
- if (q_vector->itr_countdown)
- q_vector->itr_countdown--;
- else
- q_vector->itr_countdown = ITR_COUNTDOWN_START;
+ /* Update ITR interval if necessary and enforce software interrupt
+ * if we are exiting busy poll.
+ */
+ if (q_vector->in_busy_poll) {
+ itr_val = i40e_buildreg_itr(itr_idx, interval, true);
+ q_vector->in_busy_poll = false;
+ } else {
+ itr_val = i40e_buildreg_itr(itr_idx, interval, false);
+ }
+ wr32(hw, I40E_PFINT_DYN_CTLN(q_vector->reg_idx), itr_val);
}
/**
@@ -2319,6 +2745,10 @@ int i40e_napi_poll(struct napi_struct *napi, int budget)
container_of(napi, struct i40e_q_vector, napi);
struct i40e_vsi *vsi = q_vector->vsi;
struct i40e_ring *ring;
+ bool tx_clean_complete = true;
+ bool rx_clean_complete = true;
+ unsigned int tx_cleaned = 0;
+ unsigned int rx_cleaned = 0;
bool clean_complete = true;
bool arm_wb = false;
int budget_per_ring;
@@ -2333,8 +2763,12 @@ int i40e_napi_poll(struct napi_struct *napi, int budget)
* budget and be more aggressive about cleaning up the Tx descriptors.
*/
i40e_for_each_ring(ring, q_vector->tx) {
- if (!i40e_clean_tx_irq(vsi, ring, budget)) {
- clean_complete = false;
+ bool wd = ring->xsk_pool ?
+ i40e_clean_xdp_tx_irq(vsi, ring) :
+ i40e_clean_tx_irq(vsi, ring, budget, &tx_cleaned);
+
+ if (!wd) {
+ clean_complete = tx_clean_complete = false;
continue;
}
arm_wb |= ring->arm_wb;
@@ -2345,23 +2779,34 @@ int i40e_napi_poll(struct napi_struct *napi, int budget)
if (budget <= 0)
goto tx_only;
- /* We attempt to distribute budget to each Rx queue fairly, but don't
- * allow the budget to go below 1 because that would exit polling early.
- */
- budget_per_ring = max(budget/q_vector->num_ringpairs, 1);
+ /* normally we have 1 Rx ring per q_vector */
+ if (unlikely(q_vector->num_ringpairs > 1))
+ /* We attempt to distribute budget to each Rx queue fairly, but
+ * don't allow the budget to go below 1 because that would exit
+ * polling early.
+ */
+ budget_per_ring = max_t(int, budget / q_vector->num_ringpairs, 1);
+ else
+ /* Max of 1 Rx ring in this q_vector so give it the budget */
+ budget_per_ring = budget;
i40e_for_each_ring(ring, q_vector->rx) {
- int cleaned = i40e_clean_rx_irq(ring, budget_per_ring);
+ int cleaned = ring->xsk_pool ?
+ i40e_clean_rx_irq_zc(ring, budget_per_ring) :
+ i40e_clean_rx_irq(ring, budget_per_ring, &rx_cleaned);
work_done += cleaned;
/* if we clean as many as budgeted, we must not be done */
if (cleaned >= budget_per_ring)
- clean_complete = false;
+ clean_complete = rx_clean_complete = false;
}
+ if (!i40e_enabled_xdp_vsi(vsi))
+ trace_i40e_napi_poll(napi, q_vector, budget, budget_per_ring, rx_cleaned,
+ tx_cleaned, rx_clean_complete, tx_clean_complete);
+
/* If work not completed, return budget and polling will return */
if (!clean_complete) {
- const cpumask_t *aff_mask = &q_vector->affinity_mask;
int cpu_id = smp_processor_id();
/* It is possible that the interrupt affinity has changed but,
@@ -2371,33 +2816,34 @@ int i40e_napi_poll(struct napi_struct *napi, int budget)
* continue to poll, otherwise we must stop polling so the
* interrupt can move to the correct cpu.
*/
- if (likely(cpumask_test_cpu(cpu_id, aff_mask) ||
- !(vsi->back->flags & I40E_FLAG_MSIX_ENABLED))) {
+ if (!cpumask_test_cpu(cpu_id, &q_vector->affinity_mask)) {
+ /* Tell napi that we are done polling */
+ napi_complete_done(napi, work_done);
+
+ /* Force an interrupt */
+ i40e_force_wb(vsi, q_vector);
+
+ /* Return budget-1 so that polling stops */
+ return budget - 1;
+ }
tx_only:
- if (arm_wb) {
- q_vector->tx.ring[0].tx_stats.tx_force_wb++;
- i40e_enable_wb_on_itr(vsi, q_vector);
- }
- return budget;
+ if (arm_wb) {
+ q_vector->tx.ring[0].tx_stats.tx_force_wb++;
+ i40e_enable_wb_on_itr(vsi, q_vector);
}
+ return budget;
}
- if (vsi->back->flags & I40E_TXR_FLAGS_WB_ON_ITR)
+ if (q_vector->tx.ring[0].flags & I40E_TXR_FLAGS_WB_ON_ITR)
q_vector->arm_wb_state = false;
- /* Work is done so exit the polling mode and re-enable the interrupt */
- napi_complete_done(napi, work_done);
-
- /* If we're prematurely stopping polling to fix the interrupt
- * affinity we want to make sure polling starts back up so we
- * issue a call to i40e_force_wb which triggers a SW interrupt.
+ /* Exit the polling mode, but don't re-enable interrupts if stack might
+ * poll us due to busy-polling
*/
- if (!clean_complete)
- i40e_force_wb(vsi, q_vector);
- else if (!(vsi->back->flags & I40E_FLAG_MSIX_ENABLED))
- i40e_irq_dynamic_enable_icr0(vsi->back, false);
- else
+ if (likely(napi_complete_done(napi, work_done)))
i40e_update_enable_itr(vsi, q_vector);
+ else
+ q_vector->in_busy_poll = true;
return min(work_done, budget - 1);
}
@@ -2425,10 +2871,10 @@ static void i40e_atr(struct i40e_ring *tx_ring, struct sk_buff *skb,
u16 i;
/* make sure ATR is enabled */
- if (!(pf->flags & I40E_FLAG_FD_ATR_ENABLED))
+ if (!test_bit(I40E_FLAG_FD_ATR_ENA, pf->flags))
return;
- if (pf->flags & I40E_FLAG_FD_ATR_AUTO_DISABLED)
+ if (test_bit(__I40E_FD_ATR_AUTO_DISABLED, pf->state))
return;
/* if sampling is disabled do nothing */
@@ -2451,9 +2897,15 @@ static void i40e_atr(struct i40e_ring *tx_ring, struct sk_buff *skb,
hlen = (hdr.network[0] & 0x0F) << 2;
l4_proto = hdr.ipv4->protocol;
} else {
- hlen = hdr.network - skb->data;
- l4_proto = ipv6_find_hdr(skb, &hlen, IPPROTO_TCP, NULL, NULL);
- hlen -= hdr.network - skb->data;
+ /* find the start of the innermost ipv6 header */
+ unsigned int inner_hlen = hdr.network - skb->data;
+ unsigned int h_offset = inner_hlen;
+
+ /* this function updates h_offset to the end of the header */
+ l4_proto =
+ ipv6_find_hdr(skb, &h_offset, IPPROTO_TCP, NULL, NULL);
+ /* hlen will contain our best estimate of the tcp header */
+ hlen = h_offset - inner_hlen;
}
if (l4_proto != IPPROTO_TCP)
@@ -2462,9 +2914,9 @@ static void i40e_atr(struct i40e_ring *tx_ring, struct sk_buff *skb,
th = (struct tcphdr *)(hdr.network + hlen);
/* Due to lack of space, no more new filters can be programmed */
- if (th->syn && (pf->flags & I40E_FLAG_FD_ATR_AUTO_DISABLED))
+ if (th->syn && test_bit(__I40E_FD_ATR_AUTO_DISABLED, pf->state))
return;
- if (pf->flags & I40E_FLAG_HW_ATR_EVICT_ENABLED) {
+ if (test_bit(I40E_FLAG_HW_ATR_EVICT_ENA, pf->flags)) {
/* HW ATR eviction will take care of removing filters on FIN
* and RST packets.
*/
@@ -2490,12 +2942,12 @@ static void i40e_atr(struct i40e_ring *tx_ring, struct sk_buff *skb,
i++;
tx_ring->next_to_use = (i < tx_ring->count) ? i : 0;
- flex_ptype = (tx_ring->queue_index << I40E_TXD_FLTR_QW0_QINDEX_SHIFT) &
- I40E_TXD_FLTR_QW0_QINDEX_MASK;
+ flex_ptype = FIELD_PREP(I40E_TXD_FLTR_QW0_QINDEX_MASK,
+ tx_ring->queue_index);
flex_ptype |= (tx_flags & I40E_TX_FLAGS_IPV4) ?
- (I40E_FILTER_PCTYPE_NONF_IPV4_TCP <<
+ (LIBIE_FILTER_PCTYPE_NONF_IPV4_TCP <<
I40E_TXD_FLTR_QW0_PCTYPE_SHIFT) :
- (I40E_FILTER_PCTYPE_NONF_IPV6_TCP <<
+ (LIBIE_FILTER_PCTYPE_NONF_IPV6_TCP <<
I40E_TXD_FLTR_QW0_PCTYPE_SHIFT);
flex_ptype |= tx_ring->vsi->id << I40E_TXD_FLTR_QW0_DEST_VSI_SHIFT;
@@ -2517,16 +2969,14 @@ static void i40e_atr(struct i40e_ring *tx_ring, struct sk_buff *skb,
dtype_cmd |= I40E_TXD_FLTR_QW1_CNT_ENA_MASK;
if (!(tx_flags & I40E_TX_FLAGS_UDP_TUNNEL))
dtype_cmd |=
- ((u32)I40E_FD_ATR_STAT_IDX(pf->hw.pf_id) <<
- I40E_TXD_FLTR_QW1_CNTINDEX_SHIFT) &
- I40E_TXD_FLTR_QW1_CNTINDEX_MASK;
+ FIELD_PREP(I40E_TXD_FLTR_QW1_CNTINDEX_MASK,
+ I40E_FD_ATR_STAT_IDX(pf->hw.pf_id));
else
dtype_cmd |=
- ((u32)I40E_FD_ATR_TUNNEL_STAT_IDX(pf->hw.pf_id) <<
- I40E_TXD_FLTR_QW1_CNTINDEX_SHIFT) &
- I40E_TXD_FLTR_QW1_CNTINDEX_MASK;
+ FIELD_PREP(I40E_TXD_FLTR_QW1_CNTINDEX_MASK,
+ I40E_FD_ATR_TUNNEL_STAT_IDX(pf->hw.pf_id));
- if (pf->flags & I40E_FLAG_HW_ATR_EVICT_ENABLED)
+ if (test_bit(I40E_FLAG_HW_ATR_EVICT_ENA, pf->flags))
dtype_cmd |= I40E_TXD_FLTR_QW1_ATR_MASK;
fdir_desc->qindex_flex_ptype_vsi = cpu_to_le32(flex_ptype);
@@ -2584,7 +3034,7 @@ static inline int i40e_tx_prepare_vlan_flags(struct sk_buff *skb,
tx_flags |= I40E_TX_FLAGS_SW_VLAN;
}
- if (!(tx_ring->vsi->back->flags & I40E_FLAG_DCB_ENABLED))
+ if (!test_bit(I40E_FLAG_DCB_ENA, tx_ring->vsi->back->flags))
goto out;
/* Insert 802.1p priority into VLAN header */
@@ -2600,7 +3050,7 @@ static inline int i40e_tx_prepare_vlan_flags(struct sk_buff *skb,
rc = skb_cow_head(skb, 0);
if (rc < 0)
return rc;
- vhdr = (struct vlan_ethhdr *)skb->data;
+ vhdr = skb_vlan_eth_hdr(skb);
vhdr->h_vlan_TCI = htons(tx_flags >>
I40E_TX_FLAGS_VLAN_SHIFT);
} else {
@@ -2626,6 +3076,7 @@ static int i40e_tso(struct i40e_tx_buffer *first, u8 *hdr_len,
{
struct sk_buff *skb = first->skb;
u64 cd_cmd, cd_tso_len, cd_mss;
+ __be16 protocol;
union {
struct iphdr *v4;
struct ipv6hdr *v6;
@@ -2637,7 +3088,7 @@ static int i40e_tso(struct i40e_tx_buffer *first, u8 *hdr_len,
unsigned char *hdr;
} l4;
u32 paylen, l4_offset;
- u16 gso_segs, gso_size;
+ u16 gso_size;
int err;
if (skb->ip_summed != CHECKSUM_PARTIAL)
@@ -2650,15 +3101,23 @@ static int i40e_tso(struct i40e_tx_buffer *first, u8 *hdr_len,
if (err < 0)
return err;
- ip.hdr = skb_network_header(skb);
- l4.hdr = skb_transport_header(skb);
+ protocol = vlan_get_protocol(skb);
+
+ if (eth_p_mpls(protocol))
+ ip.hdr = skb_inner_network_header(skb);
+ else
+ ip.hdr = skb_network_header(skb);
+ l4.hdr = skb_checksum_start(skb);
/* initialize outer IP header fields */
if (ip.v4->version == 4) {
ip.v4->tot_len = 0;
ip.v4->check = 0;
+
+ first->tx_flags |= I40E_TX_FLAGS_TSO;
} else {
ip.v6->payload_len = 0;
+ first->tx_flags |= I40E_TX_FLAGS_TSO;
}
if (skb_shinfo(skb)->gso_type & (SKB_GSO_GRE |
@@ -2698,17 +3157,22 @@ static int i40e_tso(struct i40e_tx_buffer *first, u8 *hdr_len,
/* remove payload length from inner checksum */
paylen = skb->len - l4_offset;
- csum_replace_by_diff(&l4.tcp->check, (__force __wsum)htonl(paylen));
- /* compute length of segmentation header */
- *hdr_len = (l4.tcp->doff * 4) + l4_offset;
+ if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4) {
+ csum_replace_by_diff(&l4.udp->check, (__force __wsum)htonl(paylen));
+ /* compute length of segmentation header */
+ *hdr_len = sizeof(*l4.udp) + l4_offset;
+ } else {
+ csum_replace_by_diff(&l4.tcp->check, (__force __wsum)htonl(paylen));
+ /* compute length of segmentation header */
+ *hdr_len = (l4.tcp->doff * 4) + l4_offset;
+ }
/* pull values out of skb_shinfo */
gso_size = skb_shinfo(skb)->gso_size;
- gso_segs = skb_shinfo(skb)->gso_segs;
/* update GSO size and bytecount with header size */
- first->gso_segs = gso_segs;
+ first->gso_segs = skb_shinfo(skb)->gso_segs;
first->bytecount += (first->gso_segs - 1) * *hdr_len;
/* find the field values */
@@ -2746,7 +3210,7 @@ static int i40e_tsyn(struct i40e_ring *tx_ring, struct sk_buff *skb,
* we are not already transmitting a packet to be timestamped
*/
pf = i40e_netdev_to_pf(tx_ring->netdev);
- if (!(pf->flags & I40E_FLAG_PTP))
+ if (!test_bit(I40E_FLAG_PTP_ENA, pf->flags))
return 0;
if (pf->ptp_tx &&
@@ -2792,13 +3256,29 @@ static int i40e_tx_enable_csum(struct sk_buff *skb, u32 *tx_flags,
unsigned char *exthdr;
u32 offset, cmd = 0;
__be16 frag_off;
+ __be16 protocol;
u8 l4_proto = 0;
if (skb->ip_summed != CHECKSUM_PARTIAL)
return 0;
- ip.hdr = skb_network_header(skb);
- l4.hdr = skb_transport_header(skb);
+ protocol = vlan_get_protocol(skb);
+
+ if (eth_p_mpls(protocol)) {
+ ip.hdr = skb_inner_network_header(skb);
+ l4.hdr = skb_checksum_start(skb);
+ } else {
+ ip.hdr = skb_network_header(skb);
+ l4.hdr = skb_transport_header(skb);
+ }
+
+ /* set the tx_flags to indicate the IP protocol type. this is
+ * required so that checksum header computation below is accurate.
+ */
+ if (ip.v4->version == 4)
+ *tx_flags |= I40E_TX_FLAGS_IPV4;
+ else
+ *tx_flags |= I40E_TX_FLAGS_IPV6;
/* compute outer L2 header size */
offset = ((ip.hdr - skb->data) / 2) << I40E_TX_DESC_LENGTH_MACLEN_SHIFT;
@@ -2813,13 +3293,16 @@ static int i40e_tx_enable_csum(struct sk_buff *skb, u32 *tx_flags,
l4_proto = ip.v4->protocol;
} else if (*tx_flags & I40E_TX_FLAGS_IPV6) {
+ int ret;
+
tunnel |= I40E_TX_CTX_EXT_IP_IPV6;
exthdr = ip.hdr + sizeof(*ip.v6);
l4_proto = ip.v6->nexthdr;
- if (l4.hdr != exthdr)
- ipv6_skip_exthdr(skb, exthdr - skb->data,
- &l4_proto, &frag_off);
+ ret = ipv6_skip_exthdr(skb, exthdr - skb->data,
+ &l4_proto, &frag_off);
+ if (ret < 0)
+ return -1;
}
/* define outer transport */
@@ -2932,7 +3415,7 @@ static int i40e_tx_enable_csum(struct sk_buff *skb, u32 *tx_flags,
}
/**
- * i40e_create_tx_ctx Build the Tx context descriptor
+ * i40e_create_tx_ctx - Build the Tx context descriptor
* @tx_ring: ring to create the descriptor on
* @cd_type_cmd_tso_mss: Quad Word 1
* @cd_tunneling: Quad Word 0 - bits 0-31
@@ -2975,6 +3458,8 @@ int __i40e_maybe_stop_tx(struct i40e_ring *tx_ring, int size)
/* Memory barrier before checking head and tail */
smp_mb();
+ ++tx_ring->tx_stats.tx_stopped;
+
/* Check again in a case another CPU has just made room available. */
if (likely(I40E_DESC_UNUSED(tx_ring) < size))
return -EBUSY;
@@ -3000,7 +3485,7 @@ int __i40e_maybe_stop_tx(struct i40e_ring *tx_ring, int size)
**/
bool __i40e_chk_linearize(struct sk_buff *skb)
{
- const struct skb_frag_struct *frag, *stale;
+ const skb_frag_t *frag, *stale;
int nr_frags, sum;
/* no need to check if number of frags is less than 7 */
@@ -3032,10 +3517,30 @@ bool __i40e_chk_linearize(struct sk_buff *skb)
/* Walk through fragments adding latest fragment, testing it, and
* then removing stale fragments from the sum.
*/
- stale = &skb_shinfo(skb)->frags[0];
- for (;;) {
+ for (stale = &skb_shinfo(skb)->frags[0];; stale++) {
+ int stale_size = skb_frag_size(stale);
+
sum += skb_frag_size(frag++);
+ /* The stale fragment may present us with a smaller
+ * descriptor than the actual fragment size. To account
+ * for that we need to remove all the data on the front and
+ * figure out what the remainder would be in the last
+ * descriptor associated with the fragment.
+ */
+ if (stale_size > I40E_MAX_DATA_PER_TXD) {
+ int align_pad = -(skb_frag_off(stale)) &
+ (I40E_MAX_READ_REQ_SIZE - 1);
+
+ sum -= align_pad;
+ stale_size -= align_pad;
+
+ do {
+ sum -= I40E_MAX_DATA_PER_TXD_ALIGNED;
+ stale_size -= I40E_MAX_DATA_PER_TXD_ALIGNED;
+ } while (stale_size > I40E_MAX_DATA_PER_TXD);
+ }
+
/* if sum is negative we failed to make sufficient progress */
if (sum < 0)
return true;
@@ -3043,7 +3548,7 @@ bool __i40e_chk_linearize(struct sk_buff *skb)
if (!nr_frags--)
break;
- sum -= skb_frag_size(stale++);
+ sum -= stale_size;
}
return false;
@@ -3067,7 +3572,7 @@ static inline int i40e_tx_map(struct i40e_ring *tx_ring, struct sk_buff *skb,
{
unsigned int data_len = skb->data_len;
unsigned int size = skb_headlen(skb);
- struct skb_frag_struct *frag;
+ skb_frag_t *frag;
struct i40e_tx_buffer *tx_bi;
struct i40e_tx_desc *tx_desc;
u16 i = tx_ring->next_to_use;
@@ -3077,8 +3582,7 @@ static inline int i40e_tx_map(struct i40e_ring *tx_ring, struct sk_buff *skb,
if (tx_flags & I40E_TX_FLAGS_HW_VLAN) {
td_cmd |= I40E_TX_DESC_CMD_IL2TAG1;
- td_tag = (tx_flags & I40E_TX_FLAGS_VLAN_MASK) >>
- I40E_TX_FLAGS_VLAN_SHIFT;
+ td_tag = FIELD_GET(I40E_TX_FLAGS_VLAN_MASK, tx_flags);
}
first->tx_flags = tx_flags;
@@ -3160,38 +3664,12 @@ static inline int i40e_tx_map(struct i40e_ring *tx_ring, struct sk_buff *skb,
/* write last descriptor with EOP bit */
td_cmd |= I40E_TX_DESC_CMD_EOP;
- /* We can OR these values together as they both are checked against
- * 4 below and at this point desc_count will be used as a boolean value
- * after this if/else block.
+ /* We OR these values together to check both against 4 (WB_STRIDE)
+ * below. This is safe since we don't re-use desc_count afterwards.
*/
desc_count |= ++tx_ring->packet_stride;
- /* Algorithm to optimize tail and RS bit setting:
- * if queue is stopped
- * mark RS bit
- * reset packet counter
- * else if xmit_more is supported and is true
- * advance packet counter to 4
- * reset desc_count to 0
- *
- * if desc_count >= 4
- * mark RS bit
- * reset packet counter
- * if desc_count > 0
- * update tail
- *
- * Note: If there are less than 4 descriptors
- * pending and interrupts were disabled the service task will
- * trigger a force WB.
- */
- if (netif_xmit_stopped(txring_txq(tx_ring))) {
- goto do_rs;
- } else if (skb->xmit_more) {
- /* set stride to arm on next packet and reset desc_count */
- tx_ring->packet_stride = WB_STRIDE;
- desc_count = 0;
- } else if (desc_count >= WB_STRIDE) {
-do_rs:
+ if (desc_count >= WB_STRIDE) {
/* write last descriptor with RS bit set */
td_cmd |= I40E_TX_DESC_CMD_RS;
tx_ring->packet_stride = 0;
@@ -3200,6 +3678,8 @@ do_rs:
tx_desc->cmd_type_offset_bsz =
build_ctob(td_cmd, td_offset, size, td_tag);
+ skb_tx_timestamp(skb);
+
/* Force memory writes to complete before letting h/w know there
* are new descriptors to fetch.
*
@@ -3212,13 +3692,8 @@ do_rs:
first->next_to_watch = tx_desc;
/* notify HW of packet */
- if (desc_count) {
+ if (netif_xmit_stopped(txring_txq(tx_ring)) || !netdev_xmit_more()) {
writel(i, tx_ring->tail);
-
- /* we need this if more than one processor can write to our tail
- * at a time, it synchronizes IO on IA64/Altix systems
- */
- mmiowb();
}
return 0;
@@ -3242,57 +3717,144 @@ dma_error:
return -1;
}
+static u16 i40e_swdcb_skb_tx_hash(struct net_device *dev,
+ const struct sk_buff *skb,
+ u16 num_tx_queues)
+{
+ u32 jhash_initval_salt = 0xd631614b;
+ u32 hash;
+
+ if (skb->sk && skb->sk->sk_hash)
+ hash = skb->sk->sk_hash;
+ else
+ hash = (__force u16)skb->protocol ^ skb->hash;
+
+ hash = jhash_1word(hash, jhash_initval_salt);
+
+ return (u16)(((u64)hash * num_tx_queues) >> 32);
+}
+
+u16 i40e_lan_select_queue(struct net_device *netdev,
+ struct sk_buff *skb,
+ struct net_device __always_unused *sb_dev)
+{
+ struct i40e_netdev_priv *np = netdev_priv(netdev);
+ struct i40e_vsi *vsi = np->vsi;
+ struct i40e_hw *hw;
+ u16 qoffset;
+ u16 qcount;
+ u8 tclass;
+ u16 hash;
+ u8 prio;
+
+ /* is DCB enabled at all? */
+ if (vsi->tc_config.numtc == 1 ||
+ i40e_is_tc_mqprio_enabled(vsi->back))
+ return netdev_pick_tx(netdev, skb, sb_dev);
+
+ prio = skb->priority;
+ hw = &vsi->back->hw;
+ tclass = hw->local_dcbx_config.etscfg.prioritytable[prio];
+ /* sanity check */
+ if (unlikely(!(vsi->tc_config.enabled_tc & BIT(tclass))))
+ tclass = 0;
+
+ /* select a queue assigned for the given TC */
+ qcount = vsi->tc_config.tc_info[tclass].qcount;
+ hash = i40e_swdcb_skb_tx_hash(netdev, skb, qcount);
+
+ qoffset = vsi->tc_config.tc_info[tclass].qoffset;
+ return qoffset + hash;
+}
+
/**
* i40e_xmit_xdp_ring - transmits an XDP buffer to an XDP Tx ring
- * @xdp: data to transmit
+ * @xdpf: data to transmit
* @xdp_ring: XDP Tx ring
**/
-static int i40e_xmit_xdp_ring(struct xdp_buff *xdp,
+static int i40e_xmit_xdp_ring(struct xdp_frame *xdpf,
struct i40e_ring *xdp_ring)
{
- u32 size = xdp->data_end - xdp->data;
- u16 i = xdp_ring->next_to_use;
- struct i40e_tx_buffer *tx_bi;
- struct i40e_tx_desc *tx_desc;
- dma_addr_t dma;
-
- if (!unlikely(I40E_DESC_UNUSED(xdp_ring))) {
+ struct skb_shared_info *sinfo = xdp_get_shared_info_from_frame(xdpf);
+ u8 nr_frags = unlikely(xdp_frame_has_frags(xdpf)) ? sinfo->nr_frags : 0;
+ u16 i = 0, index = xdp_ring->next_to_use;
+ struct i40e_tx_buffer *tx_head = &xdp_ring->tx_bi[index];
+ struct i40e_tx_buffer *tx_bi = tx_head;
+ struct i40e_tx_desc *tx_desc = I40E_TX_DESC(xdp_ring, index);
+ void *data = xdpf->data;
+ u32 size = xdpf->len;
+
+ if (unlikely(I40E_DESC_UNUSED(xdp_ring) < 1 + nr_frags)) {
xdp_ring->tx_stats.tx_busy++;
return I40E_XDP_CONSUMED;
}
- dma = dma_map_single(xdp_ring->dev, xdp->data, size, DMA_TO_DEVICE);
- if (dma_mapping_error(xdp_ring->dev, dma))
- return I40E_XDP_CONSUMED;
+ tx_head->bytecount = xdp_get_frame_len(xdpf);
+ tx_head->gso_segs = 1;
+ tx_head->xdpf = xdpf;
- tx_bi = &xdp_ring->tx_bi[i];
- tx_bi->bytecount = size;
- tx_bi->gso_segs = 1;
- tx_bi->raw_buf = xdp->data;
+ for (;;) {
+ dma_addr_t dma;
- /* record length, and DMA address */
- dma_unmap_len_set(tx_bi, len, size);
- dma_unmap_addr_set(tx_bi, dma, dma);
+ dma = dma_map_single(xdp_ring->dev, data, size, DMA_TO_DEVICE);
+ if (dma_mapping_error(xdp_ring->dev, dma))
+ goto unmap;
- tx_desc = I40E_TX_DESC(xdp_ring, i);
- tx_desc->buffer_addr = cpu_to_le64(dma);
- tx_desc->cmd_type_offset_bsz = build_ctob(I40E_TX_DESC_CMD_ICRC
- | I40E_TXD_CMD,
- 0, size, 0);
+ /* record length, and DMA address */
+ dma_unmap_len_set(tx_bi, len, size);
+ dma_unmap_addr_set(tx_bi, dma, dma);
+
+ tx_desc->buffer_addr = cpu_to_le64(dma);
+ tx_desc->cmd_type_offset_bsz =
+ build_ctob(I40E_TX_DESC_CMD_ICRC, 0, size, 0);
+
+ if (++index == xdp_ring->count)
+ index = 0;
+
+ if (i == nr_frags)
+ break;
+
+ tx_bi = &xdp_ring->tx_bi[index];
+ tx_desc = I40E_TX_DESC(xdp_ring, index);
+
+ data = skb_frag_address(&sinfo->frags[i]);
+ size = skb_frag_size(&sinfo->frags[i]);
+ i++;
+ }
+
+ tx_desc->cmd_type_offset_bsz |=
+ cpu_to_le64(I40E_TXD_CMD << I40E_TXD_QW1_CMD_SHIFT);
/* Make certain all of the status bits have been updated
* before next_to_watch is written.
*/
smp_wmb();
- i++;
- if (i == xdp_ring->count)
- i = 0;
+ xdp_ring->xdp_tx_active++;
- tx_bi->next_to_watch = tx_desc;
- xdp_ring->next_to_use = i;
+ tx_head->next_to_watch = tx_desc;
+ xdp_ring->next_to_use = index;
return I40E_XDP_TX;
+
+unmap:
+ for (;;) {
+ tx_bi = &xdp_ring->tx_bi[index];
+ if (dma_unmap_len(tx_bi, len))
+ dma_unmap_page(xdp_ring->dev,
+ dma_unmap_addr(tx_bi, dma),
+ dma_unmap_len(tx_bi, len),
+ DMA_TO_DEVICE);
+ dma_unmap_len_set(tx_bi, len, 0);
+ if (tx_bi == tx_head)
+ break;
+
+ if (!index)
+ index += xdp_ring->count;
+ index--;
+ }
+
+ return I40E_XDP_CONSUMED;
}
/**
@@ -3310,7 +3872,6 @@ static netdev_tx_t i40e_xmit_frame_ring(struct sk_buff *skb,
struct i40e_tx_buffer *first;
u32 td_offset = 0;
u32 tx_flags = 0;
- __be16 protocol;
u32 td_cmd = 0;
u8 hdr_len = 0;
int tso, count;
@@ -3352,15 +3913,6 @@ static netdev_tx_t i40e_xmit_frame_ring(struct sk_buff *skb,
if (i40e_tx_prepare_vlan_flags(skb, tx_ring, &tx_flags))
goto out_drop;
- /* obtain protocol of skb */
- protocol = vlan_get_protocol(skb);
-
- /* setup IPv4/IPv6 offloads */
- if (protocol == htons(ETH_P_IP))
- tx_flags |= I40E_TX_FLAGS_IPV4;
- else if (protocol == htons(ETH_P_IPV6))
- tx_flags |= I40E_TX_FLAGS_IPV6;
-
tso = i40e_tso(first, &hdr_len, &cd_type_cmd_tso_mss);
if (tso < 0)
@@ -3379,8 +3931,6 @@ static netdev_tx_t i40e_xmit_frame_ring(struct sk_buff *skb,
if (tsyn)
tx_flags |= I40E_TX_FLAGS_TSYN;
- skb_tx_timestamp(skb);
-
/* always enable CRC insertion offload */
td_cmd |= I40E_TX_DESC_CMD_ICRC;
@@ -3436,3 +3986,55 @@ netdev_tx_t i40e_lan_xmit_frame(struct sk_buff *skb, struct net_device *netdev)
return i40e_xmit_frame_ring(skb, tx_ring);
}
+
+/**
+ * i40e_xdp_xmit - Implements ndo_xdp_xmit
+ * @dev: netdev
+ * @n: number of frames
+ * @frames: array of XDP buffer pointers
+ * @flags: XDP extra info
+ *
+ * Returns number of frames successfully sent. Failed frames
+ * will be free'ed by XDP core.
+ *
+ * For error cases, a negative errno code is returned and no-frames
+ * are transmitted (caller must handle freeing frames).
+ **/
+int i40e_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames,
+ u32 flags)
+{
+ struct i40e_netdev_priv *np = netdev_priv(dev);
+ unsigned int queue_index = smp_processor_id();
+ struct i40e_vsi *vsi = np->vsi;
+ struct i40e_pf *pf = vsi->back;
+ struct i40e_ring *xdp_ring;
+ int nxmit = 0;
+ int i;
+
+ if (test_bit(__I40E_VSI_DOWN, vsi->state))
+ return -ENETDOWN;
+
+ if (!i40e_enabled_xdp_vsi(vsi) || queue_index >= vsi->num_queue_pairs ||
+ test_bit(__I40E_CONFIG_BUSY, pf->state))
+ return -ENXIO;
+
+ if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK))
+ return -EINVAL;
+
+ xdp_ring = vsi->xdp_rings[queue_index];
+
+ for (i = 0; i < n; i++) {
+ struct xdp_frame *xdpf = frames[i];
+ int err;
+
+ err = i40e_xmit_xdp_ring(xdpf, xdp_ring);
+ if (err != I40E_XDP_TX)
+ break;
+ nxmit++;
+ }
+
+ if (unlikely(flags & XDP_XMIT_FLUSH))
+ i40e_xdp_ring_update_tail(xdp_ring);
+
+ return nxmit;
+}