diff options
96 files changed, 1930 insertions, 570 deletions
diff --git a/Documentation/networking/scaling.rst b/Documentation/networking/scaling.rst index 8f0347b9fb3d..3d435caa3ef2 100644 --- a/Documentation/networking/scaling.rst +++ b/Documentation/networking/scaling.rst @@ -465,9 +465,9 @@ XPS Configuration ----------------- XPS is only available if the kconfig symbol CONFIG_XPS is enabled (on by -default for SMP). The functionality remains disabled until explicitly -configured. To enable XPS, the bitmap of CPUs/receive-queues that may -use a transmit queue is configured using the sysfs file entry: +default for SMP). If compiled in, it is driver dependent whether, and +how, XPS is configured at device init. The mapping of CPUs/receive-queues +to transmit queue can be inspected and configured using sysfs: For selection based on CPUs map:: diff --git a/drivers/crypto/chelsio/chcr_core.h b/drivers/crypto/chelsio/chcr_core.h index bb092b6b36b2..b02f981e7c32 100644 --- a/drivers/crypto/chelsio/chcr_core.h +++ b/drivers/crypto/chelsio/chcr_core.h @@ -137,6 +137,4 @@ int chcr_uld_rx_handler(void *handle, const __be64 *rsp, int chcr_uld_tx_handler(struct sk_buff *skb, struct net_device *dev); int chcr_handle_resp(struct crypto_async_request *req, unsigned char *input, int err); -int chcr_ipsec_xmit(struct sk_buff *skb, struct net_device *dev); -void chcr_add_xfrmops(const struct cxgb4_lld_info *lld); #endif /* __CHCR_CORE_H__ */ diff --git a/drivers/infiniband/hw/hfi1/ipoib_main.c b/drivers/infiniband/hw/hfi1/ipoib_main.c index 014351ebbefa..9f71b9d706bd 100644 --- a/drivers/infiniband/hw/hfi1/ipoib_main.c +++ b/drivers/infiniband/hw/hfi1/ipoib_main.c @@ -97,41 +97,9 @@ static void hfi1_ipoib_dev_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *storage) { struct hfi1_ipoib_dev_priv *priv = hfi1_ipoib_priv(dev); - u64 rx_packets = 0ull; - u64 rx_bytes = 0ull; - u64 tx_packets = 0ull; - u64 tx_bytes = 0ull; - int i; netdev_stats_to_stats64(storage, &dev->stats); - - for_each_possible_cpu(i) { - const struct pcpu_sw_netstats *stats; - unsigned int start; - u64 trx_packets; - u64 trx_bytes; - u64 ttx_packets; - u64 ttx_bytes; - - stats = per_cpu_ptr(priv->netstats, i); - do { - start = u64_stats_fetch_begin_irq(&stats->syncp); - trx_packets = stats->rx_packets; - trx_bytes = stats->rx_bytes; - ttx_packets = stats->tx_packets; - ttx_bytes = stats->tx_bytes; - } while (u64_stats_fetch_retry_irq(&stats->syncp, start)); - - rx_packets += trx_packets; - rx_bytes += trx_bytes; - tx_packets += ttx_packets; - tx_bytes += ttx_bytes; - } - - storage->rx_packets += rx_packets; - storage->rx_bytes += rx_bytes; - storage->tx_packets += tx_packets; - storage->tx_bytes += tx_bytes; + dev_fetch_sw_netstats(storage, priv->netstats); } static const struct net_device_ops hfi1_ipoib_netdev_ops = { diff --git a/drivers/net/can/m_can/m_can_platform.c b/drivers/net/can/m_can/m_can_platform.c index 38ea5e600fb8..e6d0cb9ee02f 100644 --- a/drivers/net/can/m_can/m_can_platform.c +++ b/drivers/net/can/m_can/m_can_platform.c @@ -144,8 +144,6 @@ static int __maybe_unused m_can_runtime_suspend(struct device *dev) struct net_device *ndev = dev_get_drvdata(dev); struct m_can_classdev *mcan_class = netdev_priv(ndev); - m_can_class_suspend(dev); - clk_disable_unprepare(mcan_class->cclk); clk_disable_unprepare(mcan_class->hclk); diff --git a/drivers/net/dsa/microchip/ksz_common.c b/drivers/net/dsa/microchip/ksz_common.c index 01f5784f69cb..0ef854911f21 100644 --- a/drivers/net/dsa/microchip/ksz_common.c +++ b/drivers/net/dsa/microchip/ksz_common.c @@ -103,14 +103,8 @@ void ksz_init_mib_timer(struct ksz_device *dev) INIT_DELAYED_WORK(&dev->mib_read, ksz_mib_read_work); - /* Read MIB counters every 30 seconds to avoid overflow. */ - dev->mib_read_interval = msecs_to_jiffies(30000); - for (i = 0; i < dev->mib_port_cnt; i++) dev->dev_ops->port_init_cnt(dev, i); - - /* Start the timer 2 seconds later. */ - schedule_delayed_work(&dev->mib_read, msecs_to_jiffies(2000)); } EXPORT_SYMBOL_GPL(ksz_init_mib_timer); @@ -143,7 +137,9 @@ void ksz_mac_link_down(struct dsa_switch *ds, int port, unsigned int mode, /* Read all MIB counters when the link is going down. */ p->read = true; - schedule_delayed_work(&dev->mib_read, 0); + /* timer started */ + if (dev->mib_read_interval) + schedule_delayed_work(&dev->mib_read, 0); } EXPORT_SYMBOL_GPL(ksz_mac_link_down); @@ -451,6 +447,12 @@ int ksz_switch_register(struct ksz_device *dev, return ret; } + /* Read MIB counters every 30 seconds to avoid overflow. */ + dev->mib_read_interval = msecs_to_jiffies(30000); + + /* Start the MIB timer. */ + schedule_delayed_work(&dev->mib_read, 0); + return 0; } EXPORT_SYMBOL(ksz_switch_register); diff --git a/drivers/net/ethernet/cadence/macb.h b/drivers/net/ethernet/cadence/macb.h index 3fd5c6cc23af..5de47f6fde5a 100644 --- a/drivers/net/ethernet/cadence/macb.h +++ b/drivers/net/ethernet/cadence/macb.h @@ -366,6 +366,8 @@ #define MACB_ISR_RLE_SIZE 1 #define MACB_TXERR_OFFSET 6 /* EN TX frame corrupt from error interrupt */ #define MACB_TXERR_SIZE 1 +#define MACB_RM9200_TBRE_OFFSET 6 /* EN may send new frame interrupt (RM9200) */ +#define MACB_RM9200_TBRE_SIZE 1 #define MACB_TCOMP_OFFSET 7 /* Enable transmit complete interrupt */ #define MACB_TCOMP_SIZE 1 #define MACB_ISR_LINK_OFFSET 9 /* Enable link change interrupt */ @@ -1205,10 +1207,10 @@ struct macb { phy_interface_t phy_interface; - /* AT91RM9200 transmit */ - struct sk_buff *skb; /* holds skb until xmit interrupt completes */ - dma_addr_t skb_physaddr; /* phys addr from pci_map_single */ - int skb_length; /* saved skb length for pci_unmap_single */ + /* AT91RM9200 transmit queue (1 on wire + 1 queued) */ + struct macb_tx_skb rm9200_txq[2]; + unsigned int rm9200_tx_tail; + unsigned int rm9200_tx_len; unsigned int max_tx_length; u64 ethtool_stats[GEM_STATS_LEN + QUEUE_STATS_LEN * MACB_MAX_QUEUES]; diff --git a/drivers/net/ethernet/cadence/macb_main.c b/drivers/net/ethernet/cadence/macb_main.c index 4b42b2d6398c..883e47c5b1a7 100644 --- a/drivers/net/ethernet/cadence/macb_main.c +++ b/drivers/net/ethernet/cadence/macb_main.c @@ -3908,6 +3908,7 @@ static int at91ether_start(struct macb *lp) MACB_BIT(ISR_TUND) | MACB_BIT(ISR_RLE) | MACB_BIT(TCOMP) | + MACB_BIT(RM9200_TBRE) | MACB_BIT(ISR_ROVR) | MACB_BIT(HRESP)); @@ -3924,6 +3925,7 @@ static void at91ether_stop(struct macb *lp) MACB_BIT(ISR_TUND) | MACB_BIT(ISR_RLE) | MACB_BIT(TCOMP) | + MACB_BIT(RM9200_TBRE) | MACB_BIT(ISR_ROVR) | MACB_BIT(HRESP)); @@ -3993,24 +3995,34 @@ static netdev_tx_t at91ether_start_xmit(struct sk_buff *skb, struct net_device *dev) { struct macb *lp = netdev_priv(dev); + unsigned long flags; - if (macb_readl(lp, TSR) & MACB_BIT(RM9200_BNQ)) { - netif_stop_queue(dev); + if (lp->rm9200_tx_len < 2) { + int desc = lp->rm9200_tx_tail; /* Store packet information (to free when Tx completed) */ - lp->skb = skb; - lp->skb_length = skb->len; - lp->skb_physaddr = dma_map_single(&lp->pdev->dev, skb->data, - skb->len, DMA_TO_DEVICE); - if (dma_mapping_error(&lp->pdev->dev, lp->skb_physaddr)) { + lp->rm9200_txq[desc].skb = skb; + lp->rm9200_txq[desc].size = skb->len; + lp->rm9200_txq[desc].mapping = dma_map_single(&lp->pdev->dev, skb->data, + skb->len, DMA_TO_DEVICE); + if (dma_mapping_error(&lp->pdev->dev, lp->rm9200_txq[desc].mapping)) { dev_kfree_skb_any(skb); dev->stats.tx_dropped++; netdev_err(dev, "%s: DMA mapping error\n", __func__); return NETDEV_TX_OK; } + spin_lock_irqsave(&lp->lock, flags); + + lp->rm9200_tx_tail = (desc + 1) & 1; + lp->rm9200_tx_len++; + if (lp->rm9200_tx_len > 1) + netif_stop_queue(dev); + + spin_unlock_irqrestore(&lp->lock, flags); + /* Set address of the data in the Transmit Address register */ - macb_writel(lp, TAR, lp->skb_physaddr); + macb_writel(lp, TAR, lp->rm9200_txq[desc].mapping); /* Set length of the packet in the Transmit Control register */ macb_writel(lp, TCR, skb->len); @@ -4073,6 +4085,9 @@ static irqreturn_t at91ether_interrupt(int irq, void *dev_id) struct net_device *dev = dev_id; struct macb *lp = netdev_priv(dev); u32 intstatus, ctl; + unsigned int desc; + unsigned int qlen; + u32 tsr; /* MAC Interrupt Status register indicates what interrupts are pending. * It is automatically cleared once read. @@ -4084,20 +4099,39 @@ static irqreturn_t at91ether_interrupt(int irq, void *dev_id) at91ether_rx(dev); /* Transmit complete */ - if (intstatus & MACB_BIT(TCOMP)) { + if (intstatus & (MACB_BIT(TCOMP) | MACB_BIT(RM9200_TBRE))) { /* The TCOM bit is set even if the transmission failed */ if (intstatus & (MACB_BIT(ISR_TUND) | MACB_BIT(ISR_RLE))) dev->stats.tx_errors++; - if (lp->skb) { - dev_consume_skb_irq(lp->skb); - lp->skb = NULL; - dma_unmap_single(&lp->pdev->dev, lp->skb_physaddr, - lp->skb_length, DMA_TO_DEVICE); + spin_lock(&lp->lock); + + tsr = macb_readl(lp, TSR); + + /* we have three possibilities here: + * - all pending packets transmitted (TGO, implies BNQ) + * - only first packet transmitted (!TGO && BNQ) + * - two frames pending (!TGO && !BNQ) + * Note that TGO ("transmit go") is called "IDLE" on RM9200. + */ + qlen = (tsr & MACB_BIT(TGO)) ? 0 : + (tsr & MACB_BIT(RM9200_BNQ)) ? 1 : 2; + + while (lp->rm9200_tx_len > qlen) { + desc = (lp->rm9200_tx_tail - lp->rm9200_tx_len) & 1; + dev_consume_skb_irq(lp->rm9200_txq[desc].skb); + lp->rm9200_txq[desc].skb = NULL; + dma_unmap_single(&lp->pdev->dev, lp->rm9200_txq[desc].mapping, + lp->rm9200_txq[desc].size, DMA_TO_DEVICE); dev->stats.tx_packets++; - dev->stats.tx_bytes += lp->skb_length; + dev->stats.tx_bytes += lp->rm9200_txq[desc].size; + lp->rm9200_tx_len--; } - netif_wake_queue(dev); + + if (lp->rm9200_tx_len < 2 && netif_queue_stopped(dev)) + netif_wake_queue(dev); + + spin_unlock(&lp->lock); } /* Work-around for EMAC Errata section 41.3.1 */ diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c index f642c1b475c4..1b88bd1c2dbe 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c @@ -60,6 +60,89 @@ static struct ch_tc_pedit_fields pedits[] = { PEDIT_FIELDS(IP6_, DST_127_96, 4, nat_lip, 12), }; +static const struct cxgb4_natmode_config cxgb4_natmode_config_array[] = { + /* Default supported NAT modes */ + { + .chip = CHELSIO_T5, + .flags = CXGB4_ACTION_NATMODE_NONE, + .natmode = NAT_MODE_NONE, + }, + { + .chip = CHELSIO_T5, + .flags = CXGB4_ACTION_NATMODE_DIP, + .natmode = NAT_MODE_DIP, + }, + { + .chip = CHELSIO_T5, + .flags = CXGB4_ACTION_NATMODE_DIP | CXGB4_ACTION_NATMODE_DPORT, + .natmode = NAT_MODE_DIP_DP, + }, + { + .chip = CHELSIO_T5, + .flags = CXGB4_ACTION_NATMODE_DIP | CXGB4_ACTION_NATMODE_DPORT | + CXGB4_ACTION_NATMODE_SIP, + .natmode = NAT_MODE_DIP_DP_SIP, + }, + { + .chip = CHELSIO_T5, + .flags = CXGB4_ACTION_NATMODE_DIP | CXGB4_ACTION_NATMODE_DPORT | + CXGB4_ACTION_NATMODE_SPORT, + .natmode = NAT_MODE_DIP_DP_SP, + }, + { + .chip = CHELSIO_T5, + .flags = CXGB4_ACTION_NATMODE_SIP | CXGB4_ACTION_NATMODE_SPORT, + .natmode = NAT_MODE_SIP_SP, + }, + { + .chip = CHELSIO_T5, + .flags = CXGB4_ACTION_NATMODE_DIP | CXGB4_ACTION_NATMODE_SIP | + CXGB4_ACTION_NATMODE_SPORT, + .natmode = NAT_MODE_DIP_SIP_SP, + }, + { + .chip = CHELSIO_T5, + .flags = CXGB4_ACTION_NATMODE_DIP | CXGB4_ACTION_NATMODE_SIP | + CXGB4_ACTION_NATMODE_DPORT | + CXGB4_ACTION_NATMODE_SPORT, + .natmode = NAT_MODE_ALL, + }, + /* T6+ can ignore L4 ports when they're disabled. */ + { + .chip = CHELSIO_T6, + .flags = CXGB4_ACTION_NATMODE_SIP, + .natmode = NAT_MODE_SIP_SP, + }, + { + .chip = CHELSIO_T6, + .flags = CXGB4_ACTION_NATMODE_DIP | CXGB4_ACTION_NATMODE_SPORT, + .natmode = NAT_MODE_DIP_DP_SP, + }, + { + .chip = CHELSIO_T6, + .flags = CXGB4_ACTION_NATMODE_DIP | CXGB4_ACTION_NATMODE_SIP, + .natmode = NAT_MODE_ALL, + }, +}; + +static void cxgb4_action_natmode_tweak(struct ch_filter_specification *fs, + u8 natmode_flags) +{ + u8 i = 0; + + /* Translate the enabled NAT 4-tuple fields to one of the + * hardware supported NAT mode configurations. This ensures + * that we pick a valid combination, where the disabled fields + * do not get overwritten to 0. + */ + for (i = 0; i < ARRAY_SIZE(cxgb4_natmode_config_array); i++) { + if (cxgb4_natmode_config_array[i].flags == natmode_flags) { + fs->nat_mode = cxgb4_natmode_config_array[i].natmode; + return; + } + } +} + static struct ch_tc_flower_entry *allocate_flower_entry(void) { struct ch_tc_flower_entry *new = kzalloc(sizeof(*new), GFP_KERNEL); @@ -289,7 +372,8 @@ static void offload_pedit(struct ch_filter_specification *fs, u32 val, u32 mask, } static void process_pedit_field(struct ch_filter_specification *fs, u32 val, - u32 mask, u32 offset, u8 htype) + u32 mask, u32 offset, u8 htype, + u8 *natmode_flags) { switch (htype) { case FLOW_ACT_MANGLE_HDR_TYPE_ETH: @@ -314,60 +398,94 @@ static void process_pedit_field(struct ch_filter_specification *fs, u32 val, switch (offset) { case PEDIT_IP4_SRC: offload_pedit(fs, val, mask, IP4_SRC); + *natmode_flags |= CXGB4_ACTION_NATMODE_SIP; break; case PEDIT_IP4_DST: offload_pedit(fs, val, mask, IP4_DST); + *natmode_flags |= CXGB4_ACTION_NATMODE_DIP; } - fs->nat_mode = NAT_MODE_ALL; break; case FLOW_ACT_MANGLE_HDR_TYPE_IP6: switch (offset) { case PEDIT_IP6_SRC_31_0: offload_pedit(fs, val, mask, IP6_SRC_31_0); + *natmode_flags |= CXGB4_ACTION_NATMODE_SIP; break; case PEDIT_IP6_SRC_63_32: offload_pedit(fs, val, mask, IP6_SRC_63_32); + *natmode_flags |= CXGB4_ACTION_NATMODE_SIP; break; case PEDIT_IP6_SRC_95_64: offload_pedit(fs, val, mask, IP6_SRC_95_64); + *natmode_flags |= CXGB4_ACTION_NATMODE_SIP; break; case PEDIT_IP6_SRC_127_96: offload_pedit(fs, val, mask, IP6_SRC_127_96); + *natmode_flags |= CXGB4_ACTION_NATMODE_SIP; break; case PEDIT_IP6_DST_31_0: offload_pedit(fs, val, mask, IP6_DST_31_0); + *natmode_flags |= CXGB4_ACTION_NATMODE_DIP; break; case PEDIT_IP6_DST_63_32: offload_pedit(fs, val, mask, IP6_DST_63_32); + *natmode_flags |= CXGB4_ACTION_NATMODE_DIP; break; case PEDIT_IP6_DST_95_64: offload_pedit(fs, val, mask, IP6_DST_95_64); + *natmode_flags |= CXGB4_ACTION_NATMODE_DIP; break; case PEDIT_IP6_DST_127_96: offload_pedit(fs, val, mask, IP6_DST_127_96); + *natmode_flags |= CXGB4_ACTION_NATMODE_DIP; } - fs->nat_mode = NAT_MODE_ALL; break; case FLOW_ACT_MANGLE_HDR_TYPE_TCP: switch (offset) { case PEDIT_TCP_SPORT_DPORT: - if (~mask & PEDIT_TCP_UDP_SPORT_MASK) + if (~mask & PEDIT_TCP_UDP_SPORT_MASK) { fs->nat_fport = val; - else + *natmode_flags |= CXGB4_ACTION_NATMODE_SPORT; + } else { fs->nat_lport = val >> 16; + *natmode_flags |= CXGB4_ACTION_NATMODE_DPORT; + } } - fs->nat_mode = NAT_MODE_ALL; break; case FLOW_ACT_MANGLE_HDR_TYPE_UDP: switch (offset) { case PEDIT_UDP_SPORT_DPORT: - if (~mask & PEDIT_TCP_UDP_SPORT_MASK) + if (~mask & PEDIT_TCP_UDP_SPORT_MASK) { fs->nat_fport = val; - else + *natmode_flags |= CXGB4_ACTION_NATMODE_SPORT; + } else { fs->nat_lport = val >> 16; + *natmode_flags |= CXGB4_ACTION_NATMODE_DPORT; + } } - fs->nat_mode = NAT_MODE_ALL; + break; + } +} + +static int cxgb4_action_natmode_validate(struct adapter *adap, u8 natmode_flags, + struct netlink_ext_ack *extack) +{ + u8 i = 0; + + /* Extract the NAT mode to enable based on what 4-tuple fields + * are enabled to be overwritten. This ensures that the + * disabled fields don't get overwritten to 0. + */ + for (i = 0; i < ARRAY_SIZE(cxgb4_natmode_config_array); i++) { + const struct cxgb4_natmode_config *c; + + c = &cxgb4_natmode_config_array[i]; + if (CHELSIO_CHIP_VERSION(adap->params.chip) >= c->chip && + natmode_flags == c->flags) + return 0; } + NL_SET_ERR_MSG_MOD(extack, "Unsupported NAT mode 4-tuple combination"); + return -EOPNOTSUPP; } void cxgb4_process_flow_actions(struct net_device *in, @@ -375,6 +493,7 @@ void cxgb4_process_flow_actions(struct net_device *in, struct ch_filter_specification *fs) { struct flow_action_entry *act; + u8 natmode_flags = 0; int i; flow_action_for_each(i, act, actions) { @@ -426,7 +545,8 @@ void cxgb4_process_flow_actions(struct net_device *in, val = act->mangle.val; offset = act->mangle.offset; - process_pedit_field(fs, val, mask, offset, htype); + process_pedit_field(fs, val, mask, offset, htype, + &natmode_flags); } break; case FLOW_ACTION_QUEUE: @@ -438,6 +558,9 @@ void cxgb4_process_flow_actions(struct net_device *in, break; } } + if (natmode_flags) + cxgb4_action_natmode_tweak(fs, natmode_flags); + } static bool valid_l4_mask(u32 mask) @@ -454,7 +577,8 @@ static bool valid_l4_mask(u32 mask) } static bool valid_pedit_action(struct net_device *dev, - const struct flow_action_entry *act) + const struct flow_action_entry *act, + u8 *natmode_flags) { u32 mask, offset; u8 htype; @@ -479,7 +603,10 @@ static bool valid_pedit_action(struct net_device *dev, case FLOW_ACT_MANGLE_HDR_TYPE_IP4: switch (offset) { case PEDIT_IP4_SRC: + *natmode_flags |= CXGB4_ACTION_NATMODE_SIP; + break; case PEDIT_IP4_DST: + *natmode_flags |= CXGB4_ACTION_NATMODE_DIP; break; default: netdev_err(dev, "%s: Unsupported pedit field\n", @@ -493,10 +620,13 @@ static bool valid_pedit_action(struct net_device *dev, case PEDIT_IP6_SRC_63_32: case PEDIT_IP6_SRC_95_64: case PEDIT_IP6_SRC_127_96: + *natmode_flags |= CXGB4_ACTION_NATMODE_SIP; + break; case PEDIT_IP6_DST_31_0: case PEDIT_IP6_DST_63_32: case PEDIT_IP6_DST_95_64: case PEDIT_IP6_DST_127_96: + *natmode_flags |= CXGB4_ACTION_NATMODE_DIP; break; default: netdev_err(dev, "%s: Unsupported pedit field\n", @@ -512,6 +642,10 @@ static bool valid_pedit_action(struct net_device *dev, __func__); return false; } + if (~mask & PEDIT_TCP_UDP_SPORT_MASK) + *natmode_flags |= CXGB4_ACTION_NATMODE_SPORT; + else + *natmode_flags |= CXGB4_ACTION_NATMODE_DPORT; break; default: netdev_err(dev, "%s: Unsupported pedit field\n", @@ -527,6 +661,10 @@ static bool valid_pedit_action(struct net_device *dev, __func__); return false; } + if (~mask & PEDIT_TCP_UDP_SPORT_MASK) + *natmode_flags |= CXGB4_ACTION_NATMODE_SPORT; + else + *natmode_flags |= CXGB4_ACTION_NATMODE_DPORT; break; default: netdev_err(dev, "%s: Unsupported pedit field\n", @@ -546,10 +684,12 @@ int cxgb4_validate_flow_actions(struct net_device *dev, struct netlink_ext_ack *extack, u8 matchall_filter) { + struct adapter *adap = netdev2adap(dev); struct flow_action_entry *act; bool act_redir = false; bool act_pedit = false; bool act_vlan = false; + u8 natmode_flags = 0; int i; if (!flow_action_basic_hw_stats_check(actions, extack)) @@ -563,7 +703,6 @@ int cxgb4_validate_flow_actions(struct net_device *dev, break; case FLOW_ACTION_MIRRED: case FLOW_ACTION_REDIRECT: { - struct adapter *adap = netdev2adap(dev); struct net_device *n_dev, *target_dev; bool found = false; unsigned int i; @@ -620,7 +759,8 @@ int cxgb4_validate_flow_actions(struct net_device *dev, } break; case FLOW_ACTION_MANGLE: { - bool pedit_valid = valid_pedit_action(dev, act); + bool pedit_valid = valid_pedit_action(dev, act, + &natmode_flags); if (!pedit_valid) return -EOPNOTSUPP; @@ -642,6 +782,15 @@ int cxgb4_validate_flow_actions(struct net_device *dev, return -EINVAL; } + if (act_pedit) { + int ret; + + ret = cxgb4_action_natmode_validate(adap, natmode_flags, + extack); + if (ret) + return ret; + } + return 0; } diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.h b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.h index 6296e1d5a12b..3a2fa00c8cde 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.h +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.h @@ -108,6 +108,21 @@ struct ch_tc_pedit_fields { #define PEDIT_TCP_SPORT_DPORT 0x0 #define PEDIT_UDP_SPORT_DPORT 0x0 +enum cxgb4_action_natmode_flags { + CXGB4_ACTION_NATMODE_NONE = 0, + CXGB4_ACTION_NATMODE_DIP = (1 << 0), + CXGB4_ACTION_NATMODE_SIP = (1 << 1), + CXGB4_ACTION_NATMODE_DPORT = (1 << 2), + CXGB4_ACTION_NATMODE_SPORT = (1 << 3), +}; + +/* TC PEDIT action to NATMODE translation entry */ +struct cxgb4_natmode_config { + enum chip_type chip; + u8 flags; + u8 natmode; +}; + void cxgb4_process_flow_actions(struct net_device *in, struct flow_action *actions, struct ch_filter_specification *fs); diff --git a/drivers/net/ethernet/chelsio/inline_crypto/ch_ipsec/chcr_ipsec.c b/drivers/net/ethernet/chelsio/inline_crypto/ch_ipsec/chcr_ipsec.c index 0e7d25169407..072299b14b8d 100644 --- a/drivers/net/ethernet/chelsio/inline_crypto/ch_ipsec/chcr_ipsec.c +++ b/drivers/net/ethernet/chelsio/inline_crypto/ch_ipsec/chcr_ipsec.c @@ -35,7 +35,7 @@ * Atul Gupta (atul.gupta@chelsio.com) */ -#define pr_fmt(fmt) "chcr:" fmt +#define pr_fmt(fmt) "ch_ipsec: " fmt #include <linux/kernel.h> #include <linux/module.h> @@ -72,20 +72,21 @@ static LIST_HEAD(uld_ctx_list); static DEFINE_MUTEX(dev_mutex); -static int chcr_xfrm_add_state(struct xfrm_state *x); -static void chcr_xfrm_del_state(struct xfrm_state *x); -static void chcr_xfrm_free_state(struct xfrm_state *x); -static bool chcr_ipsec_offload_ok(struct sk_buff *skb, struct xfrm_state *x); -static void chcr_advance_esn_state(struct xfrm_state *x); +static bool ch_ipsec_offload_ok(struct sk_buff *skb, struct xfrm_state *x); static int ch_ipsec_uld_state_change(void *handle, enum cxgb4_state new_state); +static int ch_ipsec_xmit(struct sk_buff *skb, struct net_device *dev); static void *ch_ipsec_uld_add(const struct cxgb4_lld_info *infop); - -static const struct xfrmdev_ops chcr_xfrmdev_ops = { - .xdo_dev_state_add = chcr_xfrm_add_state, - .xdo_dev_state_delete = chcr_xfrm_del_state, - .xdo_dev_state_free = chcr_xfrm_free_state, - .xdo_dev_offload_ok = chcr_ipsec_offload_ok, - .xdo_dev_state_advance_esn = chcr_advance_esn_state, +static void ch_ipsec_advance_esn_state(struct xfrm_state *x); +static void ch_ipsec_xfrm_free_state(struct xfrm_state *x); +static void ch_ipsec_xfrm_del_state(struct xfrm_state *x); +static int ch_ipsec_xfrm_add_state(struct xfrm_state *x); + +static const struct xfrmdev_ops ch_ipsec_xfrmdev_ops = { + .xdo_dev_state_add = ch_ipsec_xfrm_add_state, + .xdo_dev_state_delete = ch_ipsec_xfrm_del_state, + .xdo_dev_state_free = ch_ipsec_xfrm_free_state, + .xdo_dev_offload_ok = ch_ipsec_offload_ok, + .xdo_dev_state_advance_esn = ch_ipsec_advance_esn_state, }; static struct cxgb4_uld_info ch_ipsec_uld_info = { @@ -95,8 +96,8 @@ static struct cxgb4_uld_info ch_ipsec_uld_info = { .rxq_size = 1024, .add = ch_ipsec_uld_add, .state_change = ch_ipsec_uld_state_change, - .tx_handler = chcr_ipsec_xmit, - .xfrmdev_ops = &chcr_xfrmdev_ops, + .tx_handler = ch_ipsec_xmit, + .xfrmdev_ops = &ch_ipsec_xfrmdev_ops, }; static void *ch_ipsec_uld_add(const struct cxgb4_lld_info *infop) @@ -119,7 +120,7 @@ static int ch_ipsec_uld_state_change(void *handle, enum cxgb4_state new_state) { struct ipsec_uld_ctx *u_ctx = handle; - pr_info("new_state %u\n", new_state); + pr_debug("new_state %u\n", new_state); switch (new_state) { case CXGB4_STATE_UP: pr_info("%s: Up\n", pci_name(u_ctx->lldi.pdev)); @@ -140,8 +141,8 @@ static int ch_ipsec_uld_state_change(void *handle, enum cxgb4_state new_state) return 0; } -static inline int chcr_ipsec_setauthsize(struct xfrm_state *x, - struct ipsec_sa_entry *sa_entry) +static int ch_ipsec_setauthsize(struct xfrm_state *x, + struct ipsec_sa_entry *sa_entry) { int hmac_ctrl; int authsize = x->aead->alg_icv_len / 8; @@ -164,8 +165,8 @@ static inline int chcr_ipsec_setauthsize(struct xfrm_state *x, return hmac_ctrl; } -static inline int chcr_ipsec_setkey(struct xfrm_state *x, - struct ipsec_sa_entry *sa_entry) +static int ch_ipsec_setkey(struct xfrm_state *x, + struct ipsec_sa_entry *sa_entry) { int keylen = (x->aead->alg_key_len + 7) / 8; unsigned char *key = x->aead->alg_key; @@ -223,65 +224,65 @@ out: } /* - * chcr_xfrm_add_state + * ch_ipsec_xfrm_add_state * returns 0 on success, negative error if failed to send message to FPGA * positive error if FPGA returned a bad response */ -static int chcr_xfrm_add_state(struct xfrm_state *x) +static int ch_ipsec_xfrm_add_state(struct xfrm_state *x) { struct ipsec_sa_entry *sa_entry; int res = 0; if (x->props.aalgo != SADB_AALG_NONE) { - pr_debug("CHCR: Cannot offload authenticated xfrm states\n"); + pr_debug("Cannot offload authenticated xfrm states\n"); return -EINVAL; } if (x->props.calgo != SADB_X_CALG_NONE) { - pr_debug("CHCR: Cannot offload compressed xfrm states\n"); + pr_debug("Cannot offload compressed xfrm states\n"); return -EINVAL; } if (x->props.family != AF_INET && x->props.family != AF_INET6) { - pr_debug("CHCR: Only IPv4/6 xfrm state offloaded\n"); + pr_debug("Only IPv4/6 xfrm state offloaded\n"); return -EINVAL; } if (x->props.mode != XFRM_MODE_TRANSPORT && x->props.mode != XFRM_MODE_TUNNEL) { - pr_debug("CHCR: Only transport and tunnel xfrm offload\n"); + pr_debug("Only transport and tunnel xfrm offload\n"); return -EINVAL; } if (x->id.proto != IPPROTO_ESP) { - pr_debug("CHCR: Only ESP xfrm state offloaded\n"); + pr_debug("Only ESP xfrm state offloaded\n"); return -EINVAL; } if (x->encap) { - pr_debug("CHCR: Encapsulated xfrm state not offloaded\n"); + pr_debug("Encapsulated xfrm state not offloaded\n"); return -EINVAL; } if (!x->aead) { - pr_debug("CHCR: Cannot offload xfrm states without aead\n"); + pr_debug("Cannot offload xfrm states without aead\n"); return -EINVAL; } if (x->aead->alg_icv_len != 128 && x->aead->alg_icv_len != 96) { - pr_debug("CHCR: Cannot offload xfrm states with AEAD ICV length other than 96b & 128b\n"); + pr_debug("Cannot offload xfrm states with AEAD ICV length other than 96b & 128b\n"); return -EINVAL; } if ((x->aead->alg_key_len != 128 + 32) && (x->aead->alg_key_len != 256 + 32)) { - pr_debug("CHCR: Cannot offload xfrm states with AEAD key length other than 128/256 bit\n"); + pr_debug("cannot offload xfrm states with AEAD key length other than 128/256 bit\n"); return -EINVAL; } if (x->tfcpad) { - pr_debug("CHCR: Cannot offload xfrm states with tfc padding\n"); + pr_debug("Cannot offload xfrm states with tfc padding\n"); return -EINVAL; } if (!x->geniv) { - pr_debug("CHCR: Cannot offload xfrm states without geniv\n"); + pr_debug("Cannot offload xfrm states without geniv\n"); return -EINVAL; } if (strcmp(x->geniv, "seqiv")) { - pr_debug("CHCR: Cannot offload xfrm states with geniv other than seqiv\n"); + pr_debug("Cannot offload xfrm states with geniv other than seqiv\n"); return -EINVAL; } @@ -291,24 +292,24 @@ static int chcr_xfrm_add_state(struct xfrm_state *x) goto out; } - sa_entry->hmac_ctrl = chcr_ipsec_setauthsize(x, sa_entry); + sa_entry->hmac_ctrl = ch_ipsec_setauthsize(x, sa_entry); if (x->props.flags & XFRM_STATE_ESN) sa_entry->esn = 1; - chcr_ipsec_setkey(x, sa_entry); + ch_ipsec_setkey(x, sa_entry); x->xso.offload_handle = (unsigned long)sa_entry; try_module_get(THIS_MODULE); out: return res; } -static void chcr_xfrm_del_state(struct xfrm_state *x) +static void ch_ipsec_xfrm_del_state(struct xfrm_state *x) { /* do nothing */ if (!x->xso.offload_handle) return; } -static void chcr_xfrm_free_state(struct xfrm_state *x) +static void ch_ipsec_xfrm_free_state(struct xfrm_state *x) { struct ipsec_sa_entry *sa_entry; @@ -320,7 +321,7 @@ static void chcr_xfrm_free_state(struct xfrm_state *x) module_put(THIS_MODULE); } -static bool chcr_ipsec_offload_ok(struct sk_buff *skb, struct xfrm_state *x) +static bool ch_ipsec_offload_ok(struct sk_buff *skb, struct xfrm_state *x) { if (x->props.family == AF_INET) { /* Offload with IP options is not supported yet */ @@ -334,15 +335,15 @@ static bool chcr_ipsec_offload_ok(struct sk_buff *skb, struct xfrm_state *x) return true; } -static void chcr_advance_esn_state(struct xfrm_state *x) +static void ch_ipsec_advance_esn_state(struct xfrm_state *x) { /* do nothing */ if (!x->xso.offload_handle) return; } -static inline int is_eth_imm(const struct sk_buff *skb, - struct ipsec_sa_entry *sa_entry) +static int is_eth_imm(const struct sk_buff *skb, + struct ipsec_sa_entry *sa_entry) { unsigned int kctx_len; int hdrlen; @@ -360,9 +361,9 @@ static inline int is_eth_imm(const struct sk_buff *skb, return 0; } -static inline unsigned int calc_tx_sec_flits(const struct sk_buff *skb, - struct ipsec_sa_entry *sa_entry, - bool *immediate) +static unsigned int calc_tx_sec_flits(const struct sk_buff *skb, + struct ipsec_sa_entry *sa_entry, + bool *immediate) { unsigned int kctx_len; unsigned int flits; @@ -403,7 +404,7 @@ static inline unsigned int calc_tx_sec_flits(const struct sk_buff *skb, return flits; } -inline void *copy_esn_pktxt(struct sk_buff *skb, +static void *copy_esn_pktxt(struct sk_buff *skb, struct net_device *dev, void *pos, struct ipsec_sa_entry *sa_entry) @@ -457,7 +458,7 @@ inline void *copy_esn_pktxt(struct sk_buff *skb, return pos; } -inline void *copy_cpltx_pktxt(struct sk_buff *skb, +static void *copy_cpltx_pktxt(struct sk_buff *skb, struct net_device *dev, void *pos, struct ipsec_sa_entry *sa_entry) @@ -501,10 +502,10 @@ inline void *copy_cpltx_pktxt(struct sk_buff *skb, return pos; } -inline void *copy_key_cpltx_pktxt(struct sk_buff *skb, - struct net_device *dev, - void *pos, - struct ipsec_sa_entry *sa_entry) +static void *copy_key_cpltx_pktxt(struct sk_buff *skb, + struct net_device *dev, + void *pos, + struct ipsec_sa_entry *sa_entry) { struct _key_ctx *key_ctx; int left, eoq, key_len; @@ -549,11 +550,11 @@ inline void *copy_key_cpltx_pktxt(struct sk_buff *skb, return pos; } -inline void *chcr_crypto_wreq(struct sk_buff *skb, - struct net_device *dev, - void *pos, - int credits, - struct ipsec_sa_entry *sa_entry) +static void *ch_ipsec_crypto_wreq(struct sk_buff *skb, + struct net_device *dev, + void *pos, + int credits, + struct ipsec_sa_entry *sa_entry) { struct port_info *pi = netdev_priv(dev); struct adapter *adap = pi->adapter; @@ -674,13 +675,13 @@ inline void *chcr_crypto_wreq(struct sk_buff *skb, * Returns the number of Tx descriptors needed for the supplied number * of flits. */ -static inline unsigned int flits_to_desc(unsigned int n) +static unsigned int flits_to_desc(unsigned int n) { WARN_ON(n > SGE_MAX_WR_LEN / 8); return DIV_ROUND_UP(n, 8); } -static inline unsigned int txq_avail(const struct sge_txq *q) +static unsigned int txq_avail(const struct sge_txq *q) { return q->size - 1 - q->in_use; } @@ -691,7 +692,7 @@ static void eth_txq_stop(struct sge_eth_txq *q) q->q.stops++; } -static inline void txq_advance(struct sge_txq *q, unsigned int n) +static void txq_advance(struct sge_txq *q, unsigned int n) { q->in_use += n; q->pidx += n; @@ -700,9 +701,9 @@ static inline void txq_advance(struct sge_txq *q, unsigned int n) } /* - * chcr_ipsec_xmit called from ULD Tx handler + * ch_ipsec_xmit called from ULD Tx handler */ -int chcr_ipsec_xmit(struct sk_buff *skb, struct net_device *dev) +int ch_ipsec_xmit(struct sk_buff *skb, struct net_device *dev) { struct xfrm_state *x = xfrm_input_state(skb); unsigned int last_desc, ndesc, flits = 0; @@ -763,8 +764,8 @@ out_free: dev_kfree_skb_any(skb); before = (u64 *)pos; end = (u64 *)pos + flits; /* Setup IPSec CPL */ - pos = (void *)chcr_crypto_wreq(skb, dev, (void *)pos, - credits, sa_entry); + pos = (void *)ch_ipsec_crypto_wreq(skb, dev, (void *)pos, + credits, sa_entry); if (before > (u64 *)pos) { left = (u8 *)end - (u8 *)q->q.stat; end = (void *)q->q.desc + left; @@ -791,14 +792,14 @@ out_free: dev_kfree_skb_any(skb); return NETDEV_TX_OK; } -static int __init chcr_ipsec_init(void) +static int __init ch_ipsec_init(void) { cxgb4_register_uld(CXGB4_ULD_IPSEC, &ch_ipsec_uld_info); return 0; } -static void __exit chcr_ipsec_exit(void) +static void __exit ch_ipsec_exit(void) { struct ipsec_uld_ctx *u_ctx, *tmp; struct adapter *adap; @@ -814,8 +815,8 @@ static void __exit chcr_ipsec_exit(void) cxgb4_unregister_uld(CXGB4_ULD_IPSEC); } -module_init(chcr_ipsec_init); -module_exit(chcr_ipsec_exit); +module_init(ch_ipsec_init); +module_exit(ch_ipsec_exit); MODULE_DESCRIPTION("Crypto IPSEC for Chelsio Terminator cards."); MODULE_LICENSE("GPL"); diff --git a/drivers/net/ethernet/freescale/fec_main.c b/drivers/net/ethernet/freescale/fec_main.c index c043afb38b6e..8f7eca1e7716 100644 --- a/drivers/net/ethernet/freescale/fec_main.c +++ b/drivers/net/ethernet/freescale/fec_main.c @@ -1912,6 +1912,27 @@ out: return ret; } +static void fec_enet_phy_reset_after_clk_enable(struct net_device *ndev) +{ + struct fec_enet_private *fep = netdev_priv(ndev); + struct phy_device *phy_dev = ndev->phydev; + + if (phy_dev) { + phy_reset_after_clk_enable(phy_dev); + } else if (fep->phy_node) { + /* + * If the PHY still is not bound to the MAC, but there is + * OF PHY node and a matching PHY device instance already, + * use the OF PHY node to obtain the PHY device instance, + * and then use that PHY device instance when triggering + * the PHY reset. + */ + phy_dev = of_phy_find_device(fep->phy_node); + phy_reset_after_clk_enable(phy_dev); + put_device(&phy_dev->mdio.dev); + } +} + static int fec_enet_clk_enable(struct net_device *ndev, bool enable) { struct fec_enet_private *fep = netdev_priv(ndev); @@ -1938,7 +1959,7 @@ static int fec_enet_clk_enable(struct net_device *ndev, bool enable) if (ret) goto failed_clk_ref; - phy_reset_after_clk_enable(ndev->phydev); + fec_enet_phy_reset_after_clk_enable(ndev); } else { clk_disable_unprepare(fep->clk_enet_out); if (fep->clk_ptp) { @@ -2983,16 +3004,16 @@ fec_enet_open(struct net_device *ndev) /* Init MAC prior to mii bus probe */ fec_restart(ndev); - /* Probe and connect to PHY when open the interface */ - ret = fec_enet_mii_probe(ndev); - if (ret) - goto err_enet_mii_probe; - /* Call phy_reset_after_clk_enable() again if it failed during * phy_reset_after_clk_enable() before because the PHY wasn't probed. */ if (reset_again) - phy_reset_after_clk_enable(ndev->phydev); + fec_enet_phy_reset_after_clk_enable(ndev); + + /* Probe and connect to PHY when open the interface */ + ret = fec_enet_mii_probe(ndev); + if (ret) + goto err_enet_mii_probe; if (fep->quirks & FEC_QUIRK_ERR006687) imx6q_cpuidle_fec_irqs_used(); diff --git a/drivers/net/ethernet/ibm/ibmveth.c b/drivers/net/ethernet/ibm/ibmveth.c index c5c732601e35..7ef3369953b6 100644 --- a/drivers/net/ethernet/ibm/ibmveth.c +++ b/drivers/net/ethernet/ibm/ibmveth.c @@ -1349,6 +1349,7 @@ static int ibmveth_poll(struct napi_struct *napi, int budget) int offset = ibmveth_rxq_frame_offset(adapter); int csum_good = ibmveth_rxq_csum_good(adapter); int lrg_pkt = ibmveth_rxq_large_packet(adapter); + __sum16 iph_check = 0; skb = ibmveth_rxq_get_buffer(adapter); @@ -1385,16 +1386,26 @@ static int ibmveth_poll(struct napi_struct *napi, int budget) skb_put(skb, length); skb->protocol = eth_type_trans(skb, netdev); - if (csum_good) { - skb->ip_summed = CHECKSUM_UNNECESSARY; - ibmveth_rx_csum_helper(skb, adapter); + /* PHYP without PLSO support places a -1 in the ip + * checksum for large send frames. + */ + if (skb->protocol == cpu_to_be16(ETH_P_IP)) { + struct iphdr *iph = (struct iphdr *)skb->data; + + iph_check = iph->check; } - if (length > netdev->mtu + ETH_HLEN) { + if ((length > netdev->mtu + ETH_HLEN) || + lrg_pkt || iph_check == 0xffff) { ibmveth_rx_mss_helper(skb, mss, lrg_pkt); adapter->rx_large_packets++; } + if (csum_good) { + skb->ip_summed = CHECKSUM_UNNECESSARY; + ibmveth_rx_csum_helper(skb, adapter); + } + napi_gro_receive(napi, skb); /* send it up */ netdev->stats.rx_packets++; diff --git a/drivers/net/ethernet/intel/e1000/e1000_hw.c b/drivers/net/ethernet/intel/e1000/e1000_hw.c index fb5af23880c3..4c0c9433bd60 100644 --- a/drivers/net/ethernet/intel/e1000/e1000_hw.c +++ b/drivers/net/ethernet/intel/e1000/e1000_hw.c @@ -4401,17 +4401,9 @@ void e1000_write_vfta(struct e1000_hw *hw, u32 offset, u32 value) static void e1000_clear_vfta(struct e1000_hw *hw) { u32 offset; - u32 vfta_value = 0; - u32 vfta_offset = 0; - u32 vfta_bit_in_reg = 0; for (offset = 0; offset < E1000_VLAN_FILTER_TBL_SIZE; offset++) { - /* If the offset we want to clear is the same offset of the - * manageability VLAN ID, then clear all bits except that of the - * manageability unit - */ - vfta_value = (offset == vfta_offset) ? vfta_bit_in_reg : 0; - E1000_WRITE_REG_ARRAY(hw, VFTA, offset, vfta_value); + E1000_WRITE_REG_ARRAY(hw, VFTA, offset, 0); E1000_WRITE_FLUSH(); } } diff --git a/drivers/net/ethernet/intel/i40e/i40e_adminq.c b/drivers/net/ethernet/intel/i40e/i40e_adminq.c index c897a2863e4f..593912b17609 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_adminq.c +++ b/drivers/net/ethernet/intel/i40e/i40e_adminq.c @@ -541,6 +541,12 @@ static void i40e_set_hw_flags(struct i40e_hw *hw) (aq->api_maj_ver == 1 && aq->api_min_ver >= I40E_MINOR_VER_GET_LINK_INFO_X722)) hw->flags |= I40E_HW_FLAG_AQ_PHY_ACCESS_CAPABLE; + + if (aq->api_maj_ver > 1 || + (aq->api_maj_ver == 1 && + aq->api_min_ver >= I40E_MINOR_VER_FW_REQUEST_FEC_X722)) + hw->flags |= I40E_HW_FLAG_X722_FEC_REQUEST_CAPABLE; + fallthrough; default: break; diff --git a/drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h b/drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h index c0c8efe42fce..1e960c3c7ef0 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h +++ b/drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h @@ -24,6 +24,8 @@ #define I40E_MINOR_VER_GET_LINK_INFO_X722 0x0009 /* API version 1.6 for X722 devices adds ability to stop FW LLDP agent */ #define I40E_MINOR_VER_FW_LLDP_STOPPABLE_X722 0x0006 +/* API version 1.10 for X722 devices adds ability to request FEC encoding */ +#define I40E_MINOR_VER_FW_REQUEST_FEC_X722 0x000A struct i40e_aq_desc { __le16 flags; diff --git a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c index dc1577156bb6..26ba1f3eb2d8 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c +++ b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c @@ -891,6 +891,7 @@ static void i40e_get_settings_link_up(struct i40e_hw *hw, if (hw_link_info->requested_speeds & I40E_LINK_SPEED_10GB) ethtool_link_ksettings_add_link_mode(ks, advertising, 10000baseT_Full); + i40e_get_settings_link_up_fec(hw_link_info->req_fec_info, ks); break; case I40E_PHY_TYPE_SGMII: ethtool_link_ksettings_add_link_mode(ks, supported, Autoneg); @@ -1481,12 +1482,16 @@ static int i40e_set_fec_param(struct net_device *netdev, struct i40e_pf *pf = np->vsi->back; struct i40e_hw *hw = &pf->hw; u8 fec_cfg = 0; - int err = 0; if (hw->device_id != I40E_DEV_ID_25G_SFP28 && - hw->device_id != I40E_DEV_ID_25G_B) { - err = -EPERM; - goto done; + hw->device_id != I40E_DEV_ID_25G_B && + hw->device_id != I40E_DEV_ID_KX_X722) + return -EPERM; + + if (hw->mac.type == I40E_MAC_X722 && + !(hw->flags & I40E_HW_FLAG_X722_FEC_REQUEST_CAPABLE)) { + netdev_err(netdev, "Setting FEC encoding not supported by firmware. Please update the NVM image.\n"); + return -EOPNOTSUPP; } switch (fecparam->fec) { @@ -1508,14 +1513,10 @@ static int i40e_set_fec_param(struct net_device *netdev, default: dev_warn(&pf->pdev->dev, "Unsupported FEC mode: %d", fecparam->fec); - err = -EINVAL; - goto done; + return -EINVAL; } - err = i40e_set_fec_cfg(netdev, fec_cfg); - -done: - return err; + return i40e_set_fec_cfg(netdev, fec_cfg); } static int i40e_nway_reset(struct net_device *netdev) @@ -4951,8 +4952,7 @@ flags_complete: } } - if (((changed_flags & I40E_FLAG_RS_FEC) || - (changed_flags & I40E_FLAG_BASE_R_FEC)) && + if (changed_flags & I40E_FLAG_RS_FEC && pf->hw.device_id != I40E_DEV_ID_25G_SFP28 && pf->hw.device_id != I40E_DEV_ID_25G_B) { dev_warn(&pf->pdev->dev, @@ -4960,6 +4960,15 @@ flags_complete: return -EOPNOTSUPP; } + if (changed_flags & I40E_FLAG_BASE_R_FEC && + pf->hw.device_id != I40E_DEV_ID_25G_SFP28 && + pf->hw.device_id != I40E_DEV_ID_25G_B && + pf->hw.device_id != I40E_DEV_ID_KX_X722) { + dev_warn(&pf->pdev->dev, + "Device does not support changing FEC configuration\n"); + return -EOPNOTSUPP; + } + /* Process any additional changes needed as a result of flag changes. * The changed_flags value reflects the list of bits that were * changed in the code above. diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index 929c64789119..4f8a2154b93f 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -6625,6 +6625,25 @@ void i40e_print_link_message(struct i40e_vsi *vsi, bool isup) netdev_info(vsi->netdev, "NIC Link is Up, %sbps Full Duplex, Requested FEC: %s, Negotiated FEC: %s, Autoneg: %s, Flow Control: %s\n", speed, req_fec, fec, an, fc); + } else if (pf->hw.device_id == I40E_DEV_ID_KX_X722) { + req_fec = "None"; + fec = "None"; + an = "False"; + + if (pf->hw.phy.link_info.an_info & I40E_AQ_AN_COMPLETED) + an = "True"; + + if (pf->hw.phy.link_info.fec_info & + I40E_AQ_CONFIG_FEC_KR_ENA) + fec = "CL74 FC-FEC/BASE-R"; + + if (pf->hw.phy.link_info.req_fec_info & + I40E_AQ_REQUEST_FEC_KR) + req_fec = "CL74 FC-FEC/BASE-R"; + + netdev_info(vsi->netdev, + "NIC Link is Up, %sbps Full Duplex, Requested FEC: %s, Negotiated FEC: %s, Autoneg: %s, Flow Control: %s\n", + speed, req_fec, fec, an, fc); } else { netdev_info(vsi->netdev, "NIC Link is Up, %sbps Full Duplex, Flow Control: %s\n", diff --git a/drivers/net/ethernet/intel/i40e/i40e_type.h b/drivers/net/ethernet/intel/i40e/i40e_type.h index 97d29df65f9e..c0bdc666f557 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_type.h +++ b/drivers/net/ethernet/intel/i40e/i40e_type.h @@ -595,6 +595,7 @@ struct i40e_hw { #define I40E_HW_FLAG_FW_LLDP_PERSISTENT BIT_ULL(5) #define I40E_HW_FLAG_AQ_PHY_ACCESS_EXTENDED BIT_ULL(6) #define I40E_HW_FLAG_DROP_MODE BIT_ULL(7) +#define I40E_HW_FLAG_X722_FEC_REQUEST_CAPABLE BIT_ULL(8) u64 flags; /* Used in set switch config AQ command */ diff --git a/drivers/net/ethernet/korina.c b/drivers/net/ethernet/korina.c index 03e034918d14..af441d699a57 100644 --- a/drivers/net/ethernet/korina.c +++ b/drivers/net/ethernet/korina.c @@ -1113,7 +1113,7 @@ out: return rc; probe_err_register: - kfree(lp->td_ring); + kfree(KSEG0ADDR(lp->td_ring)); probe_err_td_ring: iounmap(lp->tx_dma_regs); probe_err_dma_tx: @@ -1133,6 +1133,7 @@ static int korina_remove(struct platform_device *pdev) iounmap(lp->eth_regs); iounmap(lp->rx_dma_regs); iounmap(lp->tx_dma_regs); + kfree(KSEG0ADDR(lp->td_ring)); unregister_netdev(bif->dev); free_netdev(bif->dev); diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c index 99d7737e8ad6..502d1b97855c 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c @@ -943,6 +943,9 @@ int mlx4_en_poll_rx_cq(struct napi_struct *napi, int budget) bool clean_complete = true; int done; + if (!budget) + return 0; + if (priv->tx_ring_num[TX_XDP]) { xdp_tx_cq = priv->tx_cq[TX_XDP][cq->ring]; if (xdp_tx_cq->xdp_busy) { diff --git a/drivers/net/ethernet/mellanox/mlx4/en_tx.c b/drivers/net/ethernet/mellanox/mlx4/en_tx.c index 45869b29368f..3ddb7268e415 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_tx.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_tx.c @@ -350,7 +350,7 @@ u32 mlx4_en_recycle_tx_desc(struct mlx4_en_priv *priv, .dma = tx_info->map0_dma, }; - if (!mlx4_en_rx_recycle(ring->recycle_ring, &frame)) { + if (!napi_mode || !mlx4_en_rx_recycle(ring->recycle_ring, &frame)) { dma_unmap_page(priv->ddev, tx_info->map0_dma, PAGE_SIZE, priv->dma_dir); put_page(tx_info->page); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/fs.h b/drivers/net/ethernet/mellanox/mlx5/core/en/fs.h index 6a97452dc60e..dc744702aee4 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/fs.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/fs.h @@ -236,6 +236,7 @@ struct mlx5e_accel_fs_tcp; struct mlx5e_flow_steering { struct mlx5_flow_namespace *ns; + struct mlx5_flow_namespace *egress_ns; #ifdef CONFIG_MLX5_EN_RXNFC struct mlx5e_ethtool_steering ethtool; #endif diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/en_accel.h b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/en_accel.h index 2ea1cdc1ca54..899b98aca0d3 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/en_accel.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/en_accel.h @@ -107,6 +107,9 @@ struct mlx5e_accel_tx_state { #ifdef CONFIG_MLX5_EN_TLS struct mlx5e_accel_tx_tls_state tls; #endif +#ifdef CONFIG_MLX5_EN_IPSEC + struct mlx5e_accel_tx_ipsec_state ipsec; +#endif }; static inline bool mlx5e_accel_tx_begin(struct net_device *dev, @@ -125,22 +128,46 @@ static inline bool mlx5e_accel_tx_begin(struct net_device *dev, } #endif +#ifdef CONFIG_MLX5_EN_IPSEC + if (test_bit(MLX5E_SQ_STATE_IPSEC, &sq->state) && xfrm_offload(skb)) { + if (unlikely(!mlx5e_ipsec_handle_tx_skb(dev, skb, &state->ipsec))) + return false; + } +#endif + return true; } +static inline bool mlx5e_accel_tx_is_ipsec_flow(struct mlx5e_accel_tx_state *state) +{ +#ifdef CONFIG_MLX5_EN_IPSEC + return mlx5e_ipsec_is_tx_flow(&state->ipsec); +#endif + + return false; +} + +static inline unsigned int mlx5e_accel_tx_ids_len(struct mlx5e_txqsq *sq, + struct mlx5e_accel_tx_state *state) +{ +#ifdef CONFIG_MLX5_EN_IPSEC + if (test_bit(MLX5E_SQ_STATE_IPSEC, &sq->state)) + return mlx5e_ipsec_tx_ids_len(&state->ipsec); +#endif + + return 0; +} + /* Part of the eseg touched by TX offloads */ #define MLX5E_ACCEL_ESEG_LEN offsetof(struct mlx5_wqe_eth_seg, mss) static inline bool mlx5e_accel_tx_eseg(struct mlx5e_priv *priv, - struct mlx5e_txqsq *sq, struct sk_buff *skb, struct mlx5_wqe_eth_seg *eseg) { #ifdef CONFIG_MLX5_EN_IPSEC - if (test_bit(MLX5E_SQ_STATE_IPSEC, &sq->state)) { - if (unlikely(!mlx5e_ipsec_handle_tx_skb(priv, eseg, skb))) - return false; - } + if (xfrm_offload(skb)) + mlx5e_ipsec_tx_build_eseg(priv, skb, eseg); #endif #if IS_ENABLED(CONFIG_GENEVE) @@ -153,11 +180,18 @@ static inline bool mlx5e_accel_tx_eseg(struct mlx5e_priv *priv, static inline void mlx5e_accel_tx_finish(struct mlx5e_txqsq *sq, struct mlx5e_tx_wqe *wqe, - struct mlx5e_accel_tx_state *state) + struct mlx5e_accel_tx_state *state, + struct mlx5_wqe_inline_seg *inlseg) { #ifdef CONFIG_MLX5_EN_TLS mlx5e_tls_handle_tx_wqe(sq, &wqe->ctrl, &state->tls); #endif + +#ifdef CONFIG_MLX5_EN_IPSEC + if (test_bit(MLX5E_SQ_STATE_IPSEC, &sq->state) && + state->ipsec.xo && state->ipsec.tailen) + mlx5e_ipsec_handle_tx_wqe(wqe, &state->ipsec, inlseg); +#endif } static inline int mlx5e_accel_init_rx(struct mlx5e_priv *priv) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c index d39989cddd90..3d45341e2216 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c @@ -560,6 +560,9 @@ void mlx5e_ipsec_build_netdev(struct mlx5e_priv *priv) return; } + if (mlx5_is_ipsec_device(mdev)) + netdev->gso_partial_features |= NETIF_F_GSO_ESP; + mlx5_core_dbg(mdev, "mlx5e: ESP GSO capability turned on\n"); netdev->features |= NETIF_F_GSO_ESP; netdev->hw_features |= NETIF_F_GSO_ESP; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.h b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.h index 0fc8b4d4f4a3..6164c7f59efb 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.h @@ -76,6 +76,7 @@ struct mlx5e_ipsec_stats { }; struct mlx5e_accel_fs_esp; +struct mlx5e_ipsec_tx; struct mlx5e_ipsec { struct mlx5e_priv *en_priv; @@ -87,6 +88,7 @@ struct mlx5e_ipsec { struct mlx5e_ipsec_stats stats; struct workqueue_struct *wq; struct mlx5e_accel_fs_esp *rx_fs; + struct mlx5e_ipsec_tx *tx_fs; }; struct mlx5e_ipsec_esn_state { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_fs.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_fs.c index b974f3cd1005..0e45590662a8 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_fs.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_fs.c @@ -34,6 +34,12 @@ struct mlx5e_accel_fs_esp { struct mlx5e_accel_fs_esp_prot fs_prot[ACCEL_FS_ESP_NUM_TYPES]; }; +struct mlx5e_ipsec_tx { + struct mlx5_flow_table *ft; + struct mutex mutex; /* Protect IPsec TX steering */ + u32 refcnt; +}; + /* IPsec RX flow steering */ static enum mlx5e_traffic_types fs_esp2tt(enum accel_fs_esp_type i) { @@ -323,6 +329,77 @@ out: mutex_unlock(&fs_prot->prot_mutex); } +/* IPsec TX flow steering */ +static int tx_create(struct mlx5e_priv *priv) +{ + struct mlx5_flow_table_attr ft_attr = {}; + struct mlx5e_ipsec *ipsec = priv->ipsec; + struct mlx5_flow_table *ft; + int err; + + priv->fs.egress_ns = + mlx5_get_flow_namespace(priv->mdev, + MLX5_FLOW_NAMESPACE_EGRESS_KERNEL); + if (!priv->fs.egress_ns) + return -EOPNOTSUPP; + + ft_attr.max_fte = NUM_IPSEC_FTE; + ft_attr.autogroup.max_num_groups = 1; + ft = mlx5_create_auto_grouped_flow_table(priv->fs.egress_ns, &ft_attr); + if (IS_ERR(ft)) { + err = PTR_ERR(ft); + netdev_err(priv->netdev, "fail to create ipsec tx ft err=%d\n", err); + return err; + } + ipsec->tx_fs->ft = ft; + return 0; +} + +static void tx_destroy(struct mlx5e_priv *priv) +{ + struct mlx5e_ipsec *ipsec = priv->ipsec; + + if (IS_ERR_OR_NULL(ipsec->tx_fs->ft)) + return; + + mlx5_destroy_flow_table(ipsec->tx_fs->ft); + ipsec->tx_fs->ft = NULL; +} + +static int tx_ft_get(struct mlx5e_priv *priv) +{ + struct mlx5e_ipsec_tx *tx_fs = priv->ipsec->tx_fs; + int err = 0; + + mutex_lock(&tx_fs->mutex); + if (tx_fs->refcnt++) + goto out; + + err = tx_create(priv); + if (err) { + tx_fs->refcnt--; + goto out; + } + +out: + mutex_unlock(&tx_fs->mutex); + return err; +} + +static void tx_ft_put(struct mlx5e_priv *priv) +{ + struct mlx5e_ipsec_tx *tx_fs = priv->ipsec->tx_fs; + + mutex_lock(&tx_fs->mutex); + if (--tx_fs->refcnt) + goto out; + + tx_destroy(priv); + +out: + mutex_unlock(&tx_fs->mutex); +} + static void setup_fte_common(struct mlx5_accel_esp_xfrm_attrs *attrs, u32 ipsec_obj_id, struct mlx5_flow_spec *spec, @@ -457,6 +534,54 @@ out: return err; } +static int tx_add_rule(struct mlx5e_priv *priv, + struct mlx5_accel_esp_xfrm_attrs *attrs, + u32 ipsec_obj_id, + struct mlx5e_ipsec_rule *ipsec_rule) +{ + struct mlx5_flow_act flow_act = {}; + struct mlx5_flow_handle *rule; + struct mlx5_flow_spec *spec; + int err = 0; + + err = tx_ft_get(priv); + if (err) + return err; + + spec = kvzalloc(sizeof(*spec), GFP_KERNEL); + if (!spec) { + err = -ENOMEM; + goto out; + } + + setup_fte_common(attrs, ipsec_obj_id, spec, &flow_act); + + /* Add IPsec indicator in metadata_reg_a */ + spec->match_criteria_enable |= MLX5_MATCH_MISC_PARAMETERS_2; + MLX5_SET(fte_match_param, spec->match_criteria, misc_parameters_2.metadata_reg_a, + MLX5_ETH_WQE_FT_META_IPSEC); + MLX5_SET(fte_match_param, spec->match_value, misc_parameters_2.metadata_reg_a, + MLX5_ETH_WQE_FT_META_IPSEC); + + flow_act.action = MLX5_FLOW_CONTEXT_ACTION_ALLOW | + MLX5_FLOW_CONTEXT_ACTION_IPSEC_ENCRYPT; + rule = mlx5_add_flow_rules(priv->ipsec->tx_fs->ft, spec, &flow_act, NULL, 0); + if (IS_ERR(rule)) { + err = PTR_ERR(rule); + netdev_err(priv->netdev, "fail to add ipsec rule attrs->action=0x%x, err=%d\n", + attrs->action, err); + goto out; + } + + ipsec_rule->rule = rule; + +out: + kvfree(spec); + if (err) + tx_ft_put(priv); + return err; +} + static void rx_del_rule(struct mlx5e_priv *priv, struct mlx5_accel_esp_xfrm_attrs *attrs, struct mlx5e_ipsec_rule *ipsec_rule) @@ -470,15 +595,27 @@ static void rx_del_rule(struct mlx5e_priv *priv, rx_ft_put(priv, attrs->is_ipv6 ? ACCEL_FS_ESP6 : ACCEL_FS_ESP4); } +static void tx_del_rule(struct mlx5e_priv *priv, + struct mlx5e_ipsec_rule *ipsec_rule) +{ + mlx5_del_flow_rules(ipsec_rule->rule); + ipsec_rule->rule = NULL; + + tx_ft_put(priv); +} + int mlx5e_accel_ipsec_fs_add_rule(struct mlx5e_priv *priv, struct mlx5_accel_esp_xfrm_attrs *attrs, u32 ipsec_obj_id, struct mlx5e_ipsec_rule *ipsec_rule) { - if (!priv->ipsec->rx_fs || attrs->action != MLX5_ACCEL_ESP_ACTION_DECRYPT) + if (!priv->ipsec->rx_fs) return -EOPNOTSUPP; - return rx_add_rule(priv, attrs, ipsec_obj_id, ipsec_rule); + if (attrs->action == MLX5_ACCEL_ESP_ACTION_DECRYPT) + return rx_add_rule(priv, attrs, ipsec_obj_id, ipsec_rule); + else + return tx_add_rule(priv, attrs, ipsec_obj_id, ipsec_rule); } void mlx5e_accel_ipsec_fs_del_rule(struct mlx5e_priv *priv, @@ -488,7 +625,18 @@ void mlx5e_accel_ipsec_fs_del_rule(struct mlx5e_priv *priv, if (!priv->ipsec->rx_fs) return; - rx_del_rule(priv, attrs, ipsec_rule); + if (attrs->action == MLX5_ACCEL_ESP_ACTION_DECRYPT) + rx_del_rule(priv, attrs, ipsec_rule); + else + tx_del_rule(priv, ipsec_rule); +} + +static void fs_cleanup_tx(struct mlx5e_priv *priv) +{ + mutex_destroy(&priv->ipsec->tx_fs->mutex); + WARN_ON(priv->ipsec->tx_fs->refcnt); + kfree(priv->ipsec->tx_fs); + priv->ipsec->tx_fs = NULL; } static void fs_cleanup_rx(struct mlx5e_priv *priv) @@ -507,6 +655,17 @@ static void fs_cleanup_rx(struct mlx5e_priv *priv) priv->ipsec->rx_fs = NULL; } +static int fs_init_tx(struct mlx5e_priv *priv) +{ + priv->ipsec->tx_fs = + kzalloc(sizeof(struct mlx5e_ipsec_tx), GFP_KERNEL); + if (!priv->ipsec->tx_fs) + return -ENOMEM; + + mutex_init(&priv->ipsec->tx_fs->mutex); + return 0; +} + static int fs_init_rx(struct mlx5e_priv *priv) { struct mlx5e_accel_fs_esp_prot *fs_prot; @@ -532,13 +691,24 @@ void mlx5e_accel_ipsec_fs_cleanup(struct mlx5e_priv *priv) if (!priv->ipsec->rx_fs) return; + fs_cleanup_tx(priv); fs_cleanup_rx(priv); } int mlx5e_accel_ipsec_fs_init(struct mlx5e_priv *priv) { + int err; + if (!mlx5_is_ipsec_device(priv->mdev) || !priv->ipsec) return -EOPNOTSUPP; - return fs_init_rx(priv); + err = fs_init_tx(priv); + if (err) + return err; + + err = fs_init_rx(priv); + if (err) + fs_cleanup_tx(priv); + + return err; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_rxtx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_rxtx.c index 93a8d68815ad..11e31a3db2be 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_rxtx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_rxtx.c @@ -34,7 +34,7 @@ #include <crypto/aead.h> #include <net/xfrm.h> #include <net/esp.h> - +#include "accel/ipsec_offload.h" #include "en_accel/ipsec_rxtx.h" #include "en_accel/ipsec.h" #include "accel/accel.h" @@ -233,18 +233,94 @@ static void mlx5e_ipsec_set_metadata(struct sk_buff *skb, ntohs(mdata->content.tx.seq)); } -bool mlx5e_ipsec_handle_tx_skb(struct mlx5e_priv *priv, - struct mlx5_wqe_eth_seg *eseg, - struct sk_buff *skb) +void mlx5e_ipsec_handle_tx_wqe(struct mlx5e_tx_wqe *wqe, + struct mlx5e_accel_tx_ipsec_state *ipsec_st, + struct mlx5_wqe_inline_seg *inlseg) +{ + inlseg->byte_count = cpu_to_be32(ipsec_st->tailen | MLX5_INLINE_SEG); + esp_output_fill_trailer((u8 *)inlseg->data, 0, ipsec_st->plen, ipsec_st->xo->proto); +} + +static int mlx5e_ipsec_set_state(struct mlx5e_priv *priv, + struct sk_buff *skb, + struct xfrm_state *x, + struct xfrm_offload *xo, + struct mlx5e_accel_tx_ipsec_state *ipsec_st) +{ + unsigned int blksize, clen, alen, plen; + struct crypto_aead *aead; + unsigned int tailen; + + ipsec_st->x = x; + ipsec_st->xo = xo; + if (mlx5_is_ipsec_device(priv->mdev)) { + aead = x->data; + alen = crypto_aead_authsize(aead); + blksize = ALIGN(crypto_aead_blocksize(aead), 4); + clen = ALIGN(skb->len + 2, blksize); + plen = max_t(u32, clen - skb->len, 4); + tailen = plen + alen; + ipsec_st->plen = plen; + ipsec_st->tailen = tailen; + } + + return 0; +} + +void mlx5e_ipsec_tx_build_eseg(struct mlx5e_priv *priv, struct sk_buff *skb, + struct mlx5_wqe_eth_seg *eseg) { struct xfrm_offload *xo = xfrm_offload(skb); - struct mlx5e_ipsec_metadata *mdata; - struct mlx5e_ipsec_sa_entry *sa_entry; + struct xfrm_encap_tmpl *encap; struct xfrm_state *x; struct sec_path *sp; + u8 l3_proto; + + sp = skb_sec_path(skb); + if (unlikely(sp->len != 1)) + return; + + x = xfrm_input_state(skb); + if (unlikely(!x)) + return; + + if (unlikely(!x->xso.offload_handle || + (skb->protocol != htons(ETH_P_IP) && + skb->protocol != htons(ETH_P_IPV6)))) + return; + + mlx5e_ipsec_set_swp(skb, eseg, x->props.mode, xo); - if (!xo) - return true; + l3_proto = (x->props.family == AF_INET) ? + ((struct iphdr *)skb_network_header(skb))->protocol : + ((struct ipv6hdr *)skb_network_header(skb))->nexthdr; + + if (mlx5_is_ipsec_device(priv->mdev)) { + eseg->flow_table_metadata |= cpu_to_be32(MLX5_ETH_WQE_FT_META_IPSEC); + eseg->trailer |= cpu_to_be32(MLX5_ETH_WQE_INSERT_TRAILER); + encap = x->encap; + if (!encap) { + eseg->trailer |= (l3_proto == IPPROTO_ESP) ? + cpu_to_be32(MLX5_ETH_WQE_TRAILER_HDR_OUTER_IP_ASSOC) : + cpu_to_be32(MLX5_ETH_WQE_TRAILER_HDR_OUTER_L4_ASSOC); + } else if (encap->encap_type == UDP_ENCAP_ESPINUDP) { + eseg->trailer |= (l3_proto == IPPROTO_ESP) ? + cpu_to_be32(MLX5_ETH_WQE_TRAILER_HDR_INNER_IP_ASSOC) : + cpu_to_be32(MLX5_ETH_WQE_TRAILER_HDR_INNER_L4_ASSOC); + } + } +} + +bool mlx5e_ipsec_handle_tx_skb(struct net_device *netdev, + struct sk_buff *skb, + struct mlx5e_accel_tx_ipsec_state *ipsec_st) +{ + struct mlx5e_priv *priv = netdev_priv(netdev); + struct xfrm_offload *xo = xfrm_offload(skb); + struct mlx5e_ipsec_sa_entry *sa_entry; + struct mlx5e_ipsec_metadata *mdata; + struct xfrm_state *x; + struct sec_path *sp; sp = skb_sec_path(skb); if (unlikely(sp->len != 1)) { @@ -270,15 +346,21 @@ bool mlx5e_ipsec_handle_tx_skb(struct mlx5e_priv *priv, atomic64_inc(&priv->ipsec->sw_stats.ipsec_tx_drop_trailer); goto drop; } - mdata = mlx5e_ipsec_add_metadata(skb); - if (IS_ERR(mdata)) { - atomic64_inc(&priv->ipsec->sw_stats.ipsec_tx_drop_metadata); - goto drop; + + if (MLX5_CAP_GEN(priv->mdev, fpga)) { + mdata = mlx5e_ipsec_add_metadata(skb); + if (IS_ERR(mdata)) { + atomic64_inc(&priv->ipsec->sw_stats.ipsec_tx_drop_metadata); + goto drop; + } } - mlx5e_ipsec_set_swp(skb, eseg, x->props.mode, xo); + sa_entry = (struct mlx5e_ipsec_sa_entry *)x->xso.offload_handle; sa_entry->set_iv_op(skb, x, xo); - mlx5e_ipsec_set_metadata(skb, mdata, xo); + if (MLX5_CAP_GEN(priv->mdev, fpga)) + mlx5e_ipsec_set_metadata(skb, mdata, xo); + + mlx5e_ipsec_set_state(priv, skb, x, xo, ipsec_st); return true; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_rxtx.h b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_rxtx.h index f96e786db158..056dacb612b0 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_rxtx.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_rxtx.h @@ -43,6 +43,13 @@ #define MLX5_IPSEC_METADATA_SYNDROM_MASK (0x7F) #define MLX5_IPSEC_METADATA_HANDLE(metadata) (((metadata) >> 8) & 0xFF) +struct mlx5e_accel_tx_ipsec_state { + struct xfrm_offload *xo; + struct xfrm_state *x; + u32 tailen; + u32 plen; +}; + #ifdef CONFIG_MLX5_EN_IPSEC struct sk_buff *mlx5e_ipsec_handle_rx_skb(struct net_device *netdev, @@ -55,16 +62,32 @@ void mlx5e_ipsec_set_iv_esn(struct sk_buff *skb, struct xfrm_state *x, struct xfrm_offload *xo); void mlx5e_ipsec_set_iv(struct sk_buff *skb, struct xfrm_state *x, struct xfrm_offload *xo); -bool mlx5e_ipsec_handle_tx_skb(struct mlx5e_priv *priv, - struct mlx5_wqe_eth_seg *eseg, - struct sk_buff *skb); +bool mlx5e_ipsec_handle_tx_skb(struct net_device *netdev, + struct sk_buff *skb, + struct mlx5e_accel_tx_ipsec_state *ipsec_st); +void mlx5e_ipsec_handle_tx_wqe(struct mlx5e_tx_wqe *wqe, + struct mlx5e_accel_tx_ipsec_state *ipsec_st, + struct mlx5_wqe_inline_seg *inlseg); void mlx5e_ipsec_offload_handle_rx_skb(struct net_device *netdev, struct sk_buff *skb, struct mlx5_cqe64 *cqe); +static inline unsigned int mlx5e_ipsec_tx_ids_len(struct mlx5e_accel_tx_ipsec_state *ipsec_st) +{ + return ipsec_st->tailen; +} + static inline bool mlx5_ipsec_is_rx_flow(struct mlx5_cqe64 *cqe) { return !!(MLX5_IPSEC_METADATA_MARKER_MASK & be32_to_cpu(cqe->ft_metadata)); } + +static inline bool mlx5e_ipsec_is_tx_flow(struct mlx5e_accel_tx_ipsec_state *ipsec_st) +{ + return ipsec_st->x; +} + +void mlx5e_ipsec_tx_build_eseg(struct mlx5e_priv *priv, struct sk_buff *skb, + struct mlx5_wqe_eth_seg *eseg); #else static inline void mlx5e_ipsec_offload_handle_rx_skb(struct net_device *netdev, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c index a0c356987e1a..e3a968e9e2a0 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c @@ -4569,7 +4569,6 @@ mlx5e_alloc_flow(struct mlx5e_priv *priv, int attr_size, err_free: kfree(flow); kvfree(parse_attr); - kfree(attr); return err; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c index 13bd4f254ed7..82b4419af9d4 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c @@ -144,9 +144,29 @@ static inline void mlx5e_insert_vlan(void *start, struct sk_buff *skb, u16 ihs) memcpy(&vhdr->h_vlan_encapsulated_proto, skb->data + cpy1_sz, cpy2_sz); } +/* RM 2311217: no L4 inner checksum for IPsec tunnel type packet */ +static void +ipsec_txwqe_build_eseg_csum(struct mlx5e_txqsq *sq, struct sk_buff *skb, + struct mlx5_wqe_eth_seg *eseg) +{ + eseg->cs_flags = MLX5_ETH_WQE_L3_CSUM; + if (skb->encapsulation) { + eseg->cs_flags |= MLX5_ETH_WQE_L3_INNER_CSUM; + sq->stats->csum_partial_inner++; + } else { + eseg->cs_flags |= MLX5_ETH_WQE_L4_CSUM; + sq->stats->csum_partial++; + } +} + static inline void mlx5e_txwqe_build_eseg_csum(struct mlx5e_txqsq *sq, struct sk_buff *skb, struct mlx5_wqe_eth_seg *eseg) { + if (unlikely(eseg->flow_table_metadata & cpu_to_be32(MLX5_ETH_WQE_FT_META_IPSEC))) { + ipsec_txwqe_build_eseg_csum(sq, skb, eseg); + return; + } + if (likely(skb->ip_summed == CHECKSUM_PARTIAL)) { eseg->cs_flags = MLX5_ETH_WQE_L3_CSUM; if (skb->encapsulation) { @@ -237,12 +257,14 @@ struct mlx5e_tx_attr { u16 headlen; u16 ihs; __be16 mss; + u16 insz; u8 opcode; }; struct mlx5e_tx_wqe_attr { u16 ds_cnt; u16 ds_cnt_inl; + u16 ds_cnt_ids; u8 num_wqebbs; }; @@ -299,6 +321,7 @@ static void mlx5e_sq_xmit_prepare(struct mlx5e_txqsq *sq, struct sk_buff *skb, stats->packets++; } + attr->insz = mlx5e_accel_tx_ids_len(sq, accel); stats->bytes += attr->num_bytes; } @@ -307,9 +330,13 @@ static void mlx5e_sq_calc_wqe_attr(struct sk_buff *skb, const struct mlx5e_tx_at { u16 ds_cnt = MLX5E_TX_WQE_EMPTY_DS_COUNT; u16 ds_cnt_inl = 0; + u16 ds_cnt_ids = 0; - ds_cnt += !!attr->headlen + skb_shinfo(skb)->nr_frags; + if (attr->insz) + ds_cnt_ids = DIV_ROUND_UP(sizeof(struct mlx5_wqe_inline_seg) + attr->insz, + MLX5_SEND_WQE_DS); + ds_cnt += !!attr->headlen + skb_shinfo(skb)->nr_frags + ds_cnt_ids; if (attr->ihs) { u16 inl = attr->ihs - INL_HDR_START_SZ; @@ -323,6 +350,7 @@ static void mlx5e_sq_calc_wqe_attr(struct sk_buff *skb, const struct mlx5e_tx_at *wqe_attr = (struct mlx5e_tx_wqe_attr) { .ds_cnt = ds_cnt, .ds_cnt_inl = ds_cnt_inl, + .ds_cnt_ids = ds_cnt_ids, .num_wqebbs = DIV_ROUND_UP(ds_cnt, MLX5_SEND_WQEBB_NUM_DS), }; } @@ -398,11 +426,11 @@ mlx5e_sq_xmit_wqe(struct mlx5e_txqsq *sq, struct sk_buff *skb, if (attr->ihs) { if (skb_vlan_tag_present(skb)) { - eseg->inline_hdr.sz = cpu_to_be16(attr->ihs + VLAN_HLEN); + eseg->inline_hdr.sz |= cpu_to_be16(attr->ihs + VLAN_HLEN); mlx5e_insert_vlan(eseg->inline_hdr.start, skb, attr->ihs); stats->added_vlan_packets++; } else { - eseg->inline_hdr.sz = cpu_to_be16(attr->ihs); + eseg->inline_hdr.sz |= cpu_to_be16(attr->ihs); memcpy(eseg->inline_hdr.start, skb->data, attr->ihs); } dseg += wqe_attr->ds_cnt_inl; @@ -414,6 +442,7 @@ mlx5e_sq_xmit_wqe(struct mlx5e_txqsq *sq, struct sk_buff *skb, stats->added_vlan_packets++; } + dseg += wqe_attr->ds_cnt_ids; num_dma = mlx5e_txwqe_build_dsegs(sq, skb, skb->data + attr->ihs, attr->headlen, dseg); if (unlikely(num_dma < 0)) @@ -430,7 +459,8 @@ err_drop: static bool mlx5e_tx_skb_supports_mpwqe(struct sk_buff *skb, struct mlx5e_tx_attr *attr) { - return !skb_is_nonlinear(skb) && !skb_vlan_tag_present(skb) && !attr->ihs; + return !skb_is_nonlinear(skb) && !skb_vlan_tag_present(skb) && !attr->ihs && + !attr->insz; } static bool mlx5e_tx_mpwqe_same_eseg(struct mlx5e_txqsq *sq, struct mlx5_wqe_eth_seg *eseg) @@ -580,7 +610,7 @@ void mlx5e_tx_mpwqe_ensure_complete(struct mlx5e_txqsq *sq) static bool mlx5e_txwqe_build_eseg(struct mlx5e_priv *priv, struct mlx5e_txqsq *sq, struct sk_buff *skb, struct mlx5_wqe_eth_seg *eseg) { - if (unlikely(!mlx5e_accel_tx_eseg(priv, sq, skb, eseg))) + if (unlikely(!mlx5e_accel_tx_eseg(priv, skb, eseg))) return false; mlx5e_txwqe_build_eseg_csum(sq, skb, eseg); @@ -625,7 +655,8 @@ netdev_tx_t mlx5e_xmit(struct sk_buff *skb, struct net_device *dev) wqe = MLX5E_TX_FETCH_WQE(sq, pi); /* May update the WQE, but may not post other WQEs. */ - mlx5e_accel_tx_finish(sq, wqe, &accel); + mlx5e_accel_tx_finish(sq, wqe, &accel, + (struct mlx5_wqe_inline_seg *)(wqe->data + wqe_attr.ds_cnt_inl)); if (unlikely(!mlx5e_txwqe_build_eseg(priv, sq, skb, &wqe->eth))) return NETDEV_TX_OK; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c index fee169732de7..babe3405132a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c @@ -776,6 +776,9 @@ static int mlx5_cmd_modify_header_alloc(struct mlx5_flow_root_namespace *ns, table_type = FS_FT_NIC_RX; break; case MLX5_FLOW_NAMESPACE_EGRESS: +#ifdef CONFIG_MLX5_IPSEC + case MLX5_FLOW_NAMESPACE_EGRESS_KERNEL: +#endif max_actions = MLX5_CAP_FLOWTABLE_NIC_TX(dev, max_modify_header_actions); table_type = FS_FT_NIC_TX; break; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c index 6141e9ec8190..16091838bfcf 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c @@ -126,6 +126,10 @@ #define LAG_NUM_PRIOS 1 #define LAG_MIN_LEVEL (OFFLOADS_MIN_LEVEL + 1) +#define KERNEL_TX_IPSEC_NUM_PRIOS 1 +#define KERNEL_TX_IPSEC_NUM_LEVELS 1 +#define KERNEL_TX_MIN_LEVEL (KERNEL_TX_IPSEC_NUM_LEVELS) + struct node_caps { size_t arr_sz; long *caps; @@ -180,13 +184,24 @@ static struct init_tree_node { static struct init_tree_node egress_root_fs = { .type = FS_TYPE_NAMESPACE, +#ifdef CONFIG_MLX5_IPSEC + .ar_size = 2, +#else .ar_size = 1, +#endif .children = (struct init_tree_node[]) { ADD_PRIO(0, MLX5_BY_PASS_NUM_PRIOS, 0, FS_CHAINING_CAPS_EGRESS, ADD_NS(MLX5_FLOW_TABLE_MISS_ACTION_DEF, ADD_MULTIPLE_PRIO(MLX5_BY_PASS_NUM_PRIOS, BY_PASS_PRIO_NUM_LEVELS))), +#ifdef CONFIG_MLX5_IPSEC + ADD_PRIO(0, KERNEL_TX_MIN_LEVEL, 0, + FS_CHAINING_CAPS_EGRESS, + ADD_NS(MLX5_FLOW_TABLE_MISS_ACTION_DEF, + ADD_MULTIPLE_PRIO(KERNEL_TX_IPSEC_NUM_PRIOS, + KERNEL_TX_IPSEC_NUM_LEVELS))), +#endif } }; @@ -2165,8 +2180,10 @@ struct mlx5_flow_namespace *mlx5_get_flow_namespace(struct mlx5_core_dev *dev, break; } - if (type == MLX5_FLOW_NAMESPACE_EGRESS) { + if (type == MLX5_FLOW_NAMESPACE_EGRESS || + type == MLX5_FLOW_NAMESPACE_EGRESS_KERNEL) { root_ns = steering->egress_root_ns; + prio = type - MLX5_FLOW_NAMESPACE_EGRESS; } else if (type == MLX5_FLOW_NAMESPACE_RDMA_RX) { root_ns = steering->rdma_rx_root_ns; prio = RDMA_RX_BYPASS_PRIO; diff --git a/drivers/net/ethernet/mscc/ocelot_net.c b/drivers/net/ethernet/mscc/ocelot_net.c index d3c03942546d..b34da11acf65 100644 --- a/drivers/net/ethernet/mscc/ocelot_net.c +++ b/drivers/net/ethernet/mscc/ocelot_net.c @@ -669,7 +669,8 @@ struct net_device *ocelot_port_to_netdev(struct ocelot *ocelot, int port) return priv->dev; } -static bool ocelot_port_dev_check(const struct net_device *dev) +/* Checks if the net_device instance given to us originates from our driver */ +static bool ocelot_netdevice_dev_check(const struct net_device *dev) { return dev->netdev_ops == &ocelot_port_netdev_ops; } @@ -678,7 +679,7 @@ int ocelot_netdev_to_port(struct net_device *dev) { struct ocelot_port_private *priv; - if (!dev || !ocelot_port_dev_check(dev)) + if (!dev || !ocelot_netdevice_dev_check(dev)) return -EINVAL; priv = netdev_priv(dev); @@ -907,12 +908,6 @@ static int ocelot_port_obj_del(struct net_device *dev, return ret; } -/* Checks if the net_device instance given to us originate from our driver. */ -static bool ocelot_netdevice_dev_check(const struct net_device *dev) -{ - return dev->netdev_ops == &ocelot_port_netdev_ops; -} - static int ocelot_netdevice_port_event(struct net_device *dev, unsigned long event, struct netdev_notifier_changeupper_info *info) diff --git a/drivers/net/ethernet/ti/tlan.c b/drivers/net/ethernet/ti/tlan.c index 317fad787d78..267c080ee084 100644 --- a/drivers/net/ethernet/ti/tlan.c +++ b/drivers/net/ethernet/ti/tlan.c @@ -2504,7 +2504,7 @@ static void tlan_phy_power_down(struct net_device *dev) } /* Wait for 50 ms and powerup - * This is abitrary. It is intended to make sure the + * This is arbitrary. It is intended to make sure the * transceiver settles. */ tlan_set_timer(dev, msecs_to_jiffies(50), TLAN_TIMER_PHY_PUP); diff --git a/drivers/net/ipa/ipa_endpoint.c b/drivers/net/ipa/ipa_endpoint.c index 24d688e3cd93..b40b711cf4bd 100644 --- a/drivers/net/ipa/ipa_endpoint.c +++ b/drivers/net/ipa/ipa_endpoint.c @@ -1432,6 +1432,9 @@ void ipa_endpoint_resume_one(struct ipa_endpoint *endpoint) void ipa_endpoint_suspend(struct ipa *ipa) { + if (!ipa->setup_complete) + return; + if (ipa->modem_netdev) ipa_modem_suspend(ipa->modem_netdev); @@ -1443,6 +1446,9 @@ void ipa_endpoint_suspend(struct ipa *ipa) void ipa_endpoint_resume(struct ipa *ipa) { + if (!ipa->setup_complete) + return; + ipa_endpoint_resume_one(ipa->name_map[IPA_ENDPOINT_AP_COMMAND_TX]); ipa_endpoint_resume_one(ipa->name_map[IPA_ENDPOINT_AP_LAN_RX]); diff --git a/drivers/net/macsec.c b/drivers/net/macsec.c index 2b0c8f01ddc3..11ca5fa902a1 100644 --- a/drivers/net/macsec.c +++ b/drivers/net/macsec.c @@ -3647,30 +3647,10 @@ static int macsec_change_mtu(struct net_device *dev, int new_mtu) static void macsec_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *s) { - int cpu; - if (!dev->tstats) return; - for_each_possible_cpu(cpu) { - struct pcpu_sw_netstats *stats; - struct pcpu_sw_netstats tmp; - int start; - - stats = per_cpu_ptr(dev->tstats, cpu); - do { - start = u64_stats_fetch_begin_irq(&stats->syncp); - tmp.rx_packets = stats->rx_packets; - tmp.rx_bytes = stats->rx_bytes; - tmp.tx_packets = stats->tx_packets; - tmp.tx_bytes = stats->tx_bytes; - } while (u64_stats_fetch_retry_irq(&stats->syncp, start)); - - s->rx_packets += tmp.rx_packets; - s->rx_bytes += tmp.rx_bytes; - s->tx_packets += tmp.tx_packets; - s->tx_bytes += tmp.tx_bytes; - } + dev_fetch_sw_netstats(s, dev->tstats); s->rx_dropped = dev->stats.rx_dropped; s->tx_dropped = dev->stats.tx_dropped; diff --git a/drivers/net/usb/qmi_wwan.c b/drivers/net/usb/qmi_wwan.c index 5ca1356b8656..a322f51873d0 100644 --- a/drivers/net/usb/qmi_wwan.c +++ b/drivers/net/usb/qmi_wwan.c @@ -126,31 +126,9 @@ static void qmimux_get_stats64(struct net_device *net, struct rtnl_link_stats64 *stats) { struct qmimux_priv *priv = netdev_priv(net); - unsigned int start; - int cpu; netdev_stats_to_stats64(stats, &net->stats); - - for_each_possible_cpu(cpu) { - struct pcpu_sw_netstats *stats64; - u64 rx_packets, rx_bytes; - u64 tx_packets, tx_bytes; - - stats64 = per_cpu_ptr(priv->stats64, cpu); - - do { - start = u64_stats_fetch_begin_irq(&stats64->syncp); - rx_packets = stats64->rx_packets; - rx_bytes = stats64->rx_bytes; - tx_packets = stats64->tx_packets; - tx_bytes = stats64->tx_bytes; - } while (u64_stats_fetch_retry_irq(&stats64->syncp, start)); - - stats->rx_packets += rx_packets; - stats->rx_bytes += rx_bytes; - stats->tx_packets += tx_packets; - stats->tx_bytes += tx_bytes; - } + dev_fetch_sw_netstats(stats, priv->stats64); } static const struct net_device_ops qmimux_netdev_ops = { diff --git a/drivers/net/usb/usbnet.c b/drivers/net/usb/usbnet.c index 963d260d19ab..6062dc27870e 100644 --- a/drivers/net/usb/usbnet.c +++ b/drivers/net/usb/usbnet.c @@ -983,31 +983,9 @@ EXPORT_SYMBOL_GPL(usbnet_set_link_ksettings); void usbnet_get_stats64(struct net_device *net, struct rtnl_link_stats64 *stats) { struct usbnet *dev = netdev_priv(net); - unsigned int start; - int cpu; netdev_stats_to_stats64(stats, &net->stats); - - for_each_possible_cpu(cpu) { - struct pcpu_sw_netstats *stats64; - u64 rx_packets, rx_bytes; - u64 tx_packets, tx_bytes; - - stats64 = per_cpu_ptr(dev->stats64, cpu); - - do { - start = u64_stats_fetch_begin_irq(&stats64->syncp); - rx_packets = stats64->rx_packets; - rx_bytes = stats64->rx_bytes; - tx_packets = stats64->tx_packets; - tx_bytes = stats64->tx_bytes; - } while (u64_stats_fetch_retry_irq(&stats64->syncp, start)); - - stats->rx_packets += rx_packets; - stats->rx_bytes += rx_bytes; - stats->tx_packets += tx_packets; - stats->tx_bytes += tx_bytes; - } + dev_fetch_sw_netstats(stats, dev->stats64); } EXPORT_SYMBOL_GPL(usbnet_get_stats64); diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 21b71148c532..d2d2c4a53cf2 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -68,6 +68,8 @@ static const unsigned long guest_offloads[] = { (1ULL << VIRTIO_NET_F_GUEST_ECN) | \ (1ULL << VIRTIO_NET_F_GUEST_UFO)) +#define GUEST_OFFLOAD_CSUM_MASK (1ULL << VIRTIO_NET_F_GUEST_CSUM) + struct virtnet_stat_desc { char desc[ETH_GSTRING_LEN]; size_t offset; @@ -2522,29 +2524,48 @@ static int virtnet_get_phys_port_name(struct net_device *dev, char *buf, return 0; } +static netdev_features_t virtnet_fix_features(struct net_device *netdev, + netdev_features_t features) +{ + /* If Rx checksum is disabled, LRO should also be disabled. */ + if (!(features & NETIF_F_RXCSUM)) + features &= ~NETIF_F_LRO; + + return features; +} + static int virtnet_set_features(struct net_device *dev, netdev_features_t features) { struct virtnet_info *vi = netdev_priv(dev); - u64 offloads; + u64 offloads = vi->guest_offloads; int err; - if ((dev->features ^ features) & NETIF_F_LRO) { - if (vi->xdp_queue_pairs) - return -EBUSY; + /* Don't allow configuration while XDP is active. */ + if (vi->xdp_queue_pairs) + return -EBUSY; + if ((dev->features ^ features) & NETIF_F_LRO) { if (features & NETIF_F_LRO) - offloads = vi->guest_offloads_capable; + offloads |= GUEST_OFFLOAD_LRO_MASK & + vi->guest_offloads_capable; else - offloads = vi->guest_offloads_capable & - ~GUEST_OFFLOAD_LRO_MASK; + offloads &= ~GUEST_OFFLOAD_LRO_MASK; + } - err = virtnet_set_guest_offloads(vi, offloads); - if (err) - return err; - vi->guest_offloads = offloads; + if ((dev->features ^ features) & NETIF_F_RXCSUM) { + if (features & NETIF_F_RXCSUM) + offloads |= GUEST_OFFLOAD_CSUM_MASK & + vi->guest_offloads_capable; + else + offloads &= ~GUEST_OFFLOAD_CSUM_MASK; } + err = virtnet_set_guest_offloads(vi, offloads); + if (err) + return err; + + vi->guest_offloads = offloads; return 0; } @@ -2563,6 +2584,7 @@ static const struct net_device_ops virtnet_netdev = { .ndo_features_check = passthru_features_check, .ndo_get_phys_port_name = virtnet_get_phys_port_name, .ndo_set_features = virtnet_set_features, + .ndo_fix_features = virtnet_fix_features, }; static void virtnet_config_changed_work(struct work_struct *work) @@ -3013,8 +3035,10 @@ static int virtnet_probe(struct virtio_device *vdev) if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO4) || virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO6)) dev->features |= NETIF_F_LRO; - if (virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_GUEST_OFFLOADS)) + if (virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_GUEST_OFFLOADS)) { + dev->hw_features |= NETIF_F_RXCSUM; dev->hw_features |= NETIF_F_LRO; + } dev->vlan_features = dev->features; diff --git a/drivers/net/wireless/quantenna/qtnfmac/core.c b/drivers/net/wireless/quantenna/qtnfmac/core.c index 374074dc71c0..bf6dbeb61842 100644 --- a/drivers/net/wireless/quantenna/qtnfmac/core.c +++ b/drivers/net/wireless/quantenna/qtnfmac/core.c @@ -139,34 +139,13 @@ static void qtnf_netdev_get_stats64(struct net_device *ndev, struct rtnl_link_stats64 *stats) { struct qtnf_vif *vif = qtnf_netdev_get_priv(ndev); - unsigned int start; - int cpu; netdev_stats_to_stats64(stats, &ndev->stats); if (!vif->stats64) return; - for_each_possible_cpu(cpu) { - struct pcpu_sw_netstats *stats64; - u64 rx_packets, rx_bytes; - u64 tx_packets, tx_bytes; - - stats64 = per_cpu_ptr(vif->stats64, cpu); - - do { - start = u64_stats_fetch_begin_irq(&stats64->syncp); - rx_packets = stats64->rx_packets; - rx_bytes = stats64->rx_bytes; - tx_packets = stats64->tx_packets; - tx_bytes = stats64->tx_bytes; - } while (u64_stats_fetch_retry_irq(&stats64->syncp, start)); - - stats->rx_packets += rx_packets; - stats->rx_bytes += rx_bytes; - stats->tx_packets += tx_packets; - stats->tx_bytes += tx_bytes; - } + dev_fetch_sw_netstats(stats, vif->stats64); } /* Netdev handler for transmission timeout. diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h index 92d991d93757..846d94ad04bc 100644 --- a/include/linux/mlx5/fs.h +++ b/include/linux/mlx5/fs.h @@ -76,6 +76,7 @@ enum mlx5_flow_namespace_type { MLX5_FLOW_NAMESPACE_SNIFFER_RX, MLX5_FLOW_NAMESPACE_SNIFFER_TX, MLX5_FLOW_NAMESPACE_EGRESS, + MLX5_FLOW_NAMESPACE_EGRESS_KERNEL, MLX5_FLOW_NAMESPACE_RDMA_RX, MLX5_FLOW_NAMESPACE_RDMA_RX_KERNEL, MLX5_FLOW_NAMESPACE_RDMA_TX, diff --git a/include/linux/mlx5/qp.h b/include/linux/mlx5/qp.h index 36492a1342cf..d75ef8aa8fac 100644 --- a/include/linux/mlx5/qp.h +++ b/include/linux/mlx5/qp.h @@ -245,6 +245,10 @@ enum { MLX5_ETH_WQE_SWP_OUTER_L4_UDP = 1 << 5, }; +enum { + MLX5_ETH_WQE_FT_META_IPSEC = BIT(0), +}; + struct mlx5_wqe_eth_seg { u8 swp_outer_l4_offset; u8 swp_outer_l3_offset; @@ -253,7 +257,7 @@ struct mlx5_wqe_eth_seg { u8 cs_flags; u8 swp_flags; __be16 mss; - __be32 rsvd2; + __be32 flow_table_metadata; union { struct { __be16 sz; diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 948d7105e91a..964b494b0e8d 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4499,6 +4499,8 @@ struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev, struct rtnl_link_stats64 *storage); void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64, const struct net_device_stats *netdev_stats); +void dev_fetch_sw_netstats(struct rtnl_link_stats64 *s, + const struct pcpu_sw_netstats __percpu *netstats); extern int netdev_max_backlog; extern int netdev_tstamp_prequeue; diff --git a/include/net/netfilter/nf_log.h b/include/net/netfilter/nf_log.h index 0d3920896d50..716db4a0fed8 100644 --- a/include/net/netfilter/nf_log.h +++ b/include/net/netfilter/nf_log.h @@ -108,6 +108,7 @@ int nf_log_dump_tcp_header(struct nf_log_buf *m, const struct sk_buff *skb, unsigned int logflags); void nf_log_dump_sk_uid_gid(struct net *net, struct nf_log_buf *m, struct sock *sk); +void nf_log_dump_vlan(struct nf_log_buf *m, const struct sk_buff *skb); void nf_log_dump_packet_common(struct nf_log_buf *m, u_int8_t pf, unsigned int hooknum, const struct sk_buff *skb, const struct net_device *in, diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 3965ce18226f..3f7e56b1171e 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -14,6 +14,8 @@ #include <net/netlink.h> #include <net/flow_offload.h> +#define NFT_MAX_HOOKS (NF_INET_INGRESS + 1) + struct module; #define NFT_JUMP_STACK_SIZE 16 @@ -979,7 +981,7 @@ struct nft_chain_type { int family; struct module *owner; unsigned int hook_mask; - nf_hookfn *hooks[NF_MAX_HOOKS]; + nf_hookfn *hooks[NFT_MAX_HOOKS]; int (*ops_register)(struct net *net, const struct nf_hook_ops *ops); void (*ops_unregister)(struct net *net, const struct nf_hook_ops *ops); }; diff --git a/include/net/tc_act/tc_tunnel_key.h b/include/net/tc_act/tc_tunnel_key.h index e1057b255f69..879fe8cff581 100644 --- a/include/net/tc_act/tc_tunnel_key.h +++ b/include/net/tc_act/tc_tunnel_key.h @@ -56,7 +56,10 @@ static inline struct ip_tunnel_info *tcf_tunnel_info(const struct tc_action *a) { #ifdef CONFIG_NET_CLS_ACT struct tcf_tunnel_key *t = to_tunnel_key(a); - struct tcf_tunnel_key_params *params = rtnl_dereference(t->params); + struct tcf_tunnel_key_params *params; + + params = rcu_dereference_protected(t->params, + lockdep_is_held(&a->tcfa_lock)); return ¶ms->tcft_enc_metadata->u.tun_info; #else diff --git a/include/net/tls.h b/include/net/tls.h index e5dac7e74e79..baf1e99d8193 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -679,10 +679,6 @@ int decrypt_skb(struct sock *sk, struct sk_buff *skb, struct scatterlist *sgout); struct sk_buff *tls_encrypt_skb(struct sk_buff *skb); -struct sk_buff *tls_validate_xmit_skb(struct sock *sk, - struct net_device *dev, - struct sk_buff *skb); - int tls_sw_fallback_init(struct sock *sk, struct tls_offload_context_tx *offload_ctx, struct tls_crypto_info *crypto_info); diff --git a/include/uapi/linux/netfilter.h b/include/uapi/linux/netfilter.h index 6a6179af0d7c..ef9a44286e23 100644 --- a/include/uapi/linux/netfilter.h +++ b/include/uapi/linux/netfilter.h @@ -45,8 +45,8 @@ enum nf_inet_hooks { NF_INET_FORWARD, NF_INET_LOCAL_OUT, NF_INET_POST_ROUTING, - NF_INET_INGRESS, - NF_INET_NUMHOOKS + NF_INET_NUMHOOKS, + NF_INET_INGRESS = NF_INET_NUMHOOKS, }; enum nf_dev_hooks { diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 352ee51707a1..98272cb5f617 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -749,10 +749,12 @@ enum nft_payload_bases { * * @NFT_PAYLOAD_CSUM_NONE: no checksumming * @NFT_PAYLOAD_CSUM_INET: internet checksum (RFC 791) + * @NFT_PAYLOAD_CSUM_SCTP: CRC-32c, for use in SCTP header (RFC 3309) */ enum nft_payload_csum_types { NFT_PAYLOAD_CSUM_NONE, NFT_PAYLOAD_CSUM_INET, + NFT_PAYLOAD_CSUM_SCTP, }; enum nft_payload_csum_flags { diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index 9a2fb4aa1a10..6f742fee874a 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -206,27 +206,8 @@ static void br_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats) { struct net_bridge *br = netdev_priv(dev); - struct pcpu_sw_netstats tmp, sum = { 0 }; - unsigned int cpu; - - for_each_possible_cpu(cpu) { - unsigned int start; - const struct pcpu_sw_netstats *bstats - = per_cpu_ptr(br->stats, cpu); - do { - start = u64_stats_fetch_begin_irq(&bstats->syncp); - memcpy(&tmp, bstats, sizeof(tmp)); - } while (u64_stats_fetch_retry_irq(&bstats->syncp, start)); - sum.tx_bytes += tmp.tx_bytes; - sum.tx_packets += tmp.tx_packets; - sum.rx_bytes += tmp.rx_bytes; - sum.rx_packets += tmp.rx_packets; - } - stats->tx_bytes = sum.tx_bytes; - stats->tx_packets = sum.tx_packets; - stats->rx_bytes = sum.rx_bytes; - stats->rx_packets = sum.rx_packets; + dev_fetch_sw_netstats(stats, br->stats); } static int br_change_mtu(struct net_device *dev, int new_mtu) diff --git a/net/can/j1939/transport.c b/net/can/j1939/transport.c index 0cec4152f979..e09d087ba240 100644 --- a/net/can/j1939/transport.c +++ b/net/can/j1939/transport.c @@ -580,6 +580,7 @@ sk_buff *j1939_tp_tx_dat_new(struct j1939_priv *priv, skb->dev = priv->ndev; can_skb_reserve(skb); can_skb_prv(skb)->ifindex = priv->ndev->ifindex; + can_skb_prv(skb)->skbcnt = 0; /* reserve CAN header */ skb_reserve(skb, offsetof(struct can_frame, data)); @@ -1487,6 +1488,7 @@ j1939_session *j1939_session_fresh_new(struct j1939_priv *priv, skb->dev = priv->ndev; can_skb_reserve(skb); can_skb_prv(skb)->ifindex = priv->ndev->ifindex; + can_skb_prv(skb)->skbcnt = 0; skcb = j1939_skb_to_cb(skb); memcpy(skcb, rel_skcb, sizeof(*skcb)); diff --git a/net/core/dev.c b/net/core/dev.c index 22babf5bcc88..751e5264fd49 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -10328,6 +10328,40 @@ struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev, } EXPORT_SYMBOL(dev_get_stats); +/** + * dev_fetch_sw_netstats - get per-cpu network device statistics + * @s: place to store stats + * @netstats: per-cpu network stats to read from + * + * Read per-cpu network statistics and populate the related fields in @s. + */ +void dev_fetch_sw_netstats(struct rtnl_link_stats64 *s, + const struct pcpu_sw_netstats __percpu *netstats) +{ + int cpu; + + for_each_possible_cpu(cpu) { + const struct pcpu_sw_netstats *stats; + struct pcpu_sw_netstats tmp; + unsigned int start; + + stats = per_cpu_ptr(netstats, cpu); + do { + start = u64_stats_fetch_begin_irq(&stats->syncp); + tmp.rx_packets = stats->rx_packets; + tmp.rx_bytes = stats->rx_bytes; + tmp.tx_packets = stats->tx_packets; + tmp.tx_bytes = stats->tx_bytes; + } while (u64_stats_fetch_retry_irq(&stats->syncp, start)); + + s->rx_packets += tmp.rx_packets; + s->rx_bytes += tmp.rx_bytes; + s->tx_packets += tmp.tx_packets; + s->tx_bytes += tmp.tx_bytes; + } +} +EXPORT_SYMBOL_GPL(dev_fetch_sw_netstats); + struct netdev_queue *dev_ingress_queue_create(struct net_device *dev) { struct netdev_queue *queue = dev_ingress_queue(dev); diff --git a/net/core/sock.c b/net/core/sock.c index d9a537e6876a..4e8729357122 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -757,7 +757,6 @@ static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns) } else { sock_reset_flag(sk, SOCK_RCVTSTAMP); sock_reset_flag(sk, SOCK_RCVTSTAMPNS); - sock_reset_flag(sk, SOCK_TSTAMP_NEW); } } @@ -994,8 +993,6 @@ set_sndbuf: __sock_set_timestamps(sk, valbool, true, true); break; case SO_TIMESTAMPING_NEW: - sock_set_flag(sk, SOCK_TSTAMP_NEW); - fallthrough; case SO_TIMESTAMPING_OLD: if (val & ~SOF_TIMESTAMPING_MASK) { ret = -EINVAL; @@ -1024,16 +1021,14 @@ set_sndbuf: } sk->sk_tsflags = val; + sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW); + if (val & SOF_TIMESTAMPING_RX_SOFTWARE) sock_enable_timestamp(sk, SOCK_TIMESTAMPING_RX_SOFTWARE); - else { - if (optname == SO_TIMESTAMPING_NEW) - sock_reset_flag(sk, SOCK_TSTAMP_NEW); - + else sock_disable_timestamp(sk, (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE)); - } break; case SO_RCVLOWAT: diff --git a/net/dsa/slave.c b/net/dsa/slave.c index e7c1d62fde99..3bc5ca40c9fb 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -1221,28 +1221,9 @@ static void dsa_slave_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats) { struct dsa_slave_priv *p = netdev_priv(dev); - struct pcpu_sw_netstats *s; - unsigned int start; - int i; netdev_stats_to_stats64(stats, &dev->stats); - for_each_possible_cpu(i) { - u64 tx_packets, tx_bytes, rx_packets, rx_bytes; - - s = per_cpu_ptr(p->stats64, i); - do { - start = u64_stats_fetch_begin_irq(&s->syncp); - tx_packets = s->tx_packets; - tx_bytes = s->tx_bytes; - rx_packets = s->rx_packets; - rx_bytes = s->rx_bytes; - } while (u64_stats_fetch_retry_irq(&s->syncp, start)); - - stats->tx_packets += tx_packets; - stats->tx_bytes += tx_bytes; - stats->rx_packets += rx_packets; - stats->rx_bytes += rx_bytes; - } + dev_fetch_sw_netstats(stats, p->stats64); } static int dsa_slave_get_rxnfc(struct net_device *dev, diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 8f2e974a1e4d..e8d704de728c 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -457,6 +457,23 @@ out_bh_enable: local_bh_enable(); } +/* + * The device used for looking up which routing table to use for sending an ICMP + * error is preferably the source whenever it is set, which should ensure the + * icmp error can be sent to the source host, else lookup using the routing + * table of the destination device, else use the main routing table (index 0). + */ +static struct net_device *icmp_get_route_lookup_dev(struct sk_buff *skb) +{ + struct net_device *route_lookup_dev = NULL; + + if (skb->dev) + route_lookup_dev = skb->dev; + else if (skb_dst(skb)) + route_lookup_dev = skb_dst(skb)->dev; + return route_lookup_dev; +} + static struct rtable *icmp_route_lookup(struct net *net, struct flowi4 *fl4, struct sk_buff *skb_in, @@ -465,6 +482,7 @@ static struct rtable *icmp_route_lookup(struct net *net, int type, int code, struct icmp_bxm *param) { + struct net_device *route_lookup_dev; struct rtable *rt, *rt2; struct flowi4 fl4_dec; int err; @@ -479,7 +497,8 @@ static struct rtable *icmp_route_lookup(struct net *net, fl4->flowi4_proto = IPPROTO_ICMP; fl4->fl4_icmp_type = type; fl4->fl4_icmp_code = code; - fl4->flowi4_oif = l3mdev_master_ifindex(skb_dst(skb_in)->dev); + route_lookup_dev = icmp_get_route_lookup_dev(skb_in); + fl4->flowi4_oif = l3mdev_master_ifindex(route_lookup_dev); security_skb_classify_flow(skb_in, flowi4_to_flowi(fl4)); rt = ip_route_output_key_hash(net, fl4, skb_in); @@ -503,7 +522,7 @@ static struct rtable *icmp_route_lookup(struct net *net, if (err) goto relookup_failed; - if (inet_addr_type_dev_table(net, skb_dst(skb_in)->dev, + if (inet_addr_type_dev_table(net, route_lookup_dev, fl4_dec.saddr) == RTN_LOCAL) { rt2 = __ip_route_output_key(net, &fl4_dec); if (IS_ERR(rt2)) diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 4e31f23e4117..e70291748889 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -625,9 +625,7 @@ static netdev_tx_t ipgre_xmit(struct sk_buff *skb, } if (dev->header_ops) { - /* Need space for new headers */ - if (skb_cow_head(skb, dev->needed_headroom - - (tunnel->hlen + sizeof(struct iphdr)))) + if (skb_cow_head(skb, 0)) goto free_skb; tnl_params = (const struct iphdr *)skb->data; @@ -748,7 +746,11 @@ static void ipgre_link_update(struct net_device *dev, bool set_mtu) len = tunnel->tun_hlen - len; tunnel->hlen = tunnel->hlen + len; - dev->needed_headroom = dev->needed_headroom + len; + if (dev->header_ops) + dev->hard_header_len += len; + else + dev->needed_headroom += len; + if (set_mtu) dev->mtu = max_t(int, dev->mtu - len, 68); @@ -944,6 +946,7 @@ static void __gre_tunnel_init(struct net_device *dev) tunnel->parms.iph.protocol = IPPROTO_GRE; tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen; + dev->needed_headroom = tunnel->hlen + sizeof(tunnel->parms.iph); dev->features |= GRE_FEATURES; dev->hw_features |= GRE_FEATURES; @@ -987,10 +990,14 @@ static int ipgre_tunnel_init(struct net_device *dev) return -EINVAL; dev->flags = IFF_BROADCAST; dev->header_ops = &ipgre_header_ops; + dev->hard_header_len = tunnel->hlen + sizeof(*iph); + dev->needed_headroom = 0; } #endif } else if (!tunnel->collect_md) { dev->header_ops = &ipgre_header_ops; + dev->hard_header_len = tunnel->hlen + sizeof(*iph); + dev->needed_headroom = 0; } return ip_tunnel_init(dev); diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index b2ea1a8c5fd6..25f1caf5abf9 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -433,29 +433,8 @@ EXPORT_SYMBOL(skb_tunnel_check_pmtu); void ip_tunnel_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *tot) { - int i; - netdev_stats_to_stats64(tot, &dev->stats); - - for_each_possible_cpu(i) { - const struct pcpu_sw_netstats *tstats = - per_cpu_ptr(dev->tstats, i); - u64 rx_packets, rx_bytes, tx_packets, tx_bytes; - unsigned int start; - - do { - start = u64_stats_fetch_begin_irq(&tstats->syncp); - rx_packets = tstats->rx_packets; - tx_packets = tstats->tx_packets; - rx_bytes = tstats->rx_bytes; - tx_bytes = tstats->tx_bytes; - } while (u64_stats_fetch_retry_irq(&tstats->syncp, start)); - - tot->rx_packets += rx_packets; - tot->tx_packets += tx_packets; - tot->rx_bytes += rx_bytes; - tot->tx_bytes += tx_bytes; - } + dev_fetch_sw_netstats(tot, dev->tstats); } EXPORT_SYMBOL_GPL(ip_tunnel_get_stats64); diff --git a/net/ipv4/netfilter/nf_log_arp.c b/net/ipv4/netfilter/nf_log_arp.c index 7a83f881efa9..136030ad2e54 100644 --- a/net/ipv4/netfilter/nf_log_arp.c +++ b/net/ipv4/netfilter/nf_log_arp.c @@ -43,16 +43,31 @@ static void dump_arp_packet(struct nf_log_buf *m, const struct nf_loginfo *info, const struct sk_buff *skb, unsigned int nhoff) { - const struct arphdr *ah; - struct arphdr _arph; const struct arppayload *ap; struct arppayload _arpp; + const struct arphdr *ah; + unsigned int logflags; + struct arphdr _arph; ah = skb_header_pointer(skb, 0, sizeof(_arph), &_arph); if (ah == NULL) { nf_log_buf_add(m, "TRUNCATED"); return; } + + if (info->type == NF_LOG_TYPE_LOG) + logflags = info->u.log.logflags; + else + logflags = NF_LOG_DEFAULT_MASK; + + if (logflags & NF_LOG_MACDECODE) { + nf_log_buf_add(m, "MACSRC=%pM MACDST=%pM ", + eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest); + nf_log_dump_vlan(m, skb); + nf_log_buf_add(m, "MACPROTO=%04x ", + ntohs(eth_hdr(skb)->h_proto)); + } + nf_log_buf_add(m, "ARP HTYPE=%d PTYPE=0x%04x OPCODE=%d", ntohs(ah->ar_hrd), ntohs(ah->ar_pro), ntohs(ah->ar_op)); diff --git a/net/ipv4/netfilter/nf_log_ipv4.c b/net/ipv4/netfilter/nf_log_ipv4.c index 0c72156130b6..d07583fac8f8 100644 --- a/net/ipv4/netfilter/nf_log_ipv4.c +++ b/net/ipv4/netfilter/nf_log_ipv4.c @@ -284,8 +284,10 @@ static void dump_ipv4_mac_header(struct nf_log_buf *m, switch (dev->type) { case ARPHRD_ETHER: - nf_log_buf_add(m, "MACSRC=%pM MACDST=%pM MACPROTO=%04x ", - eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest, + nf_log_buf_add(m, "MACSRC=%pM MACDST=%pM ", + eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest); + nf_log_dump_vlan(m, skb); + nf_log_buf_add(m, "MACPROTO=%04x ", ntohs(eth_hdr(skb)->h_proto)); return; default: diff --git a/net/ipv4/route.c b/net/ipv4/route.c index d15a78b26dfa..dc2a399cd9f4 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -2770,10 +2770,12 @@ struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4, if (IS_ERR(rt)) return rt; - if (flp4->flowi4_proto) + if (flp4->flowi4_proto) { + flp4->flowi4_oif = rt->dst.dev->ifindex; rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst, flowi4_to_flowi(flp4), sk, 0); + } return rt; } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index c15ca207dd85..67f10d3ec240 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4430,7 +4430,8 @@ static void tcp_sack_maybe_coalesce(struct tcp_sock *tp) sp[i] = sp[i + 1]; continue; } - this_sack++, swalk++; + this_sack++; + swalk++; } } diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c index 3f51e781562a..c8003c8aad2c 100644 --- a/net/ipv4/tcp_vegas.c +++ b/net/ipv4/tcp_vegas.c @@ -293,10 +293,10 @@ size_t tcp_vegas_get_info(struct sock *sk, u32 ext, int *attr, const struct vegas *ca = inet_csk_ca(sk); if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) { - info->vegas.tcpv_enabled = ca->doing_vegas_now, - info->vegas.tcpv_rttcnt = ca->cntRTT, - info->vegas.tcpv_rtt = ca->baseRTT, - info->vegas.tcpv_minrtt = ca->minRTT, + info->vegas.tcpv_enabled = ca->doing_vegas_now; + info->vegas.tcpv_rttcnt = ca->cntRTT; + info->vegas.tcpv_rtt = ca->baseRTT; + info->vegas.tcpv_minrtt = ca->minRTT; *attr = INET_DIAG_VEGASINFO; return sizeof(struct tcpvegas_info); diff --git a/net/ipv6/calipso.c b/net/ipv6/calipso.c index 8d3f66c310db..78f766019b7e 100644 --- a/net/ipv6/calipso.c +++ b/net/ipv6/calipso.c @@ -761,7 +761,7 @@ static int calipso_genopt(unsigned char *buf, u32 start, u32 buf_len, calipso[1] = len - 2; *(__be32 *)(calipso + 2) = htonl(doi_def->doi); calipso[6] = (len - CALIPSO_HDR_LEN) / 4; - calipso[7] = secattr->attr.mls.lvl, + calipso[7] = secattr->attr.mls.lvl; crc = ~crc_ccitt(0xffff, calipso, len); calipso[8] = crc & 0xff; calipso[9] = (crc >> 8) & 0xff; diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index a4e4912ad607..91209a2760aa 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -501,8 +501,11 @@ void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, if (__ipv6_addr_needs_scope_id(addr_type)) { iif = icmp6_iif(skb); } else { - dst = skb_dst(skb); - iif = l3mdev_master_ifindex(dst ? dst->dev : skb->dev); + /* + * The source device is used for looking up which routing table + * to use for sending an ICMP error. + */ + iif = l3mdev_master_ifindex(skb->dev); } /* diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 141c0a4c569a..605cdd38a919 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -2622,8 +2622,10 @@ static void *ipv6_route_seq_start(struct seq_file *seq, loff_t *pos) iter->skip = *pos; if (iter->tbl) { + loff_t p = 0; + ipv6_route_seq_setup_walk(iter, net); - return ipv6_route_seq_next(seq, NULL, pos); + return ipv6_route_seq_next(seq, NULL, &p); } else { return NULL; } diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index a2a65e327f49..5941bfbc7c47 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -468,8 +468,6 @@ int ip6_forward(struct sk_buff *skb) * check and decrement ttl */ if (hdr->hop_limit <= 1) { - /* Force OUTPUT device used as source address */ - skb->dev = dst->dev; icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0); __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); diff --git a/net/ipv6/netfilter/nf_log_ipv6.c b/net/ipv6/netfilter/nf_log_ipv6.c index da64550a5707..8210ff34ed9b 100644 --- a/net/ipv6/netfilter/nf_log_ipv6.c +++ b/net/ipv6/netfilter/nf_log_ipv6.c @@ -297,9 +297,11 @@ static void dump_ipv6_mac_header(struct nf_log_buf *m, switch (dev->type) { case ARPHRD_ETHER: - nf_log_buf_add(m, "MACSRC=%pM MACDST=%pM MACPROTO=%04x ", - eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest, - ntohs(eth_hdr(skb)->h_proto)); + nf_log_buf_add(m, "MACSRC=%pM MACDST=%pM ", + eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest); + nf_log_dump_vlan(m, skb); + nf_log_buf_add(m, "MACPROTO=%04x ", + ntohs(eth_hdr(skb)->h_proto)); return; default: break; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index bde6d48a9942..7e0ce7af8234 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -2745,7 +2745,8 @@ static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk, if (confirm_neigh) dst_confirm_neigh(dst, daddr); - mtu = max_t(u32, mtu, IPV6_MIN_MTU); + if (mtu < IPV6_MIN_MTU) + return; if (mtu >= dst_mtu(dst)) return; diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index 240862a74a0f..1be775979132 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -709,28 +709,7 @@ static u16 ieee80211_netdev_select_queue(struct net_device *dev, static void ieee80211_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats) { - int i; - - for_each_possible_cpu(i) { - const struct pcpu_sw_netstats *tstats; - u64 rx_packets, rx_bytes, tx_packets, tx_bytes; - unsigned int start; - - tstats = per_cpu_ptr(dev->tstats, i); - - do { - start = u64_stats_fetch_begin_irq(&tstats->syncp); - rx_packets = tstats->rx_packets; - tx_packets = tstats->tx_packets; - rx_bytes = tstats->rx_bytes; - tx_bytes = tstats->tx_bytes; - } while (u64_stats_fetch_retry_irq(&tstats->syncp, start)); - - stats->rx_packets += rx_packets; - stats->tx_packets += tx_packets; - stats->rx_bytes += rx_bytes; - stats->tx_bytes += tx_bytes; - } + dev_fetch_sw_netstats(stats, dev->tstats); } static const struct net_device_ops ieee80211_dataif_ops = { diff --git a/net/mptcp/options.c b/net/mptcp/options.c index 4d35fa998fcc..092a2d48bfd3 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -517,7 +517,7 @@ static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb, return ret; } - if (subflow->use_64bit_ack) { + if (READ_ONCE(msk->use_64bit_ack)) { ack_size = TCPOLEN_MPTCP_DSS_ACK64; opts->ext_copy.data_ack = READ_ONCE(msk->ack_seq); opts->ext_copy.ack64 = 1; @@ -657,6 +657,12 @@ bool mptcp_established_options(struct sock *sk, struct sk_buff *skb, if (unlikely(mptcp_check_fallback(sk))) return false; + /* prevent adding of any MPTCP related options on reset packet + * until we support MP_TCPRST/MP_FASTCLOSE + */ + if (unlikely(skb && TCP_SKB_CB(skb)->tcp_flags & TCPHDR_RST)) + return false; + if (mptcp_established_options_mp(sk, skb, &opt_size, remaining, opts)) ret = true; else if (mptcp_established_options_dss(sk, skb, &opt_size, remaining, @@ -711,7 +717,7 @@ bool mptcp_synack_options(const struct request_sock *req, unsigned int *size, return false; } -static bool check_fully_established(struct mptcp_sock *msk, struct sock *sk, +static bool check_fully_established(struct mptcp_sock *msk, struct sock *ssk, struct mptcp_subflow_context *subflow, struct sk_buff *skb, struct mptcp_options_received *mp_opt) @@ -728,15 +734,20 @@ static bool check_fully_established(struct mptcp_sock *msk, struct sock *sk, TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq && subflow->mp_join && mp_opt->mp_join && READ_ONCE(msk->pm.server_side)) - tcp_send_ack(sk); + tcp_send_ack(ssk); goto fully_established; } - /* we should process OoO packets before the first subflow is fully - * established, but not expected for MP_JOIN subflows + /* we must process OoO packets before the first subflow is fully + * established. OoO packets are instead a protocol violation + * for MP_JOIN subflows as the peer must not send any data + * before receiving the forth ack - cfr. RFC 8684 section 3.2. */ - if (TCP_SKB_CB(skb)->seq != subflow->ssn_offset + 1) + if (TCP_SKB_CB(skb)->seq != subflow->ssn_offset + 1) { + if (subflow->mp_join) + goto reset; return subflow->mp_capable; + } if (mp_opt->dss && mp_opt->use_ack) { /* subflows are fully established as soon as we get any @@ -748,9 +759,12 @@ static bool check_fully_established(struct mptcp_sock *msk, struct sock *sk, } /* If the first established packet does not contain MP_CAPABLE + data - * then fallback to TCP + * then fallback to TCP. Fallback scenarios requires a reset for + * MP_JOIN subflows. */ if (!mp_opt->mp_capable) { + if (subflow->mp_join) + goto reset; subflow->mp_capable = 0; pr_fallback(msk); __mptcp_do_fallback(msk); @@ -767,12 +781,16 @@ fully_established: subflow->pm_notified = 1; if (subflow->mp_join) { - clear_3rdack_retransmission(sk); + clear_3rdack_retransmission(ssk); mptcp_pm_subflow_established(msk, subflow); } else { mptcp_pm_fully_established(msk); } return true; + +reset: + mptcp_subflow_reset(ssk); + return false; } static u64 expand_ack(u64 old_ack, u64 cur_ack, bool use_64bit) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 9816608da452..185dacb39781 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -1710,6 +1710,20 @@ static void pm_work(struct mptcp_sock *msk) spin_unlock_bh(&msk->pm.lock); } +static void __mptcp_close_subflow(struct mptcp_sock *msk) +{ + struct mptcp_subflow_context *subflow, *tmp; + + list_for_each_entry_safe(subflow, tmp, &msk->conn_list, node) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + + if (inet_sk_state_load(ssk) != TCP_CLOSE) + continue; + + __mptcp_close_ssk((struct sock *)msk, ssk, subflow, 0); + } +} + static void mptcp_worker(struct work_struct *work) { struct mptcp_sock *msk = container_of(work, struct mptcp_sock, work); @@ -1727,6 +1741,9 @@ static void mptcp_worker(struct work_struct *work) mptcp_clean_una(sk); mptcp_check_data_fin_ack(sk); __mptcp_flush_join_list(msk); + if (test_and_clear_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags)) + __mptcp_close_subflow(msk); + __mptcp_move_skbs(msk); if (msk->pm.status) diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index aa0ab18d2e57..13ab89dc1914 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -90,6 +90,7 @@ #define MPTCP_WORK_RTX 2 #define MPTCP_WORK_EOF 3 #define MPTCP_FALLBACK_DONE 4 +#define MPTCP_WORK_CLOSE_SUBFLOW 5 struct mptcp_options_received { u64 sndr_key; @@ -211,6 +212,7 @@ struct mptcp_sock { bool fully_established; bool rcv_data_fin; bool snd_data_fin_enable; + bool use_64bit_ack; /* Set when we received a 64-bit DSN */ spinlock_t join_list_lock; struct work_struct work; struct sk_buff *ooo_last_skb; @@ -310,7 +312,6 @@ struct mptcp_subflow_context { mpc_map : 1, backup : 1, rx_eof : 1, - use_64bit_ack : 1, /* Set when we received a 64-bit DSN */ can_ack : 1; /* only after processing the remote a key */ enum mptcp_data_avail data_avail; u32 remote_nonce; @@ -369,6 +370,7 @@ void mptcp_subflow_shutdown(struct sock *sk, struct sock *ssk, int how); void __mptcp_close_ssk(struct sock *sk, struct sock *ssk, struct mptcp_subflow_context *subflow, long timeout); +void mptcp_subflow_reset(struct sock *ssk); /* called with sk socket lock held */ int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc, diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 5d91e3a2cd30..ac4a1fe3550b 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -271,6 +271,19 @@ static bool subflow_thmac_valid(struct mptcp_subflow_context *subflow) return thmac == subflow->thmac; } +void mptcp_subflow_reset(struct sock *ssk) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); + struct sock *sk = subflow->conn; + + tcp_set_state(ssk, TCP_CLOSE); + tcp_send_active_reset(ssk, GFP_ATOMIC); + tcp_done(ssk); + if (!test_and_set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &mptcp_sk(sk)->flags) && + schedule_work(&mptcp_sk(sk)->work)) + sock_hold(sk); +} + static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); @@ -343,8 +356,7 @@ fallback: return; do_reset: - tcp_send_active_reset(sk, GFP_ATOMIC); - tcp_done(sk); + mptcp_subflow_reset(sk); } struct request_sock_ops mptcp_subflow_request_sock_ops; @@ -770,12 +782,11 @@ static enum mapping_status get_mapping_status(struct sock *ssk, if (!mpext->dsn64) { map_seq = expand_seq(subflow->map_seq, subflow->map_data_len, mpext->data_seq); - subflow->use_64bit_ack = 0; pr_debug("expanded seq=%llu", subflow->map_seq); } else { map_seq = mpext->data_seq; - subflow->use_64bit_ack = 1; } + WRITE_ONCE(mptcp_sk(subflow->conn)->use_64bit_ack, !!mpext->dsn64); if (subflow->map_valid) { /* Allow replacing only with an identical map */ diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 25313c29d799..52370211e46b 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -441,6 +441,7 @@ endif # NF_CONNTRACK config NF_TABLES select NETFILTER_NETLINK + select LIBCRC32C tristate "Netfilter nf_tables support" help nftables is the new packet classification framework that intends to diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index b00866d777fe..d2e5a8f644b8 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -609,6 +609,8 @@ static inline int ip_vs_tunnel_xmit_prepare(struct sk_buff *skb, if (ret == NF_ACCEPT) { nf_reset_ct(skb); skb_forward_csum(skb); + if (skb->dev) + skb->tstamp = 0; } return ret; } @@ -649,6 +651,8 @@ static inline int ip_vs_nat_send_or_cont(int pf, struct sk_buff *skb, if (!local) { skb_forward_csum(skb); + if (skb->dev) + skb->tstamp = 0; NF_HOOK(pf, NF_INET_LOCAL_OUT, cp->ipvs->net, NULL, skb, NULL, skb_dst(skb)->dev, dst_output); } else @@ -669,6 +673,8 @@ static inline int ip_vs_send_or_cont(int pf, struct sk_buff *skb, if (!local) { ip_vs_drop_early_demux_sk(skb); skb_forward_csum(skb); + if (skb->dev) + skb->tstamp = 0; NF_HOOK(pf, NF_INET_LOCAL_OUT, cp->ipvs->net, NULL, skb, NULL, skb_dst(skb)->dev, dst_output); } else diff --git a/net/netfilter/nf_log_common.c b/net/netfilter/nf_log_common.c index ae5628ddbe6d..fd7c5f0f5c25 100644 --- a/net/netfilter/nf_log_common.c +++ b/net/netfilter/nf_log_common.c @@ -171,6 +171,18 @@ nf_log_dump_packet_common(struct nf_log_buf *m, u_int8_t pf, } EXPORT_SYMBOL_GPL(nf_log_dump_packet_common); +void nf_log_dump_vlan(struct nf_log_buf *m, const struct sk_buff *skb) +{ + u16 vid; + + if (!skb_vlan_tag_present(skb)) + return; + + vid = skb_vlan_tag_get(skb); + nf_log_buf_add(m, "VPROTO=%04x VID=%u ", ntohs(skb->vlan_proto), vid); +} +EXPORT_SYMBOL_GPL(nf_log_dump_vlan); + /* bridge and netdev logging families share this code. */ void nf_log_l2packet(struct net *net, u_int8_t pf, __be16 protocol, diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index f22ad21d0230..9957e0ed8658 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -1864,7 +1864,7 @@ static int nft_chain_parse_hook(struct net *net, if (IS_ERR(type)) return PTR_ERR(type); } - if (hook->num > NF_MAX_HOOKS || !(type->hook_mask & (1 << hook->num))) + if (hook->num >= NFT_MAX_HOOKS || !(type->hook_mask & (1 << hook->num))) return -EOPNOTSUPP; if (type->type == NFT_CHAIN_T_NAT && @@ -2138,7 +2138,8 @@ static bool nft_hook_list_equal(struct list_head *hook_list1, } static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy, - u32 flags) + u32 flags, const struct nlattr *attr, + struct netlink_ext_ack *extack) { const struct nlattr * const *nla = ctx->nla; struct nft_table *table = ctx->table; @@ -2154,9 +2155,10 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy, return -EOPNOTSUPP; if (nla[NFTA_CHAIN_HOOK]) { - if (!nft_is_base_chain(chain)) + if (!nft_is_base_chain(chain)) { + NL_SET_BAD_ATTR(extack, attr); return -EEXIST; - + } err = nft_chain_parse_hook(ctx->net, nla, &hook, ctx->family, false); if (err < 0) @@ -2165,6 +2167,7 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy, basechain = nft_base_chain(chain); if (basechain->type != hook.type) { nft_chain_release_hook(&hook); + NL_SET_BAD_ATTR(extack, attr); return -EEXIST; } @@ -2172,6 +2175,7 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy, if (!nft_hook_list_equal(&basechain->hook_list, &hook.list)) { nft_chain_release_hook(&hook); + NL_SET_BAD_ATTR(extack, attr); return -EEXIST; } } else { @@ -2179,6 +2183,7 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy, if (ops->hooknum != hook.num || ops->priority != hook.priority) { nft_chain_release_hook(&hook); + NL_SET_BAD_ATTR(extack, attr); return -EEXIST; } } @@ -2191,8 +2196,10 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy, chain2 = nft_chain_lookup(ctx->net, table, nla[NFTA_CHAIN_NAME], genmask); - if (!IS_ERR(chain2)) + if (!IS_ERR(chain2)) { + NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_NAME]); return -EEXIST; + } } if (nla[NFTA_CHAIN_COUNTERS]) { @@ -2235,6 +2242,7 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy, nft_trans_chain_update(tmp) && nft_trans_chain_name(tmp) && strcmp(name, nft_trans_chain_name(tmp)) == 0) { + NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_NAME]); kfree(name); goto err; } @@ -2357,7 +2365,8 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk, return -EOPNOTSUPP; flags |= chain->flags & NFT_CHAIN_BASE; - return nf_tables_updchain(&ctx, genmask, policy, flags); + return nf_tables_updchain(&ctx, genmask, policy, flags, attr, + extack); } return nf_tables_addchain(&ctx, family, genmask, policy, flags); diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c index 7a2e59638499..dcd3c7b8a367 100644 --- a/net/netfilter/nft_payload.c +++ b/net/netfilter/nft_payload.c @@ -22,6 +22,7 @@ #include <linux/icmpv6.h> #include <linux/ip.h> #include <linux/ipv6.h> +#include <net/sctp/checksum.h> static bool nft_payload_rebuild_vlan_hdr(const struct sk_buff *skb, int mac_off, struct vlan_ethhdr *veth) @@ -484,6 +485,19 @@ static int nft_payload_l4csum_offset(const struct nft_pktinfo *pkt, return 0; } +static int nft_payload_csum_sctp(struct sk_buff *skb, int offset) +{ + struct sctphdr *sh; + + if (skb_ensure_writable(skb, offset + sizeof(*sh))) + return -1; + + sh = (struct sctphdr *)(skb->data + offset); + sh->checksum = sctp_compute_cksum(skb, offset); + skb->ip_summed = CHECKSUM_UNNECESSARY; + return 0; +} + static int nft_payload_l4csum_update(const struct nft_pktinfo *pkt, struct sk_buff *skb, __wsum fsum, __wsum tsum) @@ -587,6 +601,13 @@ static void nft_payload_set_eval(const struct nft_expr *expr, skb_store_bits(skb, offset, src, priv->len) < 0) goto err; + if (priv->csum_type == NFT_PAYLOAD_CSUM_SCTP && + pkt->tprot == IPPROTO_SCTP && + skb->ip_summed != CHECKSUM_PARTIAL) { + if (nft_payload_csum_sctp(skb, pkt->xt.thoff)) + goto err; + } + return; err: regs->verdict.code = NFT_BREAK; @@ -623,6 +644,13 @@ static int nft_payload_set_init(const struct nft_ctx *ctx, case NFT_PAYLOAD_CSUM_NONE: case NFT_PAYLOAD_CSUM_INET: break; + case NFT_PAYLOAD_CSUM_SCTP: + if (priv->base != NFT_PAYLOAD_TRANSPORT_HEADER) + return -EINVAL; + + if (priv->csum_offset != offsetof(struct sctphdr, checksum)) + return -EINVAL; + break; default: return -EOPNOTSUPP; } diff --git a/net/openvswitch/vport-internal_dev.c b/net/openvswitch/vport-internal_dev.c index d8fe66eea206..1e30d8df3ba5 100644 --- a/net/openvswitch/vport-internal_dev.c +++ b/net/openvswitch/vport-internal_dev.c @@ -86,31 +86,13 @@ static void internal_dev_destructor(struct net_device *dev) static void internal_get_stats(struct net_device *dev, struct rtnl_link_stats64 *stats) { - int i; - memset(stats, 0, sizeof(*stats)); stats->rx_errors = dev->stats.rx_errors; stats->tx_errors = dev->stats.tx_errors; stats->tx_dropped = dev->stats.tx_dropped; stats->rx_dropped = dev->stats.rx_dropped; - for_each_possible_cpu(i) { - const struct pcpu_sw_netstats *percpu_stats; - struct pcpu_sw_netstats local_stats; - unsigned int start; - - percpu_stats = per_cpu_ptr(dev->tstats, i); - - do { - start = u64_stats_fetch_begin_irq(&percpu_stats->syncp); - local_stats = *percpu_stats; - } while (u64_stats_fetch_retry_irq(&percpu_stats->syncp, start)); - - stats->rx_bytes += local_stats.rx_bytes; - stats->rx_packets += local_stats.rx_packets; - stats->tx_bytes += local_stats.tx_bytes; - stats->tx_packets += local_stats.tx_packets; - } + dev_fetch_sw_netstats(stats, dev->tstats); } static const struct net_device_ops internal_dev_netdev_ops = { diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index c9287b6551df..dce48162f6c2 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -831,6 +831,7 @@ void rxrpc_clean_up_local_conns(struct rxrpc_local *); * conn_event.c */ void rxrpc_process_connection(struct work_struct *); +void rxrpc_process_delayed_final_acks(struct rxrpc_connection *, bool); /* * conn_object.c diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c index 78c845a4f1ad..7e574c75be8e 100644 --- a/net/rxrpc/conn_client.c +++ b/net/rxrpc/conn_client.c @@ -901,11 +901,14 @@ static void rxrpc_unbundle_conn(struct rxrpc_connection *conn) struct rxrpc_bundle *bundle = conn->bundle; struct rxrpc_local *local = bundle->params.local; unsigned int bindex; - bool need_drop = false; + bool need_drop = false, need_put = false; int i; _enter("C=%x", conn->debug_id); + if (conn->flags & RXRPC_CONN_FINAL_ACK_MASK) + rxrpc_process_delayed_final_acks(conn, true); + spin_lock(&bundle->channel_lock); bindex = conn->bundle_shift / RXRPC_MAXCALLS; if (bundle->conns[bindex] == conn) { @@ -928,10 +931,11 @@ static void rxrpc_unbundle_conn(struct rxrpc_connection *conn) if (i == ARRAY_SIZE(bundle->conns) && !bundle->params.exclusive) { _debug("erase bundle"); rb_erase(&bundle->local_node, &local->client_bundles); + need_put = true; } spin_unlock(&local->client_bundles_lock); - if (i == ARRAY_SIZE(bundle->conns)) + if (need_put) rxrpc_put_bundle(bundle); } diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c index 6b7c6f4a82e3..aff184145ffa 100644 --- a/net/rxrpc/conn_event.c +++ b/net/rxrpc/conn_event.c @@ -397,7 +397,7 @@ abort: /* * Process delayed final ACKs that we haven't subsumed into a subsequent call. */ -static void rxrpc_process_delayed_final_acks(struct rxrpc_connection *conn) +void rxrpc_process_delayed_final_acks(struct rxrpc_connection *conn, bool force) { unsigned long j = jiffies, next_j; unsigned int channel; @@ -416,7 +416,7 @@ again: smp_rmb(); /* vs rxrpc_disconnect_client_call */ ack_at = READ_ONCE(chan->final_ack_at); - if (time_before(j, ack_at)) { + if (time_before(j, ack_at) && !force) { if (time_before(ack_at, next_j)) { next_j = ack_at; set = true; @@ -450,7 +450,7 @@ static void rxrpc_do_process_connection(struct rxrpc_connection *conn) /* Process delayed ACKs whose time has come. */ if (conn->flags & RXRPC_CONN_FINAL_ACK_MASK) - rxrpc_process_delayed_final_acks(conn); + rxrpc_process_delayed_final_acks(conn, false); /* go through the conn-level event packets, releasing the ref on this * connection that each one has when we've finished with it */ diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index f1dbb5025c0b..d790c43c473f 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -1596,7 +1596,7 @@ out: return rc; } -#define SMCD_DMBE_SIZES 7 /* 0 -> 16KB, 1 -> 32KB, .. 6 -> 1MB */ +#define SMCD_DMBE_SIZES 6 /* 0 -> 16KB, 1 -> 32KB, .. 6 -> 1MB */ static struct smc_buf_desc *smcd_new_buf_create(struct smc_link_group *lgr, bool is_dmb, int bufsize) @@ -1615,7 +1615,8 @@ static struct smc_buf_desc *smcd_new_buf_create(struct smc_link_group *lgr, rc = smc_ism_register_dmb(lgr, bufsize, buf_desc); if (rc) { kfree(buf_desc); - return (rc == -ENOMEM) ? ERR_PTR(-EAGAIN) : ERR_PTR(rc); + return (rc == -ENOMEM) ? ERR_PTR(-EAGAIN) : + ERR_PTR(-EIO); } buf_desc->pages = virt_to_page(buf_desc->cpu_addr); /* CDC header stored in buf. So, pretend it was smaller */ diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index 2db967f2fb50..273eaf1bfe49 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -233,8 +233,6 @@ static bool smc_llc_flow_start(struct smc_llc_flow *flow, default: flow->type = SMC_LLC_FLOW_NONE; } - if (qentry == lgr->delayed_event) - lgr->delayed_event = NULL; smc_llc_flow_qentry_set(flow, qentry); spin_unlock_bh(&lgr->llc_flow_lock); return true; @@ -1209,7 +1207,7 @@ static void smc_llc_process_srv_add_link(struct smc_link_group *lgr) /* enqueue a local add_link req to trigger a new add_link flow */ void smc_llc_add_link_local(struct smc_link *link) { - struct smc_llc_msg_add_link add_llc = {0}; + struct smc_llc_msg_add_link add_llc = {}; add_llc.hd.length = sizeof(add_llc); add_llc.hd.common.type = SMC_LLC_ADD_LINK; @@ -1242,7 +1240,7 @@ out: */ void smc_llc_srv_delete_link_local(struct smc_link *link, u8 del_link_id) { - struct smc_llc_msg_del_link del_llc = {0}; + struct smc_llc_msg_del_link del_llc = {}; del_llc.hd.length = sizeof(del_llc); del_llc.hd.common.type = SMC_LLC_DELETE_LINK; @@ -1314,7 +1312,7 @@ out: */ void smc_llc_send_link_delete_all(struct smc_link_group *lgr, bool ord, u32 rsn) { - struct smc_llc_msg_del_link delllc = {0}; + struct smc_llc_msg_del_link delllc = {}; int i; delllc.hd.common.type = SMC_LLC_DELETE_LINK; @@ -1603,13 +1601,12 @@ static void smc_llc_event_work(struct work_struct *work) struct smc_llc_qentry *qentry; if (!lgr->llc_flow_lcl.type && lgr->delayed_event) { - if (smc_link_usable(lgr->delayed_event->link)) { - smc_llc_event_handler(lgr->delayed_event); - } else { - qentry = lgr->delayed_event; - lgr->delayed_event = NULL; + qentry = lgr->delayed_event; + lgr->delayed_event = NULL; + if (smc_link_usable(qentry->link)) + smc_llc_event_handler(qentry); + else kfree(qentry); - } } again: diff --git a/net/tipc/msg.c b/net/tipc/msg.c index 11a429f4c6cd..2a78aa701572 100644 --- a/net/tipc/msg.c +++ b/net/tipc/msg.c @@ -150,7 +150,8 @@ int tipc_buf_append(struct sk_buff **headbuf, struct sk_buff **buf) if (fragid == FIRST_FRAGMENT) { if (unlikely(head)) goto err; - frag = skb_unshare(frag, GFP_ATOMIC); + if (skb_cloned(frag)) + frag = skb_copy(frag, GFP_ATOMIC); if (unlikely(!frag)) goto err; head = *headbuf = frag; diff --git a/net/tipc/name_distr.c b/net/tipc/name_distr.c index 2f9c148f17e2..fe4edce459ad 100644 --- a/net/tipc/name_distr.c +++ b/net/tipc/name_distr.c @@ -327,8 +327,13 @@ static struct sk_buff *tipc_named_dequeue(struct sk_buff_head *namedq, struct tipc_msg *hdr; u16 seqno; + spin_lock_bh(&namedq->lock); skb_queue_walk_safe(namedq, skb, tmp) { - skb_linearize(skb); + if (unlikely(skb_linearize(skb))) { + __skb_unlink(skb, namedq); + kfree_skb(skb); + continue; + } hdr = buf_msg(skb); seqno = msg_named_seqno(hdr); if (msg_is_last_bulk(hdr)) { @@ -338,12 +343,14 @@ static struct sk_buff *tipc_named_dequeue(struct sk_buff_head *namedq, if (msg_is_bulk(hdr) || msg_is_legacy(hdr)) { __skb_unlink(skb, namedq); + spin_unlock_bh(&namedq->lock); return skb; } if (*open && (*rcv_nxt == seqno)) { (*rcv_nxt)++; __skb_unlink(skb, namedq); + spin_unlock_bh(&namedq->lock); return skb; } @@ -353,6 +360,7 @@ static struct sk_buff *tipc_named_dequeue(struct sk_buff_head *namedq, continue; } } + spin_unlock_bh(&namedq->lock); return NULL; } diff --git a/net/tipc/node.c b/net/tipc/node.c index cf4b239fc569..d269ebe382e1 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -1496,7 +1496,7 @@ static void node_lost_contact(struct tipc_node *n, /* Clean up broadcast state */ tipc_bcast_remove_peer(n->net, n->bc_entry.link); - __skb_queue_purge(&n->bc_entry.namedq); + skb_queue_purge(&n->bc_entry.namedq); /* Abort any ongoing link failover */ for (i = 0; i < MAX_BEARERS; i++) { diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c index b74e2741f74f..cec86229a6a0 100644 --- a/net/tls/tls_device.c +++ b/net/tls/tls_device.c @@ -418,14 +418,14 @@ static int tls_push_data(struct sock *sk, struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_prot_info *prot = &tls_ctx->prot_info; struct tls_offload_context_tx *ctx = tls_offload_ctx_tx(tls_ctx); - int more = flags & (MSG_SENDPAGE_NOTLAST | MSG_MORE); struct tls_record_info *record = ctx->open_record; int tls_push_record_flags; struct page_frag *pfrag; size_t orig_size = size; u32 max_open_record_len; - int copy, rc = 0; + bool more = false; bool done = false; + int copy, rc = 0; long timeo; if (flags & @@ -492,9 +492,8 @@ handle_error: if (!size) { last_record: tls_push_record_flags = flags; - if (more) { - tls_ctx->pending_open_record_frags = - !!record->num_frags; + if (flags & (MSG_SENDPAGE_NOTLAST | MSG_MORE)) { + more = true; break; } @@ -526,6 +525,8 @@ last_record: } } while (!done); + tls_ctx->pending_open_record_frags = more; + if (orig_size - size > 0) rc = orig_size - size; diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index 002b0859fed5..8d93cea99f2c 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -869,7 +869,7 @@ static int __init tls_register(void) tls_sw_proto_ops = inet_stream_ops; tls_sw_proto_ops.splice_read = tls_sw_splice_read; - tls_sw_proto_ops.sendpage_locked = tls_sw_sendpage_locked, + tls_sw_proto_ops.sendpage_locked = tls_sw_sendpage_locked; tls_device_init(); tcp_register_ulp(&tcp_tls_ulp_ops); diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index eb82bdc6cf7c..41c3303c3357 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -613,7 +613,6 @@ static int unix_listen(struct socket *sock, int backlog) int err; struct sock *sk = sock->sk; struct unix_sock *u = unix_sk(sk); - struct pid *old_pid = NULL; err = -EOPNOTSUPP; if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET) @@ -634,7 +633,6 @@ static int unix_listen(struct socket *sock, int backlog) out_unlock: unix_state_unlock(sk); - put_pid(old_pid); out: return err; } diff --git a/net/xfrm/xfrm_interface.c b/net/xfrm/xfrm_interface.c index 5b120936d82f..aa4cdcf69d47 100644 --- a/net/xfrm/xfrm_interface.c +++ b/net/xfrm/xfrm_interface.c @@ -541,27 +541,7 @@ static int xfrmi_update(struct xfrm_if *xi, struct xfrm_if_parms *p) static void xfrmi_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *s) { - int cpu; - - for_each_possible_cpu(cpu) { - struct pcpu_sw_netstats *stats; - struct pcpu_sw_netstats tmp; - int start; - - stats = per_cpu_ptr(dev->tstats, cpu); - do { - start = u64_stats_fetch_begin_irq(&stats->syncp); - tmp.rx_packets = stats->rx_packets; - tmp.rx_bytes = stats->rx_bytes; - tmp.tx_packets = stats->tx_packets; - tmp.tx_bytes = stats->tx_bytes; - } while (u64_stats_fetch_retry_irq(&stats->syncp, start)); - - s->rx_packets += tmp.rx_packets; - s->rx_bytes += tmp.rx_bytes; - s->tx_packets += tmp.tx_packets; - s->tx_bytes += tmp.tx_bytes; - } + dev_fetch_sw_netstats(s, dev->tstats); s->rx_dropped = dev->stats.rx_dropped; s->tx_dropped = dev->stats.tx_dropped; diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile index 4773ce72edbd..ef352477cac6 100644 --- a/tools/testing/selftests/net/Makefile +++ b/tools/testing/selftests/net/Makefile @@ -20,6 +20,7 @@ TEST_PROGS += vrf-xfrm-tests.sh TEST_PROGS += rxtimestamp.sh TEST_PROGS += devlink_port_split.py TEST_PROGS += drop_monitor_tests.sh +TEST_PROGS += vrf_route_leaking.sh TEST_PROGS_EXTENDED := in_netns.sh TEST_GEN_FILES = socket nettest TEST_GEN_FILES += psock_fanout psock_tpacket msg_zerocopy reuseport_addr_any diff --git a/tools/testing/selftests/net/vrf_route_leaking.sh b/tools/testing/selftests/net/vrf_route_leaking.sh new file mode 100755 index 000000000000..23cf924754a5 --- /dev/null +++ b/tools/testing/selftests/net/vrf_route_leaking.sh @@ -0,0 +1,626 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# Copyright (c) 2019 David Ahern <dsahern@gmail.com>. All rights reserved. +# Copyright (c) 2020 Michael Jeanson <mjeanson@efficios.com>. All rights reserved. +# +# Requires CONFIG_NET_VRF, CONFIG_VETH, CONFIG_BRIDGE and CONFIG_NET_NS. +# +# +# Symmetric routing topology +# +# blue red +# +----+ .253 +----+ .253 +----+ +# | h1 |-------------------| r1 |-------------------| h2 | +# +----+ .1 +----+ .2 +----+ +# 172.16.1/24 172.16.2/24 +# 2001:db8:16:1/64 2001:db8:16:2/64 +# +# +# Route from h1 to h2 and back goes through r1, incoming vrf blue has a route +# to the outgoing vrf red for the n2 network and red has a route back to n1. +# The red VRF interface has a MTU of 1400. +# +# The first test sends a ping with a ttl of 1 from h1 to h2 and parses the +# output of the command to check that a ttl expired error is received. +# +# The second test runs traceroute from h1 to h2 and parses the output to check +# for a hop on r1. +# +# The third test sends a ping with a packet size of 1450 from h1 to h2 and +# parses the output of the command to check that a fragmentation error is +# received. +# +# +# Asymmetric routing topology +# +# This topology represents a customer setup where the issue with icmp errors +# and VRF route leaking was initialy reported. The MTU test isn't done here +# because of the lack of a return route in the red VRF. +# +# blue red +# .253 +----+ .253 +# +----| r1 |----+ +# | +----+ | +# +----+ | | +----+ +# | h1 |--------------+ +--------------| h2 | +# +----+ .1 | | .2 +----+ +# 172.16.1/24 | +----+ | 172.16.2/24 +# 2001:db8:16:1/64 +----| r2 |----+ 2001:db8:16:2/64 +# .254 +----+ .254 +# +# +# Route from h1 to h2 goes through r1, incoming vrf blue has a route to the +# outgoing vrf red for the n2 network but red doesn't have a route back to n1. +# Route from h2 to h1 goes through r2. +# +# The objective is to check that the incoming vrf routing table is selected +# to send an ICMP error back to the source when the ttl of a packet reaches 1 +# while it is forwarded between different vrfs. + +VERBOSE=0 +PAUSE_ON_FAIL=no +DEFAULT_TTYPE=sym + +H1_N1=172.16.1.0/24 +H1_N1_6=2001:db8:16:1::/64 + +H1_N1_IP=172.16.1.1 +R1_N1_IP=172.16.1.253 +R2_N1_IP=172.16.1.254 + +H1_N1_IP6=2001:db8:16:1::1 +R1_N1_IP6=2001:db8:16:1::253 +R2_N1_IP6=2001:db8:16:1::254 + +H2_N2=172.16.2.0/24 +H2_N2_6=2001:db8:16:2::/64 + +H2_N2_IP=172.16.2.2 +R1_N2_IP=172.16.2.253 +R2_N2_IP=172.16.2.254 + +H2_N2_IP6=2001:db8:16:2::2 +R1_N2_IP6=2001:db8:16:2::253 +R2_N2_IP6=2001:db8:16:2::254 + +################################################################################ +# helpers + +log_section() +{ + echo + echo "###########################################################################" + echo "$*" + echo "###########################################################################" + echo +} + +log_test() +{ + local rc=$1 + local expected=$2 + local msg="$3" + + if [ "${rc}" -eq "${expected}" ]; then + printf "TEST: %-60s [ OK ]\n" "${msg}" + nsuccess=$((nsuccess+1)) + else + ret=1 + nfail=$((nfail+1)) + printf "TEST: %-60s [FAIL]\n" "${msg}" + if [ "${PAUSE_ON_FAIL}" = "yes" ]; then + echo + echo "hit enter to continue, 'q' to quit" + read -r a + [ "$a" = "q" ] && exit 1 + fi + fi +} + +run_cmd() +{ + local cmd="$*" + local out + local rc + + if [ "$VERBOSE" = "1" ]; then + echo "COMMAND: $cmd" + fi + + # shellcheck disable=SC2086 + out=$(eval $cmd 2>&1) + rc=$? + if [ "$VERBOSE" = "1" ] && [ -n "$out" ]; then + echo "$out" + fi + + [ "$VERBOSE" = "1" ] && echo + + return $rc +} + +run_cmd_grep() +{ + local grep_pattern="$1" + shift + local cmd="$*" + local out + local rc + + if [ "$VERBOSE" = "1" ]; then + echo "COMMAND: $cmd" + fi + + # shellcheck disable=SC2086 + out=$(eval $cmd 2>&1) + if [ "$VERBOSE" = "1" ] && [ -n "$out" ]; then + echo "$out" + fi + + echo "$out" | grep -q "$grep_pattern" + rc=$? + + [ "$VERBOSE" = "1" ] && echo + + return $rc +} + +################################################################################ +# setup and teardown + +cleanup() +{ + local ns + + for ns in h1 h2 r1 r2; do + ip netns del $ns 2>/dev/null + done +} + +setup_vrf() +{ + local ns=$1 + + ip -netns "${ns}" rule del pref 0 + ip -netns "${ns}" rule add pref 32765 from all lookup local + ip -netns "${ns}" -6 rule del pref 0 + ip -netns "${ns}" -6 rule add pref 32765 from all lookup local +} + +create_vrf() +{ + local ns=$1 + local vrf=$2 + local table=$3 + + ip -netns "${ns}" link add "${vrf}" type vrf table "${table}" + ip -netns "${ns}" link set "${vrf}" up + ip -netns "${ns}" route add vrf "${vrf}" unreachable default metric 8192 + ip -netns "${ns}" -6 route add vrf "${vrf}" unreachable default metric 8192 + + ip -netns "${ns}" addr add 127.0.0.1/8 dev "${vrf}" + ip -netns "${ns}" -6 addr add ::1 dev "${vrf}" nodad +} + +setup_sym() +{ + local ns + + # make sure we are starting with a clean slate + cleanup + + # + # create nodes as namespaces + # + for ns in h1 h2 r1; do + ip netns add $ns + ip -netns $ns link set lo up + + case "${ns}" in + h[12]) ip netns exec $ns sysctl -q -w net.ipv6.conf.all.forwarding=0 + ip netns exec $ns sysctl -q -w net.ipv6.conf.all.keep_addr_on_down=1 + ;; + r1) ip netns exec $ns sysctl -q -w net.ipv4.ip_forward=1 + ip netns exec $ns sysctl -q -w net.ipv6.conf.all.forwarding=1 + esac + done + + # + # create interconnects + # + ip -netns h1 link add eth0 type veth peer name r1h1 + ip -netns h1 link set r1h1 netns r1 name eth0 up + + ip -netns h2 link add eth0 type veth peer name r1h2 + ip -netns h2 link set r1h2 netns r1 name eth1 up + + # + # h1 + # + ip -netns h1 addr add dev eth0 ${H1_N1_IP}/24 + ip -netns h1 -6 addr add dev eth0 ${H1_N1_IP6}/64 nodad + ip -netns h1 link set eth0 up + + # h1 to h2 via r1 + ip -netns h1 route add ${H2_N2} via ${R1_N1_IP} dev eth0 + ip -netns h1 -6 route add ${H2_N2_6} via "${R1_N1_IP6}" dev eth0 + + # + # h2 + # + ip -netns h2 addr add dev eth0 ${H2_N2_IP}/24 + ip -netns h2 -6 addr add dev eth0 ${H2_N2_IP6}/64 nodad + ip -netns h2 link set eth0 up + + # h2 to h1 via r1 + ip -netns h2 route add default via ${R1_N2_IP} dev eth0 + ip -netns h2 -6 route add default via ${R1_N2_IP6} dev eth0 + + # + # r1 + # + setup_vrf r1 + create_vrf r1 blue 1101 + create_vrf r1 red 1102 + ip -netns r1 link set mtu 1400 dev eth1 + ip -netns r1 link set eth0 vrf blue up + ip -netns r1 link set eth1 vrf red up + ip -netns r1 addr add dev eth0 ${R1_N1_IP}/24 + ip -netns r1 -6 addr add dev eth0 ${R1_N1_IP6}/64 nodad + ip -netns r1 addr add dev eth1 ${R1_N2_IP}/24 + ip -netns r1 -6 addr add dev eth1 ${R1_N2_IP6}/64 nodad + + # Route leak from blue to red + ip -netns r1 route add vrf blue ${H2_N2} dev red + ip -netns r1 -6 route add vrf blue ${H2_N2_6} dev red + + # Route leak from red to blue + ip -netns r1 route add vrf red ${H1_N1} dev blue + ip -netns r1 -6 route add vrf red ${H1_N1_6} dev blue + + + # Wait for ip config to settle + sleep 2 +} + +setup_asym() +{ + local ns + + # make sure we are starting with a clean slate + cleanup + + # + # create nodes as namespaces + # + for ns in h1 h2 r1 r2; do + ip netns add $ns + ip -netns $ns link set lo up + + case "${ns}" in + h[12]) ip netns exec $ns sysctl -q -w net.ipv6.conf.all.forwarding=0 + ip netns exec $ns sysctl -q -w net.ipv6.conf.all.keep_addr_on_down=1 + ;; + r[12]) ip netns exec $ns sysctl -q -w net.ipv4.ip_forward=1 + ip netns exec $ns sysctl -q -w net.ipv6.conf.all.forwarding=1 + esac + done + + # + # create interconnects + # + ip -netns h1 link add eth0 type veth peer name r1h1 + ip -netns h1 link set r1h1 netns r1 name eth0 up + + ip -netns h1 link add eth1 type veth peer name r2h1 + ip -netns h1 link set r2h1 netns r2 name eth0 up + + ip -netns h2 link add eth0 type veth peer name r1h2 + ip -netns h2 link set r1h2 netns r1 name eth1 up + + ip -netns h2 link add eth1 type veth peer name r2h2 + ip -netns h2 link set r2h2 netns r2 name eth1 up + + # + # h1 + # + ip -netns h1 link add br0 type bridge + ip -netns h1 link set br0 up + ip -netns h1 addr add dev br0 ${H1_N1_IP}/24 + ip -netns h1 -6 addr add dev br0 ${H1_N1_IP6}/64 nodad + ip -netns h1 link set eth0 master br0 up + ip -netns h1 link set eth1 master br0 up + + # h1 to h2 via r1 + ip -netns h1 route add ${H2_N2} via ${R1_N1_IP} dev br0 + ip -netns h1 -6 route add ${H2_N2_6} via "${R1_N1_IP6}" dev br0 + + # + # h2 + # + ip -netns h2 link add br0 type bridge + ip -netns h2 link set br0 up + ip -netns h2 addr add dev br0 ${H2_N2_IP}/24 + ip -netns h2 -6 addr add dev br0 ${H2_N2_IP6}/64 nodad + ip -netns h2 link set eth0 master br0 up + ip -netns h2 link set eth1 master br0 up + + # h2 to h1 via r2 + ip -netns h2 route add default via ${R2_N2_IP} dev br0 + ip -netns h2 -6 route add default via ${R2_N2_IP6} dev br0 + + # + # r1 + # + setup_vrf r1 + create_vrf r1 blue 1101 + create_vrf r1 red 1102 + ip -netns r1 link set mtu 1400 dev eth1 + ip -netns r1 link set eth0 vrf blue up + ip -netns r1 link set eth1 vrf red up + ip -netns r1 addr add dev eth0 ${R1_N1_IP}/24 + ip -netns r1 -6 addr add dev eth0 ${R1_N1_IP6}/64 nodad + ip -netns r1 addr add dev eth1 ${R1_N2_IP}/24 + ip -netns r1 -6 addr add dev eth1 ${R1_N2_IP6}/64 nodad + + # Route leak from blue to red + ip -netns r1 route add vrf blue ${H2_N2} dev red + ip -netns r1 -6 route add vrf blue ${H2_N2_6} dev red + + # No route leak from red to blue + + # + # r2 + # + ip -netns r2 addr add dev eth0 ${R2_N1_IP}/24 + ip -netns r2 -6 addr add dev eth0 ${R2_N1_IP6}/64 nodad + ip -netns r2 addr add dev eth1 ${R2_N2_IP}/24 + ip -netns r2 -6 addr add dev eth1 ${R2_N2_IP6}/64 nodad + + # Wait for ip config to settle + sleep 2 +} + +check_connectivity() +{ + ip netns exec h1 ping -c1 -w1 ${H2_N2_IP} >/dev/null 2>&1 + log_test $? 0 "Basic IPv4 connectivity" + return $? +} + +check_connectivity6() +{ + ip netns exec h1 "${ping6}" -c1 -w1 ${H2_N2_IP6} >/dev/null 2>&1 + log_test $? 0 "Basic IPv6 connectivity" + return $? +} + +check_traceroute() +{ + if [ ! -x "$(command -v traceroute)" ]; then + echo "SKIP: Could not run IPV4 test without traceroute" + return 1 + fi +} + +check_traceroute6() +{ + if [ ! -x "$(command -v traceroute6)" ]; then + echo "SKIP: Could not run IPV6 test without traceroute6" + return 1 + fi +} + +ipv4_traceroute() +{ + local ttype="$1" + + [ "x$ttype" = "x" ] && ttype="$DEFAULT_TTYPE" + + log_section "IPv4 ($ttype route): VRF ICMP error route lookup traceroute" + + check_traceroute || return + + setup_"$ttype" + + check_connectivity || return + + run_cmd_grep "${R1_N1_IP}" ip netns exec h1 traceroute ${H2_N2_IP} + log_test $? 0 "Traceroute reports a hop on r1" +} + +ipv4_traceroute_asym() +{ + ipv4_traceroute asym +} + +ipv6_traceroute() +{ + local ttype="$1" + + [ "x$ttype" = "x" ] && ttype="$DEFAULT_TTYPE" + + log_section "IPv6 ($ttype route): VRF ICMP error route lookup traceroute" + + check_traceroute6 || return + + setup_"$ttype" + + check_connectivity6 || return + + run_cmd_grep "${R1_N1_IP6}" ip netns exec h1 traceroute6 ${H2_N2_IP6} + log_test $? 0 "Traceroute6 reports a hop on r1" +} + +ipv6_traceroute_asym() +{ + ipv6_traceroute asym +} + +ipv4_ping_ttl() +{ + local ttype="$1" + + [ "x$ttype" = "x" ] && ttype="$DEFAULT_TTYPE" + + log_section "IPv4 ($ttype route): VRF ICMP ttl error route lookup ping" + + setup_"$ttype" + + check_connectivity || return + + run_cmd_grep "Time to live exceeded" ip netns exec h1 ping -t1 -c1 -W2 ${H2_N2_IP} + log_test $? 0 "Ping received ICMP ttl exceeded" +} + +ipv4_ping_ttl_asym() +{ + ipv4_ping_ttl asym +} + +ipv4_ping_frag() +{ + local ttype="$1" + + [ "x$ttype" = "x" ] && ttype="$DEFAULT_TTYPE" + + log_section "IPv4 ($ttype route): VRF ICMP fragmentation error route lookup ping" + + setup_"$ttype" + + check_connectivity || return + + run_cmd_grep "Frag needed" ip netns exec h1 ping -s 1450 -Mdo -c1 -W2 ${H2_N2_IP} + log_test $? 0 "Ping received ICMP Frag needed" +} + +ipv4_ping_frag_asym() +{ + ipv4_ping_frag asym +} + +ipv6_ping_ttl() +{ + local ttype="$1" + + [ "x$ttype" = "x" ] && ttype="$DEFAULT_TTYPE" + + log_section "IPv6 ($ttype route): VRF ICMP ttl error route lookup ping" + + setup_"$ttype" + + check_connectivity6 || return + + run_cmd_grep "Time exceeded: Hop limit" ip netns exec h1 "${ping6}" -t1 -c1 -W2 ${H2_N2_IP6} + log_test $? 0 "Ping received ICMP Hop limit" +} + +ipv6_ping_ttl_asym() +{ + ipv6_ping_ttl asym +} + +ipv6_ping_frag() +{ + local ttype="$1" + + [ "x$ttype" = "x" ] && ttype="$DEFAULT_TTYPE" + + log_section "IPv6 ($ttype route): VRF ICMP fragmentation error route lookup ping" + + setup_"$ttype" + + check_connectivity6 || return + + run_cmd_grep "Packet too big" ip netns exec h1 "${ping6}" -s 1450 -Mdo -c1 -W2 ${H2_N2_IP6} + log_test $? 0 "Ping received ICMP Packet too big" +} + +ipv6_ping_frag_asym() +{ + ipv6_ping_frag asym +} + +################################################################################ +# usage + +usage() +{ + cat <<EOF +usage: ${0##*/} OPTS + + -4 Run IPv4 tests only + -6 Run IPv6 tests only + -t TEST Run only TEST + -p Pause on fail + -v verbose mode (show commands and output) +EOF +} + +################################################################################ +# main + +# Some systems don't have a ping6 binary anymore +command -v ping6 > /dev/null 2>&1 && ping6=$(command -v ping6) || ping6=$(command -v ping) + +TESTS_IPV4="ipv4_ping_ttl ipv4_traceroute ipv4_ping_frag ipv4_ping_ttl_asym ipv4_traceroute_asym" +TESTS_IPV6="ipv6_ping_ttl ipv6_traceroute ipv6_ping_frag ipv6_ping_ttl_asym ipv6_traceroute_asym" + +ret=0 +nsuccess=0 +nfail=0 + +while getopts :46t:pvh o +do + case $o in + 4) TESTS=ipv4;; + 6) TESTS=ipv6;; + t) TESTS=$OPTARG;; + p) PAUSE_ON_FAIL=yes;; + v) VERBOSE=1;; + h) usage; exit 0;; + *) usage; exit 1;; + esac +done + +# +# show user test config +# +if [ -z "$TESTS" ]; then + TESTS="$TESTS_IPV4 $TESTS_IPV6" +elif [ "$TESTS" = "ipv4" ]; then + TESTS="$TESTS_IPV4" +elif [ "$TESTS" = "ipv6" ]; then + TESTS="$TESTS_IPV6" +fi + +for t in $TESTS +do + case $t in + ipv4_ping_ttl|ping) ipv4_ping_ttl;;& + ipv4_ping_ttl_asym|ping) ipv4_ping_ttl_asym;;& + ipv4_traceroute|traceroute) ipv4_traceroute;;& + ipv4_traceroute_asym|traceroute) ipv4_traceroute_asym;;& + ipv4_ping_frag|ping) ipv4_ping_frag;;& + + ipv6_ping_ttl|ping) ipv6_ping_ttl;;& + ipv6_ping_ttl_asym|ping) ipv6_ping_ttl_asym;;& + ipv6_traceroute|traceroute) ipv6_traceroute;;& + ipv6_traceroute_asym|traceroute) ipv6_traceroute_asym;;& + ipv6_ping_frag|ping) ipv6_ping_frag;;& + + # setup namespaces and config, but do not run any tests + setup_sym|setup) setup_sym; exit 0;; + setup_asym) setup_asym; exit 0;; + + help) echo "Test names: $TESTS"; exit 0;; + esac +done + +cleanup + +printf "\nTests passed: %3d\n" ${nsuccess} +printf "Tests failed: %3d\n" ${nfail} + +exit $ret diff --git a/tools/testing/selftests/netfilter/nf-queue.c b/tools/testing/selftests/netfilter/nf-queue.c index 29c73bce38fa..9e56b9d47037 100644 --- a/tools/testing/selftests/netfilter/nf-queue.c +++ b/tools/testing/selftests/netfilter/nf-queue.c @@ -17,9 +17,12 @@ struct options { bool count_packets; + bool gso_enabled; int verbose; unsigned int queue_num; unsigned int timeout; + uint32_t verdict; + uint32_t delay_ms; }; static unsigned int queue_stats[5]; @@ -27,7 +30,7 @@ static struct options opts; static void help(const char *p) { - printf("Usage: %s [-c|-v [-vv] ] [-t timeout] [-q queue_num]\n", p); + printf("Usage: %s [-c|-v [-vv] ] [-t timeout] [-q queue_num] [-Qdst_queue ] [ -d ms_delay ] [-G]\n", p); } static int parse_attr_cb(const struct nlattr *attr, void *data) @@ -162,7 +165,7 @@ nfq_build_cfg_params(char *buf, uint8_t mode, int range, int queue_num) } static struct nlmsghdr * -nfq_build_verdict(char *buf, int id, int queue_num, int verd) +nfq_build_verdict(char *buf, int id, int queue_num, uint32_t verd) { struct nfqnl_msg_verdict_hdr vh = { .verdict = htonl(verd), @@ -189,9 +192,6 @@ static void print_stats(void) unsigned int last, total; int i; - if (!opts.count_packets) - return; - total = 0; last = queue_stats[0]; @@ -234,7 +234,8 @@ struct mnl_socket *open_queue(void) nlh = nfq_build_cfg_params(buf, NFQNL_COPY_PACKET, 0xFFFF, queue_num); - flags = NFQA_CFG_F_GSO | NFQA_CFG_F_UID_GID; + flags = opts.gso_enabled ? NFQA_CFG_F_GSO : 0; + flags |= NFQA_CFG_F_UID_GID; mnl_attr_put_u32(nlh, NFQA_CFG_FLAGS, htonl(flags)); mnl_attr_put_u32(nlh, NFQA_CFG_MASK, htonl(flags)); @@ -255,6 +256,17 @@ struct mnl_socket *open_queue(void) return nl; } +static void sleep_ms(uint32_t delay) +{ + struct timespec ts = { .tv_sec = delay / 1000 }; + + delay %= 1000; + + ts.tv_nsec = delay * 1000llu * 1000llu; + + nanosleep(&ts, NULL); +} + static int mainloop(void) { unsigned int buflen = 64 * 1024 + MNL_SOCKET_BUFFER_SIZE; @@ -278,7 +290,7 @@ static int mainloop(void) ret = mnl_socket_recvfrom(nl, buf, buflen); if (ret == -1) { - if (errno == ENOBUFS) + if (errno == ENOBUFS || errno == EINTR) continue; if (errno == EAGAIN) { @@ -298,7 +310,10 @@ static int mainloop(void) } id = ret - MNL_CB_OK; - nlh = nfq_build_verdict(buf, id, opts.queue_num, NF_ACCEPT); + if (opts.delay_ms) + sleep_ms(opts.delay_ms); + + nlh = nfq_build_verdict(buf, id, opts.queue_num, opts.verdict); if (mnl_socket_sendto(nl, nlh, nlh->nlmsg_len) < 0) { perror("mnl_socket_sendto"); exit(EXIT_FAILURE); @@ -314,7 +329,7 @@ static void parse_opts(int argc, char **argv) { int c; - while ((c = getopt(argc, argv, "chvt:q:")) != -1) { + while ((c = getopt(argc, argv, "chvt:q:Q:d:G")) != -1) { switch (c) { case 'c': opts.count_packets = true; @@ -328,20 +343,48 @@ static void parse_opts(int argc, char **argv) if (opts.queue_num > 0xffff) opts.queue_num = 0; break; + case 'Q': + opts.verdict = atoi(optarg); + if (opts.verdict > 0xffff) { + fprintf(stderr, "Expected destination queue number\n"); + exit(1); + } + + opts.verdict <<= 16; + opts.verdict |= NF_QUEUE; + break; + case 'd': + opts.delay_ms = atoi(optarg); + if (opts.delay_ms == 0) { + fprintf(stderr, "Expected nonzero delay (in milliseconds)\n"); + exit(1); + } + break; case 't': opts.timeout = atoi(optarg); break; + case 'G': + opts.gso_enabled = false; + break; case 'v': opts.verbose++; break; } } + + if (opts.verdict != NF_ACCEPT && (opts.verdict >> 16 == opts.queue_num)) { + fprintf(stderr, "Cannot use same destination and source queue\n"); + exit(1); + } } int main(int argc, char *argv[]) { int ret; + opts.verdict = NF_ACCEPT; + opts.gso_enabled = true; + parse_opts(argc, argv); ret = mainloop(); diff --git a/tools/testing/selftests/netfilter/nft_meta.sh b/tools/testing/selftests/netfilter/nft_meta.sh index d250b84dd5bc..087f0e6e71ce 100755 --- a/tools/testing/selftests/netfilter/nft_meta.sh +++ b/tools/testing/selftests/netfilter/nft_meta.sh @@ -7,8 +7,7 @@ ksft_skip=4 sfx=$(mktemp -u "XXXXXXXX") ns0="ns0-$sfx" -nft --version > /dev/null 2>&1 -if [ $? -ne 0 ];then +if ! nft --version > /dev/null 2>&1; then echo "SKIP: Could not run test without nft tool" exit $ksft_skip fi @@ -24,6 +23,8 @@ ip -net "$ns0" addr add 127.0.0.1 dev lo trap cleanup EXIT +currentyear=$(date +%G) +lastyear=$((currentyear-1)) ip netns exec "$ns0" nft -f /dev/stdin <<EOF table inet filter { counter iifcount {} @@ -33,6 +34,9 @@ table inet filter { counter infproto4count {} counter il4protocounter {} counter imarkcounter {} + counter icpu0counter {} + counter ilastyearcounter {} + counter icurrentyearcounter {} counter oifcount {} counter oifnamecount {} @@ -54,6 +58,9 @@ table inet filter { meta nfproto ipv4 counter name "infproto4count" meta l4proto icmp counter name "il4protocounter" meta mark 42 counter name "imarkcounter" + meta cpu 0 counter name "icpu0counter" + meta time "$lastyear-01-01" - "$lastyear-12-31" counter name ilastyearcounter + meta time "$currentyear-01-01" - "$currentyear-12-31" counter name icurrentyearcounter } chain output { @@ -84,11 +91,10 @@ check_one_counter() local want="packets $2" local verbose="$3" - cnt=$(ip netns exec "$ns0" nft list counter inet filter $cname | grep -q "$want") - if [ $? -ne 0 ];then + if ! ip netns exec "$ns0" nft list counter inet filter $cname | grep -q "$want"; then echo "FAIL: $cname, want \"$want\", got" ret=1 - ip netns exec "$ns0" nft list counter inet filter $counter + ip netns exec "$ns0" nft list counter inet filter $cname fi } @@ -100,8 +106,7 @@ check_lo_counters() for counter in iifcount iifnamecount iifgroupcount iiftypecount infproto4count \ oifcount oifnamecount oifgroupcount oiftypecount onfproto4count \ - il4protocounter \ - ol4protocounter \ + il4protocounter icurrentyearcounter ol4protocounter \ ; do check_one_counter "$counter" "$want" "$verbose" done @@ -116,9 +121,22 @@ check_one_counter oskuidcounter "1" true check_one_counter oskgidcounter "1" true check_one_counter imarkcounter "1" true check_one_counter omarkcounter "1" true +check_one_counter ilastyearcounter "0" true if [ $ret -eq 0 ];then echo "OK: nftables meta iif/oif counters at expected values" +else + exit $ret +fi + +#First CPU execution and counter +taskset -p 01 $$ > /dev/null +ip netns exec "$ns0" nft reset counters > /dev/null +ip netns exec "$ns0" ping -q -c 1 127.0.0.1 > /dev/null +check_one_counter icpu0counter "2" true + +if [ $ret -eq 0 ];then + echo "OK: nftables meta cpu counter at expected values" fi exit $ret diff --git a/tools/testing/selftests/netfilter/nft_queue.sh b/tools/testing/selftests/netfilter/nft_queue.sh index 6898448b4266..3d202b90b33d 100755 --- a/tools/testing/selftests/netfilter/nft_queue.sh +++ b/tools/testing/selftests/netfilter/nft_queue.sh @@ -12,6 +12,7 @@ sfx=$(mktemp -u "XXXXXXXX") ns1="ns1-$sfx" ns2="ns2-$sfx" nsrouter="nsrouter-$sfx" +timeout=4 cleanup() { @@ -20,6 +21,7 @@ cleanup() ip netns del ${nsrouter} rm -f "$TMPFILE0" rm -f "$TMPFILE1" + rm -f "$TMPFILE2" "$TMPFILE3" } nft --version > /dev/null 2>&1 @@ -42,6 +44,8 @@ fi TMPFILE0=$(mktemp) TMPFILE1=$(mktemp) +TMPFILE2=$(mktemp) +TMPFILE3=$(mktemp) trap cleanup EXIT ip netns add ${ns1} @@ -83,7 +87,7 @@ load_ruleset() { local name=$1 local prio=$2 -ip netns exec ${nsrouter} nft -f - <<EOF +ip netns exec ${nsrouter} nft -f /dev/stdin <<EOF table inet $name { chain nfq { ip protocol icmp queue bypass @@ -118,7 +122,7 @@ EOF load_counter_ruleset() { local prio=$1 -ip netns exec ${nsrouter} nft -f - <<EOF +ip netns exec ${nsrouter} nft -f /dev/stdin <<EOF table inet countrules { chain pre { type filter hook prerouting priority $prio; policy accept; @@ -175,7 +179,7 @@ test_ping_router() { test_queue_blackhole() { local proto=$1 -ip netns exec ${nsrouter} nft -f - <<EOF +ip netns exec ${nsrouter} nft -f /dev/stdin <<EOF table $proto blackh { chain forward { type filter hook forward priority 0; policy accept; @@ -184,10 +188,10 @@ table $proto blackh { } EOF if [ $proto = "ip" ] ;then - ip netns exec ${ns1} ping -c 1 -q 10.0.2.99 > /dev/null + ip netns exec ${ns1} ping -W 2 -c 1 -q 10.0.2.99 > /dev/null lret=$? elif [ $proto = "ip6" ]; then - ip netns exec ${ns1} ping -c 1 -q dead:2::99 > /dev/null + ip netns exec ${ns1} ping -W 2 -c 1 -q dead:2::99 > /dev/null lret=$? else lret=111 @@ -214,8 +218,8 @@ test_queue() local last="" # spawn nf-queue listeners - ip netns exec ${nsrouter} ./nf-queue -c -q 0 -t 3 > "$TMPFILE0" & - ip netns exec ${nsrouter} ./nf-queue -c -q 1 -t 3 > "$TMPFILE1" & + ip netns exec ${nsrouter} ./nf-queue -c -q 0 -t $timeout > "$TMPFILE0" & + ip netns exec ${nsrouter} ./nf-queue -c -q 1 -t $timeout > "$TMPFILE1" & sleep 1 test_ping ret=$? @@ -250,11 +254,11 @@ test_queue() test_tcp_forward() { - ip netns exec ${nsrouter} ./nf-queue -q 2 -t 10 & + ip netns exec ${nsrouter} ./nf-queue -q 2 -t $timeout & local nfqpid=$! tmpfile=$(mktemp) || exit 1 - dd conv=sparse status=none if=/dev/zero bs=1M count=100 of=$tmpfile + dd conv=sparse status=none if=/dev/zero bs=1M count=200 of=$tmpfile ip netns exec ${ns2} nc -w 5 -l -p 12345 <"$tmpfile" >/dev/null & local rpid=$! @@ -270,15 +274,13 @@ test_tcp_forward() test_tcp_localhost() { - tc -net "${nsrouter}" qdisc add dev lo root netem loss random 1% - tmpfile=$(mktemp) || exit 1 - dd conv=sparse status=none if=/dev/zero bs=1M count=900 of=$tmpfile + dd conv=sparse status=none if=/dev/zero bs=1M count=200 of=$tmpfile ip netns exec ${nsrouter} nc -w 5 -l -p 12345 <"$tmpfile" >/dev/null & local rpid=$! - ip netns exec ${nsrouter} ./nf-queue -q 3 -t 30 & + ip netns exec ${nsrouter} ./nf-queue -q 3 -t $timeout & local nfqpid=$! sleep 1 @@ -287,6 +289,47 @@ test_tcp_localhost() wait $rpid [ $? -eq 0 ] && echo "PASS: tcp via loopback" + wait 2>/dev/null +} + +test_tcp_localhost_requeue() +{ +ip netns exec ${nsrouter} nft -f /dev/stdin <<EOF +flush ruleset +table inet filter { + chain output { + type filter hook output priority 0; policy accept; + tcp dport 12345 limit rate 1/second burst 1 packets counter queue num 0 + } + chain post { + type filter hook postrouting priority 0; policy accept; + tcp dport 12345 limit rate 1/second burst 1 packets counter queue num 0 + } +} +EOF + tmpfile=$(mktemp) || exit 1 + dd conv=sparse status=none if=/dev/zero bs=1M count=200 of=$tmpfile + ip netns exec ${nsrouter} nc -w 5 -l -p 12345 <"$tmpfile" >/dev/null & + local rpid=$! + + ip netns exec ${nsrouter} ./nf-queue -c -q 1 -t $timeout > "$TMPFILE2" & + + # nfqueue 1 will be called via output hook. But this time, + # re-queue the packet to nfqueue program on queue 2. + ip netns exec ${nsrouter} ./nf-queue -G -d 150 -c -q 0 -Q 1 -t $timeout > "$TMPFILE3" & + + sleep 1 + ip netns exec ${nsrouter} nc -w 5 127.0.0.1 12345 <"$tmpfile" > /dev/null + rm -f "$tmpfile" + + wait + + if ! diff -u "$TMPFILE2" "$TMPFILE3" ; then + echo "FAIL: lost packets during requeue?!" 1>&2 + return + fi + + echo "PASS: tcp via loopback and re-queueing" } ip netns exec ${nsrouter} sysctl net.ipv6.conf.all.forwarding=1 > /dev/null @@ -328,5 +371,6 @@ test_queue 20 test_tcp_forward test_tcp_localhost +test_tcp_localhost_requeue exit $ret |