diff options
141 files changed, 2139 insertions, 1189 deletions
diff --git a/Documentation/ABI/testing/sysfs-class-net b/Documentation/ABI/testing/sysfs-class-net index beb8ec4dabbc..5ecfd72ba684 100644 --- a/Documentation/ABI/testing/sysfs-class-net +++ b/Documentation/ABI/testing/sysfs-class-net @@ -188,6 +188,14 @@ Description: Indicates the interface unique physical port identifier within the NIC, as a string. +What: /sys/class/net/<iface>/phys_port_name +Date: March 2015 +KernelVersion: 4.0 +Contact: netdev@vger.kernel.org +Description: + Indicates the interface physical port name within the NIC, + as a string. + What: /sys/class/net/<iface>/speed Date: October 2009 KernelVersion: 2.6.33 diff --git a/Documentation/ABI/testing/sysfs-class-net-queues b/Documentation/ABI/testing/sysfs-class-net-queues index 5e9aeb91d355..0c0df91b1516 100644 --- a/Documentation/ABI/testing/sysfs-class-net-queues +++ b/Documentation/ABI/testing/sysfs-class-net-queues @@ -24,6 +24,14 @@ Description: Indicates the number of transmit timeout events seen by this network interface transmit queue. +What: /sys/class/<iface>/queues/tx-<queue>/tx_maxrate +Date: March 2015 +KernelVersion: 4.1 +Contact: netdev@vger.kernel.org +Description: + A Mbps max-rate set for the queue, a value of zero means disabled, + default is disabled. + What: /sys/class/<iface>/queues/tx-<queue>/xps_cpus Date: November 2010 KernelVersion: 2.6.38 diff --git a/Documentation/devicetree/bindings/net/apm-xgene-enet.txt b/Documentation/devicetree/bindings/net/apm-xgene-enet.txt index 6151999c5dca..dc7961b33076 100644 --- a/Documentation/devicetree/bindings/net/apm-xgene-enet.txt +++ b/Documentation/devicetree/bindings/net/apm-xgene-enet.txt @@ -15,6 +15,7 @@ Required properties for all the ethernet interfaces: - "ring_csr": Descriptor ring control and status register address space - "ring_cmd": Descriptor ring command register address space - interrupts: Ethernet main interrupt +- port-id: Port number (0 or 1) - clocks: Reference to the clock entry. - local-mac-address: MAC address assigned to this device - phy-connection-type: Interface type between ethernet device and PHY device @@ -49,6 +50,7 @@ Example: <0x0 0X10000000 0x0 0X200>; reg-names = "enet_csr", "ring_csr", "ring_cmd"; interrupts = <0x0 0x3c 0x4>; + port-id = <0>; clocks = <&menetclk 0>; local-mac-address = [00 01 73 00 00 01]; phy-connection-type = "rgmii"; diff --git a/Documentation/networking/scaling.txt b/Documentation/networking/scaling.txt index 99ca40e8e810..cbfac0949635 100644 --- a/Documentation/networking/scaling.txt +++ b/Documentation/networking/scaling.txt @@ -421,6 +421,15 @@ best CPUs to share a given queue are probably those that share the cache with the CPU that processes transmit completions for that queue (transmit interrupts). +Per TX Queue rate limitation: +============================= + +These are rate-limitation mechanisms implemented by HW, where currently +a max-rate attribute is supported, by setting a Mbps value to + +/sys/class/net/<dev>/queues/tx-<n>/tx_maxrate + +A value of zero means disabled, and this is the default. Further Information =================== diff --git a/arch/arm64/boot/dts/apm/apm-mustang.dts b/arch/arm64/boot/dts/apm/apm-mustang.dts index 2e25de0800b9..83578e766b94 100644 --- a/arch/arm64/boot/dts/apm/apm-mustang.dts +++ b/arch/arm64/boot/dts/apm/apm-mustang.dts @@ -45,6 +45,10 @@ status = "ok"; }; +&sgenet1 { + status = "ok"; +}; + &xgenet { status = "ok"; }; diff --git a/arch/arm64/boot/dts/apm/apm-storm.dtsi b/arch/arm64/boot/dts/apm/apm-storm.dtsi index a857794432d6..c1eb6911e539 100644 --- a/arch/arm64/boot/dts/apm/apm-storm.dtsi +++ b/arch/arm64/boot/dts/apm/apm-storm.dtsi @@ -186,6 +186,16 @@ clock-output-names = "sge0clk"; }; + sge1clk: sge1clk@1f21c000 { + compatible = "apm,xgene-device-clock"; + #clock-cells = <1>; + clocks = <&socplldiv2 0>; + reg = <0x0 0x1f21c000 0x0 0x1000>; + reg-names = "csr-reg"; + csr-mask = <0xc>; + clock-output-names = "sge1clk"; + }; + xge0clk: xge0clk@1f61c000 { compatible = "apm,xgene-device-clock"; #clock-cells = <1>; @@ -635,6 +645,21 @@ phy-connection-type = "sgmii"; }; + sgenet1: ethernet@1f210030 { + compatible = "apm,xgene1-sgenet"; + status = "disabled"; + reg = <0x0 0x1f210030 0x0 0xd100>, + <0x0 0x1f200000 0x0 0Xc300>, + <0x0 0x1B000000 0x0 0X8000>; + reg-names = "enet_csr", "ring_csr", "ring_cmd"; + interrupts = <0x0 0xAC 0x4>; + port-id = <1>; + dma-coherent; + clocks = <&sge1clk 0>; + local-mac-address = [00 00 00 00 00 00]; + phy-connection-type = "sgmii"; + }; + xgenet: ethernet@1f610000 { compatible = "apm,xgene1-xgenet"; status = "disabled"; diff --git a/drivers/net/can/cc770/cc770_platform.c b/drivers/net/can/cc770/cc770_platform.c index b1e8851d3cc4..866e5e12fdd2 100644 --- a/drivers/net/can/cc770/cc770_platform.c +++ b/drivers/net/can/cc770/cc770_platform.c @@ -254,7 +254,7 @@ static int cc770_platform_remove(struct platform_device *pdev) return 0; } -static struct of_device_id cc770_platform_table[] = { +static const struct of_device_id cc770_platform_table[] = { {.compatible = "bosch,cc770"}, /* CC770 from Bosch */ {.compatible = "intc,82527"}, /* AN82527 from Intel CP */ {}, diff --git a/drivers/net/can/grcan.c b/drivers/net/can/grcan.c index fed1bbd0b0d2..e3d7e22a4fa0 100644 --- a/drivers/net/can/grcan.c +++ b/drivers/net/can/grcan.c @@ -1725,7 +1725,7 @@ static int grcan_remove(struct platform_device *ofdev) return 0; } -static struct of_device_id grcan_match[] = { +static const struct of_device_id grcan_match[] = { {.name = "GAISLER_GRCAN"}, {.name = "01_03d"}, {.name = "GAISLER_GRHCAN"}, diff --git a/drivers/net/can/mscan/mpc5xxx_can.c b/drivers/net/can/mscan/mpc5xxx_can.c index ad024e60ba8c..c7427bdd3a4b 100644 --- a/drivers/net/can/mscan/mpc5xxx_can.c +++ b/drivers/net/can/mscan/mpc5xxx_can.c @@ -43,7 +43,7 @@ struct mpc5xxx_can_data { }; #ifdef CONFIG_PPC_MPC52xx -static struct of_device_id mpc52xx_cdm_ids[] = { +static const struct of_device_id mpc52xx_cdm_ids[] = { { .compatible = "fsl,mpc5200-cdm", }, {} }; diff --git a/drivers/net/can/sja1000/sja1000_platform.c b/drivers/net/can/sja1000/sja1000_platform.c index 93115250eaf5..0552ed46a206 100644 --- a/drivers/net/can/sja1000/sja1000_platform.c +++ b/drivers/net/can/sja1000/sja1000_platform.c @@ -242,7 +242,7 @@ static int sp_remove(struct platform_device *pdev) return 0; } -static struct of_device_id sp_of_table[] = { +static const struct of_device_id sp_of_table[] = { {.compatible = "nxp,sja1000"}, {}, }; diff --git a/drivers/net/can/xilinx_can.c b/drivers/net/can/xilinx_can.c index 6c6764312285..6bddfe062b51 100644 --- a/drivers/net/can/xilinx_can.c +++ b/drivers/net/can/xilinx_can.c @@ -1185,7 +1185,7 @@ static int xcan_remove(struct platform_device *pdev) } /* Match table for OF platform binding */ -static struct of_device_id xcan_of_match[] = { +static const struct of_device_id xcan_of_match[] = { { .compatible = "xlnx,zynq-can-1.0", }, { .compatible = "xlnx,axi-can-1.00.a", }, { /* end of list */ }, diff --git a/drivers/net/dsa/Kconfig b/drivers/net/dsa/Kconfig index 48e62a34f7f2..18550c7ebe6f 100644 --- a/drivers/net/dsa/Kconfig +++ b/drivers/net/dsa/Kconfig @@ -7,7 +7,7 @@ config NET_DSA_MV88E6XXX config NET_DSA_MV88E6060 tristate "Marvell 88E6060 ethernet switch chip support" - select NET_DSA + depends on NET_DSA select NET_DSA_TAG_TRAILER ---help--- This enables support for the Marvell 88E6060 ethernet switch @@ -19,7 +19,7 @@ config NET_DSA_MV88E6XXX_NEED_PPU config NET_DSA_MV88E6131 tristate "Marvell 88E6085/6095/6095F/6131 ethernet switch chip support" - select NET_DSA + depends on NET_DSA select NET_DSA_MV88E6XXX select NET_DSA_MV88E6XXX_NEED_PPU select NET_DSA_TAG_DSA @@ -29,7 +29,7 @@ config NET_DSA_MV88E6131 config NET_DSA_MV88E6123_61_65 tristate "Marvell 88E6123/6161/6165 ethernet switch chip support" - select NET_DSA + depends on NET_DSA select NET_DSA_MV88E6XXX select NET_DSA_TAG_EDSA ---help--- @@ -38,7 +38,7 @@ config NET_DSA_MV88E6123_61_65 config NET_DSA_MV88E6171 tristate "Marvell 88E6171/6172 ethernet switch chip support" - select NET_DSA + depends on NET_DSA select NET_DSA_MV88E6XXX select NET_DSA_TAG_EDSA ---help--- @@ -47,7 +47,7 @@ config NET_DSA_MV88E6171 config NET_DSA_MV88E6352 tristate "Marvell 88E6176/88E6352 ethernet switch chip support" - select NET_DSA + depends on NET_DSA select NET_DSA_MV88E6XXX select NET_DSA_TAG_EDSA ---help--- @@ -56,8 +56,7 @@ config NET_DSA_MV88E6352 config NET_DSA_BCM_SF2 tristate "Broadcom Starfighter 2 Ethernet switch support" - depends on HAS_IOMEM - select NET_DSA + depends on HAS_IOMEM && NET_DSA select NET_DSA_TAG_BRCM select FIXED_PHY select BCM7XXX_PHY diff --git a/drivers/net/ethernet/aeroflex/greth.c b/drivers/net/ethernet/aeroflex/greth.c index 2b8bfeeee9cf..ae89de7deb13 100644 --- a/drivers/net/ethernet/aeroflex/greth.c +++ b/drivers/net/ethernet/aeroflex/greth.c @@ -1588,7 +1588,7 @@ static int greth_of_remove(struct platform_device *of_dev) return 0; } -static struct of_device_id greth_of_match[] = { +static const struct of_device_id greth_of_match[] = { { .name = "GAISLER_ETHMAC", }, diff --git a/drivers/net/ethernet/altera/altera_tse_main.c b/drivers/net/ethernet/altera/altera_tse_main.c index fd9296a5014d..79ea35869e1e 100644 --- a/drivers/net/ethernet/altera/altera_tse_main.c +++ b/drivers/net/ethernet/altera/altera_tse_main.c @@ -89,7 +89,7 @@ MODULE_PARM_DESC(dma_tx_num, "Number of descriptors in the TX list"); #define TXQUEUESTOP_THRESHHOLD 2 -static struct of_device_id altera_tse_ids[]; +static const struct of_device_id altera_tse_ids[]; static inline u32 tse_tx_avail(struct altera_tse_private *priv) { @@ -1576,7 +1576,7 @@ static const struct altera_dmaops altera_dtype_msgdma = { .start_rxdma = msgdma_start_rxdma, }; -static struct of_device_id altera_tse_ids[] = { +static const struct of_device_id altera_tse_ids[] = { { .compatible = "altr,tse-msgdma-1.0", .data = &altera_dtype_msgdma, }, { .compatible = "altr,tse-1.0", .data = &altera_dtype_sgdma, }, { .compatible = "ALTR,tse-1.0", .data = &altera_dtype_sgdma, }, diff --git a/drivers/net/ethernet/apm/xgene/xgene_enet_hw.h b/drivers/net/ethernet/apm/xgene/xgene_enet_hw.h index ec45f3256f0e..d9bc89d69266 100644 --- a/drivers/net/ethernet/apm/xgene/xgene_enet_hw.h +++ b/drivers/net/ethernet/apm/xgene/xgene_enet_hw.h @@ -97,6 +97,8 @@ enum xgene_enet_rm { #define QCOHERENT BIT(4) #define RECOMBBUF BIT(27) +#define MAC_OFFSET 0x30 + #define BLOCK_ETH_CSR_OFFSET 0x2000 #define BLOCK_ETH_RING_IF_OFFSET 0x9000 #define BLOCK_ETH_DIAG_CSR_OFFSET 0xD000 diff --git a/drivers/net/ethernet/apm/xgene/xgene_enet_main.c b/drivers/net/ethernet/apm/xgene/xgene_enet_main.c index 635a83be7e5e..6146a993a136 100644 --- a/drivers/net/ethernet/apm/xgene/xgene_enet_main.c +++ b/drivers/net/ethernet/apm/xgene/xgene_enet_main.c @@ -645,9 +645,11 @@ static int xgene_enet_create_desc_rings(struct net_device *ndev) struct device *dev = ndev_to_dev(ndev); struct xgene_enet_desc_ring *rx_ring, *tx_ring, *cp_ring; struct xgene_enet_desc_ring *buf_pool = NULL; - u8 cpu_bufnum = 0, eth_bufnum = START_ETH_BUFNUM; - u8 bp_bufnum = START_BP_BUFNUM; - u16 ring_id, ring_num = START_RING_NUM; + u8 cpu_bufnum = pdata->cpu_bufnum; + u8 eth_bufnum = pdata->eth_bufnum; + u8 bp_bufnum = pdata->bp_bufnum; + u16 ring_num = pdata->ring_num; + u16 ring_id; int ret; /* allocate rx descriptor ring */ @@ -752,6 +754,22 @@ static const struct net_device_ops xgene_ndev_ops = { .ndo_set_mac_address = xgene_enet_set_mac_address, }; +static int xgene_get_port_id(struct device *dev, struct xgene_enet_pdata *pdata) +{ + u32 id = 0; + int ret; + + ret = device_property_read_u32(dev, "port-id", &id); + if (!ret && id > 1) { + dev_err(dev, "Incorrect port-id specified\n"); + return -ENODEV; + } + + pdata->port_id = id; + + return 0; +} + static int xgene_get_mac_address(struct device *dev, unsigned char *addr) { @@ -843,6 +861,10 @@ static int xgene_enet_get_resources(struct xgene_enet_pdata *pdata) } pdata->rx_irq = ret; + ret = xgene_get_port_id(dev, pdata); + if (ret) + return ret; + if (xgene_get_mac_address(dev, ndev->dev_addr) != ETH_ALEN) eth_hw_addr_random(ndev); @@ -866,13 +888,13 @@ static int xgene_enet_get_resources(struct xgene_enet_pdata *pdata) pdata->clk = NULL; } - base_addr = pdata->base_addr; + base_addr = pdata->base_addr - (pdata->port_id * MAC_OFFSET); pdata->eth_csr_addr = base_addr + BLOCK_ETH_CSR_OFFSET; pdata->eth_ring_if_addr = base_addr + BLOCK_ETH_RING_IF_OFFSET; pdata->eth_diag_csr_addr = base_addr + BLOCK_ETH_DIAG_CSR_OFFSET; if (pdata->phy_mode == PHY_INTERFACE_MODE_RGMII || pdata->phy_mode == PHY_INTERFACE_MODE_SGMII) { - pdata->mcx_mac_addr = base_addr + BLOCK_ETH_MAC_OFFSET; + pdata->mcx_mac_addr = pdata->base_addr + BLOCK_ETH_MAC_OFFSET; pdata->mcx_mac_csr_addr = base_addr + BLOCK_ETH_MAC_CSR_OFFSET; } else { pdata->mcx_mac_addr = base_addr + BLOCK_AXG_MAC_OFFSET; @@ -935,6 +957,24 @@ static void xgene_enet_setup_ops(struct xgene_enet_pdata *pdata) pdata->rm = RM0; break; } + + switch (pdata->port_id) { + case 0: + pdata->cpu_bufnum = START_CPU_BUFNUM_0; + pdata->eth_bufnum = START_ETH_BUFNUM_0; + pdata->bp_bufnum = START_BP_BUFNUM_0; + pdata->ring_num = START_RING_NUM_0; + break; + case 1: + pdata->cpu_bufnum = START_CPU_BUFNUM_1; + pdata->eth_bufnum = START_ETH_BUFNUM_1; + pdata->bp_bufnum = START_BP_BUFNUM_1; + pdata->ring_num = START_RING_NUM_1; + break; + default: + break; + } + } static int xgene_enet_probe(struct platform_device *pdev) @@ -1033,7 +1073,7 @@ MODULE_DEVICE_TABLE(acpi, xgene_enet_acpi_match); #endif #ifdef CONFIG_OF -static struct of_device_id xgene_enet_of_match[] = { +static const struct of_device_id xgene_enet_of_match[] = { {.compatible = "apm,xgene-enet",}, {.compatible = "apm,xgene1-sgenet",}, {.compatible = "apm,xgene1-xgenet",}, diff --git a/drivers/net/ethernet/apm/xgene/xgene_enet_main.h b/drivers/net/ethernet/apm/xgene/xgene_enet_main.h index c2d465c3db66..b93ed21a157f 100644 --- a/drivers/net/ethernet/apm/xgene/xgene_enet_main.h +++ b/drivers/net/ethernet/apm/xgene/xgene_enet_main.h @@ -41,9 +41,15 @@ #define SKB_BUFFER_SIZE (XGENE_ENET_MAX_MTU - NET_IP_ALIGN) #define NUM_PKT_BUF 64 #define NUM_BUFPOOL 32 -#define START_ETH_BUFNUM 2 -#define START_BP_BUFNUM 0x22 -#define START_RING_NUM 8 + +#define START_CPU_BUFNUM_0 0 +#define START_ETH_BUFNUM_0 2 +#define START_BP_BUFNUM_0 0x22 +#define START_RING_NUM_0 8 +#define START_CPU_BUFNUM_1 12 +#define START_ETH_BUFNUM_1 10 +#define START_BP_BUFNUM_1 0x2A +#define START_RING_NUM_1 264 #define PHY_POLL_LINK_ON (10 * HZ) #define PHY_POLL_LINK_OFF (PHY_POLL_LINK_ON / 5) @@ -125,6 +131,11 @@ struct xgene_enet_pdata { struct xgene_mac_ops *mac_ops; struct xgene_port_ops *port_ops; struct delayed_work link_work; + u32 port_id; + u8 cpu_bufnum; + u8 eth_bufnum; + u8 bp_bufnum; + u16 ring_num; }; struct xgene_indirect_ctl { diff --git a/drivers/net/ethernet/apm/xgene/xgene_enet_sgmac.c b/drivers/net/ethernet/apm/xgene/xgene_enet_sgmac.c index f5d4f68c288c..f27fb6f2a93b 100644 --- a/drivers/net/ethernet/apm/xgene/xgene_enet_sgmac.c +++ b/drivers/net/ethernet/apm/xgene/xgene_enet_sgmac.c @@ -226,6 +226,7 @@ static u32 xgene_enet_link_status(struct xgene_enet_pdata *p) static void xgene_sgmac_init(struct xgene_enet_pdata *p) { u32 data, loop = 10; + u32 offset = p->port_id * 4; xgene_sgmac_reset(p); @@ -272,9 +273,9 @@ static void xgene_sgmac_init(struct xgene_enet_pdata *p) xgene_enet_wr_csr(p, RSIF_RAM_DBG_REG0_ADDR, 0); /* Bypass traffic gating */ - xgene_enet_wr_csr(p, CFG_LINK_AGGR_RESUME_0_ADDR, TX_PORT0); + xgene_enet_wr_csr(p, CFG_LINK_AGGR_RESUME_0_ADDR + offset, TX_PORT0); xgene_enet_wr_csr(p, CFG_BYPASS_ADDR, RESUME_TX); - xgene_enet_wr_csr(p, SG_RX_DV_GATE_REG_0_ADDR, RESUME_RX0); + xgene_enet_wr_csr(p, SG_RX_DV_GATE_REG_0_ADDR + offset, RESUME_RX0); } static void xgene_sgmac_rxtx(struct xgene_enet_pdata *p, u32 bits, bool set) @@ -330,13 +331,14 @@ static void xgene_enet_cle_bypass(struct xgene_enet_pdata *p, u32 dst_ring_num, u16 bufpool_id) { u32 data, fpsel; + u32 offset = p->port_id * MAC_OFFSET; data = CFG_CLE_BYPASS_EN0; - xgene_enet_wr_csr(p, CLE_BYPASS_REG0_0_ADDR, data); + xgene_enet_wr_csr(p, CLE_BYPASS_REG0_0_ADDR + offset, data); fpsel = xgene_enet_ring_bufnum(bufpool_id) - 0x20; data = CFG_CLE_DSTQID0(dst_ring_num) | CFG_CLE_FPSEL0(fpsel); - xgene_enet_wr_csr(p, CLE_BYPASS_REG1_0_ADDR, data); + xgene_enet_wr_csr(p, CLE_BYPASS_REG1_0_ADDR + offset, data); } static void xgene_enet_shutdown(struct xgene_enet_pdata *p) diff --git a/drivers/net/ethernet/apple/bmac.c b/drivers/net/ethernet/apple/bmac.c index daae0e016253..2f98846e2d89 100644 --- a/drivers/net/ethernet/apple/bmac.c +++ b/drivers/net/ethernet/apple/bmac.c @@ -1621,7 +1621,7 @@ static int bmac_remove(struct macio_dev *mdev) return 0; } -static struct of_device_id bmac_match[] = +static const struct of_device_id bmac_match[] = { { .name = "bmac", diff --git a/drivers/net/ethernet/apple/mace.c b/drivers/net/ethernet/apple/mace.c index 7fcaf0da42a8..a18948286682 100644 --- a/drivers/net/ethernet/apple/mace.c +++ b/drivers/net/ethernet/apple/mace.c @@ -984,7 +984,7 @@ static irqreturn_t mace_rxdma_intr(int irq, void *dev_id) return IRQ_HANDLED; } -static struct of_device_id mace_match[] = +static const struct of_device_id mace_match[] = { { .name = "mace", diff --git a/drivers/net/ethernet/broadcom/genet/bcmgenet.c b/drivers/net/ethernet/broadcom/genet/bcmgenet.c index e74ae628bbb9..12956b143b11 100644 --- a/drivers/net/ethernet/broadcom/genet/bcmgenet.c +++ b/drivers/net/ethernet/broadcom/genet/bcmgenet.c @@ -197,6 +197,14 @@ enum dma_reg { DMA_PRIORITY_0, DMA_PRIORITY_1, DMA_PRIORITY_2, + DMA_INDEX2RING_0, + DMA_INDEX2RING_1, + DMA_INDEX2RING_2, + DMA_INDEX2RING_3, + DMA_INDEX2RING_4, + DMA_INDEX2RING_5, + DMA_INDEX2RING_6, + DMA_INDEX2RING_7, }; static const u8 bcmgenet_dma_regs_v3plus[] = { @@ -208,6 +216,14 @@ static const u8 bcmgenet_dma_regs_v3plus[] = { [DMA_PRIORITY_0] = 0x30, [DMA_PRIORITY_1] = 0x34, [DMA_PRIORITY_2] = 0x38, + [DMA_INDEX2RING_0] = 0x70, + [DMA_INDEX2RING_1] = 0x74, + [DMA_INDEX2RING_2] = 0x78, + [DMA_INDEX2RING_3] = 0x7C, + [DMA_INDEX2RING_4] = 0x80, + [DMA_INDEX2RING_5] = 0x84, + [DMA_INDEX2RING_6] = 0x88, + [DMA_INDEX2RING_7] = 0x8C, }; static const u8 bcmgenet_dma_regs_v2[] = { @@ -2283,6 +2299,160 @@ static void bcmgenet_enable_dma(struct bcmgenet_priv *priv, u32 dma_ctrl) bcmgenet_tdma_writel(priv, reg, DMA_CTRL); } +static bool bcmgenet_hfb_is_filter_enabled(struct bcmgenet_priv *priv, + u32 f_index) +{ + u32 offset; + u32 reg; + + offset = HFB_FLT_ENABLE_V3PLUS + (f_index < 32) * sizeof(u32); + reg = bcmgenet_hfb_reg_readl(priv, offset); + return !!(reg & (1 << (f_index % 32))); +} + +static void bcmgenet_hfb_enable_filter(struct bcmgenet_priv *priv, u32 f_index) +{ + u32 offset; + u32 reg; + + offset = HFB_FLT_ENABLE_V3PLUS + (f_index < 32) * sizeof(u32); + reg = bcmgenet_hfb_reg_readl(priv, offset); + reg |= (1 << (f_index % 32)); + bcmgenet_hfb_reg_writel(priv, reg, offset); +} + +static void bcmgenet_hfb_set_filter_rx_queue_mapping(struct bcmgenet_priv *priv, + u32 f_index, u32 rx_queue) +{ + u32 offset; + u32 reg; + + offset = f_index / 8; + reg = bcmgenet_rdma_readl(priv, DMA_INDEX2RING_0 + offset); + reg &= ~(0xF << (4 * (f_index % 8))); + reg |= ((rx_queue & 0xF) << (4 * (f_index % 8))); + bcmgenet_rdma_writel(priv, reg, DMA_INDEX2RING_0 + offset); +} + +static void bcmgenet_hfb_set_filter_length(struct bcmgenet_priv *priv, + u32 f_index, u32 f_length) +{ + u32 offset; + u32 reg; + + offset = HFB_FLT_LEN_V3PLUS + + ((priv->hw_params->hfb_filter_cnt - 1 - f_index) / 4) * + sizeof(u32); + reg = bcmgenet_hfb_reg_readl(priv, offset); + reg &= ~(0xFF << (8 * (f_index % 4))); + reg |= ((f_length & 0xFF) << (8 * (f_index % 4))); + bcmgenet_hfb_reg_writel(priv, reg, offset); +} + +static int bcmgenet_hfb_find_unused_filter(struct bcmgenet_priv *priv) +{ + u32 f_index; + + for (f_index = 0; f_index < priv->hw_params->hfb_filter_cnt; f_index++) + if (!bcmgenet_hfb_is_filter_enabled(priv, f_index)) + return f_index; + + return -ENOMEM; +} + +/* bcmgenet_hfb_add_filter + * + * Add new filter to Hardware Filter Block to match and direct Rx traffic to + * desired Rx queue. + * + * f_data is an array of unsigned 32-bit integers where each 32-bit integer + * provides filter data for 2 bytes (4 nibbles) of Rx frame: + * + * bits 31:20 - unused + * bit 19 - nibble 0 match enable + * bit 18 - nibble 1 match enable + * bit 17 - nibble 2 match enable + * bit 16 - nibble 3 match enable + * bits 15:12 - nibble 0 data + * bits 11:8 - nibble 1 data + * bits 7:4 - nibble 2 data + * bits 3:0 - nibble 3 data + * + * Example: + * In order to match: + * - Ethernet frame type = 0x0800 (IP) + * - IP version field = 4 + * - IP protocol field = 0x11 (UDP) + * + * The following filter is needed: + * u32 hfb_filter_ipv4_udp[] = { + * Rx frame offset 0x00: 0x00000000, 0x00000000, 0x00000000, 0x00000000, + * Rx frame offset 0x08: 0x00000000, 0x00000000, 0x000F0800, 0x00084000, + * Rx frame offset 0x10: 0x00000000, 0x00000000, 0x00000000, 0x00030011, + * }; + * + * To add the filter to HFB and direct the traffic to Rx queue 0, call: + * bcmgenet_hfb_add_filter(priv, hfb_filter_ipv4_udp, + * ARRAY_SIZE(hfb_filter_ipv4_udp), 0); + */ +int bcmgenet_hfb_add_filter(struct bcmgenet_priv *priv, u32 *f_data, + u32 f_length, u32 rx_queue) +{ + int f_index; + u32 i; + + f_index = bcmgenet_hfb_find_unused_filter(priv); + if (f_index < 0) + return -ENOMEM; + + if (f_length > priv->hw_params->hfb_filter_size) + return -EINVAL; + + for (i = 0; i < f_length; i++) + bcmgenet_hfb_writel(priv, f_data[i], + (f_index * priv->hw_params->hfb_filter_size + i) * + sizeof(u32)); + + bcmgenet_hfb_set_filter_length(priv, f_index, 2 * f_length); + bcmgenet_hfb_set_filter_rx_queue_mapping(priv, f_index, rx_queue); + bcmgenet_hfb_enable_filter(priv, f_index); + bcmgenet_hfb_reg_writel(priv, 0x1, HFB_CTRL); + + return 0; +} + +/* bcmgenet_hfb_clear + * + * Clear Hardware Filter Block and disable all filtering. + */ +static void bcmgenet_hfb_clear(struct bcmgenet_priv *priv) +{ + u32 i; + + bcmgenet_hfb_reg_writel(priv, 0x0, HFB_CTRL); + bcmgenet_hfb_reg_writel(priv, 0x0, HFB_FLT_ENABLE_V3PLUS); + bcmgenet_hfb_reg_writel(priv, 0x0, HFB_FLT_ENABLE_V3PLUS + 4); + + for (i = DMA_INDEX2RING_0; i <= DMA_INDEX2RING_7; i++) + bcmgenet_rdma_writel(priv, 0x0, i); + + for (i = 0; i < (priv->hw_params->hfb_filter_cnt / 4); i++) + bcmgenet_hfb_reg_writel(priv, 0x0, + HFB_FLT_LEN_V3PLUS + i * sizeof(u32)); + + for (i = 0; i < priv->hw_params->hfb_filter_cnt * + priv->hw_params->hfb_filter_size; i++) + bcmgenet_hfb_writel(priv, 0x0, i * sizeof(u32)); +} + +static void bcmgenet_hfb_init(struct bcmgenet_priv *priv) +{ + if (GENET_IS_V1(priv) || GENET_IS_V2(priv)) + return; + + bcmgenet_hfb_clear(priv); +} + static void bcmgenet_netif_start(struct net_device *dev) { struct bcmgenet_priv *priv = netdev_priv(dev); @@ -2348,6 +2518,9 @@ static int bcmgenet_open(struct net_device *dev) /* Always enable ring 16 - descriptor ring */ bcmgenet_enable_dma(priv, dma_ctrl); + /* HFB init */ + bcmgenet_hfb_init(priv); + ret = request_irq(priv->irq0, bcmgenet_isr0, IRQF_SHARED, dev->name, priv); if (ret < 0) { @@ -2592,6 +2765,7 @@ static struct bcmgenet_hw_params bcmgenet_hw_params[] = { .bp_in_en_shift = 17, .bp_in_mask = 0x1ffff, .hfb_filter_cnt = 48, + .hfb_filter_size = 128, .qtag_mask = 0x3F, .tbuf_offset = 0x0600, .hfb_offset = 0x8000, @@ -2609,6 +2783,7 @@ static struct bcmgenet_hw_params bcmgenet_hw_params[] = { .bp_in_en_shift = 17, .bp_in_mask = 0x1ffff, .hfb_filter_cnt = 48, + .hfb_filter_size = 128, .qtag_mask = 0x3F, .tbuf_offset = 0x0600, .hfb_offset = 0x8000, diff --git a/drivers/net/ethernet/broadcom/genet/bcmgenet.h b/drivers/net/ethernet/broadcom/genet/bcmgenet.h index 2a8113898aed..1ea838946318 100644 --- a/drivers/net/ethernet/broadcom/genet/bcmgenet.h +++ b/drivers/net/ethernet/broadcom/genet/bcmgenet.h @@ -509,6 +509,7 @@ struct bcmgenet_hw_params { u8 bp_in_en_shift; u32 bp_in_mask; u8 hfb_filter_cnt; + u8 hfb_filter_size; u8 qtag_mask; u16 tbuf_offset; u32 hfb_offset; diff --git a/drivers/net/ethernet/ethoc.c b/drivers/net/ethernet/ethoc.c index f88cfaa359e7..442410cd2ca4 100644 --- a/drivers/net/ethernet/ethoc.c +++ b/drivers/net/ethernet/ethoc.c @@ -1299,7 +1299,7 @@ static int ethoc_resume(struct platform_device *pdev) # define ethoc_resume NULL #endif -static struct of_device_id ethoc_match[] = { +static const struct of_device_id ethoc_match[] = { { .compatible = "opencores,ethoc", }, {}, }; diff --git a/drivers/net/ethernet/freescale/fec_mpc52xx.c b/drivers/net/ethernet/freescale/fec_mpc52xx.c index f495796248db..afe7f39cdd7c 100644 --- a/drivers/net/ethernet/freescale/fec_mpc52xx.c +++ b/drivers/net/ethernet/freescale/fec_mpc52xx.c @@ -1057,7 +1057,7 @@ static int mpc52xx_fec_of_resume(struct platform_device *op) } #endif -static struct of_device_id mpc52xx_fec_match[] = { +static const struct of_device_id mpc52xx_fec_match[] = { { .compatible = "fsl,mpc5200b-fec", }, { .compatible = "fsl,mpc5200-fec", }, { .compatible = "mpc5200-fec", }, diff --git a/drivers/net/ethernet/freescale/fec_mpc52xx_phy.c b/drivers/net/ethernet/freescale/fec_mpc52xx_phy.c index e0528900db02..1e647beaf989 100644 --- a/drivers/net/ethernet/freescale/fec_mpc52xx_phy.c +++ b/drivers/net/ethernet/freescale/fec_mpc52xx_phy.c @@ -134,7 +134,7 @@ static int mpc52xx_fec_mdio_remove(struct platform_device *of) return 0; } -static struct of_device_id mpc52xx_fec_mdio_match[] = { +static const struct of_device_id mpc52xx_fec_mdio_match[] = { { .compatible = "fsl,mpc5200b-mdio", }, { .compatible = "fsl,mpc5200-mdio", }, { .compatible = "mpc5200b-fec-phy", }, diff --git a/drivers/net/ethernet/freescale/fs_enet/fs_enet-main.c b/drivers/net/ethernet/freescale/fs_enet/fs_enet-main.c index a17628769a1f..9b3639eae676 100644 --- a/drivers/net/ethernet/freescale/fs_enet/fs_enet-main.c +++ b/drivers/net/ethernet/freescale/fs_enet/fs_enet-main.c @@ -916,7 +916,7 @@ static const struct net_device_ops fs_enet_netdev_ops = { #endif }; -static struct of_device_id fs_enet_match[]; +static const struct of_device_id fs_enet_match[]; static int fs_enet_probe(struct platform_device *ofdev) { const struct of_device_id *match; @@ -1082,7 +1082,7 @@ static int fs_enet_remove(struct platform_device *ofdev) return 0; } -static struct of_device_id fs_enet_match[] = { +static const struct of_device_id fs_enet_match[] = { #ifdef CONFIG_FS_ENET_HAS_SCC { .compatible = "fsl,cpm1-scc-enet", diff --git a/drivers/net/ethernet/freescale/fs_enet/mii-bitbang.c b/drivers/net/ethernet/freescale/fs_enet/mii-bitbang.c index 1d5617d2d8bd..68a428de0bc0 100644 --- a/drivers/net/ethernet/freescale/fs_enet/mii-bitbang.c +++ b/drivers/net/ethernet/freescale/fs_enet/mii-bitbang.c @@ -213,7 +213,7 @@ static int fs_enet_mdio_remove(struct platform_device *ofdev) return 0; } -static struct of_device_id fs_enet_mdio_bb_match[] = { +static const struct of_device_id fs_enet_mdio_bb_match[] = { { .compatible = "fsl,cpm2-mdio-bitbang", }, diff --git a/drivers/net/ethernet/freescale/fs_enet/mii-fec.c b/drivers/net/ethernet/freescale/fs_enet/mii-fec.c index 1648e3582500..2be383e6d258 100644 --- a/drivers/net/ethernet/freescale/fs_enet/mii-fec.c +++ b/drivers/net/ethernet/freescale/fs_enet/mii-fec.c @@ -95,7 +95,7 @@ static int fs_enet_fec_mii_write(struct mii_bus *bus, int phy_id, int location, } -static struct of_device_id fs_enet_mdio_fec_match[]; +static const struct of_device_id fs_enet_mdio_fec_match[]; static int fs_enet_mdio_probe(struct platform_device *ofdev) { const struct of_device_id *match; @@ -208,7 +208,7 @@ static int fs_enet_mdio_remove(struct platform_device *ofdev) return 0; } -static struct of_device_id fs_enet_mdio_fec_match[] = { +static const struct of_device_id fs_enet_mdio_fec_match[] = { { .compatible = "fsl,pq1-fec-mdio", }, diff --git a/drivers/net/ethernet/freescale/fsl_pq_mdio.c b/drivers/net/ethernet/freescale/fsl_pq_mdio.c index d1a91e344e6b..3c40f6b99224 100644 --- a/drivers/net/ethernet/freescale/fsl_pq_mdio.c +++ b/drivers/net/ethernet/freescale/fsl_pq_mdio.c @@ -294,7 +294,7 @@ static void ucc_configure(phys_addr_t start, phys_addr_t end) #endif -static struct of_device_id fsl_pq_mdio_match[] = { +static const struct of_device_id fsl_pq_mdio_match[] = { #if defined(CONFIG_GIANFAR) || defined(CONFIG_GIANFAR_MODULE) { .compatible = "fsl,gianfar-tbi", diff --git a/drivers/net/ethernet/freescale/gianfar.c b/drivers/net/ethernet/freescale/gianfar.c index 7bf3682cdf47..4ee080d49bc0 100644 --- a/drivers/net/ethernet/freescale/gianfar.c +++ b/drivers/net/ethernet/freescale/gianfar.c @@ -158,7 +158,7 @@ static void gfar_init_rxbdp(struct gfar_priv_rx_q *rx_queue, struct rxbd8 *bdp, { u32 lstatus; - bdp->bufPtr = buf; + bdp->bufPtr = cpu_to_be32(buf); lstatus = BD_LFLAG(RXBD_EMPTY | RXBD_INTERRUPT); if (bdp == rx_queue->rx_bd_base + rx_queue->rx_ring_size - 1) @@ -166,7 +166,7 @@ static void gfar_init_rxbdp(struct gfar_priv_rx_q *rx_queue, struct rxbd8 *bdp, gfar_wmb(); - bdp->lstatus = lstatus; + bdp->lstatus = cpu_to_be32(lstatus); } static int gfar_init_bds(struct net_device *ndev) @@ -200,7 +200,8 @@ static int gfar_init_bds(struct net_device *ndev) /* Set the last descriptor in the ring to indicate wrap */ txbdp--; - txbdp->status |= TXBD_WRAP; + txbdp->status = cpu_to_be16(be16_to_cpu(txbdp->status) | + TXBD_WRAP); } rfbptr = ®s->rfbptr0; @@ -214,7 +215,7 @@ static int gfar_init_bds(struct net_device *ndev) struct sk_buff *skb = rx_queue->rx_skbuff[j]; if (skb) { - bufaddr = rxbdp->bufPtr; + bufaddr = be32_to_cpu(rxbdp->bufPtr); } else { skb = gfar_new_skb(ndev, &bufaddr); if (!skb) { @@ -696,19 +697,28 @@ static int gfar_parse_group(struct device_node *np, grp->priv = priv; spin_lock_init(&grp->grplock); if (priv->mode == MQ_MG_MODE) { - u32 *rxq_mask, *txq_mask; - rxq_mask = (u32 *)of_get_property(np, "fsl,rx-bit-map", NULL); - txq_mask = (u32 *)of_get_property(np, "fsl,tx-bit-map", NULL); + u32 rxq_mask, txq_mask; + int ret; + + grp->rx_bit_map = (DEFAULT_MAPPING >> priv->num_grps); + grp->tx_bit_map = (DEFAULT_MAPPING >> priv->num_grps); + + ret = of_property_read_u32(np, "fsl,rx-bit-map", &rxq_mask); + if (!ret) { + grp->rx_bit_map = rxq_mask ? + rxq_mask : (DEFAULT_MAPPING >> priv->num_grps); + } + + ret = of_property_read_u32(np, "fsl,tx-bit-map", &txq_mask); + if (!ret) { + grp->tx_bit_map = txq_mask ? + txq_mask : (DEFAULT_MAPPING >> priv->num_grps); + } if (priv->poll_mode == GFAR_SQ_POLLING) { /* One Q per interrupt group: Q0 to G0, Q1 to G1 */ grp->rx_bit_map = (DEFAULT_MAPPING >> priv->num_grps); grp->tx_bit_map = (DEFAULT_MAPPING >> priv->num_grps); - } else { /* GFAR_MQ_POLLING */ - grp->rx_bit_map = rxq_mask ? - *rxq_mask : (DEFAULT_MAPPING >> priv->num_grps); - grp->tx_bit_map = txq_mask ? - *txq_mask : (DEFAULT_MAPPING >> priv->num_grps); } } else { grp->rx_bit_map = 0xFF; @@ -769,11 +779,10 @@ static int gfar_of_init(struct platform_device *ofdev, struct net_device **pdev) struct gfar_private *priv = NULL; struct device_node *np = ofdev->dev.of_node; struct device_node *child = NULL; - const u32 *stash; - const u32 *stash_len; - const u32 *stash_idx; + struct property *stash; + u32 stash_len = 0; + u32 stash_idx = 0; unsigned int num_tx_qs, num_rx_qs; - u32 *tx_queues, *rx_queues; unsigned short mode, poll_mode; if (!np) @@ -787,10 +796,6 @@ static int gfar_of_init(struct platform_device *ofdev, struct net_device **pdev) poll_mode = GFAR_SQ_POLLING; } - /* parse the num of HW tx and rx queues */ - tx_queues = (u32 *)of_get_property(np, "fsl,num_tx_queues", NULL); - rx_queues = (u32 *)of_get_property(np, "fsl,num_rx_queues", NULL); - if (mode == SQ_SG_MODE) { num_tx_qs = 1; num_rx_qs = 1; @@ -809,8 +814,17 @@ static int gfar_of_init(struct platform_device *ofdev, struct net_device **pdev) num_tx_qs = num_grps; /* one txq per int group */ num_rx_qs = num_grps; /* one rxq per int group */ } else { /* GFAR_MQ_POLLING */ - num_tx_qs = tx_queues ? *tx_queues : 1; - num_rx_qs = rx_queues ? *rx_queues : 1; + u32 tx_queues, rx_queues; + int ret; + + /* parse the num of HW tx and rx queues */ + ret = of_property_read_u32(np, "fsl,num_tx_queues", + &tx_queues); + num_tx_qs = ret ? 1 : tx_queues; + + ret = of_property_read_u32(np, "fsl,num_rx_queues", + &rx_queues); + num_rx_qs = ret ? 1 : rx_queues; } } @@ -851,13 +865,17 @@ static int gfar_of_init(struct platform_device *ofdev, struct net_device **pdev) if (err) goto rx_alloc_failed; + err = of_property_read_string(np, "model", &model); + if (err) { + pr_err("Device model property missing, aborting\n"); + goto rx_alloc_failed; + } + /* Init Rx queue filer rule set linked list */ INIT_LIST_HEAD(&priv->rx_list.list); priv->rx_list.count = 0; mutex_init(&priv->rx_queue_access); - model = of_get_property(np, "model", NULL); - for (i = 0; i < MAXGROUPS; i++) priv->gfargrp[i].regs = NULL; @@ -877,22 +895,22 @@ static int gfar_of_init(struct platform_device *ofdev, struct net_device **pdev) goto err_grp_init; } - stash = of_get_property(np, "bd-stash", NULL); + stash = of_find_property(np, "bd-stash", NULL); if (stash) { priv->device_flags |= FSL_GIANFAR_DEV_HAS_BD_STASHING; priv->bd_stash_en = 1; } - stash_len = of_get_property(np, "rx-stash-len", NULL); + err = of_property_read_u32(np, "rx-stash-len", &stash_len); - if (stash_len) - priv->rx_stash_size = *stash_len; + if (err == 0) + priv->rx_stash_size = stash_len; - stash_idx = of_get_property(np, "rx-stash-idx", NULL); + err = of_property_read_u32(np, "rx-stash-idx", &stash_idx); - if (stash_idx) - priv->rx_stash_index = *stash_idx; + if (err == 0) + priv->rx_stash_index = stash_idx; if (stash_len || stash_idx) priv->device_flags |= FSL_GIANFAR_DEV_HAS_BUF_STASHING; @@ -919,15 +937,15 @@ static int gfar_of_init(struct platform_device *ofdev, struct net_device **pdev) FSL_GIANFAR_DEV_HAS_EXTENDED_HASH | FSL_GIANFAR_DEV_HAS_TIMER; - ctype = of_get_property(np, "phy-connection-type", NULL); + err = of_property_read_string(np, "phy-connection-type", &ctype); /* We only care about rgmii-id. The rest are autodetected */ - if (ctype && !strcmp(ctype, "rgmii-id")) + if (err == 0 && !strcmp(ctype, "rgmii-id")) priv->interface = PHY_INTERFACE_MODE_RGMII_ID; else priv->interface = PHY_INTERFACE_MODE_MII; - if (of_get_property(np, "fsl,magic-packet", NULL)) + if (of_find_property(np, "fsl,magic-packet", NULL)) priv->device_flags |= FSL_GIANFAR_DEV_HAS_MAGIC_PACKET; priv->phy_node = of_parse_phandle(np, "phy-handle", 0); @@ -1884,14 +1902,15 @@ static void free_skb_tx_queue(struct gfar_priv_tx_q *tx_queue) if (!tx_queue->tx_skbuff[i]) continue; - dma_unmap_single(priv->dev, txbdp->bufPtr, - txbdp->length, DMA_TO_DEVICE); + dma_unmap_single(priv->dev, be32_to_cpu(txbdp->bufPtr), + be16_to_cpu(txbdp->length), DMA_TO_DEVICE); txbdp->lstatus = 0; for (j = 0; j < skb_shinfo(tx_queue->tx_skbuff[i])->nr_frags; j++) { txbdp++; - dma_unmap_page(priv->dev, txbdp->bufPtr, - txbdp->length, DMA_TO_DEVICE); + dma_unmap_page(priv->dev, be32_to_cpu(txbdp->bufPtr), + be16_to_cpu(txbdp->length), + DMA_TO_DEVICE); } txbdp++; dev_kfree_skb_any(tx_queue->tx_skbuff[i]); @@ -1911,7 +1930,7 @@ static void free_skb_rx_queue(struct gfar_priv_rx_q *rx_queue) for (i = 0; i < rx_queue->rx_ring_size; i++) { if (rx_queue->rx_skbuff[i]) { - dma_unmap_single(priv->dev, rxbdp->bufPtr, + dma_unmap_single(priv->dev, be32_to_cpu(rxbdp->bufPtr), priv->rx_buffer_size, DMA_FROM_DEVICE); dev_kfree_skb_any(rx_queue->rx_skbuff[i]); @@ -2167,16 +2186,16 @@ static inline void gfar_tx_checksum(struct sk_buff *skb, struct txfcb *fcb, */ if (ip_hdr(skb)->protocol == IPPROTO_UDP) { flags |= TXFCB_UDP; - fcb->phcs = udp_hdr(skb)->check; + fcb->phcs = (__force __be16)(udp_hdr(skb)->check); } else - fcb->phcs = tcp_hdr(skb)->check; + fcb->phcs = (__force __be16)(tcp_hdr(skb)->check); /* l3os is the distance between the start of the * frame (skb->data) and the start of the IP hdr. * l4os is the distance between the start of the * l3 hdr and the l4 hdr */ - fcb->l3os = (u16)(skb_network_offset(skb) - fcb_length); + fcb->l3os = (u8)(skb_network_offset(skb) - fcb_length); fcb->l4os = skb_network_header_len(skb); fcb->flags = flags; @@ -2185,7 +2204,7 @@ static inline void gfar_tx_checksum(struct sk_buff *skb, struct txfcb *fcb, void inline gfar_tx_vlan(struct sk_buff *skb, struct txfcb *fcb) { fcb->flags |= TXFCB_VLN; - fcb->vlctl = skb_vlan_tag_get(skb); + fcb->vlctl = cpu_to_be16(skb_vlan_tag_get(skb)); } static inline struct txbd8 *skip_txbd(struct txbd8 *bdp, int stride, @@ -2298,7 +2317,7 @@ static int gfar_start_xmit(struct sk_buff *skb, struct net_device *dev) tx_queue->stats.tx_packets++; txbdp = txbdp_start = tx_queue->cur_tx; - lstatus = txbdp->lstatus; + lstatus = be32_to_cpu(txbdp->lstatus); /* Time stamp insertion requires one additional TxBD */ if (unlikely(do_tstamp)) @@ -2306,11 +2325,14 @@ static int gfar_start_xmit(struct sk_buff *skb, struct net_device *dev) tx_queue->tx_ring_size); if (nr_frags == 0) { - if (unlikely(do_tstamp)) - txbdp_tstamp->lstatus |= BD_LFLAG(TXBD_LAST | - TXBD_INTERRUPT); - else + if (unlikely(do_tstamp)) { + u32 lstatus_ts = be32_to_cpu(txbdp_tstamp->lstatus); + + lstatus_ts |= BD_LFLAG(TXBD_LAST | TXBD_INTERRUPT); + txbdp_tstamp->lstatus = cpu_to_be32(lstatus_ts); + } else { lstatus |= BD_LFLAG(TXBD_LAST | TXBD_INTERRUPT); + } } else { /* Place the fragment addresses and lengths into the TxBDs */ for (i = 0; i < nr_frags; i++) { @@ -2320,7 +2342,7 @@ static int gfar_start_xmit(struct sk_buff *skb, struct net_device *dev) frag_len = skb_shinfo(skb)->frags[i].size; - lstatus = txbdp->lstatus | frag_len | + lstatus = be32_to_cpu(txbdp->lstatus) | frag_len | BD_LFLAG(TXBD_READY); /* Handle the last BD specially */ @@ -2336,11 +2358,11 @@ static int gfar_start_xmit(struct sk_buff *skb, struct net_device *dev) goto dma_map_err; /* set the TxBD length and buffer pointer */ - txbdp->bufPtr = bufaddr; - txbdp->lstatus = lstatus; + txbdp->bufPtr = cpu_to_be32(bufaddr); + txbdp->lstatus = cpu_to_be32(lstatus); } - lstatus = txbdp_start->lstatus; + lstatus = be32_to_cpu(txbdp_start->lstatus); } /* Add TxPAL between FCB and frame if required */ @@ -2388,7 +2410,7 @@ static int gfar_start_xmit(struct sk_buff *skb, struct net_device *dev) if (unlikely(dma_mapping_error(priv->dev, bufaddr))) goto dma_map_err; - txbdp_start->bufPtr = bufaddr; + txbdp_start->bufPtr = cpu_to_be32(bufaddr); /* If time stamping is requested one additional TxBD must be set up. The * first TxBD points to the FCB and must have a data length of @@ -2396,9 +2418,15 @@ static int gfar_start_xmit(struct sk_buff *skb, struct net_device *dev) * the full frame length. */ if (unlikely(do_tstamp)) { - txbdp_tstamp->bufPtr = txbdp_start->bufPtr + fcb_len; - txbdp_tstamp->lstatus |= BD_LFLAG(TXBD_READY) | - (skb_headlen(skb) - fcb_len); + u32 lstatus_ts = be32_to_cpu(txbdp_tstamp->lstatus); + + bufaddr = be32_to_cpu(txbdp_start->bufPtr); + bufaddr += fcb_len; + lstatus_ts |= BD_LFLAG(TXBD_READY) | + (skb_headlen(skb) - fcb_len); + + txbdp_tstamp->bufPtr = cpu_to_be32(bufaddr); + txbdp_tstamp->lstatus = cpu_to_be32(lstatus_ts); lstatus |= BD_LFLAG(TXBD_CRC | TXBD_READY) | GMAC_FCB_LEN; } else { lstatus |= BD_LFLAG(TXBD_CRC | TXBD_READY) | skb_headlen(skb); @@ -2421,7 +2449,7 @@ static int gfar_start_xmit(struct sk_buff *skb, struct net_device *dev) gfar_wmb(); - txbdp_start->lstatus = lstatus; + txbdp_start->lstatus = cpu_to_be32(lstatus); gfar_wmb(); /* force lstatus write before tx_skbuff */ @@ -2460,13 +2488,14 @@ dma_map_err: if (do_tstamp) txbdp = next_txbd(txbdp, base, tx_queue->tx_ring_size); for (i = 0; i < nr_frags; i++) { - lstatus = txbdp->lstatus; + lstatus = be32_to_cpu(txbdp->lstatus); if (!(lstatus & BD_LFLAG(TXBD_READY))) break; - txbdp->lstatus = lstatus & ~BD_LFLAG(TXBD_READY); - bufaddr = txbdp->bufPtr; - dma_unmap_page(priv->dev, bufaddr, txbdp->length, + lstatus &= ~BD_LFLAG(TXBD_READY); + txbdp->lstatus = cpu_to_be32(lstatus); + bufaddr = be32_to_cpu(txbdp->bufPtr); + dma_unmap_page(priv->dev, bufaddr, be16_to_cpu(txbdp->length), DMA_TO_DEVICE); txbdp = next_txbd(txbdp, base, tx_queue->tx_ring_size); } @@ -2607,7 +2636,7 @@ static void gfar_clean_tx_ring(struct gfar_priv_tx_q *tx_queue) lbdp = skip_txbd(bdp, nr_txbds - 1, base, tx_ring_size); - lstatus = lbdp->lstatus; + lstatus = be32_to_cpu(lbdp->lstatus); /* Only clean completed frames */ if ((lstatus & BD_LFLAG(TXBD_READY)) && @@ -2616,11 +2645,12 @@ static void gfar_clean_tx_ring(struct gfar_priv_tx_q *tx_queue) if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_IN_PROGRESS)) { next = next_txbd(bdp, base, tx_ring_size); - buflen = next->length + GMAC_FCB_LEN + GMAC_TXPAL_LEN; + buflen = be16_to_cpu(next->length) + + GMAC_FCB_LEN + GMAC_TXPAL_LEN; } else - buflen = bdp->length; + buflen = be16_to_cpu(bdp->length); - dma_unmap_single(priv->dev, bdp->bufPtr, + dma_unmap_single(priv->dev, be32_to_cpu(bdp->bufPtr), buflen, DMA_TO_DEVICE); if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_IN_PROGRESS)) { @@ -2631,17 +2661,18 @@ static void gfar_clean_tx_ring(struct gfar_priv_tx_q *tx_queue) shhwtstamps.hwtstamp = ns_to_ktime(*ns); skb_pull(skb, GMAC_FCB_LEN + GMAC_TXPAL_LEN); skb_tstamp_tx(skb, &shhwtstamps); - bdp->lstatus &= BD_LFLAG(TXBD_WRAP); + gfar_clear_txbd_status(bdp); bdp = next; } - bdp->lstatus &= BD_LFLAG(TXBD_WRAP); + gfar_clear_txbd_status(bdp); bdp = next_txbd(bdp, base, tx_ring_size); for (i = 0; i < frags; i++) { - dma_unmap_page(priv->dev, bdp->bufPtr, - bdp->length, DMA_TO_DEVICE); - bdp->lstatus &= BD_LFLAG(TXBD_WRAP); + dma_unmap_page(priv->dev, be32_to_cpu(bdp->bufPtr), + be16_to_cpu(bdp->length), + DMA_TO_DEVICE); + gfar_clear_txbd_status(bdp); bdp = next_txbd(bdp, base, tx_ring_size); } @@ -2798,13 +2829,13 @@ static inline void gfar_rx_checksum(struct sk_buff *skb, struct rxfcb *fcb) * were verified, then we tell the kernel that no * checksumming is necessary. Otherwise, it is [FIXME] */ - if ((fcb->flags & RXFCB_CSUM_MASK) == (RXFCB_CIP | RXFCB_CTU)) + if ((be16_to_cpu(fcb->flags) & RXFCB_CSUM_MASK) == + (RXFCB_CIP | RXFCB_CTU)) skb->ip_summed = CHECKSUM_UNNECESSARY; else skb_checksum_none_assert(skb); } - /* gfar_process_frame() -- handle one incoming packet if skb isn't NULL. */ static void gfar_process_frame(struct net_device *dev, struct sk_buff *skb, int amount_pull, struct napi_struct *napi) @@ -2846,8 +2877,9 @@ static void gfar_process_frame(struct net_device *dev, struct sk_buff *skb, * RXFCB_VLN is pseudo randomly set. */ if (dev->features & NETIF_F_HW_VLAN_CTAG_RX && - fcb->flags & RXFCB_VLN) - __vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), fcb->vlctl); + be16_to_cpu(fcb->flags) & RXFCB_VLN) + __vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), + be16_to_cpu(fcb->vlctl)); /* Send the packet up the stack */ napi_gro_receive(napi, skb); @@ -2874,7 +2906,7 @@ int gfar_clean_rx_ring(struct gfar_priv_rx_q *rx_queue, int rx_work_limit) amount_pull = priv->uses_rxfcb ? GMAC_FCB_LEN : 0; - while (!((bdp->status & RXBD_EMPTY) || (--rx_work_limit < 0))) { + while (!(be16_to_cpu(bdp->status) & RXBD_EMPTY) && rx_work_limit--) { struct sk_buff *newskb; dma_addr_t bufaddr; @@ -2885,21 +2917,22 @@ int gfar_clean_rx_ring(struct gfar_priv_rx_q *rx_queue, int rx_work_limit) skb = rx_queue->rx_skbuff[rx_queue->skb_currx]; - dma_unmap_single(priv->dev, bdp->bufPtr, + dma_unmap_single(priv->dev, be32_to_cpu(bdp->bufPtr), priv->rx_buffer_size, DMA_FROM_DEVICE); - if (unlikely(!(bdp->status & RXBD_ERR) && - bdp->length > priv->rx_buffer_size)) - bdp->status = RXBD_LARGE; + if (unlikely(!(be16_to_cpu(bdp->status) & RXBD_ERR) && + be16_to_cpu(bdp->length) > priv->rx_buffer_size)) + bdp->status = cpu_to_be16(RXBD_LARGE); /* We drop the frame if we failed to allocate a new buffer */ - if (unlikely(!newskb || !(bdp->status & RXBD_LAST) || - bdp->status & RXBD_ERR)) { - count_errors(bdp->status, dev); + if (unlikely(!newskb || + !(be16_to_cpu(bdp->status) & RXBD_LAST) || + be16_to_cpu(bdp->status) & RXBD_ERR)) { + count_errors(be16_to_cpu(bdp->status), dev); if (unlikely(!newskb)) { newskb = skb; - bufaddr = bdp->bufPtr; + bufaddr = be32_to_cpu(bdp->bufPtr); } else if (skb) dev_kfree_skb(skb); } else { @@ -2908,7 +2941,8 @@ int gfar_clean_rx_ring(struct gfar_priv_rx_q *rx_queue, int rx_work_limit) howmany++; if (likely(skb)) { - pkt_len = bdp->length - ETH_FCS_LEN; + pkt_len = be16_to_cpu(bdp->length) - + ETH_FCS_LEN; /* Remove the FCS from the packet length */ skb_put(skb, pkt_len); rx_queue->stats.rx_bytes += pkt_len; @@ -3560,7 +3594,7 @@ static noinline void gfar_update_link_state(struct gfar_private *priv) phy_print_status(phydev); } -static struct of_device_id gfar_match[] = +static const struct of_device_id gfar_match[] = { { .type = "network", diff --git a/drivers/net/ethernet/freescale/gianfar.h b/drivers/net/ethernet/freescale/gianfar.h index 9e1802400c23..daa1d37de642 100644 --- a/drivers/net/ethernet/freescale/gianfar.h +++ b/drivers/net/ethernet/freescale/gianfar.h @@ -544,12 +544,12 @@ struct txbd8 { union { struct { - u16 status; /* Status Fields */ - u16 length; /* Buffer length */ + __be16 status; /* Status Fields */ + __be16 length; /* Buffer length */ }; - u32 lstatus; + __be32 lstatus; }; - u32 bufPtr; /* Buffer Pointer */ + __be32 bufPtr; /* Buffer Pointer */ }; struct txfcb { @@ -557,28 +557,28 @@ struct txfcb { u8 ptp; /* Flag to enable tx timestamping */ u8 l4os; /* Level 4 Header Offset */ u8 l3os; /* Level 3 Header Offset */ - u16 phcs; /* Pseudo-header Checksum */ - u16 vlctl; /* VLAN control word */ + __be16 phcs; /* Pseudo-header Checksum */ + __be16 vlctl; /* VLAN control word */ }; struct rxbd8 { union { struct { - u16 status; /* Status Fields */ - u16 length; /* Buffer Length */ + __be16 status; /* Status Fields */ + __be16 length; /* Buffer Length */ }; - u32 lstatus; + __be32 lstatus; }; - u32 bufPtr; /* Buffer Pointer */ + __be32 bufPtr; /* Buffer Pointer */ }; struct rxfcb { - u16 flags; + __be16 flags; u8 rq; /* Receive Queue index */ u8 pro; /* Layer 4 Protocol */ u16 reserved; - u16 vlctl; /* VLAN control word */ + __be16 vlctl; /* VLAN control word */ }; struct gianfar_skb_cb { @@ -1287,6 +1287,14 @@ static inline void gfar_wmb(void) #endif } +static inline void gfar_clear_txbd_status(struct txbd8 *bdp) +{ + u32 lstatus = be32_to_cpu(bdp->lstatus); + + lstatus &= BD_LFLAG(TXBD_WRAP); + bdp->lstatus = cpu_to_be32(lstatus); +} + irqreturn_t gfar_receive(int irq, void *dev_id); int startup_gfar(struct net_device *dev); void stop_gfar(struct net_device *dev); diff --git a/drivers/net/ethernet/freescale/gianfar_ptp.c b/drivers/net/ethernet/freescale/gianfar_ptp.c index 16826341a4c9..77353366f33b 100644 --- a/drivers/net/ethernet/freescale/gianfar_ptp.c +++ b/drivers/net/ethernet/freescale/gianfar_ptp.c @@ -554,7 +554,7 @@ static int gianfar_ptp_remove(struct platform_device *dev) return 0; } -static struct of_device_id match_table[] = { +static const struct of_device_id match_table[] = { { .compatible = "fsl,etsec-ptp" }, {}, }; diff --git a/drivers/net/ethernet/freescale/ucc_geth.c b/drivers/net/ethernet/freescale/ucc_geth.c index 357e8b576905..bfdccbd58be0 100644 --- a/drivers/net/ethernet/freescale/ucc_geth.c +++ b/drivers/net/ethernet/freescale/ucc_geth.c @@ -3930,7 +3930,7 @@ static int ucc_geth_remove(struct platform_device* ofdev) return 0; } -static struct of_device_id ucc_geth_match[] = { +static const struct of_device_id ucc_geth_match[] = { { .type = "network", .compatible = "ucc_geth", diff --git a/drivers/net/ethernet/freescale/xgmac_mdio.c b/drivers/net/ethernet/freescale/xgmac_mdio.c index 3a83bc2c613c..7b8fe866f603 100644 --- a/drivers/net/ethernet/freescale/xgmac_mdio.c +++ b/drivers/net/ethernet/freescale/xgmac_mdio.c @@ -46,17 +46,43 @@ struct tgec_mdio_controller { #define MDIO_DATA(x) (x & 0xffff) #define MDIO_DATA_BSY BIT(31) +struct mdio_fsl_priv { + struct tgec_mdio_controller __iomem *mdio_base; + bool is_little_endian; +}; + +static u32 xgmac_read32(void __iomem *regs, + bool is_little_endian) +{ + if (is_little_endian) + return ioread32(regs); + else + return ioread32be(regs); +} + +static void xgmac_write32(u32 value, + void __iomem *regs, + bool is_little_endian) +{ + if (is_little_endian) + iowrite32(value, regs); + else + iowrite32be(value, regs); +} + /* * Wait until the MDIO bus is free */ static int xgmac_wait_until_free(struct device *dev, - struct tgec_mdio_controller __iomem *regs) + struct tgec_mdio_controller __iomem *regs, + bool is_little_endian) { unsigned int timeout; /* Wait till the bus is free */ timeout = TIMEOUT; - while ((ioread32be(®s->mdio_stat) & MDIO_STAT_BSY) && timeout) { + while ((xgmac_read32(®s->mdio_stat, is_little_endian) & + MDIO_STAT_BSY) && timeout) { cpu_relax(); timeout--; } @@ -73,13 +99,15 @@ static int xgmac_wait_until_free(struct device *dev, * Wait till the MDIO read or write operation is complete */ static int xgmac_wait_until_done(struct device *dev, - struct tgec_mdio_controller __iomem *regs) + struct tgec_mdio_controller __iomem *regs, + bool is_little_endian) { unsigned int timeout; /* Wait till the MDIO write is complete */ timeout = TIMEOUT; - while ((ioread32be(®s->mdio_data) & MDIO_DATA_BSY) && timeout) { + while ((xgmac_read32(®s->mdio_stat, is_little_endian) & + MDIO_STAT_BSY) && timeout) { cpu_relax(); timeout--; } @@ -99,12 +127,14 @@ static int xgmac_wait_until_done(struct device *dev, */ static int xgmac_mdio_write(struct mii_bus *bus, int phy_id, int regnum, u16 value) { - struct tgec_mdio_controller __iomem *regs = bus->priv; + struct mdio_fsl_priv *priv = (struct mdio_fsl_priv *)bus->priv; + struct tgec_mdio_controller __iomem *regs = priv->mdio_base; uint16_t dev_addr; u32 mdio_ctl, mdio_stat; int ret; + bool endian = priv->is_little_endian; - mdio_stat = ioread32be(®s->mdio_stat); + mdio_stat = xgmac_read32(®s->mdio_stat, endian); if (regnum & MII_ADDR_C45) { /* Clause 45 (ie 10G) */ dev_addr = (regnum >> 16) & 0x1f; @@ -115,29 +145,29 @@ static int xgmac_mdio_write(struct mii_bus *bus, int phy_id, int regnum, u16 val mdio_stat &= ~MDIO_STAT_ENC; } - iowrite32be(mdio_stat, ®s->mdio_stat); + xgmac_write32(mdio_stat, ®s->mdio_stat, endian); - ret = xgmac_wait_until_free(&bus->dev, regs); + ret = xgmac_wait_until_free(&bus->dev, regs, endian); if (ret) return ret; /* Set the port and dev addr */ mdio_ctl = MDIO_CTL_PORT_ADDR(phy_id) | MDIO_CTL_DEV_ADDR(dev_addr); - iowrite32be(mdio_ctl, ®s->mdio_ctl); + xgmac_write32(mdio_ctl, ®s->mdio_ctl, endian); /* Set the register address */ if (regnum & MII_ADDR_C45) { - iowrite32be(regnum & 0xffff, ®s->mdio_addr); + xgmac_write32(regnum & 0xffff, ®s->mdio_addr, endian); - ret = xgmac_wait_until_free(&bus->dev, regs); + ret = xgmac_wait_until_free(&bus->dev, regs, endian); if (ret) return ret; } /* Write the value to the register */ - iowrite32be(MDIO_DATA(value), ®s->mdio_data); + xgmac_write32(MDIO_DATA(value), ®s->mdio_data, endian); - ret = xgmac_wait_until_done(&bus->dev, regs); + ret = xgmac_wait_until_done(&bus->dev, regs, endian); if (ret) return ret; @@ -151,14 +181,16 @@ static int xgmac_mdio_write(struct mii_bus *bus, int phy_id, int regnum, u16 val */ static int xgmac_mdio_read(struct mii_bus *bus, int phy_id, int regnum) { - struct tgec_mdio_controller __iomem *regs = bus->priv; + struct mdio_fsl_priv *priv = (struct mdio_fsl_priv *)bus->priv; + struct tgec_mdio_controller __iomem *regs = priv->mdio_base; uint16_t dev_addr; uint32_t mdio_stat; uint32_t mdio_ctl; uint16_t value; int ret; + bool endian = priv->is_little_endian; - mdio_stat = ioread32be(®s->mdio_stat); + mdio_stat = xgmac_read32(®s->mdio_stat, endian); if (regnum & MII_ADDR_C45) { dev_addr = (regnum >> 16) & 0x1f; mdio_stat |= MDIO_STAT_ENC; @@ -167,41 +199,41 @@ static int xgmac_mdio_read(struct mii_bus *bus, int phy_id, int regnum) mdio_stat &= ~MDIO_STAT_ENC; } - iowrite32be(mdio_stat, ®s->mdio_stat); + xgmac_write32(mdio_stat, ®s->mdio_stat, endian); - ret = xgmac_wait_until_free(&bus->dev, regs); + ret = xgmac_wait_until_free(&bus->dev, regs, endian); if (ret) return ret; /* Set the Port and Device Addrs */ mdio_ctl = MDIO_CTL_PORT_ADDR(phy_id) | MDIO_CTL_DEV_ADDR(dev_addr); - iowrite32be(mdio_ctl, ®s->mdio_ctl); + xgmac_write32(mdio_ctl, ®s->mdio_ctl, endian); /* Set the register address */ if (regnum & MII_ADDR_C45) { - iowrite32be(regnum & 0xffff, ®s->mdio_addr); + xgmac_write32(regnum & 0xffff, ®s->mdio_addr, endian); - ret = xgmac_wait_until_free(&bus->dev, regs); + ret = xgmac_wait_until_free(&bus->dev, regs, endian); if (ret) return ret; } /* Initiate the read */ - iowrite32be(mdio_ctl | MDIO_CTL_READ, ®s->mdio_ctl); + xgmac_write32(mdio_ctl | MDIO_CTL_READ, ®s->mdio_ctl, endian); - ret = xgmac_wait_until_done(&bus->dev, regs); + ret = xgmac_wait_until_done(&bus->dev, regs, endian); if (ret) return ret; /* Return all Fs if nothing was there */ - if (ioread32be(®s->mdio_stat) & MDIO_STAT_RD_ER) { + if (xgmac_read32(®s->mdio_stat, endian) & MDIO_STAT_RD_ER) { dev_err(&bus->dev, "Error while reading PHY%d reg at %d.%hhu\n", phy_id, dev_addr, regnum); return 0xffff; } - value = ioread32be(®s->mdio_data) & 0xffff; + value = xgmac_read32(®s->mdio_data, endian) & 0xffff; dev_dbg(&bus->dev, "read %04x\n", value); return value; @@ -212,6 +244,7 @@ static int xgmac_mdio_probe(struct platform_device *pdev) struct device_node *np = pdev->dev.of_node; struct mii_bus *bus; struct resource res; + struct mdio_fsl_priv *priv; int ret; ret = of_address_to_resource(np, 0, &res); @@ -220,7 +253,7 @@ static int xgmac_mdio_probe(struct platform_device *pdev) return ret; } - bus = mdiobus_alloc(); + bus = mdiobus_alloc_size(sizeof(struct mdio_fsl_priv)); if (!bus) return -ENOMEM; @@ -231,12 +264,19 @@ static int xgmac_mdio_probe(struct platform_device *pdev) snprintf(bus->id, MII_BUS_ID_SIZE, "%llx", (unsigned long long)res.start); /* Set the PHY base address */ - bus->priv = of_iomap(np, 0); - if (!bus->priv) { + priv = bus->priv; + priv->mdio_base = of_iomap(np, 0); + if (!priv->mdio_base) { ret = -ENOMEM; goto err_ioremap; } + if (of_get_property(pdev->dev.of_node, + "little-endian", NULL)) + priv->is_little_endian = true; + else + priv->is_little_endian = false; + ret = of_mdiobus_register(bus, np); if (ret) { dev_err(&pdev->dev, "cannot register MDIO bus\n"); @@ -248,7 +288,7 @@ static int xgmac_mdio_probe(struct platform_device *pdev) return 0; err_registration: - iounmap(bus->priv); + iounmap(priv->mdio_base); err_ioremap: mdiobus_free(bus); @@ -267,7 +307,7 @@ static int xgmac_mdio_remove(struct platform_device *pdev) return 0; } -static struct of_device_id xgmac_mdio_match[] = { +static const struct of_device_id xgmac_mdio_match[] = { { .compatible = "fsl,fman-xmdio", }, diff --git a/drivers/net/ethernet/ibm/ehea/ehea_main.c b/drivers/net/ethernet/ibm/ehea/ehea_main.c index c05e50759621..291c87036e17 100644 --- a/drivers/net/ethernet/ibm/ehea/ehea_main.c +++ b/drivers/net/ethernet/ibm/ehea/ehea_main.c @@ -103,7 +103,7 @@ static int ehea_probe_adapter(struct platform_device *dev); static int ehea_remove(struct platform_device *dev); -static struct of_device_id ehea_module_device_table[] = { +static const struct of_device_id ehea_module_device_table[] = { { .name = "lhea", .compatible = "IBM,lhea", @@ -116,7 +116,7 @@ static struct of_device_id ehea_module_device_table[] = { }; MODULE_DEVICE_TABLE(of, ehea_module_device_table); -static struct of_device_id ehea_device_table[] = { +static const struct of_device_id ehea_device_table[] = { { .name = "lhea", .compatible = "IBM,lhea", diff --git a/drivers/net/ethernet/ibm/emac/core.c b/drivers/net/ethernet/ibm/emac/core.c index 162762d1a12c..8a17b97baa20 100644 --- a/drivers/net/ethernet/ibm/emac/core.c +++ b/drivers/net/ethernet/ibm/emac/core.c @@ -2981,7 +2981,7 @@ static int emac_remove(struct platform_device *ofdev) } /* XXX Features in here should be replaced by properties... */ -static struct of_device_id emac_match[] = +static const struct of_device_id emac_match[] = { { .type = "network", diff --git a/drivers/net/ethernet/ibm/emac/mal.c b/drivers/net/ethernet/ibm/emac/mal.c index dddaab11a4c7..fdb5cdb3cd15 100644 --- a/drivers/net/ethernet/ibm/emac/mal.c +++ b/drivers/net/ethernet/ibm/emac/mal.c @@ -753,7 +753,7 @@ static int mal_remove(struct platform_device *ofdev) return 0; } -static struct of_device_id mal_platform_match[] = +static const struct of_device_id mal_platform_match[] = { { .compatible = "ibm,mcmal", diff --git a/drivers/net/ethernet/ibm/emac/rgmii.c b/drivers/net/ethernet/ibm/emac/rgmii.c index 457088fc5b06..206ccbbae7bb 100644 --- a/drivers/net/ethernet/ibm/emac/rgmii.c +++ b/drivers/net/ethernet/ibm/emac/rgmii.c @@ -305,7 +305,7 @@ static int rgmii_remove(struct platform_device *ofdev) return 0; } -static struct of_device_id rgmii_match[] = +static const struct of_device_id rgmii_match[] = { { .compatible = "ibm,rgmii", diff --git a/drivers/net/ethernet/ibm/emac/tah.c b/drivers/net/ethernet/ibm/emac/tah.c index cb18e7f917c6..32cb6c9007c5 100644 --- a/drivers/net/ethernet/ibm/emac/tah.c +++ b/drivers/net/ethernet/ibm/emac/tah.c @@ -148,7 +148,7 @@ static int tah_remove(struct platform_device *ofdev) return 0; } -static struct of_device_id tah_match[] = +static const struct of_device_id tah_match[] = { { .compatible = "ibm,tah", diff --git a/drivers/net/ethernet/ibm/emac/zmii.c b/drivers/net/ethernet/ibm/emac/zmii.c index 36409ccb75ea..8727b865ea02 100644 --- a/drivers/net/ethernet/ibm/emac/zmii.c +++ b/drivers/net/ethernet/ibm/emac/zmii.c @@ -295,7 +295,7 @@ static int zmii_remove(struct platform_device *ofdev) return 0; } -static struct of_device_id zmii_match[] = +static const struct of_device_id zmii_match[] = { { .compatible = "ibm,zmii", diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c index c59ed925adaf..763ba2641d5c 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c @@ -2379,6 +2379,33 @@ static netdev_features_t mlx4_en_features_check(struct sk_buff *skb, } #endif +static int mlx4_en_set_tx_maxrate(struct net_device *dev, int queue_index, u32 maxrate) +{ + struct mlx4_en_priv *priv = netdev_priv(dev); + struct mlx4_en_tx_ring *tx_ring = priv->tx_ring[queue_index]; + struct mlx4_update_qp_params params; + int err; + + if (!(priv->mdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_QP_RATE_LIMIT)) + return -EOPNOTSUPP; + + /* rate provided to us in Mbs, check if it fits into 12 bits, if not use Gbs */ + if (maxrate >> 12) { + params.rate_unit = MLX4_QP_RATE_LIMIT_GBS; + params.rate_val = maxrate / 1000; + } else if (maxrate) { + params.rate_unit = MLX4_QP_RATE_LIMIT_MBS; + params.rate_val = maxrate; + } else { /* zero serves to revoke the QP rate-limitation */ + params.rate_unit = 0; + params.rate_val = 0; + } + + err = mlx4_update_qp(priv->mdev->dev, tx_ring->qpn, MLX4_UPDATE_QP_RATE_LIMIT, + ¶ms); + return err; +} + static const struct net_device_ops mlx4_netdev_ops = { .ndo_open = mlx4_en_open, .ndo_stop = mlx4_en_close, @@ -2410,6 +2437,7 @@ static const struct net_device_ops mlx4_netdev_ops = { .ndo_del_vxlan_port = mlx4_en_del_vxlan_port, .ndo_features_check = mlx4_en_features_check, #endif + .ndo_set_tx_maxrate = mlx4_en_set_tx_maxrate, }; static const struct net_device_ops mlx4_netdev_ops_master = { @@ -2444,6 +2472,7 @@ static const struct net_device_ops mlx4_netdev_ops_master = { .ndo_del_vxlan_port = mlx4_en_del_vxlan_port, .ndo_features_check = mlx4_en_features_check, #endif + .ndo_set_tx_maxrate = mlx4_en_set_tx_maxrate, }; struct mlx4_en_bond { diff --git a/drivers/net/ethernet/mellanox/mlx4/fw.c b/drivers/net/ethernet/mellanox/mlx4/fw.c index 242bcee5d774..4a471f5d1b56 100644 --- a/drivers/net/ethernet/mellanox/mlx4/fw.c +++ b/drivers/net/ethernet/mellanox/mlx4/fw.c @@ -144,7 +144,8 @@ static void dump_dev_cap_flags2(struct mlx4_dev *dev, u64 flags) [19] = "Performance optimized for limited rule configuration flow steering support", [20] = "Recoverable error events support", [21] = "Port Remap support", - [22] = "QCN support" + [22] = "QCN support", + [23] = "QP rate limiting support" }; int i; @@ -697,6 +698,10 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap) #define QUERY_DEV_CAP_MAD_DEMUX_OFFSET 0xb0 #define QUERY_DEV_CAP_DMFS_HIGH_RATE_QPN_BASE_OFFSET 0xa8 #define QUERY_DEV_CAP_DMFS_HIGH_RATE_QPN_RANGE_OFFSET 0xac +#define QUERY_DEV_CAP_QP_RATE_LIMIT_NUM_OFFSET 0xcc +#define QUERY_DEV_CAP_QP_RATE_LIMIT_MAX_OFFSET 0xd0 +#define QUERY_DEV_CAP_QP_RATE_LIMIT_MIN_OFFSET 0xd2 + dev_cap->flags2 = 0; mailbox = mlx4_alloc_cmd_mailbox(dev); @@ -904,6 +909,18 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap) QUERY_DEV_CAP_DMFS_HIGH_RATE_QPN_RANGE_OFFSET); dev_cap->dmfs_high_rate_qpn_range &= MGM_QPN_MASK; + MLX4_GET(size, outbox, QUERY_DEV_CAP_QP_RATE_LIMIT_NUM_OFFSET); + dev_cap->rl_caps.num_rates = size; + if (dev_cap->rl_caps.num_rates) { + dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_QP_RATE_LIMIT; + MLX4_GET(size, outbox, QUERY_DEV_CAP_QP_RATE_LIMIT_MAX_OFFSET); + dev_cap->rl_caps.max_val = size & 0xfff; + dev_cap->rl_caps.max_unit = size >> 14; + MLX4_GET(size, outbox, QUERY_DEV_CAP_QP_RATE_LIMIT_MIN_OFFSET); + dev_cap->rl_caps.min_val = size & 0xfff; + dev_cap->rl_caps.min_unit = size >> 14; + } + MLX4_GET(field32, outbox, QUERY_DEV_CAP_EXT_2_FLAGS_OFFSET); if (field32 & (1 << 16)) dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_UPDATE_QP; @@ -979,6 +996,15 @@ void mlx4_dev_cap_dump(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap) dev_cap->dmfs_high_rate_qpn_base); mlx4_dbg(dev, "DMFS high rate steer QPn range: %d\n", dev_cap->dmfs_high_rate_qpn_range); + + if (dev_cap->flags2 & MLX4_DEV_CAP_FLAG2_QP_RATE_LIMIT) { + struct mlx4_rate_limit_caps *rl_caps = &dev_cap->rl_caps; + + mlx4_dbg(dev, "QP Rate-Limit: #rates %d, unit/val max %d/%d, min %d/%d\n", + rl_caps->num_rates, rl_caps->max_unit, rl_caps->max_val, + rl_caps->min_unit, rl_caps->min_val); + } + dump_dev_cap_flags(dev, dev_cap->flags); dump_dev_cap_flags2(dev, dev_cap->flags2); } @@ -1075,6 +1101,7 @@ int mlx4_QUERY_DEV_CAP_wrapper(struct mlx4_dev *dev, int slave, u64 flags; int err = 0; u8 field; + u16 field16; u32 bmme_flags, field32; int real_port; int slave_port; @@ -1158,6 +1185,10 @@ int mlx4_QUERY_DEV_CAP_wrapper(struct mlx4_dev *dev, int slave, field &= 0xfe; MLX4_PUT(outbox->buf, field, QUERY_DEV_CAP_ECN_QCN_VER_OFFSET); + /* turn off QP max-rate limiting for guests */ + field16 = 0; + MLX4_PUT(outbox->buf, field16, QUERY_DEV_CAP_QP_RATE_LIMIT_NUM_OFFSET); + return 0; } diff --git a/drivers/net/ethernet/mellanox/mlx4/fw.h b/drivers/net/ethernet/mellanox/mlx4/fw.h index f44f7f6017ed..863655bd3947 100644 --- a/drivers/net/ethernet/mellanox/mlx4/fw.h +++ b/drivers/net/ethernet/mellanox/mlx4/fw.h @@ -127,6 +127,7 @@ struct mlx4_dev_cap { u32 max_counters; u32 dmfs_high_rate_qpn_base; u32 dmfs_high_rate_qpn_range; + struct mlx4_rate_limit_caps rl_caps; struct mlx4_port_cap port_cap[MLX4_MAX_PORTS + 1]; }; diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c index 7e487223489a..43aa76775b5f 100644 --- a/drivers/net/ethernet/mellanox/mlx4/main.c +++ b/drivers/net/ethernet/mellanox/mlx4/main.c @@ -489,6 +489,8 @@ static int mlx4_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap) dev->caps.dmfs_high_rate_qpn_range = MLX4_A0_STEERING_TABLE_SIZE; } + dev->caps.rl_caps = dev_cap->rl_caps; + dev->caps.reserved_qps_cnt[MLX4_QP_REGION_RSS_RAW_ETH] = dev->caps.dmfs_high_rate_qpn_range; diff --git a/drivers/net/ethernet/mellanox/mlx4/qp.c b/drivers/net/ethernet/mellanox/mlx4/qp.c index eda29dbbfcd2..69e4462e4ee4 100644 --- a/drivers/net/ethernet/mellanox/mlx4/qp.c +++ b/drivers/net/ethernet/mellanox/mlx4/qp.c @@ -442,6 +442,11 @@ int mlx4_update_qp(struct mlx4_dev *dev, u32 qpn, cmd->qp_context.param3 |= cpu_to_be32(MLX4_STRIP_VLAN); } + if (attr & MLX4_UPDATE_QP_RATE_LIMIT) { + qp_mask |= 1ULL << MLX4_UPD_QP_MASK_RATE_LIMIT; + cmd->qp_context.rate_limit_params = cpu_to_be16((params->rate_unit << 14) | params->rate_val); + } + cmd->primary_addr_path_mask = cpu_to_be64(pri_addr_path_mask); cmd->qp_mask = cpu_to_be64(qp_mask); diff --git a/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c b/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c index d43e25914d19..c258f8625aac 100644 --- a/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c +++ b/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c @@ -2947,8 +2947,12 @@ static int verify_qp_parameters(struct mlx4_dev *dev, qp_type = (be32_to_cpu(qp_ctx->flags) >> 16) & 0xff; optpar = be32_to_cpu(*(__be32 *) inbox->buf); - if (slave != mlx4_master_func_num(dev)) + if (slave != mlx4_master_func_num(dev)) { qp_ctx->params2 &= ~MLX4_QP_BIT_FPP; + /* setting QP rate-limit is disallowed for VFs */ + if (qp_ctx->rate_limit_params) + return -EPERM; + } switch (qp_type) { case MLX4_QP_ST_RC: diff --git a/drivers/net/ethernet/octeon/octeon_mgmt.c b/drivers/net/ethernet/octeon/octeon_mgmt.c index d36599f47af5..7bf9c028d8d7 100644 --- a/drivers/net/ethernet/octeon/octeon_mgmt.c +++ b/drivers/net/ethernet/octeon/octeon_mgmt.c @@ -1557,7 +1557,7 @@ static int octeon_mgmt_remove(struct platform_device *pdev) return 0; } -static struct of_device_id octeon_mgmt_match[] = { +static const struct of_device_id octeon_mgmt_match[] = { { .compatible = "cavium,octeon-5750-mix", }, diff --git a/drivers/net/ethernet/rocker/rocker.c b/drivers/net/ethernet/rocker/rocker.c index 223348d8cc07..c9558e6d57ad 100644 --- a/drivers/net/ethernet/rocker/rocker.c +++ b/drivers/net/ethernet/rocker/rocker.c @@ -30,6 +30,7 @@ #include <linux/if_vlan.h> #include <linux/if_bridge.h> #include <linux/bitops.h> +#include <linux/ctype.h> #include <net/switchdev.h> #include <net/rtnetlink.h> #include <net/ip_fib.h> @@ -1630,6 +1631,53 @@ rocker_cmd_get_port_settings_macaddr_proc(struct rocker *rocker, return 0; } +struct port_name { + char *buf; + size_t len; +}; + +static int +rocker_cmd_get_port_settings_phys_name_proc(struct rocker *rocker, + struct rocker_port *rocker_port, + struct rocker_desc_info *desc_info, + void *priv) +{ + struct rocker_tlv *info_attrs[ROCKER_TLV_CMD_PORT_SETTINGS_MAX + 1]; + struct rocker_tlv *attrs[ROCKER_TLV_CMD_MAX + 1]; + struct port_name *name = priv; + struct rocker_tlv *attr; + size_t i, j, len; + char *str; + + rocker_tlv_parse_desc(attrs, ROCKER_TLV_CMD_MAX, desc_info); + if (!attrs[ROCKER_TLV_CMD_INFO]) + return -EIO; + + rocker_tlv_parse_nested(info_attrs, ROCKER_TLV_CMD_PORT_SETTINGS_MAX, + attrs[ROCKER_TLV_CMD_INFO]); + attr = info_attrs[ROCKER_TLV_CMD_PORT_SETTINGS_PHYS_NAME]; + if (!attr) + return -EIO; + + len = min_t(size_t, rocker_tlv_len(attr), name->len); + str = rocker_tlv_data(attr); + + /* make sure name only contains alphanumeric characters */ + for (i = j = 0; i < len; ++i) { + if (isalnum(str[i])) { + name->buf[j] = str[i]; + j++; + } + } + + if (j == 0) + return -EIO; + + name->buf[j] = '\0'; + + return 0; +} + static int rocker_cmd_set_port_settings_ethtool_prep(struct rocker *rocker, struct rocker_port *rocker_port, @@ -2955,11 +3003,16 @@ static int rocker_port_vlan_flood_group(struct rocker_port *rocker_port, struct rocker_port *p; struct rocker *rocker = rocker_port->rocker; u32 group_id = ROCKER_GROUP_L2_FLOOD(vlan_id, 0); - u32 group_ids[ROCKER_FP_PORTS_MAX]; + u32 *group_ids; u8 group_count = 0; - int err; + int err = 0; int i; + group_ids = kcalloc(rocker->port_count, sizeof(u32), + rocker_op_flags_gfp(flags)); + if (!group_ids) + return -ENOMEM; + /* Adjust the flood group for this VLAN. The flood group * references an L2 interface group for each port in this * VLAN. @@ -2977,7 +3030,7 @@ static int rocker_port_vlan_flood_group(struct rocker_port *rocker_port, /* If there are no bridged ports in this VLAN, we're done */ if (group_count == 0) - return 0; + goto no_ports_in_vlan; err = rocker_group_l2_flood(rocker_port, flags, vlan_id, group_count, group_ids, @@ -2986,6 +3039,8 @@ static int rocker_port_vlan_flood_group(struct rocker_port *rocker_port, netdev_err(rocker_port->dev, "Error (%d) port VLAN l2 flood group\n", err); +no_ports_in_vlan: + kfree(group_ids); return err; } @@ -4131,8 +4186,42 @@ static int rocker_port_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq, rocker_port->brport_flags, mask); } -static int rocker_port_switch_parent_id_get(struct net_device *dev, - struct netdev_phys_item_id *psid) +static int rocker_port_get_phys_port_name(struct net_device *dev, + char *buf, size_t len) +{ + struct rocker_port *rocker_port = netdev_priv(dev); + struct port_name name = { .buf = buf, .len = len }; + int err; + + err = rocker_cmd_exec(rocker_port->rocker, rocker_port, + rocker_cmd_get_port_settings_prep, NULL, + rocker_cmd_get_port_settings_phys_name_proc, + &name, false); + + return err ? -EOPNOTSUPP : 0; +} + +static const struct net_device_ops rocker_port_netdev_ops = { + .ndo_open = rocker_port_open, + .ndo_stop = rocker_port_stop, + .ndo_start_xmit = rocker_port_xmit, + .ndo_set_mac_address = rocker_port_set_mac_address, + .ndo_vlan_rx_add_vid = rocker_port_vlan_rx_add_vid, + .ndo_vlan_rx_kill_vid = rocker_port_vlan_rx_kill_vid, + .ndo_fdb_add = rocker_port_fdb_add, + .ndo_fdb_del = rocker_port_fdb_del, + .ndo_fdb_dump = rocker_port_fdb_dump, + .ndo_bridge_setlink = rocker_port_bridge_setlink, + .ndo_bridge_getlink = rocker_port_bridge_getlink, + .ndo_get_phys_port_name = rocker_port_get_phys_port_name, +}; + +/******************** + * swdev interface + ********************/ + +static int rocker_port_swdev_parent_id_get(struct net_device *dev, + struct netdev_phys_item_id *psid) { struct rocker_port *rocker_port = netdev_priv(dev); struct rocker *rocker = rocker_port->rocker; @@ -4142,18 +4231,18 @@ static int rocker_port_switch_parent_id_get(struct net_device *dev, return 0; } -static int rocker_port_switch_port_stp_update(struct net_device *dev, u8 state) +static int rocker_port_swdev_port_stp_update(struct net_device *dev, u8 state) { struct rocker_port *rocker_port = netdev_priv(dev); return rocker_port_stp_update(rocker_port, state); } -static int rocker_port_switch_fib_ipv4_add(struct net_device *dev, - __be32 dst, int dst_len, - struct fib_info *fi, - u8 tos, u8 type, - u32 nlflags, u32 tb_id) +static int rocker_port_swdev_fib_ipv4_add(struct net_device *dev, + __be32 dst, int dst_len, + struct fib_info *fi, + u8 tos, u8 type, + u32 nlflags, u32 tb_id) { struct rocker_port *rocker_port = netdev_priv(dev); int flags = 0; @@ -4162,10 +4251,10 @@ static int rocker_port_switch_fib_ipv4_add(struct net_device *dev, fi, tb_id, flags); } -static int rocker_port_switch_fib_ipv4_del(struct net_device *dev, - __be32 dst, int dst_len, - struct fib_info *fi, - u8 tos, u8 type, u32 tb_id) +static int rocker_port_swdev_fib_ipv4_del(struct net_device *dev, + __be32 dst, int dst_len, + struct fib_info *fi, + u8 tos, u8 type, u32 tb_id) { struct rocker_port *rocker_port = netdev_priv(dev); int flags = ROCKER_OP_FLAG_REMOVE; @@ -4174,22 +4263,11 @@ static int rocker_port_switch_fib_ipv4_del(struct net_device *dev, fi, tb_id, flags); } -static const struct net_device_ops rocker_port_netdev_ops = { - .ndo_open = rocker_port_open, - .ndo_stop = rocker_port_stop, - .ndo_start_xmit = rocker_port_xmit, - .ndo_set_mac_address = rocker_port_set_mac_address, - .ndo_vlan_rx_add_vid = rocker_port_vlan_rx_add_vid, - .ndo_vlan_rx_kill_vid = rocker_port_vlan_rx_kill_vid, - .ndo_fdb_add = rocker_port_fdb_add, - .ndo_fdb_del = rocker_port_fdb_del, - .ndo_fdb_dump = rocker_port_fdb_dump, - .ndo_bridge_setlink = rocker_port_bridge_setlink, - .ndo_bridge_getlink = rocker_port_bridge_getlink, - .ndo_switch_parent_id_get = rocker_port_switch_parent_id_get, - .ndo_switch_port_stp_update = rocker_port_switch_port_stp_update, - .ndo_switch_fib_ipv4_add = rocker_port_switch_fib_ipv4_add, - .ndo_switch_fib_ipv4_del = rocker_port_switch_fib_ipv4_del, +static const struct swdev_ops rocker_port_swdev_ops = { + .swdev_parent_id_get = rocker_port_swdev_parent_id_get, + .swdev_port_stp_update = rocker_port_swdev_port_stp_update, + .swdev_fib_ipv4_add = rocker_port_swdev_fib_ipv4_add, + .swdev_fib_ipv4_del = rocker_port_swdev_fib_ipv4_del, }; /******************** @@ -4544,6 +4622,7 @@ static int rocker_probe_port(struct rocker *rocker, unsigned int port_number) rocker_port_dev_addr_init(rocker, rocker_port); dev->netdev_ops = &rocker_port_netdev_ops; dev->ethtool_ops = &rocker_port_ethtool_ops; + dev->swdev_ops = &rocker_port_swdev_ops; netif_napi_add(dev, &rocker_port->napi_tx, rocker_port_poll_tx, NAPI_POLL_WEIGHT); netif_napi_add(dev, &rocker_port->napi_rx, rocker_port_poll_rx, diff --git a/drivers/net/ethernet/rocker/rocker.h b/drivers/net/ethernet/rocker/rocker.h index 51e430d25138..a4e9591d7457 100644 --- a/drivers/net/ethernet/rocker/rocker.h +++ b/drivers/net/ethernet/rocker/rocker.h @@ -158,6 +158,7 @@ enum { ROCKER_TLV_CMD_PORT_SETTINGS_MACADDR, /* binary */ ROCKER_TLV_CMD_PORT_SETTINGS_MODE, /* u8 */ ROCKER_TLV_CMD_PORT_SETTINGS_LEARNING, /* u8 */ + ROCKER_TLV_CMD_PORT_SETTINGS_PHYS_NAME, /* binary */ __ROCKER_TLV_CMD_PORT_SETTINGS_MAX, ROCKER_TLV_CMD_PORT_SETTINGS_MAX = diff --git a/drivers/net/ethernet/ti/netcp_core.c b/drivers/net/ethernet/ti/netcp_core.c index 9f14d8b515c7..1c4dd8081a57 100644 --- a/drivers/net/ethernet/ti/netcp_core.c +++ b/drivers/net/ethernet/ti/netcp_core.c @@ -2127,7 +2127,7 @@ static int netcp_remove(struct platform_device *pdev) return 0; } -static struct of_device_id of_match[] = { +static const struct of_device_id of_match[] = { { .compatible = "ti,netcp-1.0", }, {}, }; diff --git a/drivers/net/ethernet/via/via-rhine.c b/drivers/net/ethernet/via/via-rhine.c index 8fb807ea1caa..de2850497c09 100644 --- a/drivers/net/ethernet/via/via-rhine.c +++ b/drivers/net/ethernet/via/via-rhine.c @@ -288,7 +288,7 @@ MODULE_DEVICE_TABLE(pci, rhine_pci_tbl); * The .data field is currently only used to store quirks */ static u32 vt8500_quirks = rqWOL | rqForceReset | rq6patterns; -static struct of_device_id rhine_of_tbl[] = { +static const struct of_device_id rhine_of_tbl[] = { { .compatible = "via,vt8500-rhine", .data = &vt8500_quirks }, { } /* terminate list */ }; diff --git a/drivers/net/ethernet/via/via-velocity.c b/drivers/net/ethernet/via/via-velocity.c index c20206f83cc1..ae68afd50a15 100644 --- a/drivers/net/ethernet/via/via-velocity.c +++ b/drivers/net/ethernet/via/via-velocity.c @@ -392,7 +392,7 @@ MODULE_DEVICE_TABLE(pci, velocity_pci_id_table); * Describe the OF device identifiers that we support in this * device driver. Used for devicetree nodes. */ -static struct of_device_id velocity_of_ids[] = { +static const struct of_device_id velocity_of_ids[] = { { .compatible = "via,velocity-vt6110", .data = &chip_info_table[0] }, { /* Sentinel */ }, }; diff --git a/drivers/net/ethernet/xilinx/ll_temac_main.c b/drivers/net/ethernet/xilinx/ll_temac_main.c index dbcbf0c5bcfa..690a4c36b316 100644 --- a/drivers/net/ethernet/xilinx/ll_temac_main.c +++ b/drivers/net/ethernet/xilinx/ll_temac_main.c @@ -1157,7 +1157,7 @@ static int temac_of_remove(struct platform_device *op) return 0; } -static struct of_device_id temac_of_match[] = { +static const struct of_device_id temac_of_match[] = { { .compatible = "xlnx,xps-ll-temac-1.01.b", }, { .compatible = "xlnx,xps-ll-temac-2.00.a", }, { .compatible = "xlnx,xps-ll-temac-2.02.a", }, diff --git a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c index a6d2860b712c..28b7e7d9c272 100644 --- a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c +++ b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c @@ -48,7 +48,7 @@ #define AXIENET_REGS_N 32 /* Match table for of_platform binding */ -static struct of_device_id axienet_of_match[] = { +static const struct of_device_id axienet_of_match[] = { { .compatible = "xlnx,axi-ethernet-1.00.a", }, { .compatible = "xlnx,axi-ethernet-1.01.a", }, { .compatible = "xlnx,axi-ethernet-2.01.a", }, diff --git a/drivers/net/ethernet/xilinx/xilinx_emaclite.c b/drivers/net/ethernet/xilinx/xilinx_emaclite.c index 9d4ce388510a..2111b91f5e49 100644 --- a/drivers/net/ethernet/xilinx/xilinx_emaclite.c +++ b/drivers/net/ethernet/xilinx/xilinx_emaclite.c @@ -1231,7 +1231,7 @@ static struct net_device_ops xemaclite_netdev_ops = { }; /* Match table for OF platform binding */ -static struct of_device_id xemaclite_of_match[] = { +static const struct of_device_id xemaclite_of_match[] = { { .compatible = "xlnx,opb-ethernetlite-1.01.a", }, { .compatible = "xlnx,opb-ethernetlite-1.01.b", }, { .compatible = "xlnx,xps-ethernetlite-1.00.a", }, diff --git a/drivers/net/phy/mdio-bcm-unimac.c b/drivers/net/phy/mdio-bcm-unimac.c index 6deac6d32f57..414fdf1f343f 100644 --- a/drivers/net/phy/mdio-bcm-unimac.c +++ b/drivers/net/phy/mdio-bcm-unimac.c @@ -187,7 +187,7 @@ static int unimac_mdio_remove(struct platform_device *pdev) return 0; } -static struct of_device_id unimac_mdio_ids[] = { +static const struct of_device_id unimac_mdio_ids[] = { { .compatible = "brcm,genet-mdio-v4", }, { .compatible = "brcm,genet-mdio-v3", }, { .compatible = "brcm,genet-mdio-v2", }, diff --git a/drivers/net/phy/mdio-gpio.c b/drivers/net/phy/mdio-gpio.c index 0a0578a592b8..49ce7ece5af3 100644 --- a/drivers/net/phy/mdio-gpio.c +++ b/drivers/net/phy/mdio-gpio.c @@ -249,7 +249,7 @@ static int mdio_gpio_remove(struct platform_device *pdev) return 0; } -static struct of_device_id mdio_gpio_of_match[] = { +static const struct of_device_id mdio_gpio_of_match[] = { { .compatible = "virtual,mdio-gpio", }, { /* sentinel */ } }; diff --git a/drivers/net/phy/mdio-mux-gpio.c b/drivers/net/phy/mdio-mux-gpio.c index 320eb15315c8..1a87a585e74d 100644 --- a/drivers/net/phy/mdio-mux-gpio.c +++ b/drivers/net/phy/mdio-mux-gpio.c @@ -99,7 +99,7 @@ static int mdio_mux_gpio_remove(struct platform_device *pdev) return 0; } -static struct of_device_id mdio_mux_gpio_match[] = { +static const struct of_device_id mdio_mux_gpio_match[] = { { .compatible = "mdio-mux-gpio", }, diff --git a/drivers/net/phy/mdio-mux-mmioreg.c b/drivers/net/phy/mdio-mux-mmioreg.c index 0aa985c74014..2377c1341172 100644 --- a/drivers/net/phy/mdio-mux-mmioreg.c +++ b/drivers/net/phy/mdio-mux-mmioreg.c @@ -145,7 +145,7 @@ static int mdio_mux_mmioreg_remove(struct platform_device *pdev) return 0; } -static struct of_device_id mdio_mux_mmioreg_match[] = { +static const struct of_device_id mdio_mux_mmioreg_match[] = { { .compatible = "mdio-mux-mmioreg", }, diff --git a/drivers/net/phy/mdio-octeon.c b/drivers/net/phy/mdio-octeon.c index c81052486edc..c838ad6155f7 100644 --- a/drivers/net/phy/mdio-octeon.c +++ b/drivers/net/phy/mdio-octeon.c @@ -252,7 +252,7 @@ static int octeon_mdiobus_remove(struct platform_device *pdev) return 0; } -static struct of_device_id octeon_mdiobus_match[] = { +static const struct of_device_id octeon_mdiobus_match[] = { { .compatible = "cavium,octeon-3860-mdio", }, diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index 25d92d4fc625..5f749a51f356 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -127,10 +127,6 @@ struct vxlan_dev { __u8 ttl; u32 flags; /* VXLAN_F_* in vxlan.h */ - struct work_struct sock_work; - struct work_struct igmp_join; - struct work_struct igmp_leave; - unsigned long age_interval; struct timer_list age_timer; spinlock_t hash_lock; @@ -144,8 +140,6 @@ struct vxlan_dev { static u32 vxlan_salt __read_mostly; static struct workqueue_struct *vxlan_wq; -static void vxlan_sock_work(struct work_struct *work); - #if IS_ENABLED(CONFIG_IPV6) static inline bool vxlan_addr_equal(const union vxlan_addr *a, const union vxlan_addr *b) @@ -1072,11 +1066,6 @@ static bool vxlan_group_used(struct vxlan_net *vn, struct vxlan_dev *dev) return false; } -static void vxlan_sock_hold(struct vxlan_sock *vs) -{ - atomic_inc(&vs->refcnt); -} - void vxlan_sock_release(struct vxlan_sock *vs) { struct sock *sk = vs->sock->sk; @@ -1095,17 +1084,16 @@ void vxlan_sock_release(struct vxlan_sock *vs) } EXPORT_SYMBOL_GPL(vxlan_sock_release); -/* Callback to update multicast group membership when first VNI on +/* Update multicast group membership when first VNI on * multicast asddress is brought up - * Done as workqueue because ip_mc_join_group acquires RTNL. */ -static void vxlan_igmp_join(struct work_struct *work) +static int vxlan_igmp_join(struct vxlan_dev *vxlan) { - struct vxlan_dev *vxlan = container_of(work, struct vxlan_dev, igmp_join); struct vxlan_sock *vs = vxlan->vn_sock; struct sock *sk = vs->sock->sk; union vxlan_addr *ip = &vxlan->default_dst.remote_ip; int ifindex = vxlan->default_dst.remote_ifindex; + int ret; lock_sock(sk); if (ip->sa.sa_family == AF_INET) { @@ -1114,27 +1102,26 @@ static void vxlan_igmp_join(struct work_struct *work) .imr_ifindex = ifindex, }; - ip_mc_join_group(sk, &mreq); + ret = ip_mc_join_group(sk, &mreq); #if IS_ENABLED(CONFIG_IPV6) } else { - ipv6_stub->ipv6_sock_mc_join(sk, ifindex, - &ip->sin6.sin6_addr); + ret = ipv6_stub->ipv6_sock_mc_join(sk, ifindex, + &ip->sin6.sin6_addr); #endif } release_sock(sk); - vxlan_sock_release(vs); - dev_put(vxlan->dev); + return ret; } /* Inverse of vxlan_igmp_join when last VNI is brought down */ -static void vxlan_igmp_leave(struct work_struct *work) +static int vxlan_igmp_leave(struct vxlan_dev *vxlan) { - struct vxlan_dev *vxlan = container_of(work, struct vxlan_dev, igmp_leave); struct vxlan_sock *vs = vxlan->vn_sock; struct sock *sk = vs->sock->sk; union vxlan_addr *ip = &vxlan->default_dst.remote_ip; int ifindex = vxlan->default_dst.remote_ifindex; + int ret; lock_sock(sk); if (ip->sa.sa_family == AF_INET) { @@ -1143,18 +1130,16 @@ static void vxlan_igmp_leave(struct work_struct *work) .imr_ifindex = ifindex, }; - ip_mc_leave_group(sk, &mreq); + ret = ip_mc_leave_group(sk, &mreq); #if IS_ENABLED(CONFIG_IPV6) } else { - ipv6_stub->ipv6_sock_mc_drop(sk, ifindex, - &ip->sin6.sin6_addr); + ret = ipv6_stub->ipv6_sock_mc_drop(sk, ifindex, + &ip->sin6.sin6_addr); #endif } - release_sock(sk); - vxlan_sock_release(vs); - dev_put(vxlan->dev); + return ret; } static struct vxlanhdr *vxlan_remcsum(struct sk_buff *skb, struct vxlanhdr *vh, @@ -2175,37 +2160,22 @@ static void vxlan_cleanup(unsigned long arg) static void vxlan_vs_add_dev(struct vxlan_sock *vs, struct vxlan_dev *vxlan) { + struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id); __u32 vni = vxlan->default_dst.remote_vni; vxlan->vn_sock = vs; + spin_lock(&vn->sock_lock); hlist_add_head_rcu(&vxlan->hlist, vni_head(vs, vni)); + spin_unlock(&vn->sock_lock); } /* Setup stats when device is created */ static int vxlan_init(struct net_device *dev) { - struct vxlan_dev *vxlan = netdev_priv(dev); - struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id); - struct vxlan_sock *vs; - bool ipv6 = vxlan->flags & VXLAN_F_IPV6; - dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); if (!dev->tstats) return -ENOMEM; - spin_lock(&vn->sock_lock); - vs = vxlan_find_sock(vxlan->net, ipv6 ? AF_INET6 : AF_INET, - vxlan->dst_port, vxlan->flags); - if (vs && atomic_add_unless(&vs->refcnt, 1, 0)) { - /* If we have a socket with same port already, reuse it */ - vxlan_vs_add_dev(vs, vxlan); - } else { - /* otherwise make new socket outside of RTNL */ - dev_hold(dev); - queue_work(vxlan_wq, &vxlan->sock_work); - } - spin_unlock(&vn->sock_lock); - return 0; } @@ -2223,12 +2193,9 @@ static void vxlan_fdb_delete_default(struct vxlan_dev *vxlan) static void vxlan_uninit(struct net_device *dev) { struct vxlan_dev *vxlan = netdev_priv(dev); - struct vxlan_sock *vs = vxlan->vn_sock; vxlan_fdb_delete_default(vxlan); - if (vs) - vxlan_sock_release(vs); free_percpu(dev->tstats); } @@ -2236,22 +2203,28 @@ static void vxlan_uninit(struct net_device *dev) static int vxlan_open(struct net_device *dev) { struct vxlan_dev *vxlan = netdev_priv(dev); - struct vxlan_sock *vs = vxlan->vn_sock; + struct vxlan_sock *vs; + int ret = 0; - /* socket hasn't been created */ - if (!vs) - return -ENOTCONN; + vs = vxlan_sock_add(vxlan->net, vxlan->dst_port, vxlan_rcv, NULL, + false, vxlan->flags); + if (IS_ERR(vs)) + return PTR_ERR(vs); + + vxlan_vs_add_dev(vs, vxlan); if (vxlan_addr_multicast(&vxlan->default_dst.remote_ip)) { - vxlan_sock_hold(vs); - dev_hold(dev); - queue_work(vxlan_wq, &vxlan->igmp_join); + ret = vxlan_igmp_join(vxlan); + if (ret) { + vxlan_sock_release(vs); + return ret; + } } if (vxlan->age_interval) mod_timer(&vxlan->age_timer, jiffies + FDB_AGE_INTERVAL); - return 0; + return ret; } /* Purge the forwarding table */ @@ -2279,19 +2252,21 @@ static int vxlan_stop(struct net_device *dev) struct vxlan_dev *vxlan = netdev_priv(dev); struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id); struct vxlan_sock *vs = vxlan->vn_sock; + int ret = 0; if (vs && vxlan_addr_multicast(&vxlan->default_dst.remote_ip) && !vxlan_group_used(vn, vxlan)) { - vxlan_sock_hold(vs); - dev_hold(dev); - queue_work(vxlan_wq, &vxlan->igmp_leave); + ret = vxlan_igmp_leave(vxlan); + if (ret) + return ret; } del_timer_sync(&vxlan->age_timer); vxlan_flush(vxlan); + vxlan_sock_release(vs); - return 0; + return ret; } /* Stub, nothing needs to be done. */ @@ -2402,9 +2377,6 @@ static void vxlan_setup(struct net_device *dev) INIT_LIST_HEAD(&vxlan->next); spin_lock_init(&vxlan->hash_lock); - INIT_WORK(&vxlan->igmp_join, vxlan_igmp_join); - INIT_WORK(&vxlan->igmp_leave, vxlan_igmp_leave); - INIT_WORK(&vxlan->sock_work, vxlan_sock_work); init_timer_deferrable(&vxlan->age_timer); vxlan->age_timer.function = vxlan_cleanup; @@ -2551,6 +2523,8 @@ static struct vxlan_sock *vxlan_socket_create(struct net *net, __be16 port, sock = vxlan_create_sock(net, ipv6, port, flags); if (IS_ERR(sock)) { + pr_info("Cannot bind port %d, err=%ld\n", ntohs(port), + PTR_ERR(sock)); kfree(vs); return ERR_CAST(sock); } @@ -2590,45 +2564,23 @@ struct vxlan_sock *vxlan_sock_add(struct net *net, __be16 port, struct vxlan_sock *vs; bool ipv6 = flags & VXLAN_F_IPV6; - vs = vxlan_socket_create(net, port, rcv, data, flags); - if (!IS_ERR(vs)) - return vs; - - if (no_share) /* Return error if sharing is not allowed. */ - return vs; - - spin_lock(&vn->sock_lock); - vs = vxlan_find_sock(net, ipv6 ? AF_INET6 : AF_INET, port, flags); - if (vs && ((vs->rcv != rcv) || - !atomic_add_unless(&vs->refcnt, 1, 0))) - vs = ERR_PTR(-EBUSY); - spin_unlock(&vn->sock_lock); - - if (!vs) - vs = ERR_PTR(-EINVAL); + if (!no_share) { + spin_lock(&vn->sock_lock); + vs = vxlan_find_sock(net, ipv6 ? AF_INET6 : AF_INET, port, + flags); + if (vs && vs->rcv == rcv) { + if (!atomic_add_unless(&vs->refcnt, 1, 0)) + vs = ERR_PTR(-EBUSY); + spin_unlock(&vn->sock_lock); + return vs; + } + spin_unlock(&vn->sock_lock); + } - return vs; + return vxlan_socket_create(net, port, rcv, data, flags); } EXPORT_SYMBOL_GPL(vxlan_sock_add); -/* Scheduled at device creation to bind to a socket */ -static void vxlan_sock_work(struct work_struct *work) -{ - struct vxlan_dev *vxlan = container_of(work, struct vxlan_dev, sock_work); - struct net *net = vxlan->net; - struct vxlan_net *vn = net_generic(net, vxlan_net_id); - __be16 port = vxlan->dst_port; - struct vxlan_sock *nvs; - - nvs = vxlan_sock_add(net, port, vxlan_rcv, NULL, false, vxlan->flags); - spin_lock(&vn->sock_lock); - if (!IS_ERR(nvs)) - vxlan_vs_add_dev(nvs, vxlan); - spin_unlock(&vn->sock_lock); - - dev_put(vxlan->dev); -} - static int vxlan_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[]) { diff --git a/drivers/net/wireless/orinoco/airport.c b/drivers/net/wireless/orinoco/airport.c index 0ca8b1455cd9..77e6c53040a3 100644 --- a/drivers/net/wireless/orinoco/airport.c +++ b/drivers/net/wireless/orinoco/airport.c @@ -228,7 +228,7 @@ MODULE_AUTHOR("Benjamin Herrenschmidt <benh@kernel.crashing.org>"); MODULE_DESCRIPTION("Driver for the Apple Airport wireless card."); MODULE_LICENSE("Dual MPL/GPL"); -static struct of_device_id airport_match[] = { +static const struct of_device_id airport_match[] = { { .name = "radio", }, diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 80f2e0fc3d02..280a315de8d6 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -103,6 +103,9 @@ struct bpf_verifier_ops { * with 'type' (read or write) is allowed */ bool (*is_valid_access)(int off, int size, enum bpf_access_type type); + + u32 (*convert_ctx_access)(int dst_reg, int src_reg, int ctx_off, + struct bpf_insn *insn); }; struct bpf_prog_type_list { @@ -133,7 +136,7 @@ struct bpf_map *bpf_map_get(struct fd f); void bpf_map_put(struct bpf_map *map); /* verify correctness of eBPF program */ -int bpf_check(struct bpf_prog *fp, union bpf_attr *attr); +int bpf_check(struct bpf_prog **fp, union bpf_attr *attr); #else static inline void bpf_register_prog_type(struct bpf_prog_type_list *tl) { @@ -154,4 +157,7 @@ extern const struct bpf_func_proto bpf_map_lookup_elem_proto; extern const struct bpf_func_proto bpf_map_update_elem_proto; extern const struct bpf_func_proto bpf_map_delete_elem_proto; +extern const struct bpf_func_proto bpf_get_prandom_u32_proto; +extern const struct bpf_func_proto bpf_get_smp_processor_id_proto; + #endif /* _LINUX_BPF_H */ diff --git a/include/linux/igmp.h b/include/linux/igmp.h index b5a6470e686c..2c677afeea47 100644 --- a/include/linux/igmp.h +++ b/include/linux/igmp.h @@ -111,9 +111,7 @@ struct ip_mc_list { extern int ip_check_mc_rcu(struct in_device *dev, __be32 mc_addr, __be32 src_addr, u16 proto); extern int igmp_rcv(struct sk_buff *); -extern int __ip_mc_join_group(struct sock *sk, struct ip_mreqn *imr); extern int ip_mc_join_group(struct sock *sk, struct ip_mreqn *imr); -extern int __ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr); extern int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr); extern void ip_mc_drop_socket(struct sock *sk); extern int ip_mc_source(int add, int omode, struct sock *sk, diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h index 1cc54822b931..4550c67b92e4 100644 --- a/include/linux/mlx4/device.h +++ b/include/linux/mlx4/device.h @@ -205,6 +205,7 @@ enum { MLX4_DEV_CAP_FLAG2_RECOVERABLE_ERROR_EVENT = 1LL << 20, MLX4_DEV_CAP_FLAG2_PORT_REMAP = 1LL << 21, MLX4_DEV_CAP_FLAG2_QCN = 1LL << 22, + MLX4_DEV_CAP_FLAG2_QP_RATE_LIMIT = 1LL << 23 }; enum { @@ -450,6 +451,21 @@ enum mlx4_module_id { MLX4_MODULE_ID_QSFP28 = 0x11, }; +enum { /* rl */ + MLX4_QP_RATE_LIMIT_NONE = 0, + MLX4_QP_RATE_LIMIT_KBS = 1, + MLX4_QP_RATE_LIMIT_MBS = 2, + MLX4_QP_RATE_LIMIT_GBS = 3 +}; + +struct mlx4_rate_limit_caps { + u16 num_rates; /* Number of different rates */ + u8 min_unit; + u16 min_val; + u8 max_unit; + u16 max_val; +}; + static inline u64 mlx4_fw_ver(u64 major, u64 minor, u64 subminor) { return (major << 32) | (minor << 16) | subminor; @@ -565,6 +581,7 @@ struct mlx4_caps { u32 dmfs_high_rate_qpn_base; u32 dmfs_high_rate_qpn_range; u32 vf_caps; + struct mlx4_rate_limit_caps rl_caps; }; struct mlx4_buf_list { diff --git a/include/linux/mlx4/qp.h b/include/linux/mlx4/qp.h index 551f85456c11..1023ebe035b7 100644 --- a/include/linux/mlx4/qp.h +++ b/include/linux/mlx4/qp.h @@ -207,14 +207,16 @@ struct mlx4_qp_context { __be32 msn; __be16 rq_wqe_counter; __be16 sq_wqe_counter; - u32 reserved3[2]; + u32 reserved3; + __be16 rate_limit_params; + __be16 reserved4; __be32 param3; __be32 nummmcpeers_basemkey; u8 log_page_size; - u8 reserved4[2]; + u8 reserved5[2]; u8 mtt_base_addr_h; __be32 mtt_base_addr_l; - u32 reserved5[10]; + u32 reserved6[10]; }; struct mlx4_update_qp_context { @@ -229,6 +231,7 @@ struct mlx4_update_qp_context { enum { MLX4_UPD_QP_MASK_PM_STATE = 32, MLX4_UPD_QP_MASK_VSD = 33, + MLX4_UPD_QP_MASK_RATE_LIMIT = 35, }; enum { @@ -428,7 +431,8 @@ struct mlx4_wqe_inline_seg { enum mlx4_update_qp_attr { MLX4_UPDATE_QP_SMAC = 1 << 0, MLX4_UPDATE_QP_VSD = 1 << 1, - MLX4_UPDATE_QP_SUPPORTED_ATTRS = (1 << 2) - 1 + MLX4_UPDATE_QP_RATE_LIMIT = 1 << 2, + MLX4_UPDATE_QP_SUPPORTED_ATTRS = (1 << 3) - 1 }; enum mlx4_update_qp_params_flags { @@ -438,6 +442,8 @@ enum mlx4_update_qp_params_flags { struct mlx4_update_qp_params { u8 smac_index; u32 flags; + u16 rate_unit; + u16 rate_val; }; int mlx4_update_qp(struct mlx4_dev *dev, u32 qpn, diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index ddab1a2a07a0..76951c5fbedf 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -587,6 +587,7 @@ struct netdev_queue { #ifdef CONFIG_BQL struct dql dql; #endif + unsigned long tx_maxrate; } ____cacheline_aligned_in_smp; static inline int netdev_queue_numa_node_read(const struct netdev_queue *q) @@ -768,8 +769,6 @@ struct netdev_phys_item_id { typedef u16 (*select_queue_fallback_t)(struct net_device *dev, struct sk_buff *skb); -struct fib_info; - /* * This structure defines the management hooks for network devices. * The following hooks can be defined; unless noted otherwise, they are @@ -1024,23 +1023,10 @@ struct fib_info; * be otherwise expressed by feature flags. The check is called with * the set of features that the stack has calculated and it returns * those the driver believes to be appropriate. - * - * int (*ndo_switch_parent_id_get)(struct net_device *dev, - * struct netdev_phys_item_id *psid); - * Called to get an ID of the switch chip this port is part of. - * If driver implements this, it indicates that it represents a port - * of a switch chip. - * int (*ndo_switch_port_stp_update)(struct net_device *dev, u8 state); - * Called to notify switch device port of bridge port STP - * state change. - * int (*ndo_sw_parent_fib_ipv4_add)(struct net_device *dev, __be32 dst, - * int dst_len, struct fib_info *fi, - * u8 tos, u8 type, u32 nlflags, u32 tb_id); - * Called to add/modify IPv4 route to switch device. - * int (*ndo_sw_parent_fib_ipv4_del)(struct net_device *dev, __be32 dst, - * int dst_len, struct fib_info *fi, - * u8 tos, u8 type, u32 tb_id); - * Called to delete IPv4 route from switch device. + * int (*ndo_set_tx_maxrate)(struct net_device *dev, + * int queue_index, u32 maxrate); + * Called when a user wants to set a max-rate limitation of specific + * TX queue. */ struct net_device_ops { int (*ndo_init)(struct net_device *dev); @@ -1178,6 +1164,8 @@ struct net_device_ops { bool new_carrier); int (*ndo_get_phys_port_id)(struct net_device *dev, struct netdev_phys_item_id *ppid); + int (*ndo_get_phys_port_name)(struct net_device *dev, + char *name, size_t len); void (*ndo_add_vxlan_port)(struct net_device *dev, sa_family_t sa_family, __be16 port); @@ -1197,25 +1185,9 @@ struct net_device_ops { netdev_features_t (*ndo_features_check) (struct sk_buff *skb, struct net_device *dev, netdev_features_t features); -#ifdef CONFIG_NET_SWITCHDEV - int (*ndo_switch_parent_id_get)(struct net_device *dev, - struct netdev_phys_item_id *psid); - int (*ndo_switch_port_stp_update)(struct net_device *dev, - u8 state); - int (*ndo_switch_fib_ipv4_add)(struct net_device *dev, - __be32 dst, - int dst_len, - struct fib_info *fi, - u8 tos, u8 type, - u32 nlflags, - u32 tb_id); - int (*ndo_switch_fib_ipv4_del)(struct net_device *dev, - __be32 dst, - int dst_len, - struct fib_info *fi, - u8 tos, u8 type, - u32 tb_id); -#endif + int (*ndo_set_tx_maxrate)(struct net_device *dev, + int queue_index, + u32 maxrate); }; /** @@ -1577,6 +1549,9 @@ struct net_device { const struct net_device_ops *netdev_ops; const struct ethtool_ops *ethtool_ops; const struct forwarding_accel_ops *fwd_ops; +#ifdef CONFIG_NET_SWITCHDEV + const struct swdev_ops *swdev_ops; +#endif const struct header_ops *header_ops; @@ -2181,6 +2156,7 @@ struct net_device *__dev_get_by_name(struct net *net, const char *name); int dev_alloc_name(struct net_device *dev, const char *name); int dev_open(struct net_device *dev); int dev_close(struct net_device *dev); +int dev_close_many(struct list_head *head, bool unlink); void dev_disable_lro(struct net_device *dev); int dev_loopback_xmit(struct sk_buff *newskb); int dev_queue_xmit(struct sk_buff *skb); @@ -2974,6 +2950,8 @@ int dev_set_mac_address(struct net_device *, struct sockaddr *); int dev_change_carrier(struct net_device *, bool new_carrier); int dev_get_phys_port_id(struct net_device *dev, struct netdev_phys_item_id *ppid); +int dev_get_phys_port_name(struct net_device *dev, + char *name, size_t len); struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev); struct sk_buff *dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, struct netdev_queue *txq, int *ret); diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h index c93ff8ac474a..99425f2be708 100644 --- a/include/linux/rhashtable.h +++ b/include/linux/rhashtable.h @@ -49,18 +49,25 @@ struct rhash_head { /** * struct bucket_table - Table of hash buckets * @size: Number of hash buckets + * @rehash: Current bucket being rehashed * @hash_rnd: Random seed to fold into hash - * @shift: Current size (1 << shift) * @locks_mask: Mask to apply before accessing locks[] * @locks: Array of spinlocks protecting individual buckets + * @walkers: List of active walkers + * @rcu: RCU structure for freeing the table + * @future_tbl: Table under construction during rehashing * @buckets: size * hash buckets */ struct bucket_table { - size_t size; + unsigned int size; + unsigned int rehash; u32 hash_rnd; - u32 shift; unsigned int locks_mask; spinlock_t *locks; + struct list_head walkers; + struct rcu_head rcu; + + struct bucket_table __rcu *future_tbl; struct rhash_head __rcu *buckets[] ____cacheline_aligned_in_smp; }; @@ -76,8 +83,8 @@ struct rhashtable; * @key_len: Length of key * @key_offset: Offset of key in struct to be hashed * @head_offset: Offset of rhash_head in struct to be hashed - * @max_shift: Maximum number of shifts while expanding - * @min_shift: Minimum number of shifts while shrinking + * @max_size: Maximum size while expanding + * @min_size: Minimum size while shrinking * @nulls_base: Base value to generate nulls marker * @locks_mul: Number of bucket locks to allocate per cpu (default: 128) * @hashfn: Function to hash key @@ -88,8 +95,8 @@ struct rhashtable_params { size_t key_len; size_t key_offset; size_t head_offset; - size_t max_shift; - size_t min_shift; + unsigned int max_size; + unsigned int min_size; u32 nulls_base; size_t locks_mul; rht_hashfn_t hashfn; @@ -99,33 +106,29 @@ struct rhashtable_params { /** * struct rhashtable - Hash table handle * @tbl: Bucket table - * @future_tbl: Table under construction during expansion/shrinking * @nelems: Number of elements in table * @p: Configuration parameters * @run_work: Deferred worker to expand/shrink asynchronously * @mutex: Mutex to protect current/future table swapping - * @walkers: List of active walkers * @being_destroyed: True if table is set up for destruction */ struct rhashtable { struct bucket_table __rcu *tbl; - struct bucket_table __rcu *future_tbl; atomic_t nelems; bool being_destroyed; struct rhashtable_params p; struct work_struct run_work; struct mutex mutex; - struct list_head walkers; }; /** * struct rhashtable_walker - Hash table walker * @list: List entry on list of walkers - * @resize: Resize event occured + * @tbl: The table that we were walking over */ struct rhashtable_walker { struct list_head list; - bool resize; + struct bucket_table *tbl; }; /** diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 97dbf16f7d9d..f869ae8afbaf 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -111,7 +111,7 @@ struct tcp_request_sock_ops; struct tcp_request_sock { struct inet_request_sock req; const struct tcp_request_sock_ops *af_specific; - struct sock *listener; /* needed for TFO */ + bool tfo_listener; u32 rcv_isn; u32 snt_isn; u32 snt_synack; /* synack sent time */ diff --git a/include/linux/udp.h b/include/linux/udp.h index 247cfdcc4b08..87c094961bd5 100644 --- a/include/linux/udp.h +++ b/include/linux/udp.h @@ -34,7 +34,7 @@ static inline struct udphdr *inner_udp_hdr(const struct sk_buff *skb) #define UDP_HTABLE_SIZE_MIN (CONFIG_BASE_SMALL ? 128 : 256) -static inline int udp_hashfn(struct net *net, unsigned num, unsigned mask) +static inline u32 udp_hashfn(const struct net *net, u32 num, u32 mask) { return (num + net_hash_mix(net)) & mask; } diff --git a/include/net/inet6_hashtables.h b/include/net/inet6_hashtables.h index 9201afe083fa..7ff588ca6817 100644 --- a/include/net/inet6_hashtables.h +++ b/include/net/inet6_hashtables.h @@ -38,8 +38,6 @@ static inline unsigned int __inet6_ehashfn(const u32 lhash, return jhash_3words(lhash, fhash, ports, initval); } -int __inet6_hash(struct sock *sk, struct inet_timewait_sock *twp); - /* * Sockets in TCP_CLOSE state are _always_ taken out of the hash, so * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index bcd64756e5fe..73fe0f9525d9 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -221,8 +221,8 @@ inet_bind_bucket_create(struct kmem_cache *cachep, struct net *net, void inet_bind_bucket_destroy(struct kmem_cache *cachep, struct inet_bind_bucket *tb); -static inline int inet_bhashfn(struct net *net, const __u16 lport, - const int bhash_size) +static inline u32 inet_bhashfn(const struct net *net, const __u16 lport, + const u32 bhash_size) { return (lport + net_hash_mix(net)) & (bhash_size - 1); } @@ -231,7 +231,7 @@ void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb, const unsigned short snum); /* These can have wildcards, don't try too hard. */ -static inline int inet_lhashfn(struct net *net, const unsigned short num) +static inline u32 inet_lhashfn(const struct net *net, const unsigned short num) { return (num + net_hash_mix(net)) & (INET_LHTABLE_SIZE - 1); } @@ -249,6 +249,7 @@ void inet_put_port(struct sock *sk); void inet_hashinfo_init(struct inet_hashinfo *h); int __inet_hash_nolisten(struct sock *sk, struct inet_timewait_sock *tw); +int __inet_hash(struct sock *sk, struct inet_timewait_sock *tw); void inet_hash(struct sock *sk); void inet_unhash(struct sock *sk); @@ -383,13 +384,32 @@ static inline struct sock *__inet_lookup_skb(struct inet_hashinfo *hashinfo, iph->daddr, dport, inet_iif(skb)); } +u32 sk_ehashfn(const struct sock *sk); +u32 inet6_ehashfn(const struct net *net, + const struct in6_addr *laddr, const u16 lport, + const struct in6_addr *faddr, const __be16 fport); + +static inline void sk_daddr_set(struct sock *sk, __be32 addr) +{ + sk->sk_daddr = addr; /* alias of inet_daddr */ +#if IS_ENABLED(CONFIG_IPV6) + ipv6_addr_set_v4mapped(addr, &sk->sk_v6_daddr); +#endif +} + +static inline void sk_rcv_saddr_set(struct sock *sk, __be32 addr) +{ + sk->sk_rcv_saddr = addr; /* alias of inet_rcv_saddr */ +#if IS_ENABLED(CONFIG_IPV6) + ipv6_addr_set_v4mapped(addr, &sk->sk_v6_rcv_saddr); +#endif +} + int __inet_hash_connect(struct inet_timewait_death_row *death_row, struct sock *sk, u32 port_offset, int (*check_established)(struct inet_timewait_death_row *, struct sock *, __u16, - struct inet_timewait_sock **), - int (*hash)(struct sock *sk, - struct inet_timewait_sock *twp)); + struct inet_timewait_sock **)); int inet_hash_connect(struct inet_timewait_death_row *death_row, struct sock *sk); diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index b3053fdd871e..b6c3737da4e9 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -81,7 +81,6 @@ struct inet_request_sock { #define ir_cookie req.__req_common.skc_cookie #define ireq_net req.__req_common.skc_net #define ireq_state req.__req_common.skc_state -#define ireq_refcnt req.__req_common.skc_refcnt #define ireq_family req.__req_common.skc_family kmemcheck_bitfield_begin(flags); @@ -94,11 +93,11 @@ struct inet_request_sock { acked : 1, no_srccheck: 1; kmemcheck_bitfield_end(flags); + u32 ir_mark; union { struct ip_options_rcu *opt; struct sk_buff *pktopts; }; - u32 ir_mark; }; static inline struct inet_request_sock *inet_rsk(const struct request_sock *sk) @@ -106,13 +105,12 @@ static inline struct inet_request_sock *inet_rsk(const struct request_sock *sk) return (struct inet_request_sock *)sk; } -static inline u32 inet_request_mark(struct sock *sk, struct sk_buff *skb) +static inline u32 inet_request_mark(const struct sock *sk, struct sk_buff *skb) { - if (!sk->sk_mark && sock_net(sk)->ipv4.sysctl_tcp_fwmark_accept) { + if (!sk->sk_mark && sock_net(sk)->ipv4.sysctl_tcp_fwmark_accept) return skb->mark; - } else { - return sk->sk_mark; - } + + return sk->sk_mark; } struct inet_cork { @@ -245,20 +243,8 @@ static inline unsigned int __inet_ehashfn(const __be32 laddr, initval); } -static inline struct request_sock *inet_reqsk_alloc(struct request_sock_ops *ops) -{ - struct request_sock *req = reqsk_alloc(ops); - struct inet_request_sock *ireq = inet_rsk(req); - - if (req != NULL) { - kmemcheck_annotate_bitfield(ireq, flags); - ireq->opt = NULL; - atomic64_set(&ireq->ir_cookie, 0); - ireq->ireq_state = TCP_NEW_SYN_RECV; - } - - return req; -} +struct request_sock *inet_reqsk_alloc(const struct request_sock_ops *ops, + struct sock *sk_listener); static inline __u8 inet_sk_flowi_flags(const struct sock *sk) { diff --git a/include/net/ipv6.h b/include/net/ipv6.h index b7673065c074..e7ba9758a345 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -942,10 +942,6 @@ void ipv6_sysctl_unregister(void); int ipv6_sock_mc_join(struct sock *sk, int ifindex, const struct in6_addr *addr); -int __ipv6_sock_mc_join(struct sock *sk, int ifindex, - const struct in6_addr *addr); int ipv6_sock_mc_drop(struct sock *sk, int ifindex, const struct in6_addr *addr); -int __ipv6_sock_mc_drop(struct sock *sk, int ifindex, - const struct in6_addr *addr); #endif /* _NET_IPV6_H */ diff --git a/include/net/netns/hash.h b/include/net/netns/hash.h index c06ac58ca107..69a6715d9f3f 100644 --- a/include/net/netns/hash.h +++ b/include/net/netns/hash.h @@ -5,7 +5,7 @@ struct net; -static inline unsigned int net_hash_mix(struct net *net) +static inline u32 net_hash_mix(const struct net *net) { #ifdef CONFIG_NET_NS /* @@ -13,7 +13,7 @@ static inline unsigned int net_hash_mix(struct net *net) * always zeroed */ - return (unsigned)(((unsigned long)net) >> L1_CACHE_SHIFT); + return (u32)(((unsigned long)net) >> L1_CACHE_SHIFT); #else return 0; #endif diff --git a/include/net/request_sock.h b/include/net/request_sock.h index 3275cf31f731..e7ef86340514 100644 --- a/include/net/request_sock.h +++ b/include/net/request_sock.h @@ -52,6 +52,7 @@ struct request_sock { #define rsk_refcnt __req_common.skc_refcnt struct request_sock *dl_next; + struct sock *rsk_listener; u16 mss; u8 num_retrans; /* number of retransmits */ u8 cookie_ts:1; /* syncookie: encode tcpopts in timestamp */ @@ -67,13 +68,21 @@ struct request_sock { u32 peer_secid; }; -static inline struct request_sock *reqsk_alloc(const struct request_sock_ops *ops) +static inline struct request_sock * +reqsk_alloc(const struct request_sock_ops *ops, struct sock *sk_listener) { struct request_sock *req = kmem_cache_alloc(ops->slab, GFP_ATOMIC); - if (req != NULL) + if (req) { req->rsk_ops = ops; - + sock_hold(sk_listener); + req->rsk_listener = sk_listener; + + /* Following is temporary. It is coupled with debugging + * helpers in reqsk_put() & reqsk_free() + */ + atomic_set(&req->rsk_refcnt, 0); + } return req; } @@ -82,19 +91,27 @@ static inline struct request_sock *inet_reqsk(struct sock *sk) return (struct request_sock *)sk; } -static inline void __reqsk_free(struct request_sock *req) +static inline struct sock *req_to_sk(struct request_sock *req) { - kmem_cache_free(req->rsk_ops->slab, req); + return (struct sock *)req; } static inline void reqsk_free(struct request_sock *req) { + /* temporary debugging */ + WARN_ON_ONCE(atomic_read(&req->rsk_refcnt) != 0); + req->rsk_ops->destructor(req); - __reqsk_free(req); + if (req->rsk_listener) + sock_put(req->rsk_listener); + kmem_cache_free(req->rsk_ops->slab, req); } static inline void reqsk_put(struct request_sock *req) { + /* temporary debugging, until req sock are put into ehash table */ + WARN_ON_ONCE(atomic_read(&req->rsk_refcnt) != 1); + if (atomic_dec_and_test(&req->rsk_refcnt)) reqsk_free(req); } @@ -285,6 +302,12 @@ static inline void reqsk_queue_hash_req(struct request_sock_queue *queue, req->sk = NULL; req->dl_next = lopt->syn_table[hash]; + /* before letting lookups find us, make sure all req fields + * are committed to memory and refcnt initialized. + */ + smp_wmb(); + atomic_set(&req->rsk_refcnt, 1); + write_lock(&queue->syn_wait_lock); lopt->syn_table[hash] = req; write_unlock(&queue->syn_wait_lock); diff --git a/include/net/sock.h b/include/net/sock.h index f10832ca2e90..e0360f5a53e9 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -67,6 +67,7 @@ #include <linux/atomic.h> #include <net/dst.h> #include <net/checksum.h> +#include <net/tcp_states.h> #include <linux/net_tstamp.h> struct cgroup; @@ -2218,6 +2219,14 @@ static inline struct sock *skb_steal_sock(struct sk_buff *skb) return NULL; } +/* This helper checks if a socket is a full socket, + * ie _not_ a timewait or request socket. + */ +static inline bool sk_fullsock(const struct sock *sk) +{ + return (1 << sk->sk_state) & ~(TCPF_TIME_WAIT | TCPF_NEW_SYN_RECV); +} + void sock_enable_timestamp(struct sock *sk, int flag); int sock_get_timestamp(struct sock *, struct timeval __user *); int sock_get_timestampns(struct sock *, struct timespec __user *); diff --git a/include/net/switchdev.h b/include/net/switchdev.h index 1a9382febcc3..e5de53f92482 100644 --- a/include/net/switchdev.h +++ b/include/net/switchdev.h @@ -14,6 +14,44 @@ #include <linux/netdevice.h> #include <linux/notifier.h> +struct fib_info; + +/** + * struct switchdev_ops - switchdev operations + * + * int (*swdev_parent_id_get)(struct net_device *dev, + * struct netdev_phys_item_id *psid); + * Called to get an ID of the switch chip this port is part of. + * If driver implements this, it indicates that it represents a port + * of a switch chip. + * + * int (*swdev_port_stp_update)(struct net_device *dev, u8 state); + * Called to notify switch device port of bridge port STP + * state change. + * + * int (*swdev_fib_ipv4_add)(struct net_device *dev, __be32 dst, + * int dst_len, struct fib_info *fi, + * u8 tos, u8 type, u32 nlflags, u32 tb_id); + * Called to add/modify IPv4 route to switch device. + * + * int (*swdev_fib_ipv4_del)(struct net_device *dev, __be32 dst, + * int dst_len, struct fib_info *fi, + * u8 tos, u8 type, u32 tb_id); + * Called to delete IPv4 route from switch device. + */ +struct swdev_ops { + int (*swdev_parent_id_get)(struct net_device *dev, + struct netdev_phys_item_id *psid); + int (*swdev_port_stp_update)(struct net_device *dev, u8 state); + int (*swdev_fib_ipv4_add)(struct net_device *dev, __be32 dst, + int dst_len, struct fib_info *fi, + u8 tos, u8 type, u32 nlflags, + u32 tb_id); + int (*swdev_fib_ipv4_del)(struct net_device *dev, __be32 dst, + int dst_len, struct fib_info *fi, + u8 tos, u8 type, u32 tb_id); +}; + enum netdev_switch_notifier_type { NETDEV_SWITCH_FDB_ADD = 1, NETDEV_SWITCH_FDB_DEL, diff --git a/include/net/tcp.h b/include/net/tcp.h index 2e11e38205c2..5b29835b81d8 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1137,31 +1137,6 @@ static inline int tcp_full_space(const struct sock *sk) return tcp_win_from_space(sk->sk_rcvbuf); } -static inline void tcp_openreq_init(struct request_sock *req, - struct tcp_options_received *rx_opt, - struct sk_buff *skb, struct sock *sk) -{ - struct inet_request_sock *ireq = inet_rsk(req); - - req->rcv_wnd = 0; /* So that tcp_send_synack() knows! */ - req->cookie_ts = 0; - tcp_rsk(req)->rcv_isn = TCP_SKB_CB(skb)->seq; - tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->seq + 1; - tcp_rsk(req)->snt_synack = tcp_time_stamp; - tcp_rsk(req)->last_oow_ack_time = 0; - req->mss = rx_opt->mss_clamp; - req->ts_recent = rx_opt->saw_tstamp ? rx_opt->rcv_tsval : 0; - ireq->tstamp_ok = rx_opt->tstamp_ok; - ireq->sack_ok = rx_opt->sack_ok; - ireq->snd_wscale = rx_opt->snd_wscale; - ireq->wscale_ok = rx_opt->wscale_ok; - ireq->acked = 0; - ireq->ecn_ok = 0; - ireq->ir_rmt_port = tcp_hdr(skb)->source; - ireq->ir_num = ntohs(tcp_hdr(skb)->dest); - ireq->ir_mark = inet_request_mark(sk, skb); -} - extern void tcp_openreq_init_rwin(struct request_sock *req, struct sock *sk, struct dst_entry *dst); @@ -1241,36 +1216,8 @@ static inline bool tcp_paws_reject(const struct tcp_options_received *rx_opt, return true; } -/* Return true if we're currently rate-limiting out-of-window ACKs and - * thus shouldn't send a dupack right now. We rate-limit dupacks in - * response to out-of-window SYNs or ACKs to mitigate ACK loops or DoS - * attacks that send repeated SYNs or ACKs for the same connection. To - * do this, we do not send a duplicate SYNACK or ACK if the remote - * endpoint is sending out-of-window SYNs or pure ACKs at a high rate. - */ -static inline bool tcp_oow_rate_limited(struct net *net, - const struct sk_buff *skb, - int mib_idx, u32 *last_oow_ack_time) -{ - /* Data packets without SYNs are not likely part of an ACK loop. */ - if ((TCP_SKB_CB(skb)->seq != TCP_SKB_CB(skb)->end_seq) && - !tcp_hdr(skb)->syn) - goto not_rate_limited; - - if (*last_oow_ack_time) { - s32 elapsed = (s32)(tcp_time_stamp - *last_oow_ack_time); - - if (0 <= elapsed && elapsed < sysctl_tcp_invalid_ratelimit) { - NET_INC_STATS_BH(net, mib_idx); - return true; /* rate-limited: don't send yet! */ - } - } - - *last_oow_ack_time = tcp_time_stamp; - -not_rate_limited: - return false; /* not rate-limited: go ahead, send dupack now! */ -} +bool tcp_oow_rate_limited(struct net *net, const struct sk_buff *skb, + int mib_idx, u32 *last_oow_ack_time); static inline void tcp_mib_init(struct net *net) { diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 3fa1af8a58d7..1623047af463 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -165,7 +165,22 @@ enum bpf_func_id { BPF_FUNC_map_lookup_elem, /* void *map_lookup_elem(&map, &key) */ BPF_FUNC_map_update_elem, /* int map_update_elem(&map, &key, &value, flags) */ BPF_FUNC_map_delete_elem, /* int map_delete_elem(&map, &key) */ + BPF_FUNC_get_prandom_u32, /* u32 prandom_u32(void) */ + BPF_FUNC_get_smp_processor_id, /* u32 raw_smp_processor_id(void) */ __BPF_FUNC_MAX_ID, }; +/* user accessible mirror of in-kernel sk_buff. + * new fields can only be added to the end of this structure + */ +struct __sk_buff { + __u32 len; + __u32 pkt_type; + __u32 mark; + __u32 queue_mapping; + __u32 protocol; + __u32 vlan_present; + __u32 vlan_tci; +}; + #endif /* _UAPI__LINUX_BPF_H__ */ diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 756436e1ce89..f5f5edd5ae5f 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -147,6 +147,7 @@ enum { IFLA_CARRIER_CHANGES, IFLA_PHYS_SWITCH_ID, IFLA_LINK_NETNSID, + IFLA_PHYS_PORT_NAME, __IFLA_MAX }; @@ -224,6 +225,9 @@ enum { IFLA_BR_FORWARD_DELAY, IFLA_BR_HELLO_TIME, IFLA_BR_MAX_AGE, + IFLA_BR_AGEING_TIME, + IFLA_BR_STP_STATE, + IFLA_BR_PRIORITY, __IFLA_BR_MAX, }; diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 50603aec766a..4139a0f8b558 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -661,6 +661,9 @@ const struct bpf_func_proto bpf_map_lookup_elem_proto __weak; const struct bpf_func_proto bpf_map_update_elem_proto __weak; const struct bpf_func_proto bpf_map_delete_elem_proto __weak; +const struct bpf_func_proto bpf_get_prandom_u32_proto __weak; +const struct bpf_func_proto bpf_get_smp_processor_id_proto __weak; + /* To execute LD_ABS/LD_IND instructions __bpf_prog_run() may call * skb_copy_bits(), so provide a weak definition of it for NET-less config. */ diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index a3c7701a8b5e..bd7f5988ed9c 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -11,6 +11,8 @@ */ #include <linux/bpf.h> #include <linux/rcupdate.h> +#include <linux/random.h> +#include <linux/smp.h> /* If kernel subsystem is allowing eBPF programs to call this function, * inside its own verifier_ops->get_func_proto() callback it should return @@ -87,3 +89,25 @@ const struct bpf_func_proto bpf_map_delete_elem_proto = { .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_PTR_TO_MAP_KEY, }; + +static u64 bpf_get_prandom_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +{ + return prandom_u32(); +} + +const struct bpf_func_proto bpf_get_prandom_u32_proto = { + .func = bpf_get_prandom_u32, + .gpl_only = false, + .ret_type = RET_INTEGER, +}; + +static u64 bpf_get_smp_processor_id(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +{ + return raw_smp_processor_id(); +} + +const struct bpf_func_proto bpf_get_smp_processor_id_proto = { + .func = bpf_get_smp_processor_id, + .gpl_only = false, + .ret_type = RET_INTEGER, +}; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 669719ccc9ee..ea75c654af1b 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -519,7 +519,7 @@ static int bpf_prog_load(union bpf_attr *attr) goto free_prog; /* run eBPF verifier */ - err = bpf_check(prog, attr); + err = bpf_check(&prog, attr); if (err < 0) goto free_used_maps; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index e6b522496250..c22ebd36fa4b 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1620,11 +1620,10 @@ static int do_check(struct verifier_env *env) return err; } else if (class == BPF_LDX) { - if (BPF_MODE(insn->code) != BPF_MEM || - insn->imm != 0) { - verbose("BPF_LDX uses reserved fields\n"); - return -EINVAL; - } + enum bpf_reg_type src_reg_type; + + /* check for reserved fields is already done */ + /* check src operand */ err = check_reg_arg(regs, insn->src_reg, SRC_OP); if (err) @@ -1643,6 +1642,29 @@ static int do_check(struct verifier_env *env) if (err) return err; + src_reg_type = regs[insn->src_reg].type; + + if (insn->imm == 0 && BPF_SIZE(insn->code) == BPF_W) { + /* saw a valid insn + * dst_reg = *(u32 *)(src_reg + off) + * use reserved 'imm' field to mark this insn + */ + insn->imm = src_reg_type; + + } else if (src_reg_type != insn->imm && + (src_reg_type == PTR_TO_CTX || + insn->imm == PTR_TO_CTX)) { + /* ABuser program is trying to use the same insn + * dst_reg = *(u32*) (src_reg + off) + * with different pointer types: + * src_reg == ctx in one branch and + * src_reg == stack|map in some other branch. + * Reject it. + */ + verbose("same insn cannot be used with different pointers\n"); + return -EINVAL; + } + } else if (class == BPF_STX) { if (BPF_MODE(insn->code) == BPF_XADD) { err = check_xadd(env, insn); @@ -1790,6 +1812,13 @@ static int replace_map_fd_with_map_ptr(struct verifier_env *env) int i, j; for (i = 0; i < insn_cnt; i++, insn++) { + if (BPF_CLASS(insn->code) == BPF_LDX && + (BPF_MODE(insn->code) != BPF_MEM || + insn->imm != 0)) { + verbose("BPF_LDX uses reserved fields\n"); + return -EINVAL; + } + if (insn[0].code == (BPF_LD | BPF_IMM | BPF_DW)) { struct bpf_map *map; struct fd f; @@ -1881,6 +1910,92 @@ static void convert_pseudo_ld_imm64(struct verifier_env *env) insn->src_reg = 0; } +static void adjust_branches(struct bpf_prog *prog, int pos, int delta) +{ + struct bpf_insn *insn = prog->insnsi; + int insn_cnt = prog->len; + int i; + + for (i = 0; i < insn_cnt; i++, insn++) { + if (BPF_CLASS(insn->code) != BPF_JMP || + BPF_OP(insn->code) == BPF_CALL || + BPF_OP(insn->code) == BPF_EXIT) + continue; + + /* adjust offset of jmps if necessary */ + if (i < pos && i + insn->off + 1 > pos) + insn->off += delta; + else if (i > pos && i + insn->off + 1 < pos) + insn->off -= delta; + } +} + +/* convert load instructions that access fields of 'struct __sk_buff' + * into sequence of instructions that access fields of 'struct sk_buff' + */ +static int convert_ctx_accesses(struct verifier_env *env) +{ + struct bpf_insn *insn = env->prog->insnsi; + int insn_cnt = env->prog->len; + struct bpf_insn insn_buf[16]; + struct bpf_prog *new_prog; + u32 cnt; + int i; + + if (!env->prog->aux->ops->convert_ctx_access) + return 0; + + for (i = 0; i < insn_cnt; i++, insn++) { + if (insn->code != (BPF_LDX | BPF_MEM | BPF_W)) + continue; + + if (insn->imm != PTR_TO_CTX) { + /* clear internal mark */ + insn->imm = 0; + continue; + } + + cnt = env->prog->aux->ops-> + convert_ctx_access(insn->dst_reg, insn->src_reg, + insn->off, insn_buf); + if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) { + verbose("bpf verifier is misconfigured\n"); + return -EINVAL; + } + + if (cnt == 1) { + memcpy(insn, insn_buf, sizeof(*insn)); + continue; + } + + /* several new insns need to be inserted. Make room for them */ + insn_cnt += cnt - 1; + new_prog = bpf_prog_realloc(env->prog, + bpf_prog_size(insn_cnt), + GFP_USER); + if (!new_prog) + return -ENOMEM; + + new_prog->len = insn_cnt; + + memmove(new_prog->insnsi + i + cnt, new_prog->insns + i + 1, + sizeof(*insn) * (insn_cnt - i - cnt)); + + /* copy substitute insns in place of load instruction */ + memcpy(new_prog->insnsi + i, insn_buf, sizeof(*insn) * cnt); + + /* adjust branches in the whole program */ + adjust_branches(new_prog, i, cnt - 1); + + /* keep walking new program and skip insns we just inserted */ + env->prog = new_prog; + insn = new_prog->insnsi + i + cnt - 1; + i += cnt - 1; + } + + return 0; +} + static void free_states(struct verifier_env *env) { struct verifier_state_list *sl, *sln; @@ -1903,13 +2018,13 @@ static void free_states(struct verifier_env *env) kfree(env->explored_states); } -int bpf_check(struct bpf_prog *prog, union bpf_attr *attr) +int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) { char __user *log_ubuf = NULL; struct verifier_env *env; int ret = -EINVAL; - if (prog->len <= 0 || prog->len > BPF_MAXINSNS) + if ((*prog)->len <= 0 || (*prog)->len > BPF_MAXINSNS) return -E2BIG; /* 'struct verifier_env' can be global, but since it's not small, @@ -1919,7 +2034,7 @@ int bpf_check(struct bpf_prog *prog, union bpf_attr *attr) if (!env) return -ENOMEM; - env->prog = prog; + env->prog = *prog; /* grab the mutex to protect few globals used by verifier */ mutex_lock(&bpf_verifier_lock); @@ -1951,7 +2066,7 @@ int bpf_check(struct bpf_prog *prog, union bpf_attr *attr) if (ret < 0) goto skip_full_check; - env->explored_states = kcalloc(prog->len, + env->explored_states = kcalloc(env->prog->len, sizeof(struct verifier_state_list *), GFP_USER); ret = -ENOMEM; @@ -1968,6 +2083,10 @@ skip_full_check: while (pop_stack(env, NULL) >= 0); free_states(env); + if (ret == 0) + /* program is valid, convert *(u32*)(ctx + off) accesses */ + ret = convert_ctx_accesses(env); + if (log_level && log_len >= log_size - 1) { BUG_ON(log_len >= log_size); /* verifier log exceeded user supplied buffer */ @@ -1983,18 +2102,18 @@ skip_full_check: if (ret == 0 && env->used_map_cnt) { /* if program passed verifier, update used_maps in bpf_prog_info */ - prog->aux->used_maps = kmalloc_array(env->used_map_cnt, - sizeof(env->used_maps[0]), - GFP_KERNEL); + env->prog->aux->used_maps = kmalloc_array(env->used_map_cnt, + sizeof(env->used_maps[0]), + GFP_KERNEL); - if (!prog->aux->used_maps) { + if (!env->prog->aux->used_maps) { ret = -ENOMEM; goto free_log_buf; } - memcpy(prog->aux->used_maps, env->used_maps, + memcpy(env->prog->aux->used_maps, env->used_maps, sizeof(env->used_maps[0]) * env->used_map_cnt); - prog->aux->used_map_cnt = env->used_map_cnt; + env->prog->aux->used_map_cnt = env->used_map_cnt; /* program is valid. Convert pseudo bpf_ld_imm64 into generic * bpf_ld_imm64 instructions @@ -2006,11 +2125,12 @@ free_log_buf: if (log_level) vfree(log_buf); free_env: - if (!prog->aux->used_maps) + if (!env->prog->aux->used_maps) /* if we didn't copy map pointers into bpf_prog_info, release * them now. Otherwise free_bpf_prog_info() will release them. */ release_maps(env); + *prog = env->prog; kfree(env); mutex_unlock(&bpf_verifier_lock); return ret; diff --git a/lib/rhashtable.c b/lib/rhashtable.c index fc0d451279f0..5f8fe3e88219 100644 --- a/lib/rhashtable.c +++ b/lib/rhashtable.c @@ -27,17 +27,12 @@ #include <linux/err.h> #define HASH_DEFAULT_SIZE 64UL -#define HASH_MIN_SIZE 4UL +#define HASH_MIN_SIZE 4U #define BUCKET_LOCKS_PER_CPU 128UL /* Base bits plus 1 bit for nulls marker */ #define HASH_RESERVED_SPACE (RHT_BASE_BITS + 1) -enum { - RHT_LOCK_NORMAL, - RHT_LOCK_NESTED, -}; - /* The bucket lock is selected based on the hash and protects mutations * on a group of hash buckets. * @@ -146,8 +141,13 @@ static void bucket_table_free(const struct bucket_table *tbl) kvfree(tbl); } +static void bucket_table_free_rcu(struct rcu_head *head) +{ + bucket_table_free(container_of(head, struct bucket_table, rcu)); +} + static struct bucket_table *bucket_table_alloc(struct rhashtable *ht, - size_t nbuckets, u32 hash_rnd) + size_t nbuckets) { struct bucket_table *tbl = NULL; size_t size; @@ -162,14 +162,16 @@ static struct bucket_table *bucket_table_alloc(struct rhashtable *ht, return NULL; tbl->size = nbuckets; - tbl->shift = ilog2(nbuckets); - tbl->hash_rnd = hash_rnd; if (alloc_bucket_locks(ht, tbl) < 0) { bucket_table_free(tbl); return NULL; } + INIT_LIST_HEAD(&tbl->walkers); + + get_random_bytes(&tbl->hash_rnd, sizeof(tbl->hash_rnd)); + for (i = 0; i < nbuckets; i++) INIT_RHT_NULLS_HEAD(tbl->buckets[i], ht, i); @@ -186,7 +188,7 @@ static bool rht_grow_above_75(const struct rhashtable *ht, { /* Expand table when exceeding 75% load */ return atomic_read(&ht->nelems) > (tbl->size / 4 * 3) && - (!ht->p.max_shift || tbl->shift < ht->p.max_shift); + (!ht->p.max_size || tbl->size < ht->p.max_size); } /** @@ -199,13 +201,14 @@ static bool rht_shrink_below_30(const struct rhashtable *ht, { /* Shrink table beneath 30% load */ return atomic_read(&ht->nelems) < (tbl->size * 3 / 10) && - tbl->shift > ht->p.min_shift; + tbl->size > ht->p.min_size; } static int rhashtable_rehash_one(struct rhashtable *ht, unsigned old_hash) { - struct bucket_table *new_tbl = rht_dereference(ht->future_tbl, ht); struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht); + struct bucket_table *new_tbl = + rht_dereference(old_tbl->future_tbl, ht) ?: old_tbl; struct rhash_head __rcu **pprev = &old_tbl->buckets[old_hash]; int err = -ENOENT; struct rhash_head *head, *next, *entry; @@ -229,7 +232,7 @@ static int rhashtable_rehash_one(struct rhashtable *ht, unsigned old_hash) new_bucket_lock = bucket_lock(new_tbl, new_hash); - spin_lock_nested(new_bucket_lock, RHT_LOCK_NESTED); + spin_lock_nested(new_bucket_lock, SINGLE_DEPTH_NESTING); head = rht_dereference_bucket(new_tbl->buckets[new_hash], new_tbl, new_hash); @@ -257,6 +260,7 @@ static void rhashtable_rehash_chain(struct rhashtable *ht, unsigned old_hash) spin_lock_bh(old_bucket_lock); while (!rhashtable_rehash_one(ht, old_hash)) ; + old_tbl->rehash++; spin_unlock_bh(old_bucket_lock); } @@ -264,16 +268,13 @@ static void rhashtable_rehash(struct rhashtable *ht, struct bucket_table *new_tbl) { struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht); + struct rhashtable_walker *walker; unsigned old_hash; - get_random_bytes(&new_tbl->hash_rnd, sizeof(new_tbl->hash_rnd)); - /* Make insertions go into the new, empty table right away. Deletions * and lookups will be attempted in both tables until we synchronize. - * The synchronize_rcu() guarantees for the new table to be picked up - * so no new additions go into the old table while we relink. */ - rcu_assign_pointer(ht->future_tbl, new_tbl); + rcu_assign_pointer(old_tbl->future_tbl, new_tbl); /* Ensure the new table is visible to readers. */ smp_wmb(); @@ -284,13 +285,14 @@ static void rhashtable_rehash(struct rhashtable *ht, /* Publish the new table pointer. */ rcu_assign_pointer(ht->tbl, new_tbl); + list_for_each_entry(walker, &old_tbl->walkers, list) + walker->tbl = NULL; + /* Wait for readers. All new readers will see the new * table, and thus no references to the old table will * remain. */ - synchronize_rcu(); - - bucket_table_free(old_tbl); + call_rcu(&old_tbl->rcu, bucket_table_free_rcu); } /** @@ -314,7 +316,7 @@ int rhashtable_expand(struct rhashtable *ht) ASSERT_RHT_MUTEX(ht); - new_tbl = bucket_table_alloc(ht, old_tbl->size * 2, old_tbl->hash_rnd); + new_tbl = bucket_table_alloc(ht, old_tbl->size * 2); if (new_tbl == NULL) return -ENOMEM; @@ -345,7 +347,7 @@ int rhashtable_shrink(struct rhashtable *ht) ASSERT_RHT_MUTEX(ht); - new_tbl = bucket_table_alloc(ht, old_tbl->size / 2, old_tbl->hash_rnd); + new_tbl = bucket_table_alloc(ht, old_tbl->size / 2); if (new_tbl == NULL) return -ENOMEM; @@ -358,7 +360,6 @@ static void rht_deferred_worker(struct work_struct *work) { struct rhashtable *ht; struct bucket_table *tbl; - struct rhashtable_walker *walker; ht = container_of(work, struct rhashtable, run_work); mutex_lock(&ht->mutex); @@ -367,9 +368,6 @@ static void rht_deferred_worker(struct work_struct *work) tbl = rht_dereference(ht->tbl, ht); - list_for_each_entry(walker, &ht->walkers, list) - walker->resize = true; - if (rht_grow_above_75(ht, tbl)) rhashtable_expand(ht); else if (rht_shrink_below_30(ht, tbl)) @@ -385,14 +383,16 @@ static bool __rhashtable_insert(struct rhashtable *ht, struct rhash_head *obj, struct rhash_head *head; bool no_resize_running; unsigned hash; + spinlock_t *old_lock; bool success = true; rcu_read_lock(); old_tbl = rht_dereference_rcu(ht->tbl, ht); hash = head_hashfn(ht, old_tbl, obj); + old_lock = bucket_lock(old_tbl, hash); - spin_lock_bh(bucket_lock(old_tbl, hash)); + spin_lock_bh(old_lock); /* Because we have already taken the bucket lock in old_tbl, * if we find that future_tbl is not yet visible then that @@ -400,10 +400,10 @@ static bool __rhashtable_insert(struct rhashtable *ht, struct rhash_head *obj, * also grab the bucket lock in old_tbl because until the * rehash completes ht->tbl won't be changed. */ - tbl = rht_dereference_rcu(ht->future_tbl, ht); + tbl = rht_dereference_rcu(old_tbl->future_tbl, ht) ?: old_tbl; if (tbl != old_tbl) { hash = head_hashfn(ht, tbl, obj); - spin_lock_nested(bucket_lock(tbl, hash), RHT_LOCK_NESTED); + spin_lock_nested(bucket_lock(tbl, hash), SINGLE_DEPTH_NESTING); } if (compare && @@ -429,13 +429,10 @@ static bool __rhashtable_insert(struct rhashtable *ht, struct rhash_head *obj, schedule_work(&ht->run_work); exit: - if (tbl != old_tbl) { - hash = head_hashfn(ht, tbl, obj); + if (tbl != old_tbl) spin_unlock(bucket_lock(tbl, hash)); - } - hash = head_hashfn(ht, old_tbl, obj); - spin_unlock_bh(bucket_lock(old_tbl, hash)); + spin_unlock_bh(old_lock); rcu_read_unlock(); @@ -512,28 +509,25 @@ static bool __rhashtable_remove(struct rhashtable *ht, */ bool rhashtable_remove(struct rhashtable *ht, struct rhash_head *obj) { - struct bucket_table *tbl, *old_tbl; + struct bucket_table *tbl; bool ret; rcu_read_lock(); - old_tbl = rht_dereference_rcu(ht->tbl, ht); - ret = __rhashtable_remove(ht, old_tbl, obj); + tbl = rht_dereference_rcu(ht->tbl, ht); /* Because we have already taken (and released) the bucket * lock in old_tbl, if we find that future_tbl is not yet * visible then that guarantees the entry to still be in - * old_tbl if it exists. + * the old tbl if it exists. */ - tbl = rht_dereference_rcu(ht->future_tbl, ht); - if (!ret && old_tbl != tbl) - ret = __rhashtable_remove(ht, tbl, obj); + while (!(ret = __rhashtable_remove(ht, tbl, obj)) && + (tbl = rht_dereference_rcu(tbl->future_tbl, ht))) + ; if (ret) { - bool no_resize_running = tbl == old_tbl; - atomic_dec(&ht->nelems); - if (no_resize_running && rht_shrink_below_30(ht, tbl)) + if (rht_shrink_below_30(ht, tbl)) schedule_work(&ht->run_work); } @@ -599,7 +593,7 @@ EXPORT_SYMBOL_GPL(rhashtable_lookup); void *rhashtable_lookup_compare(struct rhashtable *ht, const void *key, bool (*compare)(void *, void *), void *arg) { - const struct bucket_table *tbl, *old_tbl; + const struct bucket_table *tbl; struct rhash_head *he; u32 hash; @@ -618,9 +612,8 @@ restart: /* Ensure we see any new tables. */ smp_rmb(); - old_tbl = tbl; - tbl = rht_dereference_rcu(ht->future_tbl, ht); - if (unlikely(tbl != old_tbl)) + tbl = rht_dereference_rcu(tbl->future_tbl, ht); + if (unlikely(tbl)) goto restart; rcu_read_unlock(); @@ -725,11 +718,9 @@ int rhashtable_walk_init(struct rhashtable *ht, struct rhashtable_iter *iter) if (!iter->walker) return -ENOMEM; - INIT_LIST_HEAD(&iter->walker->list); - iter->walker->resize = false; - mutex_lock(&ht->mutex); - list_add(&iter->walker->list, &ht->walkers); + iter->walker->tbl = rht_dereference(ht->tbl, ht); + list_add(&iter->walker->list, &iter->walker->tbl->walkers); mutex_unlock(&ht->mutex); return 0; @@ -745,7 +736,8 @@ EXPORT_SYMBOL_GPL(rhashtable_walk_init); void rhashtable_walk_exit(struct rhashtable_iter *iter) { mutex_lock(&iter->ht->mutex); - list_del(&iter->walker->list); + if (iter->walker->tbl) + list_del(&iter->walker->list); mutex_unlock(&iter->ht->mutex); kfree(iter->walker); } @@ -766,13 +758,21 @@ EXPORT_SYMBOL_GPL(rhashtable_walk_exit); * by calling rhashtable_walk_next. */ int rhashtable_walk_start(struct rhashtable_iter *iter) + __acquires(RCU) { + struct rhashtable *ht = iter->ht; + + mutex_lock(&ht->mutex); + + if (iter->walker->tbl) + list_del(&iter->walker->list); + rcu_read_lock(); - if (iter->walker->resize) { - iter->slot = 0; - iter->skip = 0; - iter->walker->resize = false; + mutex_unlock(&ht->mutex); + + if (!iter->walker->tbl) { + iter->walker->tbl = rht_dereference_rcu(ht->tbl, ht); return -EAGAIN; } @@ -794,13 +794,11 @@ EXPORT_SYMBOL_GPL(rhashtable_walk_start); */ void *rhashtable_walk_next(struct rhashtable_iter *iter) { - const struct bucket_table *tbl; + struct bucket_table *tbl = iter->walker->tbl; struct rhashtable *ht = iter->ht; struct rhash_head *p = iter->p; void *obj = NULL; - tbl = rht_dereference_rcu(ht->tbl, ht); - if (p) { p = rht_dereference_bucket_rcu(p->next, tbl, iter->slot); goto next; @@ -826,17 +824,17 @@ next: iter->skip = 0; } - iter->p = NULL; - -out: - if (iter->walker->resize) { - iter->p = NULL; + iter->walker->tbl = rht_dereference_rcu(tbl->future_tbl, ht); + if (iter->walker->tbl) { iter->slot = 0; iter->skip = 0; - iter->walker->resize = false; return ERR_PTR(-EAGAIN); } + iter->p = NULL; + +out: + return obj; } EXPORT_SYMBOL_GPL(rhashtable_walk_next); @@ -848,16 +846,34 @@ EXPORT_SYMBOL_GPL(rhashtable_walk_next); * Finish a hash table walk. */ void rhashtable_walk_stop(struct rhashtable_iter *iter) + __releases(RCU) { - rcu_read_unlock(); + struct rhashtable *ht; + struct bucket_table *tbl = iter->walker->tbl; + + if (!tbl) + goto out; + + ht = iter->ht; + + mutex_lock(&ht->mutex); + if (tbl->rehash < tbl->size) + list_add(&iter->walker->list, &tbl->walkers); + else + iter->walker->tbl = NULL; + mutex_unlock(&ht->mutex); + iter->p = NULL; + +out: + rcu_read_unlock(); } EXPORT_SYMBOL_GPL(rhashtable_walk_stop); static size_t rounded_hashtable_size(struct rhashtable_params *params) { return max(roundup_pow_of_two(params->nelem_hint * 4 / 3), - 1UL << params->min_shift); + (unsigned long)params->min_size); } /** @@ -907,7 +923,6 @@ int rhashtable_init(struct rhashtable *ht, struct rhashtable_params *params) { struct bucket_table *tbl; size_t size; - u32 hash_rnd; size = HASH_DEFAULT_SIZE; @@ -918,8 +933,7 @@ int rhashtable_init(struct rhashtable *ht, struct rhashtable_params *params) if (params->nulls_base && params->nulls_base < (1U << RHT_BASE_SHIFT)) return -EINVAL; - params->min_shift = max_t(size_t, params->min_shift, - ilog2(HASH_MIN_SIZE)); + params->min_size = max(params->min_size, HASH_MIN_SIZE); if (params->nelem_hint) size = rounded_hashtable_size(params); @@ -927,23 +941,19 @@ int rhashtable_init(struct rhashtable *ht, struct rhashtable_params *params) memset(ht, 0, sizeof(*ht)); mutex_init(&ht->mutex); memcpy(&ht->p, params, sizeof(*params)); - INIT_LIST_HEAD(&ht->walkers); if (params->locks_mul) ht->p.locks_mul = roundup_pow_of_two(params->locks_mul); else ht->p.locks_mul = BUCKET_LOCKS_PER_CPU; - get_random_bytes(&hash_rnd, sizeof(hash_rnd)); - - tbl = bucket_table_alloc(ht, size, hash_rnd); + tbl = bucket_table_alloc(ht, size); if (tbl == NULL) return -ENOMEM; atomic_set(&ht->nelems, 0); RCU_INIT_POINTER(ht->tbl, tbl); - RCU_INIT_POINTER(ht->future_tbl, tbl); INIT_WORK(&ht->run_work, rht_deferred_worker); diff --git a/lib/test_rhashtable.c b/lib/test_rhashtable.c index 67c7593d1dd6..2bc403d8f9ee 100644 --- a/lib/test_rhashtable.c +++ b/lib/test_rhashtable.c @@ -80,7 +80,7 @@ static void test_bucket_stats(struct rhashtable *ht, bool quiet) rcu_cnt = cnt = 0; if (!quiet) - pr_info(" [%#4x/%zu]", i, tbl->size); + pr_info(" [%#4x/%u]", i, tbl->size); rht_for_each_entry_rcu(obj, pos, tbl, i, node) { cnt++; @@ -201,7 +201,7 @@ static int __init test_rht_init(void) .key_offset = offsetof(struct test_obj, value), .key_len = sizeof(int), .hashfn = jhash, - .max_shift = 1, /* we expand/shrink manually here */ + .max_size = 2, /* we expand/shrink manually here */ .nulls_base = (3U << RHT_BASE_SHIFT), }; int err; diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c index 64c6bed4a3d3..98a30a5b8664 100644 --- a/net/8021q/vlan.c +++ b/net/8021q/vlan.c @@ -413,7 +413,10 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event, vlan_transfer_features(dev, vlandev); break; - case NETDEV_DOWN: + case NETDEV_DOWN: { + struct net_device *tmp; + LIST_HEAD(close_list); + if (dev->features & NETIF_F_HW_VLAN_CTAG_FILTER) vlan_vid_del(dev, htons(ETH_P_8021Q), 0); @@ -425,11 +428,18 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event, vlan = vlan_dev_priv(vlandev); if (!(vlan->flags & VLAN_FLAG_LOOSE_BINDING)) - dev_change_flags(vlandev, flgs & ~IFF_UP); + list_add(&vlandev->close_list, &close_list); + } + + dev_close_many(&close_list, false); + + list_for_each_entry_safe(vlandev, tmp, &close_list, close_list) { netif_stacked_transfer_operstate(dev, vlandev); + list_del_init(&vlandev->close_list); } + list_del(&close_list); break; - + } case NETDEV_UP: /* Put all VLANs for this dev in the up state too. */ vlan_group_for_each_dev(grp, i, vlandev) { diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 8bc6b67457dc..e1115a224a95 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -733,6 +733,9 @@ static const struct nla_policy br_policy[IFLA_BR_MAX + 1] = { [IFLA_BR_FORWARD_DELAY] = { .type = NLA_U32 }, [IFLA_BR_HELLO_TIME] = { .type = NLA_U32 }, [IFLA_BR_MAX_AGE] = { .type = NLA_U32 }, + [IFLA_BR_AGEING_TIME] = { .type = NLA_U32 }, + [IFLA_BR_STP_STATE] = { .type = NLA_U32 }, + [IFLA_BR_PRIORITY] = { .type = NLA_U16 }, }; static int br_changelink(struct net_device *brdev, struct nlattr *tb[], @@ -762,6 +765,24 @@ static int br_changelink(struct net_device *brdev, struct nlattr *tb[], return err; } + if (data[IFLA_BR_AGEING_TIME]) { + u32 ageing_time = nla_get_u32(data[IFLA_BR_AGEING_TIME]); + + br->ageing_time = clock_t_to_jiffies(ageing_time); + } + + if (data[IFLA_BR_STP_STATE]) { + u32 stp_enabled = nla_get_u32(data[IFLA_BR_STP_STATE]); + + br_stp_set_enabled(br, stp_enabled); + } + + if (data[IFLA_BR_PRIORITY]) { + u32 priority = nla_get_u16(data[IFLA_BR_PRIORITY]); + + br_stp_set_bridge_priority(br, priority); + } + return 0; } @@ -770,6 +791,9 @@ static size_t br_get_size(const struct net_device *brdev) return nla_total_size(sizeof(u32)) + /* IFLA_BR_FORWARD_DELAY */ nla_total_size(sizeof(u32)) + /* IFLA_BR_HELLO_TIME */ nla_total_size(sizeof(u32)) + /* IFLA_BR_MAX_AGE */ + nla_total_size(sizeof(u32)) + /* IFLA_BR_AGEING_TIME */ + nla_total_size(sizeof(u32)) + /* IFLA_BR_STP_STATE */ + nla_total_size(sizeof(u16)) + /* IFLA_BR_PRIORITY */ 0; } @@ -779,10 +803,16 @@ static int br_fill_info(struct sk_buff *skb, const struct net_device *brdev) u32 forward_delay = jiffies_to_clock_t(br->forward_delay); u32 hello_time = jiffies_to_clock_t(br->hello_time); u32 age_time = jiffies_to_clock_t(br->max_age); + u32 ageing_time = jiffies_to_clock_t(br->ageing_time); + u32 stp_enabled = br->stp_enabled; + u16 priority = (br->bridge_id.prio[0] << 8) | br->bridge_id.prio[1]; if (nla_put_u32(skb, IFLA_BR_FORWARD_DELAY, forward_delay) || nla_put_u32(skb, IFLA_BR_HELLO_TIME, hello_time) || - nla_put_u32(skb, IFLA_BR_MAX_AGE, age_time)) + nla_put_u32(skb, IFLA_BR_MAX_AGE, age_time) || + nla_put_u32(skb, IFLA_BR_AGEING_TIME, ageing_time) || + nla_put_u32(skb, IFLA_BR_STP_STATE, stp_enabled) || + nla_put_u16(skb, IFLA_BR_PRIORITY, priority)) return -EMSGSIZE; return 0; diff --git a/net/core/dev.c b/net/core/dev.c index 39fe369b46ad..5d43e010ef87 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1385,7 +1385,7 @@ static int __dev_close(struct net_device *dev) return retval; } -static int dev_close_many(struct list_head *head) +int dev_close_many(struct list_head *head, bool unlink) { struct net_device *dev, *tmp; @@ -1399,11 +1399,13 @@ static int dev_close_many(struct list_head *head) list_for_each_entry_safe(dev, tmp, head, close_list) { rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL); call_netdevice_notifiers(NETDEV_DOWN, dev); - list_del_init(&dev->close_list); + if (unlink) + list_del_init(&dev->close_list); } return 0; } +EXPORT_SYMBOL(dev_close_many); /** * dev_close - shutdown an interface. @@ -1420,7 +1422,7 @@ int dev_close(struct net_device *dev) LIST_HEAD(single); list_add(&dev->close_list, &single); - dev_close_many(&single); + dev_close_many(&single, true); list_del(&single); } return 0; @@ -5912,6 +5914,24 @@ int dev_get_phys_port_id(struct net_device *dev, EXPORT_SYMBOL(dev_get_phys_port_id); /** + * dev_get_phys_port_name - Get device physical port name + * @dev: device + * @name: port name + * + * Get device physical port name + */ +int dev_get_phys_port_name(struct net_device *dev, + char *name, size_t len) +{ + const struct net_device_ops *ops = dev->netdev_ops; + + if (!ops->ndo_get_phys_port_name) + return -EOPNOTSUPP; + return ops->ndo_get_phys_port_name(dev, name, len); +} +EXPORT_SYMBOL(dev_get_phys_port_name); + +/** * dev_new_index - allocate an ifindex * @net: the applicable net namespace * @@ -5968,7 +5988,7 @@ static void rollback_registered_many(struct list_head *head) /* If device is running, close it first. */ list_for_each_entry(dev, head, unreg_list) list_add_tail(&dev->close_list, &close_head); - dev_close_many(&close_head); + dev_close_many(&close_head, true); list_for_each_entry(dev, head, unreg_list) { /* And unlink it from device chain. */ diff --git a/net/core/filter.c b/net/core/filter.c index 7a4eb7030dba..b95ae7fe7e4f 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -150,30 +150,82 @@ static u64 __get_random_u32(u64 ctx, u64 a, u64 x, u64 r4, u64 r5) return prandom_u32(); } +static u32 convert_skb_access(int skb_field, int dst_reg, int src_reg, + struct bpf_insn *insn_buf) +{ + struct bpf_insn *insn = insn_buf; + + switch (skb_field) { + case SKF_AD_MARK: + BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, mark) != 4); + + *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, + offsetof(struct sk_buff, mark)); + break; + + case SKF_AD_PKTTYPE: + *insn++ = BPF_LDX_MEM(BPF_B, dst_reg, src_reg, PKT_TYPE_OFFSET()); + *insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, PKT_TYPE_MAX); +#ifdef __BIG_ENDIAN_BITFIELD + *insn++ = BPF_ALU32_IMM(BPF_RSH, dst_reg, 5); +#endif + break; + + case SKF_AD_QUEUE: + BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, queue_mapping) != 2); + + *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, + offsetof(struct sk_buff, queue_mapping)); + break; + + case SKF_AD_PROTOCOL: + BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, protocol) != 2); + + /* dst_reg = *(u16 *) (src_reg + offsetof(protocol)) */ + *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, + offsetof(struct sk_buff, protocol)); + /* dst_reg = ntohs(dst_reg) [emitting a nop or swap16] */ + *insn++ = BPF_ENDIAN(BPF_FROM_BE, dst_reg, 16); + break; + + case SKF_AD_VLAN_TAG: + case SKF_AD_VLAN_TAG_PRESENT: + BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, vlan_tci) != 2); + BUILD_BUG_ON(VLAN_TAG_PRESENT != 0x1000); + + /* dst_reg = *(u16 *) (src_reg + offsetof(vlan_tci)) */ + *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, + offsetof(struct sk_buff, vlan_tci)); + if (skb_field == SKF_AD_VLAN_TAG) { + *insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, + ~VLAN_TAG_PRESENT); + } else { + /* dst_reg >>= 12 */ + *insn++ = BPF_ALU32_IMM(BPF_RSH, dst_reg, 12); + /* dst_reg &= 1 */ + *insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, 1); + } + break; + } + + return insn - insn_buf; +} + static bool convert_bpf_extensions(struct sock_filter *fp, struct bpf_insn **insnp) { struct bpf_insn *insn = *insnp; + u32 cnt; switch (fp->k) { case SKF_AD_OFF + SKF_AD_PROTOCOL: - BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, protocol) != 2); - - /* A = *(u16 *) (CTX + offsetof(protocol)) */ - *insn++ = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_CTX, - offsetof(struct sk_buff, protocol)); - /* A = ntohs(A) [emitting a nop or swap16] */ - *insn = BPF_ENDIAN(BPF_FROM_BE, BPF_REG_A, 16); + cnt = convert_skb_access(SKF_AD_PROTOCOL, BPF_REG_A, BPF_REG_CTX, insn); + insn += cnt - 1; break; case SKF_AD_OFF + SKF_AD_PKTTYPE: - *insn++ = BPF_LDX_MEM(BPF_B, BPF_REG_A, BPF_REG_CTX, - PKT_TYPE_OFFSET()); - *insn = BPF_ALU32_IMM(BPF_AND, BPF_REG_A, PKT_TYPE_MAX); -#ifdef __BIG_ENDIAN_BITFIELD - insn++; - *insn = BPF_ALU32_IMM(BPF_RSH, BPF_REG_A, 5); -#endif + cnt = convert_skb_access(SKF_AD_PKTTYPE, BPF_REG_A, BPF_REG_CTX, insn); + insn += cnt - 1; break; case SKF_AD_OFF + SKF_AD_IFINDEX: @@ -197,10 +249,8 @@ static bool convert_bpf_extensions(struct sock_filter *fp, break; case SKF_AD_OFF + SKF_AD_MARK: - BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, mark) != 4); - - *insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_CTX, - offsetof(struct sk_buff, mark)); + cnt = convert_skb_access(SKF_AD_MARK, BPF_REG_A, BPF_REG_CTX, insn); + insn += cnt - 1; break; case SKF_AD_OFF + SKF_AD_RXHASH: @@ -211,29 +261,20 @@ static bool convert_bpf_extensions(struct sock_filter *fp, break; case SKF_AD_OFF + SKF_AD_QUEUE: - BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, queue_mapping) != 2); - - *insn = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_CTX, - offsetof(struct sk_buff, queue_mapping)); + cnt = convert_skb_access(SKF_AD_QUEUE, BPF_REG_A, BPF_REG_CTX, insn); + insn += cnt - 1; break; case SKF_AD_OFF + SKF_AD_VLAN_TAG: - case SKF_AD_OFF + SKF_AD_VLAN_TAG_PRESENT: - BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, vlan_tci) != 2); - BUILD_BUG_ON(VLAN_TAG_PRESENT != 0x1000); + cnt = convert_skb_access(SKF_AD_VLAN_TAG, + BPF_REG_A, BPF_REG_CTX, insn); + insn += cnt - 1; + break; - /* A = *(u16 *) (CTX + offsetof(vlan_tci)) */ - *insn++ = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_CTX, - offsetof(struct sk_buff, vlan_tci)); - if (fp->k == SKF_AD_OFF + SKF_AD_VLAN_TAG) { - *insn = BPF_ALU32_IMM(BPF_AND, BPF_REG_A, - ~VLAN_TAG_PRESENT); - } else { - /* A >>= 12 */ - *insn++ = BPF_ALU32_IMM(BPF_RSH, BPF_REG_A, 12); - /* A &= 1 */ - *insn = BPF_ALU32_IMM(BPF_AND, BPF_REG_A, 1); - } + case SKF_AD_OFF + SKF_AD_VLAN_TAG_PRESENT: + cnt = convert_skb_access(SKF_AD_VLAN_TAG_PRESENT, + BPF_REG_A, BPF_REG_CTX, insn); + insn += cnt - 1; break; case SKF_AD_OFF + SKF_AD_PAY_OFFSET: @@ -1139,6 +1180,10 @@ sk_filter_func_proto(enum bpf_func_id func_id) return &bpf_map_update_elem_proto; case BPF_FUNC_map_delete_elem: return &bpf_map_delete_elem_proto; + case BPF_FUNC_get_prandom_u32: + return &bpf_get_prandom_u32_proto; + case BPF_FUNC_get_smp_processor_id: + return &bpf_get_smp_processor_id_proto; default: return NULL; } @@ -1147,13 +1192,66 @@ sk_filter_func_proto(enum bpf_func_id func_id) static bool sk_filter_is_valid_access(int off, int size, enum bpf_access_type type) { - /* skb fields cannot be accessed yet */ - return false; + /* only read is allowed */ + if (type != BPF_READ) + return false; + + /* check bounds */ + if (off < 0 || off >= sizeof(struct __sk_buff)) + return false; + + /* disallow misaligned access */ + if (off % size != 0) + return false; + + /* all __sk_buff fields are __u32 */ + if (size != 4) + return false; + + return true; +} + +static u32 sk_filter_convert_ctx_access(int dst_reg, int src_reg, int ctx_off, + struct bpf_insn *insn_buf) +{ + struct bpf_insn *insn = insn_buf; + + switch (ctx_off) { + case offsetof(struct __sk_buff, len): + BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, len) != 4); + + *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, + offsetof(struct sk_buff, len)); + break; + + case offsetof(struct __sk_buff, mark): + return convert_skb_access(SKF_AD_MARK, dst_reg, src_reg, insn); + + case offsetof(struct __sk_buff, pkt_type): + return convert_skb_access(SKF_AD_PKTTYPE, dst_reg, src_reg, insn); + + case offsetof(struct __sk_buff, queue_mapping): + return convert_skb_access(SKF_AD_QUEUE, dst_reg, src_reg, insn); + + case offsetof(struct __sk_buff, protocol): + return convert_skb_access(SKF_AD_PROTOCOL, dst_reg, src_reg, insn); + + case offsetof(struct __sk_buff, vlan_present): + return convert_skb_access(SKF_AD_VLAN_TAG_PRESENT, + dst_reg, src_reg, insn); + + case offsetof(struct __sk_buff, vlan_tci): + return convert_skb_access(SKF_AD_VLAN_TAG, + dst_reg, src_reg, insn); + } + + return insn - insn_buf; } static const struct bpf_verifier_ops sk_filter_ops = { .get_func_proto = sk_filter_func_proto, .is_valid_access = sk_filter_is_valid_access, + .convert_ctx_access = sk_filter_convert_ctx_access, }; static struct bpf_prog_type_list sk_filter_type __read_mostly = { diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index cf30620a88e1..cc5cf689809c 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -418,6 +418,28 @@ static ssize_t phys_port_id_show(struct device *dev, } static DEVICE_ATTR_RO(phys_port_id); +static ssize_t phys_port_name_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct net_device *netdev = to_net_dev(dev); + ssize_t ret = -EINVAL; + + if (!rtnl_trylock()) + return restart_syscall(); + + if (dev_isalive(netdev)) { + char name[IFNAMSIZ]; + + ret = dev_get_phys_port_name(netdev, name, sizeof(name)); + if (!ret) + ret = sprintf(buf, "%s\n", name); + } + rtnl_unlock(); + + return ret; +} +static DEVICE_ATTR_RO(phys_port_name); + static ssize_t phys_switch_id_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -465,6 +487,7 @@ static struct attribute *net_class_attrs[] = { &dev_attr_tx_queue_len.attr, &dev_attr_gro_flush_timeout.attr, &dev_attr_phys_port_id.attr, + &dev_attr_phys_port_name.attr, &dev_attr_phys_switch_id.attr, NULL, }; @@ -951,6 +974,60 @@ static ssize_t show_trans_timeout(struct netdev_queue *queue, return sprintf(buf, "%lu", trans_timeout); } +#ifdef CONFIG_XPS +static inline unsigned int get_netdev_queue_index(struct netdev_queue *queue) +{ + struct net_device *dev = queue->dev; + int i; + + for (i = 0; i < dev->num_tx_queues; i++) + if (queue == &dev->_tx[i]) + break; + + BUG_ON(i >= dev->num_tx_queues); + + return i; +} + +static ssize_t show_tx_maxrate(struct netdev_queue *queue, + struct netdev_queue_attribute *attribute, + char *buf) +{ + return sprintf(buf, "%lu\n", queue->tx_maxrate); +} + +static ssize_t set_tx_maxrate(struct netdev_queue *queue, + struct netdev_queue_attribute *attribute, + const char *buf, size_t len) +{ + struct net_device *dev = queue->dev; + int err, index = get_netdev_queue_index(queue); + u32 rate = 0; + + err = kstrtou32(buf, 10, &rate); + if (err < 0) + return err; + + if (!rtnl_trylock()) + return restart_syscall(); + + err = -EOPNOTSUPP; + if (dev->netdev_ops->ndo_set_tx_maxrate) + err = dev->netdev_ops->ndo_set_tx_maxrate(dev, index, rate); + + rtnl_unlock(); + if (!err) { + queue->tx_maxrate = rate; + return len; + } + return err; +} + +static struct netdev_queue_attribute queue_tx_maxrate = + __ATTR(tx_maxrate, S_IRUGO | S_IWUSR, + show_tx_maxrate, set_tx_maxrate); +#endif + static struct netdev_queue_attribute queue_trans_timeout = __ATTR(tx_timeout, S_IRUGO, show_trans_timeout, NULL); @@ -1065,18 +1142,6 @@ static struct attribute_group dql_group = { #endif /* CONFIG_BQL */ #ifdef CONFIG_XPS -static unsigned int get_netdev_queue_index(struct netdev_queue *queue) -{ - struct net_device *dev = queue->dev; - unsigned int i; - - i = queue - dev->_tx; - BUG_ON(i >= dev->num_tx_queues); - - return i; -} - - static ssize_t show_xps_map(struct netdev_queue *queue, struct netdev_queue_attribute *attribute, char *buf) { @@ -1153,6 +1218,7 @@ static struct attribute *netdev_queue_default_attrs[] = { &queue_trans_timeout.attr, #ifdef CONFIG_XPS &xps_cpus_attribute.attr, + &queue_tx_maxrate.attr, #endif NULL }; diff --git a/net/core/request_sock.c b/net/core/request_sock.c index 04db318e6218..cc39a2aa663a 100644 --- a/net/core/request_sock.c +++ b/net/core/request_sock.c @@ -103,7 +103,7 @@ void reqsk_queue_destroy(struct request_sock_queue *queue) while ((req = lopt->syn_table[i]) != NULL) { lopt->syn_table[i] = req->dl_next; lopt->qlen--; - reqsk_free(req); + reqsk_put(req); } } } @@ -153,24 +153,22 @@ void reqsk_queue_destroy(struct request_sock_queue *queue) * case might also exist in tcp_v4_hnd_req() that will trigger this locking * order. * - * When a TFO req is created, it needs to sock_hold its listener to prevent - * the latter data structure from going away. - * - * This function also sets "treq->listener" to NULL and unreference listener - * socket. treq->listener is used by the listener so it is protected by the + * This function also sets "treq->tfo_listener" to false. + * treq->tfo_listener is used by the listener so it is protected by the * fastopenq->lock in this function. */ void reqsk_fastopen_remove(struct sock *sk, struct request_sock *req, bool reset) { - struct sock *lsk = tcp_rsk(req)->listener; - struct fastopen_queue *fastopenq = - inet_csk(lsk)->icsk_accept_queue.fastopenq; + struct sock *lsk = req->rsk_listener; + struct fastopen_queue *fastopenq; + + fastopenq = inet_csk(lsk)->icsk_accept_queue.fastopenq; tcp_sk(sk)->fastopen_rsk = NULL; spin_lock_bh(&fastopenq->lock); fastopenq->qlen--; - tcp_rsk(req)->listener = NULL; + tcp_rsk(req)->tfo_listener = false; if (req->sk) /* the child socket hasn't been accepted yet */ goto out; @@ -179,8 +177,7 @@ void reqsk_fastopen_remove(struct sock *sk, struct request_sock *req, * special RST handling below. */ spin_unlock_bh(&fastopenq->lock); - sock_put(lsk); - reqsk_free(req); + reqsk_put(req); return; } /* Wait for 60secs before removing a req that has triggered RST. @@ -201,5 +198,4 @@ void reqsk_fastopen_remove(struct sock *sk, struct request_sock *req, fastopenq->qlen++; out: spin_unlock_bh(&fastopenq->lock); - sock_put(lsk); } diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 25b4b5d23485..6abe634c666c 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -982,6 +982,24 @@ static int rtnl_phys_port_id_fill(struct sk_buff *skb, struct net_device *dev) return 0; } +static int rtnl_phys_port_name_fill(struct sk_buff *skb, struct net_device *dev) +{ + char name[IFNAMSIZ]; + int err; + + err = dev_get_phys_port_name(dev, name, sizeof(name)); + if (err) { + if (err == -EOPNOTSUPP) + return 0; + return err; + } + + if (nla_put(skb, IFLA_PHYS_PORT_NAME, strlen(name), name)) + return -EMSGSIZE; + + return 0; +} + static int rtnl_phys_switch_id_fill(struct sk_buff *skb, struct net_device *dev) { int err; @@ -1072,6 +1090,9 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, if (rtnl_phys_port_id_fill(skb, dev)) goto nla_put_failure; + if (rtnl_phys_port_name_fill(skb, dev)) + goto nla_put_failure; + if (rtnl_phys_switch_id_fill(skb, dev)) goto nla_put_failure; diff --git a/net/core/sock.c b/net/core/sock.c index 4bc42efb3e40..d9f9e4825362 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1454,8 +1454,8 @@ void sk_release_kernel(struct sock *sk) return; sock_hold(sk); - sock_release(sk->sk_socket); sock_net_set(sk, get_net(&init_net)); + sock_release(sk->sk_socket); sock_put(sk); } EXPORT_SYMBOL(sk_release_kernel); @@ -1661,21 +1661,6 @@ void sock_efree(struct sk_buff *skb) } EXPORT_SYMBOL(sock_efree); -#ifdef CONFIG_INET -void sock_edemux(struct sk_buff *skb) -{ - struct sock *sk = skb->sk; - - if (sk->sk_state == TCP_TIME_WAIT) - inet_twsk_put(inet_twsk(sk)); - else if (sk->sk_state == TCP_NEW_SYN_RECV) - reqsk_put(inet_reqsk(sk)); - else - sock_put(sk); -} -EXPORT_SYMBOL(sock_edemux); -#endif - kuid_t sock_i_uid(struct sock *sk) { kuid_t uid; diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index 7f6456afbaec..e7ad291cd96b 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -89,10 +89,9 @@ int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) if (inet->inet_saddr == 0) inet->inet_saddr = fl4->saddr; - inet->inet_rcv_saddr = inet->inet_saddr; - + sk_rcv_saddr_set(sk, inet->inet_saddr); inet->inet_dport = usin->sin_port; - inet->inet_daddr = daddr; + sk_daddr_set(sk, daddr); inet_csk(sk)->icsk_ext_hdr_len = 0; if (inet_opt) @@ -408,8 +407,8 @@ struct sock *dccp_v4_request_recv_sock(struct sock *sk, struct sk_buff *skb, newinet = inet_sk(newsk); ireq = inet_rsk(req); - newinet->inet_daddr = ireq->ir_rmt_addr; - newinet->inet_rcv_saddr = ireq->ir_loc_addr; + sk_daddr_set(newsk, ireq->ir_rmt_addr); + sk_rcv_saddr_set(newsk, ireq->ir_loc_addr); newinet->inet_saddr = ireq->ir_loc_addr; newinet->inet_opt = ireq->opt; ireq->opt = NULL; @@ -624,7 +623,7 @@ int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb) if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) goto drop; - req = inet_reqsk_alloc(&dccp_request_sock_ops); + req = inet_reqsk_alloc(&dccp_request_sock_ops, sk); if (req == NULL) goto drop; @@ -639,9 +638,8 @@ int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb) goto drop_and_free; ireq = inet_rsk(req); - ireq->ir_loc_addr = ip_hdr(skb)->daddr; - ireq->ir_rmt_addr = ip_hdr(skb)->saddr; - write_pnet(&ireq->ireq_net, sock_net(sk)); + sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr); + sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr); ireq->ireq_family = AF_INET; ireq->ir_iif = sk->sk_bound_dev_if; diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index 5166b0043f95..c655de5f67c9 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -40,19 +40,6 @@ static const struct inet_connection_sock_af_ops dccp_ipv6_mapped; static const struct inet_connection_sock_af_ops dccp_ipv6_af_ops; -static void dccp_v6_hash(struct sock *sk) -{ - if (sk->sk_state != DCCP_CLOSED) { - if (inet_csk(sk)->icsk_af_ops == &dccp_ipv6_mapped) { - inet_hash(sk); - return; - } - local_bh_disable(); - __inet6_hash(sk, NULL); - local_bh_enable(); - } -} - /* add pseudo-header to DCCP checksum stored in skb->csum */ static inline __sum16 dccp_v6_csum_finish(struct sk_buff *skb, const struct in6_addr *saddr, @@ -386,7 +373,7 @@ static int dccp_v6_conn_request(struct sock *sk, struct sk_buff *skb) if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) goto drop; - req = inet_reqsk_alloc(&dccp6_request_sock_ops); + req = inet_reqsk_alloc(&dccp6_request_sock_ops, sk); if (req == NULL) goto drop; @@ -403,7 +390,6 @@ static int dccp_v6_conn_request(struct sock *sk, struct sk_buff *skb) ireq = inet_rsk(req); ireq->ir_v6_rmt_addr = ipv6_hdr(skb)->saddr; ireq->ir_v6_loc_addr = ipv6_hdr(skb)->daddr; - write_pnet(&ireq->ireq_net, sock_net(sk)); ireq->ireq_family = AF_INET6; if (ipv6_opt_accepted(sk, skb, IP6CB(skb)) || @@ -471,11 +457,7 @@ static struct sock *dccp_v6_request_recv_sock(struct sock *sk, memcpy(newnp, np, sizeof(struct ipv6_pinfo)); - ipv6_addr_set_v4mapped(newinet->inet_daddr, &newsk->sk_v6_daddr); - - ipv6_addr_set_v4mapped(newinet->inet_saddr, &newnp->saddr); - - newsk->sk_v6_rcv_saddr = newnp->saddr; + newnp->saddr = newsk->sk_v6_rcv_saddr; inet_csk(newsk)->icsk_af_ops = &dccp_ipv6_mapped; newsk->sk_backlog_rcv = dccp_v4_do_rcv; @@ -593,7 +575,7 @@ static struct sock *dccp_v6_request_recv_sock(struct sock *sk, dccp_done(newsk); goto out; } - __inet6_hash(newsk, NULL); + __inet_hash(newsk, NULL); return newsk; @@ -918,9 +900,7 @@ static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr, sk->sk_backlog_rcv = dccp_v6_do_rcv; goto failure; } - ipv6_addr_set_v4mapped(inet->inet_saddr, &np->saddr); - ipv6_addr_set_v4mapped(inet->inet_rcv_saddr, &sk->sk_v6_rcv_saddr); - + np->saddr = sk->sk_v6_rcv_saddr; return err; } @@ -1063,7 +1043,7 @@ static struct proto dccp_v6_prot = { .sendmsg = dccp_sendmsg, .recvmsg = dccp_recvmsg, .backlog_rcv = dccp_v6_do_rcv, - .hash = dccp_v6_hash, + .hash = inet_hash, .unhash = inet_unhash, .accept = inet_csk_accept, .get_port = inet_csk_get_port, diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig index b45206e8dd3e..9379a9cf7f5d 100644 --- a/net/dsa/Kconfig +++ b/net/dsa/Kconfig @@ -6,9 +6,8 @@ config HAVE_NET_DSA config NET_DSA tristate - depends on HAVE_NET_DSA + depends on HAVE_NET_DSA && NET_SWITCHDEV select PHYLIB - select NET_SWITCHDEV if NET_DSA diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 6511552039d6..f0af7aa331c1 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -16,6 +16,7 @@ #include <linux/of_net.h> #include <linux/of_mdio.h> #include <net/rtnetlink.h> +#include <net/switchdev.h> #include <linux/if_bridge.h> #include "dsa_priv.h" @@ -572,8 +573,11 @@ static const struct net_device_ops dsa_slave_netdev_ops = { .ndo_set_rx_mode = dsa_slave_set_rx_mode, .ndo_set_mac_address = dsa_slave_set_mac_address, .ndo_do_ioctl = dsa_slave_ioctl, - .ndo_switch_parent_id_get = dsa_slave_parent_id_get, - .ndo_switch_port_stp_update = dsa_slave_stp_update, +}; + +static const struct swdev_ops dsa_slave_swdev_ops = { + .swdev_parent_id_get = dsa_slave_parent_id_get, + .swdev_port_stp_update = dsa_slave_stp_update, }; static void dsa_slave_adjust_link(struct net_device *dev) @@ -755,6 +759,7 @@ int dsa_slave_create(struct dsa_switch *ds, struct device *parent, eth_hw_addr_inherit(slave_dev, master); slave_dev->tx_queue_len = 0; slave_dev->netdev_ops = &dsa_slave_netdev_ops; + slave_dev->swdev_ops = &dsa_slave_swdev_ops; SET_NETDEV_DEV(slave_dev, parent); slave_dev->dev.of_node = ds->pd->port_dn[port]; diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 5105759e4e00..975ee5e30c64 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -107,7 +107,7 @@ static const struct nla_policy ifa_ipv4_policy[IFA_MAX+1] = { static struct hlist_head inet_addr_lst[IN4_ADDR_HSIZE]; -static u32 inet_addr_hash(struct net *net, __be32 addr) +static u32 inet_addr_hash(const struct net *net, __be32 addr) { u32 val = (__force u32) addr ^ net_hash_mix(net); @@ -560,9 +560,9 @@ static int ip_mc_config(struct sock *sk, bool join, const struct in_ifaddr *ifa) lock_sock(sk); if (join) - ret = __ip_mc_join_group(sk, &mreq); + ret = ip_mc_join_group(sk, &mreq); else - ret = __ip_mc_leave_group(sk, &mreq); + ret = ip_mc_leave_group(sk, &mreq); release_sock(sk); return ret; diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 5cb1ef4ce292..ad3f866085de 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -1850,7 +1850,10 @@ static void ip_mc_clear_src(struct ip_mc_list *pmc) pmc->sfcount[MCAST_EXCLUDE] = 1; } -int __ip_mc_join_group(struct sock *sk, struct ip_mreqn *imr) +/* Join a multicast group + */ + +int ip_mc_join_group(struct sock *sk, struct ip_mreqn *imr) { __be32 addr = imr->imr_multiaddr.s_addr; struct ip_mc_socklist *iml, *i; @@ -1898,20 +1901,6 @@ int __ip_mc_join_group(struct sock *sk, struct ip_mreqn *imr) done: return err; } -EXPORT_SYMBOL(__ip_mc_join_group); - -/* Join a multicast group - */ -int ip_mc_join_group(struct sock *sk, struct ip_mreqn *imr) -{ - int ret; - - rtnl_lock(); - ret = __ip_mc_join_group(sk, imr); - rtnl_unlock(); - - return ret; -} EXPORT_SYMBOL(ip_mc_join_group); static int ip_mc_leave_src(struct sock *sk, struct ip_mc_socklist *iml, @@ -1934,7 +1923,7 @@ static int ip_mc_leave_src(struct sock *sk, struct ip_mc_socklist *iml, return err; } -int __ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr) +int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr) { struct inet_sock *inet = inet_sk(sk); struct ip_mc_socklist *iml; @@ -1979,18 +1968,6 @@ int __ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr) out: return ret; } -EXPORT_SYMBOL(__ip_mc_leave_group); - -int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr) -{ - int ret; - - rtnl_lock(); - ret = __ip_mc_leave_group(sk, imr); - rtnl_unlock(); - - return ret; -} EXPORT_SYMBOL(ip_mc_leave_group); int ip_mc_source(int add, int omode, struct sock *sk, struct @@ -2010,7 +1987,7 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct if (!ipv4_is_multicast(addr)) return -EINVAL; - rtnl_lock(); + ASSERT_RTNL(); imr.imr_multiaddr.s_addr = mreqs->imr_multiaddr; imr.imr_address.s_addr = mreqs->imr_interface; @@ -2124,9 +2101,8 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct ip_mc_add_src(in_dev, &mreqs->imr_multiaddr, omode, 1, &mreqs->imr_sourceaddr, 1); done: - rtnl_unlock(); if (leavegroup) - return ip_mc_leave_group(sk, &imr); + err = ip_mc_leave_group(sk, &imr); return err; } @@ -2148,7 +2124,7 @@ int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf, int ifindex) msf->imsf_fmode != MCAST_EXCLUDE) return -EINVAL; - rtnl_lock(); + ASSERT_RTNL(); imr.imr_multiaddr.s_addr = msf->imsf_multiaddr; imr.imr_address.s_addr = msf->imsf_interface; @@ -2210,7 +2186,6 @@ int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf, int ifindex) pmc->sfmode = msf->imsf_fmode; err = 0; done: - rtnl_unlock(); if (leavegroup) err = ip_mc_leave_group(sk, &imr); return err; diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 34581f928afa..f0f91858aecf 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -293,8 +293,8 @@ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err) { struct inet_connection_sock *icsk = inet_csk(sk); struct request_sock_queue *queue = &icsk->icsk_accept_queue; - struct sock *newsk; struct request_sock *req; + struct sock *newsk; int error; lock_sock(sk); @@ -323,9 +323,11 @@ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err) newsk = req->sk; sk_acceptq_removed(sk); - if (sk->sk_protocol == IPPROTO_TCP && queue->fastopenq != NULL) { + if (sk->sk_protocol == IPPROTO_TCP && + tcp_rsk(req)->tfo_listener && + queue->fastopenq) { spin_lock_bh(&queue->fastopenq->lock); - if (tcp_rsk(req)->listener) { + if (tcp_rsk(req)->tfo_listener) { /* We are still waiting for the final ACK from 3WHS * so can't free req now. Instead, we set req->sk to * NULL to signify that the child socket is taken @@ -340,7 +342,7 @@ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err) out: release_sock(sk); if (req) - __reqsk_free(req); + reqsk_put(req); return newsk; out_err: newsk = NULL; @@ -635,7 +637,7 @@ void inet_csk_reqsk_queue_prune(struct sock *parent, /* Drop this request */ inet_csk_reqsk_queue_unlink(parent, req, reqp); reqsk_queue_removed(queue, req); - reqsk_free(req); + reqsk_put(req); continue; } reqp = &req->dl_next; @@ -817,9 +819,9 @@ void inet_csk_listen_stop(struct sock *sk) percpu_counter_inc(sk->sk_prot->orphan_count); - if (sk->sk_protocol == IPPROTO_TCP && tcp_rsk(req)->listener) { + if (sk->sk_protocol == IPPROTO_TCP && tcp_rsk(req)->tfo_listener) { BUG_ON(tcp_sk(child)->fastopen_rsk != req); - BUG_ON(sk != tcp_rsk(req)->listener); + BUG_ON(sk != req->rsk_listener); /* Paranoid, to prevent race condition if * an inbound pkt destined for child is @@ -828,7 +830,6 @@ void inet_csk_listen_stop(struct sock *sk) * tcp_v4_destroy_sock(). */ tcp_sk(child)->fastopen_rsk = NULL; - sock_put(sk); } inet_csk_destroy_sock(child); @@ -837,7 +838,7 @@ void inet_csk_listen_stop(struct sock *sk) sock_put(child); sk_acceptq_removed(sk); - __reqsk_free(req); + reqsk_put(req); } if (queue->fastopenq != NULL) { /* Free all the reqs queued in rskq_rst_head. */ @@ -847,7 +848,7 @@ void inet_csk_listen_stop(struct sock *sk) spin_unlock_bh(&queue->fastopenq->lock); while ((req = acc_req) != NULL) { acc_req = req->dl_next; - __reqsk_free(req); + reqsk_put(req); } } WARN_ON(sk->sk_ack_backlog); diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index ac7b5c909fe7..74c39c9f3e11 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -113,14 +113,13 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, return -EMSGSIZE; r = nlmsg_data(nlh); - BUG_ON((1 << sk->sk_state) & (TCPF_TIME_WAIT | TCPF_NEW_SYN_RECV)); + BUG_ON(!sk_fullsock(sk)); inet_diag_msg_common_fill(r, sk); r->idiag_state = sk->sk_state; r->idiag_timer = 0; r->idiag_retrans = 0; - if (nla_put_u8(skb, INET_DIAG_SHUTDOWN, sk->sk_shutdown)) goto errout; @@ -229,7 +228,6 @@ static int inet_csk_diag_fill(struct sock *sk, static int inet_twsk_diag_fill(struct sock *sk, struct sk_buff *skb, - const struct inet_diag_req_v2 *req, u32 portid, u32 seq, u16 nlmsg_flags, const struct nlmsghdr *unlh) { @@ -265,6 +263,39 @@ static int inet_twsk_diag_fill(struct sock *sk, return 0; } +static int inet_req_diag_fill(struct sock *sk, struct sk_buff *skb, + u32 portid, u32 seq, u16 nlmsg_flags, + const struct nlmsghdr *unlh) +{ + struct inet_diag_msg *r; + struct nlmsghdr *nlh; + long tmo; + + nlh = nlmsg_put(skb, portid, seq, unlh->nlmsg_type, sizeof(*r), + nlmsg_flags); + if (!nlh) + return -EMSGSIZE; + + r = nlmsg_data(nlh); + inet_diag_msg_common_fill(r, sk); + r->idiag_state = TCP_SYN_RECV; + r->idiag_timer = 1; + r->idiag_retrans = inet_reqsk(sk)->num_retrans; + + BUILD_BUG_ON(offsetof(struct inet_request_sock, ir_cookie) != + offsetof(struct sock, sk_cookie)); + + tmo = inet_reqsk(sk)->expires - jiffies; + r->idiag_expires = (tmo >= 0) ? jiffies_to_msecs(tmo) : 0; + r->idiag_rqueue = 0; + r->idiag_wqueue = 0; + r->idiag_uid = 0; + r->idiag_inode = 0; + + nlmsg_end(skb, nlh); + return 0; +} + static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, const struct inet_diag_req_v2 *r, struct user_namespace *user_ns, @@ -272,9 +303,13 @@ static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, const struct nlmsghdr *unlh) { if (sk->sk_state == TCP_TIME_WAIT) - return inet_twsk_diag_fill(sk, skb, r, portid, seq, + return inet_twsk_diag_fill(sk, skb, portid, seq, nlmsg_flags, unlh); + if (sk->sk_state == TCP_NEW_SYN_RECV) + return inet_req_diag_fill(sk, skb, portid, seq, + nlmsg_flags, unlh); + return inet_csk_diag_fill(sk, skb, r, user_ns, portid, seq, nlmsg_flags, unlh); } @@ -502,7 +537,7 @@ int inet_diag_bc_sk(const struct nlattr *bc, struct sock *sk) entry_fill_addrs(&entry, sk); entry.sport = inet->inet_num; entry.dport = ntohs(inet->inet_dport); - entry.userlocks = (sk->sk_state != TCP_TIME_WAIT) ? sk->sk_userlocks : 0; + entry.userlocks = sk_fullsock(sk) ? sk->sk_userlocks : 0; return inet_diag_bc_run(bc, &entry); } @@ -661,61 +696,6 @@ static void twsk_build_assert(void) #endif } -static int inet_twsk_diag_dump(struct sock *sk, - struct sk_buff *skb, - struct netlink_callback *cb, - const struct inet_diag_req_v2 *r, - const struct nlattr *bc) -{ - twsk_build_assert(); - - if (!inet_diag_bc_sk(bc, sk)) - return 0; - - return inet_twsk_diag_fill(sk, skb, r, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh); -} - -static int inet_diag_fill_req(struct sk_buff *skb, struct sock *sk, - struct request_sock *req, - struct user_namespace *user_ns, - u32 portid, u32 seq, - const struct nlmsghdr *unlh) -{ - const struct inet_request_sock *ireq = inet_rsk(req); - struct inet_diag_msg *r; - struct nlmsghdr *nlh; - long tmo; - - nlh = nlmsg_put(skb, portid, seq, unlh->nlmsg_type, sizeof(*r), - NLM_F_MULTI); - if (!nlh) - return -EMSGSIZE; - - r = nlmsg_data(nlh); - inet_diag_msg_common_fill(r, (struct sock *)ireq); - r->idiag_state = TCP_SYN_RECV; - r->idiag_timer = 1; - r->idiag_retrans = req->num_retrans; - - BUILD_BUG_ON(offsetof(struct inet_request_sock, ir_cookie) != - offsetof(struct sock, sk_cookie)); - - tmo = req->expires - jiffies; - if (tmo < 0) - tmo = 0; - - r->idiag_expires = jiffies_to_msecs(tmo); - r->idiag_rqueue = 0; - r->idiag_wqueue = 0; - r->idiag_uid = from_kuid_munged(user_ns, sock_i_uid(sk)); - r->idiag_inode = 0; - - nlmsg_end(skb, nlh); - return 0; -} - static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk, struct netlink_callback *cb, const struct inet_diag_req_v2 *r, @@ -762,17 +742,17 @@ static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk, if (bc) { /* Note: entry.sport and entry.userlocks are already set */ - entry_fill_addrs(&entry, (struct sock *)req); + entry_fill_addrs(&entry, req_to_sk(req)); entry.dport = ntohs(ireq->ir_rmt_port); if (!inet_diag_bc_run(bc, &entry)) continue; } - err = inet_diag_fill_req(skb, sk, req, - sk_user_ns(NETLINK_CB(cb->skb).sk), + err = inet_req_diag_fill(req_to_sk(req), skb, NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, cb->nlh); + cb->nlh->nlmsg_seq, + NLM_F_MULTI, cb->nlh); if (err < 0) { cb->args[3] = j + 1; cb->args[4] = reqnum; @@ -903,10 +883,16 @@ skip_listen_ht: if (r->id.idiag_dport != sk->sk_dport && r->id.idiag_dport) goto next_normal; - if (sk->sk_state == TCP_TIME_WAIT) - res = inet_twsk_diag_dump(sk, skb, cb, r, bc); - else - res = inet_csk_diag_dump(sk, skb, cb, r, bc); + twsk_build_assert(); + + if (!inet_diag_bc_sk(bc, sk)) + goto next_normal; + + res = sk_diag_fill(sk, skb, r, + sk_user_ns(NETLINK_CB(cb->skb).sk), + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, NLM_F_MULTI, + cb->nlh); if (res < 0) { spin_unlock_bh(lock); goto done; diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 64401a2fdd33..0fb841b9d834 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -24,9 +24,9 @@ #include <net/secure_seq.h> #include <net/ip.h> -static unsigned int inet_ehashfn(struct net *net, const __be32 laddr, - const __u16 lport, const __be32 faddr, - const __be16 fport) +static u32 inet_ehashfn(const struct net *net, const __be32 laddr, + const __u16 lport, const __be32 faddr, + const __be16 fport) { static u32 inet_ehash_secret __read_mostly; @@ -36,17 +36,21 @@ static unsigned int inet_ehashfn(struct net *net, const __be32 laddr, inet_ehash_secret + net_hash_mix(net)); } - -static unsigned int inet_sk_ehashfn(const struct sock *sk) +/* This function handles inet_sock, but also timewait and request sockets + * for IPv4/IPv6. + */ +u32 sk_ehashfn(const struct sock *sk) { - const struct inet_sock *inet = inet_sk(sk); - const __be32 laddr = inet->inet_rcv_saddr; - const __u16 lport = inet->inet_num; - const __be32 faddr = inet->inet_daddr; - const __be16 fport = inet->inet_dport; - struct net *net = sock_net(sk); - - return inet_ehashfn(net, laddr, lport, faddr, fport); +#if IS_ENABLED(CONFIG_IPV6) + if (sk->sk_family == AF_INET6 && + !ipv6_addr_v4mapped(&sk->sk_v6_daddr)) + return inet6_ehashfn(sock_net(sk), + &sk->sk_v6_rcv_saddr, sk->sk_num, + &sk->sk_v6_daddr, sk->sk_dport); +#endif + return inet_ehashfn(sock_net(sk), + sk->sk_rcv_saddr, sk->sk_num, + sk->sk_daddr, sk->sk_dport); } /* @@ -269,6 +273,12 @@ void sock_gen_put(struct sock *sk) } EXPORT_SYMBOL_GPL(sock_gen_put); +void sock_edemux(struct sk_buff *skb) +{ + sock_gen_put(skb->sk); +} +EXPORT_SYMBOL(sock_edemux); + struct sock *__inet_lookup_established(struct net *net, struct inet_hashinfo *hashinfo, const __be32 saddr, const __be16 sport, @@ -401,13 +411,13 @@ int __inet_hash_nolisten(struct sock *sk, struct inet_timewait_sock *tw) { struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; struct hlist_nulls_head *list; - spinlock_t *lock; struct inet_ehash_bucket *head; + spinlock_t *lock; int twrefcnt = 0; WARN_ON(!sk_unhashed(sk)); - sk->sk_hash = inet_sk_ehashfn(sk); + sk->sk_hash = sk_ehashfn(sk); head = inet_ehash_bucket(hashinfo, sk->sk_hash); list = &head->chain; lock = inet_ehash_lockp(hashinfo, sk->sk_hash); @@ -424,15 +434,13 @@ int __inet_hash_nolisten(struct sock *sk, struct inet_timewait_sock *tw) } EXPORT_SYMBOL_GPL(__inet_hash_nolisten); -static void __inet_hash(struct sock *sk) +int __inet_hash(struct sock *sk, struct inet_timewait_sock *tw) { struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; struct inet_listen_hashbucket *ilb; - if (sk->sk_state != TCP_LISTEN) { - __inet_hash_nolisten(sk, NULL); - return; - } + if (sk->sk_state != TCP_LISTEN) + return __inet_hash_nolisten(sk, tw); WARN_ON(!sk_unhashed(sk)); ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)]; @@ -441,13 +449,15 @@ static void __inet_hash(struct sock *sk) __sk_nulls_add_node_rcu(sk, &ilb->head); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); spin_unlock(&ilb->lock); + return 0; } +EXPORT_SYMBOL(__inet_hash); void inet_hash(struct sock *sk) { if (sk->sk_state != TCP_CLOSE) { local_bh_disable(); - __inet_hash(sk); + __inet_hash(sk, NULL); local_bh_enable(); } } @@ -478,8 +488,7 @@ EXPORT_SYMBOL_GPL(inet_unhash); int __inet_hash_connect(struct inet_timewait_death_row *death_row, struct sock *sk, u32 port_offset, int (*check_established)(struct inet_timewait_death_row *, - struct sock *, __u16, struct inet_timewait_sock **), - int (*hash)(struct sock *sk, struct inet_timewait_sock *twp)) + struct sock *, __u16, struct inet_timewait_sock **)) { struct inet_hashinfo *hinfo = death_row->hashinfo; const unsigned short snum = inet_sk(sk)->inet_num; @@ -549,7 +558,7 @@ ok: inet_bind_hash(sk, tb, port); if (sk_unhashed(sk)) { inet_sk(sk)->inet_sport = htons(port); - twrefcnt += hash(sk, tw); + twrefcnt += __inet_hash_nolisten(sk, tw); } if (tw) twrefcnt += inet_twsk_bind_unhash(tw, hinfo); @@ -571,7 +580,7 @@ ok: tb = inet_csk(sk)->icsk_bind_hash; spin_lock_bh(&head->lock); if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) { - hash(sk, NULL); + __inet_hash_nolisten(sk, NULL); spin_unlock_bh(&head->lock); return 0; } else { @@ -591,7 +600,7 @@ int inet_hash_connect(struct inet_timewait_death_row *death_row, struct sock *sk) { return __inet_hash_connect(death_row, sk, inet_sk_port_offset(sk), - __inet_check_established, __inet_hash_nolisten); + __inet_check_established); } EXPORT_SYMBOL_GPL(inet_hash_connect); diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index 86ebf020925b..f38e387448fb 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -487,6 +487,7 @@ void inet_twsk_purge(struct inet_hashinfo *hashinfo, for (slot = 0; slot <= hashinfo->ehash_mask; slot++) { struct inet_ehash_bucket *head = &hashinfo->ehash[slot]; restart_rcu: + cond_resched(); rcu_read_lock(); restart: sk_nulls_for_each_rcu(sk, node, &head->chain) { diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 5cd99271d3a6..f6a0d54b308a 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -536,12 +536,34 @@ out: * Socket option code for IP. This is the end of the line after any * TCP,UDP etc options on an IP socket. */ +static bool setsockopt_needs_rtnl(int optname) +{ + switch (optname) { + case IP_ADD_MEMBERSHIP: + case IP_ADD_SOURCE_MEMBERSHIP: + case IP_BLOCK_SOURCE: + case IP_DROP_MEMBERSHIP: + case IP_DROP_SOURCE_MEMBERSHIP: + case IP_MSFILTER: + case IP_UNBLOCK_SOURCE: + case MCAST_BLOCK_SOURCE: + case MCAST_MSFILTER: + case MCAST_JOIN_GROUP: + case MCAST_JOIN_SOURCE_GROUP: + case MCAST_LEAVE_GROUP: + case MCAST_LEAVE_SOURCE_GROUP: + case MCAST_UNBLOCK_SOURCE: + return true; + } + return false; +} static int do_ip_setsockopt(struct sock *sk, int level, int optname, char __user *optval, unsigned int optlen) { struct inet_sock *inet = inet_sk(sk); int val = 0, err; + bool needs_rtnl = setsockopt_needs_rtnl(optname); switch (optname) { case IP_PKTINFO: @@ -584,6 +606,8 @@ static int do_ip_setsockopt(struct sock *sk, int level, return ip_mroute_setsockopt(sk, optname, optval, optlen); err = 0; + if (needs_rtnl) + rtnl_lock(); lock_sock(sk); switch (optname) { @@ -1118,10 +1142,14 @@ mc_msf_out: break; } release_sock(sk); + if (needs_rtnl) + rtnl_unlock(); return err; e_inval: release_sock(sk); + if (needs_rtnl) + rtnl_unlock(); return -EINVAL; } diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index fd88f868776f..344e7cdfb8d4 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -64,11 +64,11 @@ EXPORT_SYMBOL_GPL(pingv6_ops); static u16 ping_port_rover; -static inline int ping_hashfn(struct net *net, unsigned int num, unsigned int mask) +static inline u32 ping_hashfn(const struct net *net, u32 num, u32 mask) { - int res = (num + net_hash_mix(net)) & mask; + u32 res = (num + net_hash_mix(net)) & mask; - pr_debug("hash(%d) = %d\n", num, res); + pr_debug("hash(%u) = %u\n", num, res); return res; } EXPORT_SYMBOL_GPL(ping_hash); diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index f17db898ed26..ef01d8570358 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -219,19 +219,20 @@ int __cookie_v4_check(const struct iphdr *iph, const struct tcphdr *th, } EXPORT_SYMBOL_GPL(__cookie_v4_check); -static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb, - struct request_sock *req, - struct dst_entry *dst) +static struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb, + struct request_sock *req, + struct dst_entry *dst) { struct inet_connection_sock *icsk = inet_csk(sk); struct sock *child; child = icsk->icsk_af_ops->syn_recv_sock(sk, skb, req, dst); - if (child) + if (child) { + atomic_set(&req->rsk_refcnt, 1); inet_csk_reqsk_queue_add(sk, req, child); - else + } else { reqsk_free(req); - + } return child; } @@ -325,7 +326,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) goto out; ret = NULL; - req = inet_reqsk_alloc(&tcp_request_sock_ops); /* for safety */ + req = inet_reqsk_alloc(&tcp_request_sock_ops, sk); /* for safety */ if (!req) goto out; @@ -336,8 +337,8 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) req->mss = mss; ireq->ir_num = ntohs(th->dest); ireq->ir_rmt_port = th->source; - ireq->ir_loc_addr = ip_hdr(skb)->daddr; - ireq->ir_rmt_addr = ip_hdr(skb)->saddr; + sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr); + sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr); ireq->ir_mark = inet_request_mark(sk, skb); ireq->snd_wscale = tcp_opt.snd_wscale; ireq->sack_ok = tcp_opt.sack_ok; @@ -345,8 +346,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) ireq->tstamp_ok = tcp_opt.saw_tstamp; req->ts_recent = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsval : 0; treq->snt_synack = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsecr : 0; - treq->listener = NULL; - write_pnet(&ireq->ireq_net, sock_net(sk)); + treq->tfo_listener = false; ireq->ireq_family = AF_INET; ireq->ir_iif = sk->sk_bound_dev_if; diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index fe77417fc137..82e375a0cbcf 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -155,12 +155,7 @@ static bool tcp_fastopen_create_child(struct sock *sk, tp = tcp_sk(child); tp->fastopen_rsk = req; - /* Do a hold on the listner sk so that if the listener is being - * closed, the child that has been accepted can live on and still - * access listen_lock. - */ - sock_hold(sk); - tcp_rsk(req)->listener = sk; + tcp_rsk(req)->tfo_listener = true; /* RFC1323: The window in SYN & SYN/ACK segments is never * scaled. So correct it appropriately. @@ -174,6 +169,7 @@ static bool tcp_fastopen_create_child(struct sock *sk, inet_csk_reset_xmit_timer(child, ICSK_TIME_RETRANS, TCP_TIMEOUT_INIT, TCP_RTO_MAX); + atomic_set(&req->rsk_refcnt, 1); /* Add the child socket directly into the accept queue */ inet_csk_reqsk_queue_add(sk, req, child); @@ -253,7 +249,7 @@ static bool tcp_fastopen_queue_check(struct sock *sk) fastopenq->rskq_rst_head = req1->dl_next; fastopenq->qlen--; spin_unlock(&fastopenq->lock); - reqsk_free(req1); + reqsk_put(req1); } return true; } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 717d437b6ce1..1dfbaee3554e 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3321,6 +3321,36 @@ static int tcp_ack_update_window(struct sock *sk, const struct sk_buff *skb, u32 return flag; } +/* Return true if we're currently rate-limiting out-of-window ACKs and + * thus shouldn't send a dupack right now. We rate-limit dupacks in + * response to out-of-window SYNs or ACKs to mitigate ACK loops or DoS + * attacks that send repeated SYNs or ACKs for the same connection. To + * do this, we do not send a duplicate SYNACK or ACK if the remote + * endpoint is sending out-of-window SYNs or pure ACKs at a high rate. + */ +bool tcp_oow_rate_limited(struct net *net, const struct sk_buff *skb, + int mib_idx, u32 *last_oow_ack_time) +{ + /* Data packets without SYNs are not likely part of an ACK loop. */ + if ((TCP_SKB_CB(skb)->seq != TCP_SKB_CB(skb)->end_seq) && + !tcp_hdr(skb)->syn) + goto not_rate_limited; + + if (*last_oow_ack_time) { + s32 elapsed = (s32)(tcp_time_stamp - *last_oow_ack_time); + + if (0 <= elapsed && elapsed < sysctl_tcp_invalid_ratelimit) { + NET_INC_STATS_BH(net, mib_idx); + return true; /* rate-limited: don't send yet! */ + } + } + + *last_oow_ack_time = tcp_time_stamp; + +not_rate_limited: + return false; /* not rate-limited: go ahead, send dupack now! */ +} + /* RFC 5961 7 [ACK Throttling] */ static void tcp_send_challenge_ack(struct sock *sk, const struct sk_buff *skb) { @@ -5912,6 +5942,51 @@ static void tcp_ecn_create_request(struct request_sock *req, inet_rsk(req)->ecn_ok = 1; } +static void tcp_openreq_init(struct request_sock *req, + const struct tcp_options_received *rx_opt, + struct sk_buff *skb, const struct sock *sk) +{ + struct inet_request_sock *ireq = inet_rsk(req); + + req->rcv_wnd = 0; /* So that tcp_send_synack() knows! */ + req->cookie_ts = 0; + tcp_rsk(req)->rcv_isn = TCP_SKB_CB(skb)->seq; + tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->seq + 1; + tcp_rsk(req)->snt_synack = tcp_time_stamp; + tcp_rsk(req)->last_oow_ack_time = 0; + req->mss = rx_opt->mss_clamp; + req->ts_recent = rx_opt->saw_tstamp ? rx_opt->rcv_tsval : 0; + ireq->tstamp_ok = rx_opt->tstamp_ok; + ireq->sack_ok = rx_opt->sack_ok; + ireq->snd_wscale = rx_opt->snd_wscale; + ireq->wscale_ok = rx_opt->wscale_ok; + ireq->acked = 0; + ireq->ecn_ok = 0; + ireq->ir_rmt_port = tcp_hdr(skb)->source; + ireq->ir_num = ntohs(tcp_hdr(skb)->dest); + ireq->ir_mark = inet_request_mark(sk, skb); +} + +struct request_sock *inet_reqsk_alloc(const struct request_sock_ops *ops, + struct sock *sk_listener) +{ + struct request_sock *req = reqsk_alloc(ops, sk_listener); + + if (req) { + struct inet_request_sock *ireq = inet_rsk(req); + + kmemcheck_annotate_bitfield(ireq, flags); + ireq->opt = NULL; + atomic64_set(&ireq->ir_cookie, 0); + ireq->ireq_state = TCP_NEW_SYN_RECV; + write_pnet(&ireq->ireq_net, sock_net(sk_listener)); + + } + + return req; +} +EXPORT_SYMBOL(inet_reqsk_alloc); + int tcp_conn_request(struct request_sock_ops *rsk_ops, const struct tcp_request_sock_ops *af_ops, struct sock *sk, struct sk_buff *skb) @@ -5949,7 +6024,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, goto drop; } - req = inet_reqsk_alloc(rsk_ops); + req = inet_reqsk_alloc(rsk_ops, sk); if (!req) goto drop; @@ -5965,7 +6040,6 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, tmp_opt.tstamp_ok = tmp_opt.saw_tstamp; tcp_openreq_init(req, &tmp_opt, skb, sk); - write_pnet(&inet_rsk(req)->ireq_net, sock_net(sk)); /* Note: tcp_v6_init_req() might override ir_iif for link locals */ inet_rsk(req)->ir_iif = sk->sk_bound_dev_if; @@ -6042,7 +6116,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, if (err || want_cookie) goto drop_and_free; - tcp_rsk(req)->listener = NULL; + tcp_rsk(req)->tfo_listener = false; af_ops->queue_hash_add(sk, req, TCP_TIMEOUT_INIT); } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 1f514a0c5e60..ddd0b1f25b96 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -189,7 +189,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) if (!inet->inet_saddr) inet->inet_saddr = fl4->saddr; - inet->inet_rcv_saddr = inet->inet_saddr; + sk_rcv_saddr_set(sk, inet->inet_saddr); if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) { /* Reset inherited state */ @@ -204,7 +204,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) tcp_fetch_timewait_stamp(sk, &rt->dst); inet->inet_dport = usin->sin_port; - inet->inet_daddr = daddr; + sk_daddr_set(sk, daddr); inet_csk(sk)->icsk_ext_hdr_len = 0; if (inet_opt) @@ -1219,14 +1219,14 @@ static bool tcp_v4_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb) #endif -static void tcp_v4_init_req(struct request_sock *req, struct sock *sk, +static void tcp_v4_init_req(struct request_sock *req, struct sock *sk_listener, struct sk_buff *skb) { struct inet_request_sock *ireq = inet_rsk(req); - ireq->ir_loc_addr = ip_hdr(skb)->daddr; - ireq->ir_rmt_addr = ip_hdr(skb)->saddr; - ireq->no_srccheck = inet_sk(sk)->transparent; + sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr); + sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr); + ireq->no_srccheck = inet_sk(sk_listener)->transparent; ireq->opt = tcp_v4_save_options(skb); ireq->ireq_family = AF_INET; } @@ -1319,8 +1319,8 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, newtp = tcp_sk(newsk); newinet = inet_sk(newsk); ireq = inet_rsk(req); - newinet->inet_daddr = ireq->ir_rmt_addr; - newinet->inet_rcv_saddr = ireq->ir_loc_addr; + sk_daddr_set(newsk, ireq->ir_rmt_addr); + sk_rcv_saddr_set(newsk, ireq->ir_loc_addr); newinet->inet_saddr = ireq->ir_loc_addr; inet_opt = ireq->opt; rcu_assign_pointer(newinet->inet_opt, inet_opt); @@ -1518,7 +1518,7 @@ void tcp_v4_early_demux(struct sk_buff *skb) if (sk) { skb->sk = sk; skb->destructor = sock_edemux; - if (sk->sk_state != TCP_TIME_WAIT) { + if (sk_fullsock(sk)) { struct dst_entry *dst = sk->sk_rx_dst; if (dst) diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index 366728cbee4a..5bef3513af77 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -152,6 +152,9 @@ static void tcpm_check_stamp(struct tcp_metrics_block *tm, struct dst_entry *dst #define TCP_METRICS_RECLAIM_DEPTH 5 #define TCP_METRICS_RECLAIM_PTR (struct tcp_metrics_block *) 0x1UL +#define deref_locked(p) \ + rcu_dereference_protected(p, lockdep_is_held(&tcp_metrics_lock)) + static struct tcp_metrics_block *tcpm_new(struct dst_entry *dst, struct inetpeer_addr *saddr, struct inetpeer_addr *daddr, @@ -180,9 +183,9 @@ static struct tcp_metrics_block *tcpm_new(struct dst_entry *dst, if (unlikely(reclaim)) { struct tcp_metrics_block *oldest; - oldest = rcu_dereference(tcp_metrics_hash[hash].chain); - for (tm = rcu_dereference(oldest->tcpm_next); tm; - tm = rcu_dereference(tm->tcpm_next)) { + oldest = deref_locked(tcp_metrics_hash[hash].chain); + for (tm = deref_locked(oldest->tcpm_next); tm; + tm = deref_locked(tm->tcpm_next)) { if (time_before(tm->tcpm_stamp, oldest->tcpm_stamp)) oldest = tm; } @@ -1040,12 +1043,6 @@ out_free: return ret; } -#define deref_locked_genl(p) \ - rcu_dereference_protected(p, lockdep_genl_is_held() && \ - lockdep_is_held(&tcp_metrics_lock)) - -#define deref_genl(p) rcu_dereference_protected(p, lockdep_genl_is_held()) - static void tcp_metrics_flush_all(struct net *net) { unsigned int max_rows = 1U << tcp_metrics_hash_log; @@ -1057,8 +1054,7 @@ static void tcp_metrics_flush_all(struct net *net) struct tcp_metrics_block __rcu **pp; spin_lock_bh(&tcp_metrics_lock); pp = &hb->chain; - for (tm = deref_locked_genl(*pp); tm; - tm = deref_locked_genl(*pp)) { + for (tm = deref_locked(*pp); tm; tm = deref_locked(*pp)) { if (net_eq(tm_net(tm), net)) { *pp = tm->tcpm_next; kfree_rcu(tm, rcu_head); @@ -1097,7 +1093,7 @@ static int tcp_metrics_nl_cmd_del(struct sk_buff *skb, struct genl_info *info) hb = tcp_metrics_hash + hash; pp = &hb->chain; spin_lock_bh(&tcp_metrics_lock); - for (tm = deref_locked_genl(*pp); tm; tm = deref_locked_genl(*pp)) { + for (tm = deref_locked(*pp); tm; tm = deref_locked(*pp)) { if (addr_same(&tm->tcpm_daddr, &daddr) && (!src || addr_same(&tm->tcpm_saddr, &saddr)) && net_eq(tm_net(tm), net)) { diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index f27556e2158b..294af16633af 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -318,8 +318,8 @@ static int ipv4_rcv_saddr_equal(const struct sock *sk1, const struct sock *sk2) inet1->inet_rcv_saddr == inet2->inet_rcv_saddr)); } -static unsigned int udp4_portaddr_hash(struct net *net, __be32 saddr, - unsigned int port) +static u32 udp4_portaddr_hash(const struct net *net, __be32 saddr, + unsigned int port) { return jhash_1word((__force u32)saddr, net_hash_mix(net)) ^ port; } @@ -421,9 +421,9 @@ static inline int compute_score2(struct sock *sk, struct net *net, return score; } -static unsigned int udp_ehashfn(struct net *net, const __be32 laddr, - const __u16 lport, const __be32 faddr, - const __be16 fport) +static u32 udp_ehashfn(const struct net *net, const __be32 laddr, + const __u16 lport, const __be32 faddr, + const __be16 fport) { static u32 udp_ehash_secret __read_mostly; diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 88d2cf0cae52..158378e73f0a 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -2473,9 +2473,9 @@ static int ipv6_mc_config(struct sock *sk, bool join, lock_sock(sk); if (join) - ret = __ipv6_sock_mc_join(sk, ifindex, addr); + ret = ipv6_sock_mc_join(sk, ifindex, addr); else - ret = __ipv6_sock_mc_drop(sk, ifindex, addr); + ret = ipv6_sock_mc_drop(sk, ifindex, addr); release_sock(sk); return ret; diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c index 051dffb49c90..033f17816ef4 100644 --- a/net/ipv6/inet6_hashtables.c +++ b/net/ipv6/inet6_hashtables.c @@ -23,11 +23,9 @@ #include <net/secure_seq.h> #include <net/ip.h> -static unsigned int inet6_ehashfn(struct net *net, - const struct in6_addr *laddr, - const u16 lport, - const struct in6_addr *faddr, - const __be16 fport) +u32 inet6_ehashfn(const struct net *net, + const struct in6_addr *laddr, const u16 lport, + const struct in6_addr *faddr, const __be16 fport) { static u32 inet6_ehash_secret __read_mostly; static u32 ipv6_hash_secret __read_mostly; @@ -44,54 +42,6 @@ static unsigned int inet6_ehashfn(struct net *net, inet6_ehash_secret + net_hash_mix(net)); } -static int inet6_sk_ehashfn(const struct sock *sk) -{ - const struct inet_sock *inet = inet_sk(sk); - const struct in6_addr *laddr = &sk->sk_v6_rcv_saddr; - const struct in6_addr *faddr = &sk->sk_v6_daddr; - const __u16 lport = inet->inet_num; - const __be16 fport = inet->inet_dport; - struct net *net = sock_net(sk); - - return inet6_ehashfn(net, laddr, lport, faddr, fport); -} - -int __inet6_hash(struct sock *sk, struct inet_timewait_sock *tw) -{ - struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; - int twrefcnt = 0; - - WARN_ON(!sk_unhashed(sk)); - - if (sk->sk_state == TCP_LISTEN) { - struct inet_listen_hashbucket *ilb; - - ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)]; - spin_lock(&ilb->lock); - __sk_nulls_add_node_rcu(sk, &ilb->head); - spin_unlock(&ilb->lock); - } else { - unsigned int hash; - struct hlist_nulls_head *list; - spinlock_t *lock; - - sk->sk_hash = hash = inet6_sk_ehashfn(sk); - list = &inet_ehash_bucket(hashinfo, hash)->chain; - lock = inet_ehash_lockp(hashinfo, hash); - spin_lock(lock); - __sk_nulls_add_node_rcu(sk, list); - if (tw) { - WARN_ON(sk->sk_hash != tw->tw_hash); - twrefcnt = inet_twsk_unhash(tw); - } - spin_unlock(lock); - } - - sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); - return twrefcnt; -} -EXPORT_SYMBOL(__inet6_hash); - /* * Sockets in TCP_CLOSE state are _always_ taken out of the hash, so * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM @@ -320,6 +270,6 @@ int inet6_hash_connect(struct inet_timewait_death_row *death_row, struct sock *sk) { return __inet_hash_connect(death_row, sk, inet6_sk_port_offset(sk), - __inet6_check_established, __inet6_hash); + __inet6_check_established); } EXPORT_SYMBOL_GPL(inet6_hash_connect); diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index 8d766d9100cb..cc5883791bac 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -117,6 +117,23 @@ struct ipv6_txoptions *ipv6_update_options(struct sock *sk, return opt; } +static bool setsockopt_needs_rtnl(int optname) +{ + switch (optname) { + case IPV6_ADD_MEMBERSHIP: + case IPV6_DROP_MEMBERSHIP: + case MCAST_JOIN_GROUP: + case MCAST_LEAVE_GROUP: + case MCAST_JOIN_SOURCE_GROUP: + case MCAST_LEAVE_SOURCE_GROUP: + case MCAST_BLOCK_SOURCE: + case MCAST_UNBLOCK_SOURCE: + case MCAST_MSFILTER: + return true; + } + return false; +} + static int do_ipv6_setsockopt(struct sock *sk, int level, int optname, char __user *optval, unsigned int optlen) { @@ -124,6 +141,7 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname, struct net *net = sock_net(sk); int val, valbool; int retv = -ENOPROTOOPT; + bool needs_rtnl = setsockopt_needs_rtnl(optname); if (optval == NULL) val = 0; @@ -140,6 +158,8 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname, if (ip6_mroute_opt(optname)) return ip6_mroute_setsockopt(sk, optname, optval, optlen); + if (needs_rtnl) + rtnl_lock(); lock_sock(sk); switch (optname) { @@ -624,10 +644,10 @@ done: psin6 = (struct sockaddr_in6 *)&greq.gr_group; if (optname == MCAST_JOIN_GROUP) retv = ipv6_sock_mc_join(sk, greq.gr_interface, - &psin6->sin6_addr); + &psin6->sin6_addr); else retv = ipv6_sock_mc_drop(sk, greq.gr_interface, - &psin6->sin6_addr); + &psin6->sin6_addr); break; } case MCAST_JOIN_SOURCE_GROUP: @@ -660,7 +680,7 @@ done: psin6 = (struct sockaddr_in6 *)&greqs.gsr_group; retv = ipv6_sock_mc_join(sk, greqs.gsr_interface, - &psin6->sin6_addr); + &psin6->sin6_addr); /* prior join w/ different source is ok */ if (retv && retv != -EADDRINUSE) break; @@ -837,11 +857,15 @@ pref_skip_coa: } release_sock(sk); + if (needs_rtnl) + rtnl_unlock(); return retv; e_inval: release_sock(sk); + if (needs_rtnl) + rtnl_unlock(); return -EINVAL; } diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index 1dd1fedff9f4..cbb66fd3da6d 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -132,7 +132,7 @@ static int unsolicited_report_interval(struct inet6_dev *idev) return iv > 0 ? iv : 1; } -int __ipv6_sock_mc_join(struct sock *sk, int ifindex, const struct in6_addr *addr) +int ipv6_sock_mc_join(struct sock *sk, int ifindex, const struct in6_addr *addr) { struct net_device *dev = NULL; struct ipv6_mc_socklist *mc_lst; @@ -199,24 +199,12 @@ int __ipv6_sock_mc_join(struct sock *sk, int ifindex, const struct in6_addr *add return 0; } -EXPORT_SYMBOL(__ipv6_sock_mc_join); - -int ipv6_sock_mc_join(struct sock *sk, int ifindex, const struct in6_addr *addr) -{ - int ret; - - rtnl_lock(); - ret = __ipv6_sock_mc_join(sk, ifindex, addr); - rtnl_unlock(); - - return ret; -} EXPORT_SYMBOL(ipv6_sock_mc_join); /* * socket leave on multicast group */ -int __ipv6_sock_mc_drop(struct sock *sk, int ifindex, const struct in6_addr *addr) +int ipv6_sock_mc_drop(struct sock *sk, int ifindex, const struct in6_addr *addr) { struct ipv6_pinfo *np = inet6_sk(sk); struct ipv6_mc_socklist *mc_lst; @@ -255,18 +243,6 @@ int __ipv6_sock_mc_drop(struct sock *sk, int ifindex, const struct in6_addr *add return -EADDRNOTAVAIL; } -EXPORT_SYMBOL(__ipv6_sock_mc_drop); - -int ipv6_sock_mc_drop(struct sock *sk, int ifindex, const struct in6_addr *addr) -{ - int ret; - - rtnl_lock(); - ret = __ipv6_sock_mc_drop(sk, ifindex, addr); - rtnl_unlock(); - - return ret; -} EXPORT_SYMBOL(ipv6_sock_mc_drop); /* called with rcu_read_lock() */ @@ -460,7 +436,7 @@ done: read_unlock_bh(&idev->lock); rcu_read_unlock(); if (leavegroup) - return ipv6_sock_mc_drop(sk, pgsr->gsr_interface, group); + err = ipv6_sock_mc_drop(sk, pgsr->gsr_interface, group); return err; } diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c index 58875ce8e178..da5823e5e5a7 100644 --- a/net/ipv6/syncookies.c +++ b/net/ipv6/syncookies.c @@ -49,11 +49,12 @@ static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb, struct sock *child; child = icsk->icsk_af_ops->syn_recv_sock(sk, skb, req, dst); - if (child) + if (child) { + atomic_set(&req->rsk_refcnt, 1); inet_csk_reqsk_queue_add(sk, req, child); - else + } else { reqsk_free(req); - + } return child; } @@ -189,14 +190,13 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) goto out; ret = NULL; - req = inet_reqsk_alloc(&tcp6_request_sock_ops); + req = inet_reqsk_alloc(&tcp6_request_sock_ops, sk); if (!req) goto out; ireq = inet_rsk(req); treq = tcp_rsk(req); - treq->listener = NULL; - write_pnet(&ireq->ireq_net, sock_net(sk)); + treq->tfo_listener = false; ireq->ireq_family = AF_INET6; if (security_inet_conn_request(sk, skb, req)) diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index d89f028dc8c4..720676d073d9 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -104,19 +104,6 @@ static void inet6_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) } } -static void tcp_v6_hash(struct sock *sk) -{ - if (sk->sk_state != TCP_CLOSE) { - if (inet_csk(sk)->icsk_af_ops == &ipv6_mapped) { - tcp_prot.hash(sk); - return; - } - local_bh_disable(); - __inet6_hash(sk, NULL); - local_bh_enable(); - } -} - static __u32 tcp_v6_init_sequence(const struct sk_buff *skb) { return secure_tcpv6_sequence_number(ipv6_hdr(skb)->daddr.s6_addr32, @@ -233,11 +220,8 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, tp->af_specific = &tcp_sock_ipv6_specific; #endif goto failure; - } else { - ipv6_addr_set_v4mapped(inet->inet_saddr, &np->saddr); - ipv6_addr_set_v4mapped(inet->inet_rcv_saddr, - &sk->sk_v6_rcv_saddr); } + np->saddr = sk->sk_v6_rcv_saddr; return err; } @@ -1078,11 +1062,7 @@ static struct sock *tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, memcpy(newnp, np, sizeof(struct ipv6_pinfo)); - ipv6_addr_set_v4mapped(newinet->inet_daddr, &newsk->sk_v6_daddr); - - ipv6_addr_set_v4mapped(newinet->inet_saddr, &newnp->saddr); - - newsk->sk_v6_rcv_saddr = newnp->saddr; + newnp->saddr = newsk->sk_v6_rcv_saddr; inet_csk(newsk)->icsk_af_ops = &ipv6_mapped; newsk->sk_backlog_rcv = tcp_v4_do_rcv; @@ -1231,7 +1211,7 @@ static struct sock *tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, tcp_done(newsk); goto out; } - __inet6_hash(newsk, NULL); + __inet_hash(newsk, NULL); return newsk; @@ -1583,7 +1563,7 @@ static void tcp_v6_early_demux(struct sk_buff *skb) if (sk) { skb->sk = sk; skb->destructor = sock_edemux; - if (sk->sk_state != TCP_TIME_WAIT) { + if (sk_fullsock(sk)) { struct dst_entry *dst = sk->sk_rx_dst; if (dst) @@ -1890,7 +1870,7 @@ struct proto tcpv6_prot = { .sendpage = tcp_sendpage, .backlog_rcv = tcp_v6_do_rcv, .release_cb = tcp_release_cb, - .hash = tcp_v6_hash, + .hash = inet_hash, .unhash = inet_unhash, .get_port = inet_csk_get_port, .enter_memory_pressure = tcp_enter_memory_pressure, diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 70568a4548e4..7fe0329c0d37 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -53,11 +53,11 @@ #include <trace/events/skb.h> #include "udp_impl.h" -static unsigned int udp6_ehashfn(struct net *net, - const struct in6_addr *laddr, - const u16 lport, - const struct in6_addr *faddr, - const __be16 fport) +static u32 udp6_ehashfn(const struct net *net, + const struct in6_addr *laddr, + const u16 lport, + const struct in6_addr *faddr, + const __be16 fport) { static u32 udp6_ehash_secret __read_mostly; static u32 udp_ipv6_hash_secret __read_mostly; @@ -104,9 +104,9 @@ int ipv6_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2) return 0; } -static unsigned int udp6_portaddr_hash(struct net *net, - const struct in6_addr *addr6, - unsigned int port) +static u32 udp6_portaddr_hash(const struct net *net, + const struct in6_addr *addr6, + unsigned int port) { unsigned int hash, mix = net_hash_mix(net); diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c index 08d95559b6f7..19b9cce6c210 100644 --- a/net/netfilter/ipvs/ip_vs_sync.c +++ b/net/netfilter/ipvs/ip_vs_sync.c @@ -1405,9 +1405,11 @@ join_mcast_group(struct sock *sk, struct in_addr *addr, char *ifname) mreq.imr_ifindex = dev->ifindex; + rtnl_lock(); lock_sock(sk); ret = ip_mc_join_group(sk, &mreq); release_sock(sk); + rtnl_unlock(); return ret; } diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index 3aedbda7658a..f35c15b0de6b 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -209,7 +209,7 @@ static inline void maybe_update_pmtu(int skb_af, struct sk_buff *skb, int mtu) struct sock *sk = skb->sk; struct rtable *ort = skb_rtable(skb); - if (!skb->dev && sk && sk->sk_state != TCP_TIME_WAIT) + if (!skb->dev && sk && sk_fullsock(sk)) ort->dst.ops->update_pmtu(&ort->dst, sk, NULL, mtu); } diff --git a/net/netfilter/nf_log_common.c b/net/netfilter/nf_log_common.c index a2233e77cf39..2631876ac55b 100644 --- a/net/netfilter/nf_log_common.c +++ b/net/netfilter/nf_log_common.c @@ -133,7 +133,7 @@ EXPORT_SYMBOL_GPL(nf_log_dump_tcp_header); void nf_log_dump_sk_uid_gid(struct nf_log_buf *m, struct sock *sk) { - if (!sk || sk->sk_state == TCP_TIME_WAIT) + if (!sk || !sk_fullsock(sk)) return; read_lock_bh(&sk->sk_callback_lock); diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c index 11d85b3813f2..61d04bf9be2b 100644 --- a/net/netfilter/nfnetlink_log.c +++ b/net/netfilter/nfnetlink_log.c @@ -539,7 +539,7 @@ __build_packet_message(struct nfnl_log_net *log, /* UID */ sk = skb->sk; - if (sk && sk->sk_state != TCP_TIME_WAIT) { + if (sk && sk_fullsock(sk)) { read_lock_bh(&sk->sk_callback_lock); if (sk->sk_socket && sk->sk_socket->file) { struct file *file = sk->sk_socket->file; diff --git a/net/netfilter/nfnetlink_queue_core.c b/net/netfilter/nfnetlink_queue_core.c index 0db8515e76da..86ee8b05adae 100644 --- a/net/netfilter/nfnetlink_queue_core.c +++ b/net/netfilter/nfnetlink_queue_core.c @@ -257,7 +257,7 @@ static int nfqnl_put_sk_uidgid(struct sk_buff *skb, struct sock *sk) { const struct cred *cred; - if (sk->sk_state == TCP_TIME_WAIT) + if (!sk_fullsock(sk)) return 0; read_lock_bh(&sk->sk_callback_lock); diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c index e99911eda915..abe68119a76c 100644 --- a/net/netfilter/nft_meta.c +++ b/net/netfilter/nft_meta.c @@ -83,7 +83,7 @@ void nft_meta_get_eval(const struct nft_expr *expr, *(u16 *)dest->data = out->type; break; case NFT_META_SKUID: - if (skb->sk == NULL || skb->sk->sk_state == TCP_TIME_WAIT) + if (skb->sk == NULL || !sk_fullsock(skb->sk)) goto err; read_lock_bh(&skb->sk->sk_callback_lock); @@ -99,7 +99,7 @@ void nft_meta_get_eval(const struct nft_expr *expr, read_unlock_bh(&skb->sk->sk_callback_lock); break; case NFT_META_SKGID: - if (skb->sk == NULL || skb->sk->sk_state == TCP_TIME_WAIT) + if (skb->sk == NULL || !sk_fullsock(skb->sk)) goto err; read_lock_bh(&skb->sk->sk_callback_lock); diff --git a/net/netfilter/xt_TPROXY.c b/net/netfilter/xt_TPROXY.c index ef8a926752a9..165b77ce9aa9 100644 --- a/net/netfilter/xt_TPROXY.c +++ b/net/netfilter/xt_TPROXY.c @@ -42,15 +42,21 @@ enum nf_tproxy_lookup_t { static bool tproxy_sk_is_transparent(struct sock *sk) { - if (sk->sk_state != TCP_TIME_WAIT) { - if (inet_sk(sk)->transparent) - return true; - sock_put(sk); - } else { + switch (sk->sk_state) { + case TCP_TIME_WAIT: if (inet_twsk(sk)->tw_transparent) return true; - inet_twsk_put(inet_twsk(sk)); + break; + case TCP_NEW_SYN_RECV: + if (inet_rsk(inet_reqsk(sk))->no_srccheck) + return true; + break; + default: + if (inet_sk(sk)->transparent) + return true; } + + sock_gen_put(sk); return false; } diff --git a/net/netfilter/xt_socket.c b/net/netfilter/xt_socket.c index 13332dbf291d..895534e87a47 100644 --- a/net/netfilter/xt_socket.c +++ b/net/netfilter/xt_socket.c @@ -129,6 +129,20 @@ xt_socket_get_sock_v4(struct net *net, const u8 protocol, return NULL; } +static bool xt_socket_sk_is_transparent(struct sock *sk) +{ + switch (sk->sk_state) { + case TCP_TIME_WAIT: + return inet_twsk(sk)->tw_transparent; + + case TCP_NEW_SYN_RECV: + return inet_rsk(inet_reqsk(sk))->no_srccheck; + + default: + return inet_sk(sk)->transparent; + } +} + static bool socket_match(const struct sk_buff *skb, struct xt_action_param *par, const struct xt_socket_mtinfo1 *info) @@ -195,16 +209,14 @@ socket_match(const struct sk_buff *skb, struct xt_action_param *par, * unless XT_SOCKET_NOWILDCARD is set */ wildcard = (!(info->flags & XT_SOCKET_NOWILDCARD) && - sk->sk_state != TCP_TIME_WAIT && + sk_fullsock(sk) && inet_sk(sk)->inet_rcv_saddr == 0); /* Ignore non-transparent sockets, - if XT_SOCKET_TRANSPARENT is used */ + * if XT_SOCKET_TRANSPARENT is used + */ if (info->flags & XT_SOCKET_TRANSPARENT) - transparent = ((sk->sk_state != TCP_TIME_WAIT && - inet_sk(sk)->transparent) || - (sk->sk_state == TCP_TIME_WAIT && - inet_twsk(sk)->tw_transparent)); + transparent = xt_socket_sk_is_transparent(sk); if (sk != skb->sk) sock_gen_put(sk); @@ -363,16 +375,14 @@ socket_mt6_v1_v2(const struct sk_buff *skb, struct xt_action_param *par) * unless XT_SOCKET_NOWILDCARD is set */ wildcard = (!(info->flags & XT_SOCKET_NOWILDCARD) && - sk->sk_state != TCP_TIME_WAIT && + sk_fullsock(sk) && ipv6_addr_any(&sk->sk_v6_rcv_saddr)); /* Ignore non-transparent sockets, - if XT_SOCKET_TRANSPARENT is used */ + * if XT_SOCKET_TRANSPARENT is used + */ if (info->flags & XT_SOCKET_TRANSPARENT) - transparent = ((sk->sk_state != TCP_TIME_WAIT && - inet_sk(sk)->transparent) || - (sk->sk_state == TCP_TIME_WAIT && - inet_twsk(sk)->tw_transparent)); + transparent = xt_socket_sk_is_transparent(sk); if (sk != skb->sk) sock_gen_put(sk); diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 6b0f21950e09..d97aed628bda 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -3123,7 +3123,7 @@ static int __init netlink_proto_init(void) .key_offset = offsetof(struct netlink_sock, portid), .key_len = sizeof(u32), /* portid */ .hashfn = jhash, - .max_shift = 16, /* 64K */ + .max_size = 65536, }; if (err != 0) diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c index b7a23132c610..c9bfa004abed 100644 --- a/net/switchdev/switchdev.c +++ b/net/switchdev/switchdev.c @@ -28,11 +28,11 @@ int netdev_switch_parent_id_get(struct net_device *dev, struct netdev_phys_item_id *psid) { - const struct net_device_ops *ops = dev->netdev_ops; + const struct swdev_ops *ops = dev->swdev_ops; - if (!ops->ndo_switch_parent_id_get) + if (!ops || !ops->swdev_parent_id_get) return -EOPNOTSUPP; - return ops->ndo_switch_parent_id_get(dev, psid); + return ops->swdev_parent_id_get(dev, psid); } EXPORT_SYMBOL_GPL(netdev_switch_parent_id_get); @@ -46,12 +46,12 @@ EXPORT_SYMBOL_GPL(netdev_switch_parent_id_get); */ int netdev_switch_port_stp_update(struct net_device *dev, u8 state) { - const struct net_device_ops *ops = dev->netdev_ops; + const struct swdev_ops *ops = dev->swdev_ops; - if (!ops->ndo_switch_port_stp_update) + if (!ops || !ops->swdev_port_stp_update) return -EOPNOTSUPP; - WARN_ON(!ops->ndo_switch_parent_id_get); - return ops->ndo_switch_port_stp_update(dev, state); + WARN_ON(!ops->swdev_parent_id_get); + return ops->swdev_port_stp_update(dev, state); } EXPORT_SYMBOL_GPL(netdev_switch_port_stp_update); @@ -230,17 +230,17 @@ EXPORT_SYMBOL_GPL(ndo_dflt_netdev_switch_port_bridge_dellink); static struct net_device *netdev_switch_get_lowest_dev(struct net_device *dev) { - const struct net_device_ops *ops = dev->netdev_ops; + const struct swdev_ops *ops = dev->swdev_ops; struct net_device *lower_dev; struct net_device *port_dev; struct list_head *iter; /* Recusively search down until we find a sw port dev. - * (A sw port dev supports ndo_switch_parent_id_get). + * (A sw port dev supports swdev_parent_id_get). */ if (dev->features & NETIF_F_HW_SWITCH_OFFLOAD && - ops->ndo_switch_parent_id_get) + ops && ops->swdev_parent_id_get) return dev; netdev_for_each_lower_dev(dev, lower_dev, iter) { @@ -304,7 +304,7 @@ int netdev_switch_fib_ipv4_add(u32 dst, int dst_len, struct fib_info *fi, u8 tos, u8 type, u32 nlflags, u32 tb_id) { struct net_device *dev; - const struct net_device_ops *ops; + const struct swdev_ops *ops; int err = 0; /* Don't offload route if using custom ip rules or if @@ -322,12 +322,12 @@ int netdev_switch_fib_ipv4_add(u32 dst, int dst_len, struct fib_info *fi, dev = netdev_switch_get_dev_by_nhs(fi); if (!dev) return 0; - ops = dev->netdev_ops; + ops = dev->swdev_ops; - if (ops->ndo_switch_fib_ipv4_add) { - err = ops->ndo_switch_fib_ipv4_add(dev, htonl(dst), dst_len, - fi, tos, type, nlflags, - tb_id); + if (ops->swdev_fib_ipv4_add) { + err = ops->swdev_fib_ipv4_add(dev, htonl(dst), dst_len, + fi, tos, type, nlflags, + tb_id); if (!err) fi->fib_flags |= RTNH_F_EXTERNAL; } @@ -352,7 +352,7 @@ int netdev_switch_fib_ipv4_del(u32 dst, int dst_len, struct fib_info *fi, u8 tos, u8 type, u32 tb_id) { struct net_device *dev; - const struct net_device_ops *ops; + const struct swdev_ops *ops; int err = 0; if (!(fi->fib_flags & RTNH_F_EXTERNAL)) @@ -361,11 +361,11 @@ int netdev_switch_fib_ipv4_del(u32 dst, int dst_len, struct fib_info *fi, dev = netdev_switch_get_dev_by_nhs(fi); if (!dev) return 0; - ops = dev->netdev_ops; + ops = dev->swdev_ops; - if (ops->ndo_switch_fib_ipv4_del) { - err = ops->ndo_switch_fib_ipv4_del(dev, htonl(dst), dst_len, - fi, tos, type, tb_id); + if (ops->swdev_fib_ipv4_del) { + err = ops->swdev_fib_ipv4_del(dev, htonl(dst), dst_len, + fi, tos, type, tb_id); if (!err) fi->fib_flags &= ~RTNH_F_EXTERNAL; } diff --git a/net/tipc/link.c b/net/tipc/link.c index bc49120bfb44..8c98c4d00ad6 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -845,8 +845,10 @@ int tipc_link_xmit(struct net *net, struct sk_buff_head *list, u32 dnode, if (link) return rc; - if (likely(in_own_node(net, dnode))) - return tipc_sk_rcv(net, list); + if (likely(in_own_node(net, dnode))) { + tipc_sk_rcv(net, list); + return 0; + } __skb_queue_purge(list); return rc; diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c index 105ba7adf06f..ab0ac62a1287 100644 --- a/net/tipc/name_table.c +++ b/net/tipc/name_table.c @@ -811,8 +811,8 @@ static void tipc_purge_publications(struct net *net, struct name_seq *seq) sseq = seq->sseqs; info = sseq->info; list_for_each_entry_safe(publ, safe, &info->zone_list, zone_list) { - tipc_nametbl_remove_publ(net, publ->type, publ->lower, - publ->node, publ->ref, publ->key); + tipc_nameseq_remove_publ(net, seq, publ->lower, publ->node, + publ->ref, publ->key); kfree_rcu(publ, rcu); } hlist_del_init_rcu(&seq->ns_list); diff --git a/net/tipc/server.c b/net/tipc/server.c index eadd4ed45905..ab6183cdb121 100644 --- a/net/tipc/server.c +++ b/net/tipc/server.c @@ -37,11 +37,13 @@ #include "core.h" #include "socket.h" #include <net/sock.h> +#include <linux/module.h> /* Number of messages to send before rescheduling */ #define MAX_SEND_MSG_COUNT 25 #define MAX_RECV_MSG_COUNT 25 #define CF_CONNECTED 1 +#define CF_SERVER 2 #define sock2con(x) ((struct tipc_conn *)(x)->sk_user_data) @@ -88,9 +90,19 @@ static void tipc_clean_outqueues(struct tipc_conn *con); static void tipc_conn_kref_release(struct kref *kref) { struct tipc_conn *con = container_of(kref, struct tipc_conn, kref); + struct sockaddr_tipc *saddr = con->server->saddr; + struct socket *sock = con->sock; + struct sock *sk; - if (con->sock) { - tipc_sock_release_local(con->sock); + if (sock) { + sk = sock->sk; + if (test_bit(CF_SERVER, &con->flags)) { + __module_get(sock->ops->owner); + __module_get(sk->sk_prot_creator->owner); + } + saddr->scope = -TIPC_NODE_SCOPE; + kernel_bind(sock, (struct sockaddr *)saddr, sizeof(*saddr)); + sk_release_kernel(sk); con->sock = NULL; } @@ -281,7 +293,7 @@ static int tipc_accept_from_sock(struct tipc_conn *con) struct tipc_conn *newcon; int ret; - ret = tipc_sock_accept_local(sock, &newsock, O_NONBLOCK); + ret = kernel_accept(sock, &newsock, O_NONBLOCK); if (ret < 0) return ret; @@ -309,9 +321,12 @@ static struct socket *tipc_create_listen_sock(struct tipc_conn *con) struct socket *sock = NULL; int ret; - ret = tipc_sock_create_local(s->net, s->type, &sock); + ret = sock_create_kern(AF_TIPC, SOCK_SEQPACKET, 0, &sock); if (ret < 0) return NULL; + + sk_change_net(sock->sk, s->net); + ret = kernel_setsockopt(sock, SOL_TIPC, TIPC_IMPORTANCE, (char *)&s->imp, sizeof(s->imp)); if (ret < 0) @@ -337,11 +352,31 @@ static struct socket *tipc_create_listen_sock(struct tipc_conn *con) pr_err("Unknown socket type %d\n", s->type); goto create_err; } + + /* As server's listening socket owner and creator is the same module, + * we have to decrease TIPC module reference count to guarantee that + * it remains zero after the server socket is created, otherwise, + * executing "rmmod" command is unable to make TIPC module deleted + * after TIPC module is inserted successfully. + * + * However, the reference count is ever increased twice in + * sock_create_kern(): one is to increase the reference count of owner + * of TIPC socket's proto_ops struct; another is to increment the + * reference count of owner of TIPC proto struct. Therefore, we must + * decrement the module reference count twice to ensure that it keeps + * zero after server's listening socket is created. Of course, we + * must bump the module reference count twice as well before the socket + * is closed. + */ + module_put(sock->ops->owner); + module_put(sock->sk->sk_prot_creator->owner); + set_bit(CF_SERVER, &con->flags); + return sock; create_err: - sock_release(sock); - con->sock = NULL; + kernel_sock_shutdown(sock, SHUT_RDWR); + sk_release_kernel(sock->sk); return NULL; } diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 934947f038b6..c03a3d33806f 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -74,6 +74,7 @@ * @link_cong: non-zero if owner must sleep because of link congestion * @sent_unacked: # messages sent by socket, and not yet acked by peer * @rcv_unacked: # messages read by user, but not yet acked back to peer + * @remote: 'connected' peer for dgram/rdm * @node: hash table node * @rcu: rcu struct for tipc_sock */ @@ -96,6 +97,7 @@ struct tipc_sock { bool link_cong; uint sent_unacked; uint rcv_unacked; + struct sockaddr_tipc remote; struct rhash_head node; struct rcu_head rcu; }; @@ -121,9 +123,7 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dsz); static const struct proto_ops packet_ops; static const struct proto_ops stream_ops; static const struct proto_ops msg_ops; - static struct proto tipc_proto; -static struct proto tipc_proto_kern; static const struct nla_policy tipc_nl_sock_policy[TIPC_NLA_SOCK_MAX + 1] = { [TIPC_NLA_SOCK_UNSPEC] = { .type = NLA_UNSPEC }, @@ -341,11 +341,7 @@ static int tipc_sk_create(struct net *net, struct socket *sock, } /* Allocate socket's protocol area */ - if (!kern) - sk = sk_alloc(net, AF_TIPC, GFP_KERNEL, &tipc_proto); - else - sk = sk_alloc(net, AF_TIPC, GFP_KERNEL, &tipc_proto_kern); - + sk = sk_alloc(net, AF_TIPC, GFP_KERNEL, &tipc_proto); if (sk == NULL) return -ENOMEM; @@ -383,75 +379,6 @@ static int tipc_sk_create(struct net *net, struct socket *sock, return 0; } -/** - * tipc_sock_create_local - create TIPC socket from inside TIPC module - * @type: socket type - SOCK_RDM or SOCK_SEQPACKET - * - * We cannot use sock_creat_kern here because it bumps module user count. - * Since socket owner and creator is the same module we must make sure - * that module count remains zero for module local sockets, otherwise - * we cannot do rmmod. - * - * Returns 0 on success, errno otherwise - */ -int tipc_sock_create_local(struct net *net, int type, struct socket **res) -{ - int rc; - - rc = sock_create_lite(AF_TIPC, type, 0, res); - if (rc < 0) { - pr_err("Failed to create kernel socket\n"); - return rc; - } - tipc_sk_create(net, *res, 0, 1); - - return 0; -} - -/** - * tipc_sock_release_local - release socket created by tipc_sock_create_local - * @sock: the socket to be released. - * - * Module reference count is not incremented when such sockets are created, - * so we must keep it from being decremented when they are released. - */ -void tipc_sock_release_local(struct socket *sock) -{ - tipc_release(sock); - sock->ops = NULL; - sock_release(sock); -} - -/** - * tipc_sock_accept_local - accept a connection on a socket created - * with tipc_sock_create_local. Use this function to avoid that - * module reference count is inadvertently incremented. - * - * @sock: the accepting socket - * @newsock: reference to the new socket to be created - * @flags: socket flags - */ - -int tipc_sock_accept_local(struct socket *sock, struct socket **newsock, - int flags) -{ - struct sock *sk = sock->sk; - int ret; - - ret = sock_create_lite(sk->sk_family, sk->sk_type, - sk->sk_protocol, newsock); - if (ret < 0) - return ret; - - ret = tipc_accept(sock, *newsock, flags); - if (ret < 0) { - sock_release(*newsock); - return ret; - } - (*newsock)->ops = sock->ops; - return ret; -} - static void tipc_sk_callback(struct rcu_head *head) { struct tipc_sock *tsk = container_of(head, struct tipc_sock, rcu); @@ -929,22 +856,23 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dsz) u32 dnode, dport; struct sk_buff_head *pktchain = &sk->sk_write_queue; struct sk_buff *skb; - struct tipc_name_seq *seq = &dest->addr.nameseq; + struct tipc_name_seq *seq; struct iov_iter save; u32 mtu; long timeo; int rc; - if (unlikely(!dest)) - return -EDESTADDRREQ; - - if (unlikely((m->msg_namelen < sizeof(*dest)) || - (dest->family != AF_TIPC))) - return -EINVAL; - if (dsz > TIPC_MAX_USER_MSG_SIZE) return -EMSGSIZE; - + if (unlikely(!dest)) { + if (tsk->connected && sock->state == SS_READY) + dest = &tsk->remote; + else + return -EDESTADDRREQ; + } else if (unlikely(m->msg_namelen < sizeof(*dest)) || + dest->family != AF_TIPC) { + return -EINVAL; + } if (unlikely(sock->state != SS_READY)) { if (sock->state == SS_LISTENING) return -EPIPE; @@ -957,7 +885,7 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dsz) tsk->conn_instance = dest->addr.name.name.instance; } } - + seq = &dest->addr.nameseq; timeo = sock_sndtimeo(sk, m->msg_flags & MSG_DONTWAIT); if (dest->addrtype == TIPC_ADDR_MCAST) { @@ -1908,17 +1836,24 @@ static int tipc_connect(struct socket *sock, struct sockaddr *dest, int destlen, int flags) { struct sock *sk = sock->sk; + struct tipc_sock *tsk = tipc_sk(sk); struct sockaddr_tipc *dst = (struct sockaddr_tipc *)dest; struct msghdr m = {NULL,}; - long timeout = (flags & O_NONBLOCK) ? 0 : tipc_sk(sk)->conn_timeout; + long timeout = (flags & O_NONBLOCK) ? 0 : tsk->conn_timeout; socket_state previous; - int res; + int res = 0; lock_sock(sk); - /* For now, TIPC does not allow use of connect() with DGRAM/RDM types */ + /* DGRAM/RDM connect(), just save the destaddr */ if (sock->state == SS_READY) { - res = -EOPNOTSUPP; + if (dst->family == AF_UNSPEC) { + memset(&tsk->remote, 0, sizeof(struct sockaddr_tipc)); + tsk->connected = 0; + } else { + memcpy(&tsk->remote, dest, destlen); + tsk->connected = 1; + } goto exit; } @@ -2153,7 +2088,6 @@ restart: TIPC_CONN_SHUTDOWN)) tipc_link_xmit_skb(net, skb, dnode, tsk->portid); - tipc_node_remove_conn(net, dnode, tsk->portid); } else { dnode = tsk_peer_node(tsk); @@ -2361,8 +2295,8 @@ int tipc_sk_rht_init(struct net *net) .key_offset = offsetof(struct tipc_sock, portid), .key_len = sizeof(u32), /* portid */ .hashfn = jhash, - .max_shift = 20, /* 1M */ - .min_shift = 8, /* 256 */ + .max_size = 1048576, + .min_size = 256, }; return rhashtable_init(&tn->sk_rht, &rht_params); @@ -2608,12 +2542,6 @@ static struct proto tipc_proto = { .sysctl_rmem = sysctl_tipc_rmem }; -static struct proto tipc_proto_kern = { - .name = "TIPC", - .obj_size = sizeof(struct tipc_sock), - .sysctl_rmem = sysctl_tipc_rmem -}; - /** * tipc_socket_init - initialize TIPC socket interface * diff --git a/net/tipc/socket.h b/net/tipc/socket.h index 238f1b7bd9bd..bf6551389522 100644 --- a/net/tipc/socket.h +++ b/net/tipc/socket.h @@ -44,10 +44,6 @@ SKB_TRUESIZE(TIPC_MAX_USER_MSG_SIZE)) int tipc_socket_init(void); void tipc_socket_stop(void); -int tipc_sock_create_local(struct net *net, int type, struct socket **res); -void tipc_sock_release_local(struct socket *sock); -int tipc_sock_accept_local(struct socket *sock, struct socket **newsock, - int flags); int tipc_sk_rcv(struct net *net, struct sk_buff_head *inputq); void tipc_sk_mcast_rcv(struct net *net, struct sk_buff_head *arrvq, struct sk_buff_head *inputq); diff --git a/net/tipc/udp_media.c b/net/tipc/udp_media.c index fc2fb11a354d..04836dd70c2b 100644 --- a/net/tipc/udp_media.c +++ b/net/tipc/udp_media.c @@ -246,11 +246,11 @@ static int enable_mcast(struct udp_bearer *ub, struct udp_media_addr *remote) return 0; mreqn.imr_multiaddr = remote->ipv4; mreqn.imr_ifindex = ub->ifindex; - err = __ip_mc_join_group(sk, &mreqn); + err = ip_mc_join_group(sk, &mreqn); } else { if (!ipv6_addr_is_multicast(&remote->ipv6)) return 0; - err = __ipv6_sock_mc_join(sk, ub->ifindex, &remote->ipv6); + err = ipv6_sock_mc_join(sk, ub->ifindex, &remote->ipv6); } return err; } diff --git a/samples/bpf/sockex1_kern.c b/samples/bpf/sockex1_kern.c index 066892662915..ed18e9a4909c 100644 --- a/samples/bpf/sockex1_kern.c +++ b/samples/bpf/sockex1_kern.c @@ -1,5 +1,6 @@ #include <uapi/linux/bpf.h> #include <uapi/linux/if_ether.h> +#include <uapi/linux/if_packet.h> #include <uapi/linux/ip.h> #include "bpf_helpers.h" @@ -11,14 +12,17 @@ struct bpf_map_def SEC("maps") my_map = { }; SEC("socket1") -int bpf_prog1(struct sk_buff *skb) +int bpf_prog1(struct __sk_buff *skb) { int index = load_byte(skb, ETH_HLEN + offsetof(struct iphdr, protocol)); long *value; + if (skb->pkt_type != PACKET_OUTGOING) + return 0; + value = bpf_map_lookup_elem(&my_map, &index); if (value) - __sync_fetch_and_add(value, 1); + __sync_fetch_and_add(value, skb->len); return 0; } diff --git a/samples/bpf/sockex1_user.c b/samples/bpf/sockex1_user.c index 34a443ff3831..678ce4693551 100644 --- a/samples/bpf/sockex1_user.c +++ b/samples/bpf/sockex1_user.c @@ -40,7 +40,7 @@ int main(int ac, char **argv) key = IPPROTO_ICMP; assert(bpf_lookup_elem(map_fd[0], &key, &icmp_cnt) == 0); - printf("TCP %lld UDP %lld ICMP %lld packets\n", + printf("TCP %lld UDP %lld ICMP %lld bytes\n", tcp_cnt, udp_cnt, icmp_cnt); sleep(1); } diff --git a/samples/bpf/sockex2_kern.c b/samples/bpf/sockex2_kern.c index 6f0135f0f217..ba0e177ff561 100644 --- a/samples/bpf/sockex2_kern.c +++ b/samples/bpf/sockex2_kern.c @@ -42,13 +42,13 @@ static inline int proto_ports_offset(__u64 proto) } } -static inline int ip_is_fragment(struct sk_buff *ctx, __u64 nhoff) +static inline int ip_is_fragment(struct __sk_buff *ctx, __u64 nhoff) { return load_half(ctx, nhoff + offsetof(struct iphdr, frag_off)) & (IP_MF | IP_OFFSET); } -static inline __u32 ipv6_addr_hash(struct sk_buff *ctx, __u64 off) +static inline __u32 ipv6_addr_hash(struct __sk_buff *ctx, __u64 off) { __u64 w0 = load_word(ctx, off); __u64 w1 = load_word(ctx, off + 4); @@ -58,7 +58,7 @@ static inline __u32 ipv6_addr_hash(struct sk_buff *ctx, __u64 off) return (__u32)(w0 ^ w1 ^ w2 ^ w3); } -static inline __u64 parse_ip(struct sk_buff *skb, __u64 nhoff, __u64 *ip_proto, +static inline __u64 parse_ip(struct __sk_buff *skb, __u64 nhoff, __u64 *ip_proto, struct flow_keys *flow) { __u64 verlen; @@ -82,7 +82,7 @@ static inline __u64 parse_ip(struct sk_buff *skb, __u64 nhoff, __u64 *ip_proto, return nhoff; } -static inline __u64 parse_ipv6(struct sk_buff *skb, __u64 nhoff, __u64 *ip_proto, +static inline __u64 parse_ipv6(struct __sk_buff *skb, __u64 nhoff, __u64 *ip_proto, struct flow_keys *flow) { *ip_proto = load_byte(skb, @@ -96,7 +96,7 @@ static inline __u64 parse_ipv6(struct sk_buff *skb, __u64 nhoff, __u64 *ip_proto return nhoff; } -static inline bool flow_dissector(struct sk_buff *skb, struct flow_keys *flow) +static inline bool flow_dissector(struct __sk_buff *skb, struct flow_keys *flow) { __u64 nhoff = ETH_HLEN; __u64 ip_proto; @@ -183,18 +183,23 @@ static inline bool flow_dissector(struct sk_buff *skb, struct flow_keys *flow) return true; } +struct pair { + long packets; + long bytes; +}; + struct bpf_map_def SEC("maps") hash_map = { .type = BPF_MAP_TYPE_HASH, .key_size = sizeof(__be32), - .value_size = sizeof(long), + .value_size = sizeof(struct pair), .max_entries = 1024, }; SEC("socket2") -int bpf_prog2(struct sk_buff *skb) +int bpf_prog2(struct __sk_buff *skb) { struct flow_keys flow; - long *value; + struct pair *value; u32 key; if (!flow_dissector(skb, &flow)) @@ -203,9 +208,10 @@ int bpf_prog2(struct sk_buff *skb) key = flow.dst; value = bpf_map_lookup_elem(&hash_map, &key); if (value) { - __sync_fetch_and_add(value, 1); + __sync_fetch_and_add(&value->packets, 1); + __sync_fetch_and_add(&value->bytes, skb->len); } else { - long val = 1; + struct pair val = {1, skb->len}; bpf_map_update_elem(&hash_map, &key, &val, BPF_ANY); } diff --git a/samples/bpf/sockex2_user.c b/samples/bpf/sockex2_user.c index d2d5f5a790d3..29a276d766fc 100644 --- a/samples/bpf/sockex2_user.c +++ b/samples/bpf/sockex2_user.c @@ -6,6 +6,11 @@ #include <unistd.h> #include <arpa/inet.h> +struct pair { + __u64 packets; + __u64 bytes; +}; + int main(int ac, char **argv) { char filename[256]; @@ -29,13 +34,13 @@ int main(int ac, char **argv) for (i = 0; i < 5; i++) { int key = 0, next_key; - long long value; + struct pair value; while (bpf_get_next_key(map_fd[0], &key, &next_key) == 0) { bpf_lookup_elem(map_fd[0], &next_key, &value); - printf("ip %s count %lld\n", + printf("ip %s bytes %lld packets %lld\n", inet_ntoa((struct in_addr){htonl(next_key)}), - value); + value.bytes, value.packets); key = next_key; } sleep(1); diff --git a/samples/bpf/test_verifier.c b/samples/bpf/test_verifier.c index 7b56b59fad8e..75d561f9fd6a 100644 --- a/samples/bpf/test_verifier.c +++ b/samples/bpf/test_verifier.c @@ -14,6 +14,7 @@ #include <linux/unistd.h> #include <string.h> #include <linux/filter.h> +#include <stddef.h> #include "libbpf.h" #define MAX_INSNS 512 @@ -642,6 +643,84 @@ static struct bpf_test tests[] = { }, .result = ACCEPT, }, + { + "access skb fields ok", + .insns = { + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, + offsetof(struct __sk_buff, len)), + BPF_JMP_IMM(BPF_JGE, BPF_REG_0, 0, 1), + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, + offsetof(struct __sk_buff, mark)), + BPF_JMP_IMM(BPF_JGE, BPF_REG_0, 0, 1), + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, + offsetof(struct __sk_buff, pkt_type)), + BPF_JMP_IMM(BPF_JGE, BPF_REG_0, 0, 1), + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, + offsetof(struct __sk_buff, queue_mapping)), + BPF_JMP_IMM(BPF_JGE, BPF_REG_0, 0, 0), + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, + offsetof(struct __sk_buff, protocol)), + BPF_JMP_IMM(BPF_JGE, BPF_REG_0, 0, 0), + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, + offsetof(struct __sk_buff, vlan_present)), + BPF_JMP_IMM(BPF_JGE, BPF_REG_0, 0, 0), + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, + offsetof(struct __sk_buff, vlan_tci)), + BPF_JMP_IMM(BPF_JGE, BPF_REG_0, 0, 0), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + }, + { + "access skb fields bad1", + .insns = { + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, -4), + BPF_EXIT_INSN(), + }, + .errstr = "invalid bpf_context access", + .result = REJECT, + }, + { + "access skb fields bad2", + .insns = { + BPF_JMP_IMM(BPF_JGE, BPF_REG_1, 0, 9), + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1), + BPF_EXIT_INSN(), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, + offsetof(struct __sk_buff, pkt_type)), + BPF_EXIT_INSN(), + }, + .fixup = {4}, + .errstr = "different pointers", + .result = REJECT, + }, + { + "access skb fields bad3", + .insns = { + BPF_JMP_IMM(BPF_JGE, BPF_REG_1, 0, 2), + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, + offsetof(struct __sk_buff, pkt_type)), + BPF_EXIT_INSN(), + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1), + BPF_EXIT_INSN(), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), + BPF_JMP_IMM(BPF_JA, 0, 0, -12), + }, + .fixup = {6}, + .errstr = "different pointers", + .result = REJECT, + }, }; static int probe_filter_length(struct bpf_insn *fp) |