diff options
37 files changed, 2548 insertions, 126 deletions
diff --git a/drivers/net/ethernet/freescale/enetc/enetc.c b/drivers/net/ethernet/freescale/enetc/enetc.c index 79bbc86abe77..223709443ea4 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc.c +++ b/drivers/net/ethernet/freescale/enetc/enetc.c @@ -1427,6 +1427,62 @@ int enetc_close(struct net_device *ndev) return 0; } +int enetc_setup_tc(struct net_device *ndev, enum tc_setup_type type, + void *type_data) +{ + struct enetc_ndev_priv *priv = netdev_priv(ndev); + struct tc_mqprio_qopt *mqprio = type_data; + struct enetc_bdr *tx_ring; + u8 num_tc; + int i; + + if (type != TC_SETUP_QDISC_MQPRIO) + return -EOPNOTSUPP; + + mqprio->hw = TC_MQPRIO_HW_OFFLOAD_TCS; + num_tc = mqprio->num_tc; + + if (!num_tc) { + netdev_reset_tc(ndev); + netif_set_real_num_tx_queues(ndev, priv->num_tx_rings); + + /* Reset all ring priorities to 0 */ + for (i = 0; i < priv->num_tx_rings; i++) { + tx_ring = priv->tx_ring[i]; + enetc_set_bdr_prio(&priv->si->hw, tx_ring->index, 0); + } + + return 0; + } + + /* Check if we have enough BD rings available to accommodate all TCs */ + if (num_tc > priv->num_tx_rings) { + netdev_err(ndev, "Max %d traffic classes supported\n", + priv->num_tx_rings); + return -EINVAL; + } + + /* For the moment, we use only one BD ring per TC. + * + * Configure num_tc BD rings with increasing priorities. + */ + for (i = 0; i < num_tc; i++) { + tx_ring = priv->tx_ring[i]; + enetc_set_bdr_prio(&priv->si->hw, tx_ring->index, i); + } + + /* Reset the number of netdev queues based on the TC count */ + netif_set_real_num_tx_queues(ndev, num_tc); + + netdev_set_num_tc(ndev, num_tc); + + /* Each TC is associated with one netdev queue */ + for (i = 0; i < num_tc; i++) + netdev_set_tc_queue(ndev, i, 1, i); + + return 0; +} + struct net_device_stats *enetc_get_stats(struct net_device *ndev) { struct enetc_ndev_priv *priv = netdev_priv(ndev); diff --git a/drivers/net/ethernet/freescale/enetc/enetc.h b/drivers/net/ethernet/freescale/enetc/enetc.h index ea443268bf70..541b4e2073fe 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc.h +++ b/drivers/net/ethernet/freescale/enetc/enetc.h @@ -229,6 +229,9 @@ struct net_device_stats *enetc_get_stats(struct net_device *ndev); int enetc_set_features(struct net_device *ndev, netdev_features_t features); int enetc_ioctl(struct net_device *ndev, struct ifreq *rq, int cmd); +int enetc_setup_tc(struct net_device *ndev, enum tc_setup_type type, + void *type_data); + /* ethtool */ void enetc_set_ethtool_ops(struct net_device *ndev); diff --git a/drivers/net/ethernet/freescale/enetc/enetc_hw.h b/drivers/net/ethernet/freescale/enetc/enetc_hw.h index 6559cef4b07d..88276299f447 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc_hw.h +++ b/drivers/net/ethernet/freescale/enetc/enetc_hw.h @@ -127,7 +127,7 @@ enum enetc_bdr_type {TX, RX}; #define ENETC_TBSR_BUSY BIT(0) #define ENETC_TBMR_VIH BIT(9) #define ENETC_TBMR_PRIO_MASK GENMASK(2, 0) -#define ENETC_TBMR_PRIO_SET(val) val +#define ENETC_TBMR_SET_PRIO(val) ((val) & ENETC_TBMR_PRIO_MASK) #define ENETC_TBMR_EN BIT(31) #define ENETC_TBSR 0x4 #define ENETC_TBBAR0 0x10 @@ -544,3 +544,13 @@ static inline void enetc_enable_txvlan(struct enetc_hw *hw, int si_idx, val = (val & ~ENETC_TBMR_VIH) | (en ? ENETC_TBMR_VIH : 0); enetc_txbdr_wr(hw, si_idx, ENETC_TBMR, val); } + +static inline void enetc_set_bdr_prio(struct enetc_hw *hw, int bdr_idx, + int prio) +{ + u32 val = enetc_txbdr_rd(hw, bdr_idx, ENETC_TBMR); + + val &= ~ENETC_TBMR_PRIO_MASK; + val |= ENETC_TBMR_SET_PRIO(prio); + enetc_txbdr_wr(hw, bdr_idx, ENETC_TBMR, val); +} diff --git a/drivers/net/ethernet/freescale/enetc/enetc_pf.c b/drivers/net/ethernet/freescale/enetc/enetc_pf.c index d78ec8d43c39..258b3cb38a6f 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc_pf.c +++ b/drivers/net/ethernet/freescale/enetc/enetc_pf.c @@ -703,6 +703,7 @@ static const struct net_device_ops enetc_ndev_ops = { .ndo_set_vf_spoofchk = enetc_pf_set_vf_spoofchk, .ndo_set_features = enetc_pf_set_features, .ndo_do_ioctl = enetc_ioctl, + .ndo_setup_tc = enetc_setup_tc, }; static void enetc_pf_netdev_setup(struct enetc_si *si, struct net_device *ndev, diff --git a/drivers/net/ethernet/freescale/enetc/enetc_vf.c b/drivers/net/ethernet/freescale/enetc/enetc_vf.c index 17f72644c5a1..ebd21bf4cfa1 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc_vf.c +++ b/drivers/net/ethernet/freescale/enetc/enetc_vf.c @@ -112,6 +112,7 @@ static const struct net_device_ops enetc_ndev_ops = { .ndo_set_mac_address = enetc_vf_set_mac_addr, .ndo_set_features = enetc_vf_set_features, .ndo_do_ioctl = enetc_ioctl, + .ndo_setup_tc = enetc_setup_tc, }; static void enetc_vf_netdev_setup(struct enetc_si *si, struct net_device *ndev, diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c b/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c index fc4917ac44be..30354fa33a36 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c @@ -252,6 +252,7 @@ static void hns3_dbg_help(struct hnae3_handle *h) dev_info(&h->pdev->dev, "dump qos buf cfg\n"); dev_info(&h->pdev->dev, "dump mng tbl\n"); dev_info(&h->pdev->dev, "dump reset info\n"); + dev_info(&h->pdev->dev, "dump m7 info\n"); dev_info(&h->pdev->dev, "dump ncl_config <offset> <length>(in hex)\n"); dev_info(&h->pdev->dev, "dump mac tnl status\n"); diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c index 5e12705bae34..f6dc3054bcd1 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c @@ -3901,6 +3901,8 @@ static void hns3_client_uninit(struct hnae3_handle *handle, bool reset) hns3_client_stop(handle); + hns3_uninit_phy(netdev); + if (!test_and_clear_bit(HNS3_NIC_STATE_INITED, &priv->state)) { netdev_warn(netdev, "already uninitialized\n"); goto out_netdev_free; @@ -3910,8 +3912,6 @@ static void hns3_client_uninit(struct hnae3_handle *handle, bool reset) hns3_force_clear_all_rx_ring(handle); - hns3_uninit_phy(netdev); - hns3_nic_uninit_vector_data(priv); ret = hns3_nic_dealloc_vector_data(priv); diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h index c14480f9b625..408efd55ba48 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h @@ -145,7 +145,7 @@ enum hns3_nic_state { #define HNS3_RXD_TSIND_M (0x7 << HNS3_RXD_TSIND_S) #define HNS3_RXD_LKBK_B 15 #define HNS3_RXD_GRO_SIZE_S 16 -#define HNS3_RXD_GRO_SIZE_M (0x3ff << HNS3_RXD_GRO_SIZE_S) +#define HNS3_RXD_GRO_SIZE_M (0x3fff << HNS3_RXD_GRO_SIZE_S) #define HNS3_TXD_L3T_S 0 #define HNS3_TXD_L3T_M (0x3 << HNS3_TXD_L3T_S) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.c index fbd904e3077c..e5329053842b 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.c @@ -110,8 +110,7 @@ static void hclge_cmd_config_regs(struct hclge_cmq_ring *ring) hclge_write_dev(hw, HCLGE_NIC_CSQ_BASEADDR_H_REG, upper_32_bits(dma)); hclge_write_dev(hw, HCLGE_NIC_CSQ_DEPTH_REG, - (ring->desc_num >> HCLGE_NIC_CMQ_DESC_NUM_S) | - HCLGE_NIC_CMQ_ENABLE); + ring->desc_num >> HCLGE_NIC_CMQ_DESC_NUM_S); hclge_write_dev(hw, HCLGE_NIC_CSQ_HEAD_REG, 0); hclge_write_dev(hw, HCLGE_NIC_CSQ_TAIL_REG, 0); } else { @@ -120,8 +119,7 @@ static void hclge_cmd_config_regs(struct hclge_cmq_ring *ring) hclge_write_dev(hw, HCLGE_NIC_CRQ_BASEADDR_H_REG, upper_32_bits(dma)); hclge_write_dev(hw, HCLGE_NIC_CRQ_DEPTH_REG, - (ring->desc_num >> HCLGE_NIC_CMQ_DESC_NUM_S) | - HCLGE_NIC_CMQ_ENABLE); + ring->desc_num >> HCLGE_NIC_CMQ_DESC_NUM_S); hclge_write_dev(hw, HCLGE_NIC_CRQ_HEAD_REG, 0); hclge_write_dev(hw, HCLGE_NIC_CRQ_TAIL_REG, 0); } diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h index d79a209b80f6..61cb10dc2b9b 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h @@ -243,6 +243,9 @@ enum hclge_opcode_type { /* NCL config command */ HCLGE_OPC_QUERY_NCL_CONFIG = 0x7011, + /* M7 stats command */ + HCLGE_OPC_M7_STATS_BD = 0x7012, + HCLGE_OPC_M7_STATS_INFO = 0x7013, /* SFP command */ HCLGE_OPC_GET_SFP_INFO = 0x7104, @@ -970,6 +973,11 @@ struct hclge_fd_ad_config_cmd { u8 rsv2[8]; }; +struct hclge_get_m7_bd_cmd { + __le32 bd_num; + u8 rsv[20]; +}; + int hclge_cmd_init(struct hclge_dev *hdev); static inline void hclge_write_reg(void __iomem *base, u32 reg, u32 value) { diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_debugfs.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_debugfs.c index a9ffb57c4607..ed1f533d4082 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_debugfs.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_debugfs.c @@ -921,6 +921,61 @@ static void hclge_dbg_dump_rst_info(struct hclge_dev *hdev) hdev->rst_stats.reset_cnt); } +void hclge_dbg_get_m7_stats_info(struct hclge_dev *hdev) +{ + struct hclge_desc *desc_src, *desc_tmp; + struct hclge_get_m7_bd_cmd *req; + struct hclge_desc desc; + u32 bd_num, buf_len; + int ret, i; + + hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_M7_STATS_BD, true); + + req = (struct hclge_get_m7_bd_cmd *)desc.data; + ret = hclge_cmd_send(&hdev->hw, &desc, 1); + if (ret) { + dev_err(&hdev->pdev->dev, + "get firmware statistics bd number failed, ret=%d\n", + ret); + return; + } + + bd_num = le32_to_cpu(req->bd_num); + + buf_len = sizeof(struct hclge_desc) * bd_num; + desc_src = kzalloc(buf_len, GFP_KERNEL); + if (!desc_src) { + dev_err(&hdev->pdev->dev, + "allocate desc for get_m7_stats failed\n"); + return; + } + + desc_tmp = desc_src; + ret = hclge_dbg_cmd_send(hdev, desc_tmp, 0, bd_num, + HCLGE_OPC_M7_STATS_INFO); + if (ret) { + kfree(desc_src); + dev_err(&hdev->pdev->dev, + "get firmware statistics failed, ret=%d\n", ret); + return; + } + + for (i = 0; i < bd_num; i++) { + dev_info(&hdev->pdev->dev, "0x%08x 0x%08x 0x%08x\n", + le32_to_cpu(desc_tmp->data[0]), + le32_to_cpu(desc_tmp->data[1]), + le32_to_cpu(desc_tmp->data[2])); + dev_info(&hdev->pdev->dev, "0x%08x 0x%08x 0x%08x\n", + le32_to_cpu(desc_tmp->data[3]), + le32_to_cpu(desc_tmp->data[4]), + le32_to_cpu(desc_tmp->data[5])); + + desc_tmp++; + } + + kfree(desc_src); +} + /* hclge_dbg_dump_ncl_config: print specified range of NCL_CONFIG file * @hdev: pointer to struct hclge_dev * @cmd_buf: string that contains offset and length @@ -1029,6 +1084,8 @@ int hclge_dbg_run_cmd(struct hnae3_handle *handle, char *cmd_buf) hclge_dbg_dump_reg_cmd(hdev, cmd_buf); } else if (strncmp(cmd_buf, "dump reset info", 15) == 0) { hclge_dbg_dump_rst_info(hdev); + } else if (strncmp(cmd_buf, "dump m7 info", 12) == 0) { + hclge_dbg_get_m7_stats_info(hdev); } else if (strncmp(cmd_buf, "dump ncl_config", 15) == 0) { hclge_dbg_dump_ncl_config(hdev, &cmd_buf[sizeof("dump ncl_config")]); diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index a3fba7b8bcbb..0545f3890d1c 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -2427,7 +2427,8 @@ static void hclge_mbx_task_schedule(struct hclge_dev *hdev) static void hclge_reset_task_schedule(struct hclge_dev *hdev) { - if (!test_and_set_bit(HCLGE_STATE_RST_SERVICE_SCHED, &hdev->state)) + if (!test_bit(HCLGE_STATE_REMOVING, &hdev->state) && + !test_and_set_bit(HCLGE_STATE_RST_SERVICE_SCHED, &hdev->state)) schedule_work(&hdev->rst_service_task); } @@ -2873,6 +2874,10 @@ int hclge_notify_client(struct hclge_dev *hdev, struct hnae3_client *client = hdev->nic_client; u16 i; + if (!test_bit(HCLGE_STATE_NIC_REGISTERED, &hdev->state) || + !client) + return 0; + if (!client->ops->reset_notify) return -EOPNOTSUPP; @@ -2898,7 +2903,8 @@ static int hclge_notify_roce_client(struct hclge_dev *hdev, int ret = 0; u16 i; - if (!client) + if (!test_bit(HCLGE_STATE_ROCE_REGISTERED, &hdev->state) || + !client) return 0; if (!client->ops->reset_notify) @@ -3192,6 +3198,8 @@ static int hclge_reset_prepare_down(struct hclge_dev *hdev) static int hclge_reset_prepare_wait(struct hclge_dev *hdev) { +#define HCLGE_RESET_SYNC_TIME 100 + u32 reg_val; int ret = 0; @@ -3200,7 +3208,7 @@ static int hclge_reset_prepare_wait(struct hclge_dev *hdev) /* There is no mechanism for PF to know if VF has stopped IO * for now, just wait 100 ms for VF to stop IO */ - msleep(100); + msleep(HCLGE_RESET_SYNC_TIME); ret = hclge_func_reset_cmd(hdev, 0); if (ret) { dev_err(&hdev->pdev->dev, @@ -3220,7 +3228,7 @@ static int hclge_reset_prepare_wait(struct hclge_dev *hdev) /* There is no mechanism for PF to know if VF has stopped IO * for now, just wait 100 ms for VF to stop IO */ - msleep(100); + msleep(HCLGE_RESET_SYNC_TIME); set_bit(HCLGE_STATE_CMD_DISABLE, &hdev->state); set_bit(HNAE3_FLR_DOWN, &hdev->flr_state); hdev->rst_stats.flr_rst_cnt++; @@ -3234,6 +3242,10 @@ static int hclge_reset_prepare_wait(struct hclge_dev *hdev) break; } + /* inform hardware that preparatory work is done */ + msleep(HCLGE_RESET_SYNC_TIME); + hclge_write_dev(&hdev->hw, HCLGE_NIC_CSQ_DEPTH_REG, + HCLGE_NIC_CMQ_ENABLE); dev_info(&hdev->pdev->dev, "prepare wait ok\n"); return ret; @@ -5682,7 +5694,6 @@ static void hclge_fd_build_arfs_rule(const struct hclge_fd_rule_tuples *tuples, static int hclge_add_fd_entry_by_arfs(struct hnae3_handle *handle, u16 queue_id, u16 flow_id, struct flow_keys *fkeys) { -#ifdef CONFIG_RFS_ACCEL struct hclge_vport *vport = hclge_get_vport(handle); struct hclge_fd_rule_tuples new_tuples; struct hclge_dev *hdev = vport->back; @@ -5758,7 +5769,6 @@ static int hclge_add_fd_entry_by_arfs(struct hnae3_handle *handle, u16 queue_id, } return rule->location; -#endif } static void hclge_rfs_filter_expire(struct hclge_dev *hdev) @@ -8166,6 +8176,52 @@ static void hclge_info_show(struct hclge_dev *hdev) dev_info(dev, "PF info end.\n"); } +static int hclge_init_nic_client_instance(struct hnae3_ae_dev *ae_dev, + struct hclge_vport *vport) +{ + struct hnae3_client *client = vport->nic.client; + struct hclge_dev *hdev = ae_dev->priv; + int ret; + + ret = client->ops->init_instance(&vport->nic); + if (ret) + return ret; + + set_bit(HCLGE_STATE_NIC_REGISTERED, &hdev->state); + hnae3_set_client_init_flag(client, ae_dev, 1); + + if (netif_msg_drv(&hdev->vport->nic)) + hclge_info_show(hdev); + + return 0; +} + +static int hclge_init_roce_client_instance(struct hnae3_ae_dev *ae_dev, + struct hclge_vport *vport) +{ + struct hnae3_client *client = vport->roce.client; + struct hclge_dev *hdev = ae_dev->priv; + int ret; + + if (!hnae3_dev_roce_supported(hdev) || !hdev->roce_client || + !hdev->nic_client) + return 0; + + client = hdev->roce_client; + ret = hclge_init_roce_base_info(vport); + if (ret) + return ret; + + ret = client->ops->init_instance(&vport->roce); + if (ret) + return ret; + + set_bit(HCLGE_STATE_ROCE_REGISTERED, &hdev->state); + hnae3_set_client_init_flag(client, ae_dev, 1); + + return 0; +} + static int hclge_init_client_instance(struct hnae3_client *client, struct hnae3_ae_dev *ae_dev) { @@ -8181,30 +8237,13 @@ static int hclge_init_client_instance(struct hnae3_client *client, hdev->nic_client = client; vport->nic.client = client; - ret = client->ops->init_instance(&vport->nic); + ret = hclge_init_nic_client_instance(ae_dev, vport); if (ret) goto clear_nic; - hnae3_set_client_init_flag(client, ae_dev, 1); - - if (netif_msg_drv(&hdev->vport->nic)) - hclge_info_show(hdev); - - if (hdev->roce_client && - hnae3_dev_roce_supported(hdev)) { - struct hnae3_client *rc = hdev->roce_client; - - ret = hclge_init_roce_base_info(vport); - if (ret) - goto clear_roce; - - ret = rc->ops->init_instance(&vport->roce); - if (ret) - goto clear_roce; - - hnae3_set_client_init_flag(hdev->roce_client, - ae_dev, 1); - } + ret = hclge_init_roce_client_instance(ae_dev, vport); + if (ret) + goto clear_roce; break; case HNAE3_CLIENT_UNIC: @@ -8224,17 +8263,9 @@ static int hclge_init_client_instance(struct hnae3_client *client, vport->roce.client = client; } - if (hdev->roce_client && hdev->nic_client) { - ret = hclge_init_roce_base_info(vport); - if (ret) - goto clear_roce; - - ret = client->ops->init_instance(&vport->roce); - if (ret) - goto clear_roce; - - hnae3_set_client_init_flag(client, ae_dev, 1); - } + ret = hclge_init_roce_client_instance(ae_dev, vport); + if (ret) + goto clear_roce; break; default: @@ -8264,6 +8295,7 @@ static void hclge_uninit_client_instance(struct hnae3_client *client, for (i = 0; i < hdev->num_vmdq_vport + 1; i++) { vport = &hdev->vport[i]; if (hdev->roce_client) { + clear_bit(HCLGE_STATE_ROCE_REGISTERED, &hdev->state); hdev->roce_client->ops->uninit_instance(&vport->roce, 0); hdev->roce_client = NULL; @@ -8272,6 +8304,7 @@ static void hclge_uninit_client_instance(struct hnae3_client *client, if (client->type == HNAE3_CLIENT_ROCE) return; if (hdev->nic_client && client->ops->uninit_instance) { + clear_bit(HCLGE_STATE_NIC_REGISTERED, &hdev->state); client->ops->uninit_instance(&vport->nic, 0); hdev->nic_client = NULL; vport->nic.client = NULL; @@ -8353,6 +8386,7 @@ static void hclge_state_init(struct hclge_dev *hdev) static void hclge_state_uninit(struct hclge_dev *hdev) { set_bit(HCLGE_STATE_DOWN, &hdev->state); + set_bit(HCLGE_STATE_REMOVING, &hdev->state); if (hdev->service_timer.function) del_timer_sync(&hdev->service_timer); diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h index c770390c6e57..2b3bc95ccbdf 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h @@ -201,6 +201,8 @@ enum HCLGE_DEV_STATE { HCLGE_STATE_DOWN, HCLGE_STATE_DISABLED, HCLGE_STATE_REMOVING, + HCLGE_STATE_NIC_REGISTERED, + HCLGE_STATE_ROCE_REGISTERED, HCLGE_STATE_SERVICE_INITED, HCLGE_STATE_SERVICE_SCHED, HCLGE_STATE_RST_SERVICE_SCHED, diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c index 0e04e63f2a94..d20f01720719 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c @@ -192,12 +192,10 @@ static int hclge_map_unmap_ring_to_vf_vector(struct hclge_vport *vport, bool en, return ret; ret = hclge_bind_ring_with_vector(vport, vector_id, en, &ring_chain); - if (ret) - return ret; hclge_free_vector_ring_chain(&ring_chain); - return 0; + return ret; } static int hclge_set_vf_promisc_mode(struct hclge_vport *vport, diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_cmd.c b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_cmd.c index 71f356fc2446..e1588c0e8bb9 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_cmd.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_cmd.c @@ -98,7 +98,6 @@ static void hclgevf_cmd_config_regs(struct hclgevf_cmq_ring *ring) hclgevf_write_dev(hw, HCLGEVF_NIC_CSQ_BASEADDR_H_REG, reg_val); reg_val = (ring->desc_num >> HCLGEVF_NIC_CMQ_DESC_NUM_S); - reg_val |= HCLGEVF_NIC_CMQ_ENABLE; hclgevf_write_dev(hw, HCLGEVF_NIC_CSQ_DEPTH_REG, reg_val); hclgevf_write_dev(hw, HCLGEVF_NIC_CSQ_HEAD_REG, 0); @@ -110,7 +109,6 @@ static void hclgevf_cmd_config_regs(struct hclgevf_cmq_ring *ring) hclgevf_write_dev(hw, HCLGEVF_NIC_CRQ_BASEADDR_H_REG, reg_val); reg_val = (ring->desc_num >> HCLGEVF_NIC_CMQ_DESC_NUM_S); - reg_val |= HCLGEVF_NIC_CMQ_ENABLE; hclgevf_write_dev(hw, HCLGEVF_NIC_CRQ_DEPTH_REG, reg_val); hclgevf_write_dev(hw, HCLGEVF_NIC_CRQ_HEAD_REG, 0); diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c index 5d53467ee2d2..87a619db2780 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c @@ -1306,6 +1306,10 @@ static int hclgevf_notify_client(struct hclgevf_dev *hdev, struct hnae3_handle *handle = &hdev->nic; int ret; + if (!test_bit(HCLGEVF_STATE_NIC_REGISTERED, &hdev->state) || + !client) + return 0; + if (!client->ops->reset_notify) return -EOPNOTSUPP; @@ -1410,6 +1414,8 @@ static int hclgevf_reset_stack(struct hclgevf_dev *hdev) static int hclgevf_reset_prepare_wait(struct hclgevf_dev *hdev) { +#define HCLGEVF_RESET_SYNC_TIME 100 + int ret = 0; switch (hdev->reset_type) { @@ -1427,7 +1433,10 @@ static int hclgevf_reset_prepare_wait(struct hclgevf_dev *hdev) } set_bit(HCLGEVF_STATE_CMD_DISABLE, &hdev->state); - + /* inform hardware that preparatory work is done */ + msleep(HCLGEVF_RESET_SYNC_TIME); + hclgevf_write_dev(&hdev->hw, HCLGEVF_NIC_CSQ_DEPTH_REG, + HCLGEVF_NIC_CMQ_ENABLE); dev_info(&hdev->pdev->dev, "prepare reset(%d) wait done, ret:%d\n", hdev->reset_type, ret); @@ -1612,7 +1621,8 @@ static void hclgevf_get_misc_vector(struct hclgevf_dev *hdev) void hclgevf_reset_task_schedule(struct hclgevf_dev *hdev) { - if (!test_bit(HCLGEVF_STATE_RST_SERVICE_SCHED, &hdev->state)) { + if (!test_bit(HCLGEVF_STATE_RST_SERVICE_SCHED, &hdev->state) && + !test_bit(HCLGEVF_STATE_REMOVING, &hdev->state)) { set_bit(HCLGEVF_STATE_RST_SERVICE_SCHED, &hdev->state); schedule_work(&hdev->rst_service_task); } @@ -2123,6 +2133,7 @@ static void hclgevf_state_init(struct hclgevf_dev *hdev) static void hclgevf_state_uninit(struct hclgevf_dev *hdev) { set_bit(HCLGEVF_STATE_DOWN, &hdev->state); + set_bit(HCLGEVF_STATE_REMOVING, &hdev->state); if (hdev->keep_alive_timer.function) del_timer_sync(&hdev->keep_alive_timer); @@ -2249,6 +2260,48 @@ static void hclgevf_info_show(struct hclgevf_dev *hdev) dev_info(dev, "VF info end.\n"); } +static int hclgevf_init_nic_client_instance(struct hnae3_ae_dev *ae_dev, + struct hnae3_client *client) +{ + struct hclgevf_dev *hdev = ae_dev->priv; + int ret; + + ret = client->ops->init_instance(&hdev->nic); + if (ret) + return ret; + + set_bit(HCLGEVF_STATE_NIC_REGISTERED, &hdev->state); + hnae3_set_client_init_flag(client, ae_dev, 1); + + if (netif_msg_drv(&hdev->nic)) + hclgevf_info_show(hdev); + + return 0; +} + +static int hclgevf_init_roce_client_instance(struct hnae3_ae_dev *ae_dev, + struct hnae3_client *client) +{ + struct hclgevf_dev *hdev = ae_dev->priv; + int ret; + + if (!hnae3_dev_roce_supported(hdev) || !hdev->roce_client || + !hdev->nic_client) + return 0; + + ret = hclgevf_init_roce_base_info(hdev); + if (ret) + return ret; + + ret = client->ops->init_instance(&hdev->roce); + if (ret) + return ret; + + hnae3_set_client_init_flag(client, ae_dev, 1); + + return 0; +} + static int hclgevf_init_client_instance(struct hnae3_client *client, struct hnae3_ae_dev *ae_dev) { @@ -2260,28 +2313,15 @@ static int hclgevf_init_client_instance(struct hnae3_client *client, hdev->nic_client = client; hdev->nic.client = client; - ret = client->ops->init_instance(&hdev->nic); + ret = hclgevf_init_nic_client_instance(ae_dev, client); if (ret) goto clear_nic; - hnae3_set_client_init_flag(client, ae_dev, 1); - - if (netif_msg_drv(&hdev->nic)) - hclgevf_info_show(hdev); - - if (hdev->roce_client && hnae3_dev_roce_supported(hdev)) { - struct hnae3_client *rc = hdev->roce_client; - - ret = hclgevf_init_roce_base_info(hdev); - if (ret) - goto clear_roce; - ret = rc->ops->init_instance(&hdev->roce); - if (ret) - goto clear_roce; + ret = hclgevf_init_roce_client_instance(ae_dev, + hdev->roce_client); + if (ret) + goto clear_roce; - hnae3_set_client_init_flag(hdev->roce_client, ae_dev, - 1); - } break; case HNAE3_CLIENT_UNIC: hdev->nic_client = client; @@ -2299,17 +2339,10 @@ static int hclgevf_init_client_instance(struct hnae3_client *client, hdev->roce.client = client; } - if (hdev->roce_client && hdev->nic_client) { - ret = hclgevf_init_roce_base_info(hdev); - if (ret) - goto clear_roce; - - ret = client->ops->init_instance(&hdev->roce); - if (ret) - goto clear_roce; - } + ret = hclgevf_init_roce_client_instance(ae_dev, client); + if (ret) + goto clear_roce; - hnae3_set_client_init_flag(client, ae_dev, 1); break; default: return -EINVAL; @@ -2342,6 +2375,8 @@ static void hclgevf_uninit_client_instance(struct hnae3_client *client, /* un-init nic/unic, if this was not called by roce client */ if (client->ops->uninit_instance && hdev->nic_client && client->type != HNAE3_CLIENT_ROCE) { + clear_bit(HCLGEVF_STATE_NIC_REGISTERED, &hdev->state); + client->ops->uninit_instance(&hdev->nic, 0); hdev->nic_client = NULL; hdev->nic.client = NULL; diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.h b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.h index cc52f54f8c08..a7fbd38c1492 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.h @@ -130,6 +130,8 @@ enum hclgevf_states { HCLGEVF_STATE_DOWN, HCLGEVF_STATE_DISABLED, HCLGEVF_STATE_IRQ_INITED, + HCLGEVF_STATE_REMOVING, + HCLGEVF_STATE_NIC_REGISTERED, /* task states */ HCLGEVF_STATE_SERVICE_SCHED, HCLGEVF_STATE_RST_SERVICE_SCHED, diff --git a/drivers/net/ethernet/huawei/hinic/hinic_main.c b/drivers/net/ethernet/huawei/hinic/hinic_main.c index e64bc664f687..cfd3f4232cac 100644 --- a/drivers/net/ethernet/huawei/hinic/hinic_main.c +++ b/drivers/net/ethernet/huawei/hinic/hinic_main.c @@ -724,7 +724,6 @@ static void set_rx_mode(struct work_struct *work) { struct hinic_rx_mode_work *rx_mode_work = work_to_rx_mode_work(work); struct hinic_dev *nic_dev = rx_mode_work_to_nic_dev(rx_mode_work); - struct netdev_hw_addr *ha; netif_info(nic_dev, drv, nic_dev->netdev, "set rx mode work\n"); @@ -732,9 +731,6 @@ static void set_rx_mode(struct work_struct *work) __dev_uc_sync(nic_dev->netdev, add_mac_addr, remove_mac_addr); __dev_mc_sync(nic_dev->netdev, add_mac_addr, remove_mac_addr); - - netdev_for_each_mc_addr(ha, nic_dev->netdev) - add_mac_addr(nic_dev->netdev, ha->addr); } static void hinic_set_rx_mode(struct net_device *netdev) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-meson.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-meson.c index 7fdd1760a74c..5ae474ebaaed 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-meson.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-meson.c @@ -1,14 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Amlogic Meson6 and Meson8 DWMAC glue layer * * Copyright (C) 2014 Beniamino Galvani <b.galvani@gmail.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. */ #include <linux/device.h> diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-meson8b.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-meson8b.c index c5979569fd60..c06295ec1ef0 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-meson8b.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-meson8b.c @@ -1,14 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Amlogic Meson8b, Meson8m2 and GXBB DWMAC glue layer * * Copyright (C) 2016 Martin Blumenstingl <martin.blumenstingl@googlemail.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. */ #include <linux/clk.h> diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h index 002f23c1a1a7..e91b79ad4e4a 100644 --- a/include/net/inet_frag.h +++ b/include/net/inet_frag.h @@ -3,6 +3,7 @@ #define __NET_FRAG_H__ #include <linux/rhashtable-types.h> +#include <linux/completion.h> /* Per netns frag queues directory */ struct fqdir { @@ -104,30 +105,14 @@ struct inet_frags { struct kmem_cache *frags_cachep; const char *frags_cache_name; struct rhashtable_params rhash_params; + refcount_t refcnt; + struct completion completion; }; int inet_frags_init(struct inet_frags *); void inet_frags_fini(struct inet_frags *); -static inline int fqdir_init(struct fqdir **fqdirp, struct inet_frags *f, - struct net *net) -{ - struct fqdir *fqdir = kzalloc(sizeof(*fqdir), GFP_KERNEL); - int res; - - if (!fqdir) - return -ENOMEM; - fqdir->f = f; - fqdir->net = net; - res = rhashtable_init(&fqdir->rhashtable, &fqdir->f->rhash_params); - if (res < 0) { - kfree(fqdir); - return res; - } - *fqdirp = fqdir; - return 0; -} - +int fqdir_init(struct fqdir **fqdirp, struct inet_frags *f, struct net *net); void fqdir_exit(struct fqdir *fqdir); void inet_frag_kill(struct inet_frag_queue *q); diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index 12689ddfc24c..abb4f92456e1 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -19,6 +19,7 @@ #include <net/netns/packet.h> #include <net/netns/ipv4.h> #include <net/netns/ipv6.h> +#include <net/netns/nexthop.h> #include <net/netns/ieee802154_6lowpan.h> #include <net/netns/sctp.h> #include <net/netns/dccp.h> @@ -108,6 +109,7 @@ struct net { struct netns_mib mib; struct netns_packet packet; struct netns_unix unx; + struct netns_nexthop nexthop; struct netns_ipv4 ipv4; #if IS_ENABLED(CONFIG_IPV6) struct netns_ipv6 ipv6; diff --git a/include/net/netns/nexthop.h b/include/net/netns/nexthop.h new file mode 100644 index 000000000000..c712ee5eebd9 --- /dev/null +++ b/include/net/netns/nexthop.h @@ -0,0 +1,18 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * nexthops in net namespaces + */ + +#ifndef __NETNS_NEXTHOP_H__ +#define __NETNS_NEXTHOP_H__ + +#include <linux/rbtree.h> + +struct netns_nexthop { + struct rb_root rb_root; /* tree of nexthops by id */ + struct hlist_head *devhash; /* nexthops by device */ + + unsigned int seq; /* protected by rtnl_mutex */ + u32 last_id_allocated; +}; +#endif diff --git a/include/net/nexthop.h b/include/net/nexthop.h new file mode 100644 index 000000000000..6e1b8f53624c --- /dev/null +++ b/include/net/nexthop.h @@ -0,0 +1,195 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Generic nexthop implementation + * + * Copyright (c) 2017-19 Cumulus Networks + * Copyright (c) 2017-19 David Ahern <dsa@cumulusnetworks.com> + */ + +#ifndef __LINUX_NEXTHOP_H +#define __LINUX_NEXTHOP_H + +#include <linux/netdevice.h> +#include <linux/types.h> +#include <net/ip_fib.h> +#include <net/ip6_fib.h> +#include <net/netlink.h> + +#define NEXTHOP_VALID_USER_FLAGS RTNH_F_ONLINK + +struct nexthop; + +struct nh_config { + u32 nh_id; + + u8 nh_family; + u8 nh_protocol; + u8 nh_blackhole; + u32 nh_flags; + + int nh_ifindex; + struct net_device *dev; + + union { + __be32 ipv4; + struct in6_addr ipv6; + } gw; + + struct nlattr *nh_grp; + u16 nh_grp_type; + + struct nlattr *nh_encap; + u16 nh_encap_type; + + u32 nlflags; + struct nl_info nlinfo; +}; + +struct nh_info { + struct hlist_node dev_hash; /* entry on netns devhash */ + struct nexthop *nh_parent; + + u8 family; + bool reject_nh; + + union { + struct fib_nh_common fib_nhc; + struct fib_nh fib_nh; + struct fib6_nh fib6_nh; + }; +}; + +struct nh_grp_entry { + struct nexthop *nh; + u8 weight; + atomic_t upper_bound; + + struct list_head nh_list; + struct nexthop *nh_parent; /* nexthop of group with this entry */ +}; + +struct nh_group { + u16 num_nh; + bool mpath; + bool has_v4; + struct nh_grp_entry nh_entries[0]; +}; + +struct nexthop { + struct rb_node rb_node; /* entry on netns rbtree */ + struct list_head grp_list; /* nh group entries using this nh */ + struct net *net; + + u32 id; + + u8 protocol; /* app managing this nh */ + u8 nh_flags; + bool is_group; + + refcount_t refcnt; + struct rcu_head rcu; + + union { + struct nh_info __rcu *nh_info; + struct nh_group __rcu *nh_grp; + }; +}; + +/* caller is holding rcu or rtnl; no reference taken to nexthop */ +struct nexthop *nexthop_find_by_id(struct net *net, u32 id); +void nexthop_free_rcu(struct rcu_head *head); + +static inline bool nexthop_get(struct nexthop *nh) +{ + return refcount_inc_not_zero(&nh->refcnt); +} + +static inline void nexthop_put(struct nexthop *nh) +{ + if (refcount_dec_and_test(&nh->refcnt)) + call_rcu(&nh->rcu, nexthop_free_rcu); +} + +static inline bool nexthop_is_multipath(const struct nexthop *nh) +{ + if (nh->is_group) { + struct nh_group *nh_grp; + + nh_grp = rcu_dereference_rtnl(nh->nh_grp); + return nh_grp->mpath; + } + return false; +} + +struct nexthop *nexthop_select_path(struct nexthop *nh, int hash); + +static inline unsigned int nexthop_num_path(const struct nexthop *nh) +{ + unsigned int rc = 1; + + if (nexthop_is_multipath(nh)) { + struct nh_group *nh_grp; + + nh_grp = rcu_dereference_rtnl(nh->nh_grp); + rc = nh_grp->num_nh; + } else { + const struct nh_info *nhi; + + nhi = rcu_dereference_rtnl(nh->nh_info); + if (nhi->reject_nh) + rc = 0; + } + + return rc; +} + +static inline +struct nexthop *nexthop_mpath_select(const struct nexthop *nh, int nhsel) +{ + const struct nh_group *nhg = rcu_dereference_rtnl(nh->nh_grp); + + /* for_nexthops macros in fib_semantics.c grabs a pointer to + * the nexthop before checking nhsel + */ + if (nhsel > nhg->num_nh) + return NULL; + + return nhg->nh_entries[nhsel].nh; +} + +static inline +int nexthop_mpath_fill_node(struct sk_buff *skb, struct nexthop *nh) +{ + struct nh_group *nhg = rtnl_dereference(nh->nh_grp); + int i; + + for (i = 0; i < nhg->num_nh; i++) { + struct nexthop *nhe = nhg->nh_entries[i].nh; + struct nh_info *nhi = rcu_dereference_rtnl(nhe->nh_info); + struct fib_nh_common *nhc = &nhi->fib_nhc; + int weight = nhg->nh_entries[i].weight; + + if (fib_add_nexthop(skb, nhc, weight) < 0) + return -EMSGSIZE; + } + + return 0; +} + +/* called with rcu lock */ +static inline bool nexthop_is_blackhole(const struct nexthop *nh) +{ + const struct nh_info *nhi; + + if (nexthop_is_multipath(nh)) { + if (nexthop_num_path(nh) > 1) + return false; + nh = nexthop_mpath_select(nh, 0); + if (!nh) + return false; + } + + nhi = rcu_dereference_rtnl(nh->nh_info); + return nhi->reject_nh; +} +#endif diff --git a/include/uapi/linux/nexthop.h b/include/uapi/linux/nexthop.h new file mode 100644 index 000000000000..7b61867e9848 --- /dev/null +++ b/include/uapi/linux/nexthop.h @@ -0,0 +1,56 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _UAPI_LINUX_NEXTHOP_H +#define _UAPI_LINUX_NEXTHOP_H + +#include <linux/types.h> + +struct nhmsg { + unsigned char nh_family; + unsigned char nh_scope; /* return only */ + unsigned char nh_protocol; /* Routing protocol that installed nh */ + unsigned char resvd; + unsigned int nh_flags; /* RTNH_F flags */ +}; + +/* entry in a nexthop group */ +struct nexthop_grp { + __u32 id; /* nexthop id - must exist */ + __u8 weight; /* weight of this nexthop */ + __u8 resvd1; + __u16 resvd2; +}; + +enum { + NEXTHOP_GRP_TYPE_MPATH, /* default type if not specified */ + __NEXTHOP_GRP_TYPE_MAX, +}; + +#define NEXTHOP_GRP_TYPE_MAX (__NEXTHOP_GRP_TYPE_MAX - 1) + +enum { + NHA_UNSPEC, + NHA_ID, /* u32; id for nexthop. id == 0 means auto-assign */ + + NHA_GROUP, /* array of nexthop_grp */ + NHA_GROUP_TYPE, /* u16 one of NEXTHOP_GRP_TYPE */ + /* if NHA_GROUP attribute is added, no other attributes can be set */ + + NHA_BLACKHOLE, /* flag; nexthop used to blackhole packets */ + /* if NHA_BLACKHOLE is added, OIF, GATEWAY, ENCAP can not be set */ + + NHA_OIF, /* u32; nexthop device */ + NHA_GATEWAY, /* be32 (IPv4) or in6_addr (IPv6) gw address */ + NHA_ENCAP_TYPE, /* u16; lwt encap type */ + NHA_ENCAP, /* lwt encap data */ + + /* NHA_OIF can be appended to dump request to return only + * nexthops using given device + */ + NHA_GROUPS, /* flag; only return nexthop groups in dump */ + NHA_MASTER, /* u32; only return nexthops with given master dev */ + + __NHA_MAX, +}; + +#define NHA_MAX (__NHA_MAX - 1) +#endif diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h index 46399367627f..ce2a623abb75 100644 --- a/include/uapi/linux/rtnetlink.h +++ b/include/uapi/linux/rtnetlink.h @@ -157,6 +157,13 @@ enum { RTM_GETCHAIN, #define RTM_GETCHAIN RTM_GETCHAIN + RTM_NEWNEXTHOP = 104, +#define RTM_NEWNEXTHOP RTM_NEWNEXTHOP + RTM_DELNEXTHOP, +#define RTM_DELNEXTHOP RTM_DELNEXTHOP + RTM_GETNEXTHOP, +#define RTM_GETNEXTHOP RTM_GETNEXTHOP + __RTM_MAX, #define RTM_MAX (((__RTM_MAX + 3) & ~3) - 1) }; @@ -342,6 +349,7 @@ enum rtattr_type_t { RTA_IP_PROTO, RTA_SPORT, RTA_DPORT, + RTA_NH_ID, __RTA_MAX }; @@ -704,6 +712,8 @@ enum rtnetlink_groups { #define RTNLGRP_IPV4_MROUTE_R RTNLGRP_IPV4_MROUTE_R RTNLGRP_IPV6_MROUTE_R, #define RTNLGRP_IPV6_MROUTE_R RTNLGRP_IPV6_MROUTE_R + RTNLGRP_NEXTHOP, +#define RTNLGRP_NEXTHOP RTNLGRP_NEXTHOP __RTNLGRP_MAX }; #define RTNLGRP_MAX (__RTNLGRP_MAX - 1) diff --git a/net/ieee802154/6lowpan/reassembly.c b/net/ieee802154/6lowpan/reassembly.c index e59c3b708969..5b56f16ed86b 100644 --- a/net/ieee802154/6lowpan/reassembly.c +++ b/net/ieee802154/6lowpan/reassembly.c @@ -540,7 +540,7 @@ err_sysctl: void lowpan_net_frag_exit(void) { - inet_frags_fini(&lowpan_frags); lowpan_frags_sysctl_unregister(); unregister_pernet_subsys(&lowpan_frags_ops); + inet_frags_fini(&lowpan_frags); } diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index 000a61994c8f..d57ecfaf89d4 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -14,7 +14,7 @@ obj-y := route.o inetpeer.o protocol.o \ udp_offload.o arp.o icmp.o devinet.o af_inet.o igmp.o \ fib_frontend.o fib_semantics.o fib_trie.o fib_notifier.o \ inet_fragment.o ping.o ip_tunnel_core.o gre_offload.o \ - metrics.o netlink.o + metrics.o netlink.o nexthop.o obj-$(CONFIG_BPFILTER) += bpfilter/ diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index 6ca9523374da..2b816f1ebbb4 100644 --- a/net/ipv4/inet_fragment.c +++ b/net/ipv4/inet_fragment.c @@ -110,14 +110,18 @@ int inet_frags_init(struct inet_frags *f) if (!f->frags_cachep) return -ENOMEM; + refcount_set(&f->refcnt, 1); + init_completion(&f->completion); return 0; } EXPORT_SYMBOL(inet_frags_init); void inet_frags_fini(struct inet_frags *f) { - /* We must wait that all inet_frag_destroy_rcu() have completed. */ - rcu_barrier(); + if (refcount_dec_and_test(&f->refcnt)) + complete(&f->completion); + + wait_for_completion(&f->completion); kmem_cache_destroy(f->frags_cachep); f->frags_cachep = NULL; @@ -149,11 +153,42 @@ static void fqdir_rwork_fn(struct work_struct *work) { struct fqdir *fqdir = container_of(to_rcu_work(work), struct fqdir, destroy_rwork); + struct inet_frags *f = fqdir->f; rhashtable_free_and_destroy(&fqdir->rhashtable, inet_frags_free_cb, NULL); + + /* We need to make sure all ongoing call_rcu(..., inet_frag_destroy_rcu) + * have completed, since they need to dereference fqdir. + * Would it not be nice to have kfree_rcu_barrier() ? :) + */ + rcu_barrier(); + + if (refcount_dec_and_test(&f->refcnt)) + complete(&f->completion); + kfree(fqdir); } +int fqdir_init(struct fqdir **fqdirp, struct inet_frags *f, struct net *net) +{ + struct fqdir *fqdir = kzalloc(sizeof(*fqdir), GFP_KERNEL); + int res; + + if (!fqdir) + return -ENOMEM; + fqdir->f = f; + fqdir->net = net; + res = rhashtable_init(&fqdir->rhashtable, &fqdir->f->rhash_params); + if (res < 0) { + kfree(fqdir); + return res; + } + refcount_inc(&f->refcnt); + *fqdirp = fqdir; + return 0; +} +EXPORT_SYMBOL(fqdir_init); + void fqdir_exit(struct fqdir *fqdir) { fqdir->high_thresh = 0; /* prevent creation of new frags */ diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c new file mode 100644 index 000000000000..1af8a329dacb --- /dev/null +++ b/net/ipv4/nexthop.c @@ -0,0 +1,1479 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Generic nexthop implementation + * + * Copyright (c) 2017-19 Cumulus Networks + * Copyright (c) 2017-19 David Ahern <dsa@cumulusnetworks.com> + */ + +#include <linux/nexthop.h> +#include <linux/rtnetlink.h> +#include <linux/slab.h> +#include <net/arp.h> +#include <net/ipv6_stubs.h> +#include <net/lwtunnel.h> +#include <net/ndisc.h> +#include <net/nexthop.h> +#include <net/route.h> +#include <net/sock.h> + +static void remove_nexthop(struct net *net, struct nexthop *nh, + struct nl_info *nlinfo); + +#define NH_DEV_HASHBITS 8 +#define NH_DEV_HASHSIZE (1U << NH_DEV_HASHBITS) + +static const struct nla_policy rtm_nh_policy[NHA_MAX + 1] = { + [NHA_UNSPEC] = { .strict_start_type = NHA_UNSPEC + 1 }, + [NHA_ID] = { .type = NLA_U32 }, + [NHA_GROUP] = { .type = NLA_BINARY }, + [NHA_GROUP_TYPE] = { .type = NLA_U16 }, + [NHA_BLACKHOLE] = { .type = NLA_FLAG }, + [NHA_OIF] = { .type = NLA_U32 }, + [NHA_GATEWAY] = { .type = NLA_BINARY }, + [NHA_ENCAP_TYPE] = { .type = NLA_U16 }, + [NHA_ENCAP] = { .type = NLA_NESTED }, + [NHA_GROUPS] = { .type = NLA_FLAG }, + [NHA_MASTER] = { .type = NLA_U32 }, +}; + +static unsigned int nh_dev_hashfn(unsigned int val) +{ + unsigned int mask = NH_DEV_HASHSIZE - 1; + + return (val ^ + (val >> NH_DEV_HASHBITS) ^ + (val >> (NH_DEV_HASHBITS * 2))) & mask; +} + +static void nexthop_devhash_add(struct net *net, struct nh_info *nhi) +{ + struct net_device *dev = nhi->fib_nhc.nhc_dev; + struct hlist_head *head; + unsigned int hash; + + WARN_ON(!dev); + + hash = nh_dev_hashfn(dev->ifindex); + head = &net->nexthop.devhash[hash]; + hlist_add_head(&nhi->dev_hash, head); +} + +static void nexthop_free_mpath(struct nexthop *nh) +{ + struct nh_group *nhg; + int i; + + nhg = rcu_dereference_raw(nh->nh_grp); + for (i = 0; i < nhg->num_nh; ++i) + WARN_ON(nhg->nh_entries[i].nh); + + kfree(nhg); +} + +static void nexthop_free_single(struct nexthop *nh) +{ + struct nh_info *nhi; + + nhi = rcu_dereference_raw(nh->nh_info); + switch (nhi->family) { + case AF_INET: + fib_nh_release(nh->net, &nhi->fib_nh); + break; + case AF_INET6: + ipv6_stub->fib6_nh_release(&nhi->fib6_nh); + break; + } + kfree(nhi); +} + +void nexthop_free_rcu(struct rcu_head *head) +{ + struct nexthop *nh = container_of(head, struct nexthop, rcu); + + if (nh->is_group) + nexthop_free_mpath(nh); + else + nexthop_free_single(nh); + + kfree(nh); +} +EXPORT_SYMBOL_GPL(nexthop_free_rcu); + +static struct nexthop *nexthop_alloc(void) +{ + struct nexthop *nh; + + nh = kzalloc(sizeof(struct nexthop), GFP_KERNEL); + if (nh) { + INIT_LIST_HEAD(&nh->grp_list); + } + return nh; +} + +static struct nh_group *nexthop_grp_alloc(u16 num_nh) +{ + size_t sz = offsetof(struct nexthop, nh_grp) + + sizeof(struct nh_group) + + sizeof(struct nh_grp_entry) * num_nh; + struct nh_group *nhg; + + nhg = kzalloc(sz, GFP_KERNEL); + if (nhg) + nhg->num_nh = num_nh; + + return nhg; +} + +static void nh_base_seq_inc(struct net *net) +{ + while (++net->nexthop.seq == 0) + ; +} + +/* no reference taken; rcu lock or rtnl must be held */ +struct nexthop *nexthop_find_by_id(struct net *net, u32 id) +{ + struct rb_node **pp, *parent = NULL, *next; + + pp = &net->nexthop.rb_root.rb_node; + while (1) { + struct nexthop *nh; + + next = rcu_dereference_raw(*pp); + if (!next) + break; + parent = next; + + nh = rb_entry(parent, struct nexthop, rb_node); + if (id < nh->id) + pp = &next->rb_left; + else if (id > nh->id) + pp = &next->rb_right; + else + return nh; + } + return NULL; +} +EXPORT_SYMBOL_GPL(nexthop_find_by_id); + +/* used for auto id allocation; called with rtnl held */ +static u32 nh_find_unused_id(struct net *net) +{ + u32 id_start = net->nexthop.last_id_allocated; + + while (1) { + net->nexthop.last_id_allocated++; + if (net->nexthop.last_id_allocated == id_start) + break; + + if (!nexthop_find_by_id(net, net->nexthop.last_id_allocated)) + return net->nexthop.last_id_allocated; + } + return 0; +} + +static int nla_put_nh_group(struct sk_buff *skb, struct nh_group *nhg) +{ + struct nexthop_grp *p; + size_t len = nhg->num_nh * sizeof(*p); + struct nlattr *nla; + u16 group_type = 0; + int i; + + if (nhg->mpath) + group_type = NEXTHOP_GRP_TYPE_MPATH; + + if (nla_put_u16(skb, NHA_GROUP_TYPE, group_type)) + goto nla_put_failure; + + nla = nla_reserve(skb, NHA_GROUP, len); + if (!nla) + goto nla_put_failure; + + p = nla_data(nla); + for (i = 0; i < nhg->num_nh; ++i) { + p->id = nhg->nh_entries[i].nh->id; + p->weight = nhg->nh_entries[i].weight - 1; + p += 1; + } + + return 0; + +nla_put_failure: + return -EMSGSIZE; +} + +static int nh_fill_node(struct sk_buff *skb, struct nexthop *nh, + int event, u32 portid, u32 seq, unsigned int nlflags) +{ + struct fib6_nh *fib6_nh; + struct fib_nh *fib_nh; + struct nlmsghdr *nlh; + struct nh_info *nhi; + struct nhmsg *nhm; + + nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nhm), nlflags); + if (!nlh) + return -EMSGSIZE; + + nhm = nlmsg_data(nlh); + nhm->nh_family = AF_UNSPEC; + nhm->nh_flags = nh->nh_flags; + nhm->nh_protocol = nh->protocol; + nhm->nh_scope = 0; + nhm->resvd = 0; + + if (nla_put_u32(skb, NHA_ID, nh->id)) + goto nla_put_failure; + + if (nh->is_group) { + struct nh_group *nhg = rtnl_dereference(nh->nh_grp); + + if (nla_put_nh_group(skb, nhg)) + goto nla_put_failure; + goto out; + } + + nhi = rtnl_dereference(nh->nh_info); + nhm->nh_family = nhi->family; + if (nhi->reject_nh) { + if (nla_put_flag(skb, NHA_BLACKHOLE)) + goto nla_put_failure; + goto out; + } else { + const struct net_device *dev; + + dev = nhi->fib_nhc.nhc_dev; + if (dev && nla_put_u32(skb, NHA_OIF, dev->ifindex)) + goto nla_put_failure; + } + + nhm->nh_scope = nhi->fib_nhc.nhc_scope; + switch (nhi->family) { + case AF_INET: + fib_nh = &nhi->fib_nh; + if (fib_nh->fib_nh_gw_family && + nla_put_u32(skb, NHA_GATEWAY, fib_nh->fib_nh_gw4)) + goto nla_put_failure; + break; + + case AF_INET6: + fib6_nh = &nhi->fib6_nh; + if (fib6_nh->fib_nh_gw_family && + nla_put_in6_addr(skb, NHA_GATEWAY, &fib6_nh->fib_nh_gw6)) + goto nla_put_failure; + break; + } + + if (nhi->fib_nhc.nhc_lwtstate && + lwtunnel_fill_encap(skb, nhi->fib_nhc.nhc_lwtstate, + NHA_ENCAP, NHA_ENCAP_TYPE) < 0) + goto nla_put_failure; + +out: + nlmsg_end(skb, nlh); + return 0; + +nla_put_failure: + return -EMSGSIZE; +} + +static size_t nh_nlmsg_size_grp(struct nexthop *nh) +{ + struct nh_group *nhg = rtnl_dereference(nh->nh_grp); + size_t sz = sizeof(struct nexthop_grp) * nhg->num_nh; + + return nla_total_size(sz) + + nla_total_size(2); /* NHA_GROUP_TYPE */ +} + +static size_t nh_nlmsg_size_single(struct nexthop *nh) +{ + struct nh_info *nhi = rtnl_dereference(nh->nh_info); + size_t sz; + + /* covers NHA_BLACKHOLE since NHA_OIF and BLACKHOLE + * are mutually exclusive + */ + sz = nla_total_size(4); /* NHA_OIF */ + + switch (nhi->family) { + case AF_INET: + if (nhi->fib_nh.fib_nh_gw_family) + sz += nla_total_size(4); /* NHA_GATEWAY */ + break; + + case AF_INET6: + /* NHA_GATEWAY */ + if (nhi->fib6_nh.fib_nh_gw_family) + sz += nla_total_size(sizeof(const struct in6_addr)); + break; + } + + if (nhi->fib_nhc.nhc_lwtstate) { + sz += lwtunnel_get_encap_size(nhi->fib_nhc.nhc_lwtstate); + sz += nla_total_size(2); /* NHA_ENCAP_TYPE */ + } + + return sz; +} + +static size_t nh_nlmsg_size(struct nexthop *nh) +{ + size_t sz = nla_total_size(4); /* NHA_ID */ + + if (nh->is_group) + sz += nh_nlmsg_size_grp(nh); + else + sz += nh_nlmsg_size_single(nh); + + return sz; +} + +static void nexthop_notify(int event, struct nexthop *nh, struct nl_info *info) +{ + unsigned int nlflags = info->nlh ? info->nlh->nlmsg_flags : 0; + u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0; + struct sk_buff *skb; + int err = -ENOBUFS; + + skb = nlmsg_new(nh_nlmsg_size(nh), gfp_any()); + if (!skb) + goto errout; + + err = nh_fill_node(skb, nh, event, info->portid, seq, nlflags); + if (err < 0) { + /* -EMSGSIZE implies BUG in nh_nlmsg_size() */ + WARN_ON(err == -EMSGSIZE); + kfree_skb(skb); + goto errout; + } + + rtnl_notify(skb, info->nl_net, info->portid, RTNLGRP_NEXTHOP, + info->nlh, gfp_any()); + return; +errout: + if (err < 0) + rtnl_set_sk_err(info->nl_net, RTNLGRP_NEXTHOP, err); +} + +static bool valid_group_nh(struct nexthop *nh, unsigned int npaths, + struct netlink_ext_ack *extack) +{ + if (nh->is_group) { + struct nh_group *nhg = rtnl_dereference(nh->nh_grp); + + /* nested multipath (group within a group) is not + * supported + */ + if (nhg->mpath) { + NL_SET_ERR_MSG(extack, + "Multipath group can not be a nexthop within a group"); + return false; + } + } else { + struct nh_info *nhi = rtnl_dereference(nh->nh_info); + + if (nhi->reject_nh && npaths > 1) { + NL_SET_ERR_MSG(extack, + "Blackhole nexthop can not be used in a group with more than 1 path"); + return false; + } + } + + return true; +} + +static int nh_check_attr_group(struct net *net, struct nlattr *tb[], + struct netlink_ext_ack *extack) +{ + unsigned int len = nla_len(tb[NHA_GROUP]); + struct nexthop_grp *nhg; + unsigned int i, j; + + if (len & (sizeof(struct nexthop_grp) - 1)) { + NL_SET_ERR_MSG(extack, + "Invalid length for nexthop group attribute"); + return -EINVAL; + } + + /* convert len to number of nexthop ids */ + len /= sizeof(*nhg); + + nhg = nla_data(tb[NHA_GROUP]); + for (i = 0; i < len; ++i) { + if (nhg[i].resvd1 || nhg[i].resvd2) { + NL_SET_ERR_MSG(extack, "Reserved fields in nexthop_grp must be 0"); + return -EINVAL; + } + if (nhg[i].weight > 254) { + NL_SET_ERR_MSG(extack, "Invalid value for weight"); + return -EINVAL; + } + for (j = i + 1; j < len; ++j) { + if (nhg[i].id == nhg[j].id) { + NL_SET_ERR_MSG(extack, "Nexthop id can not be used twice in a group"); + return -EINVAL; + } + } + } + + nhg = nla_data(tb[NHA_GROUP]); + for (i = 0; i < len; ++i) { + struct nexthop *nh; + + nh = nexthop_find_by_id(net, nhg[i].id); + if (!nh) { + NL_SET_ERR_MSG(extack, "Invalid nexthop id"); + return -EINVAL; + } + if (!valid_group_nh(nh, len, extack)) + return -EINVAL; + } + for (i = NHA_GROUP + 1; i < __NHA_MAX; ++i) { + if (!tb[i]) + continue; + + NL_SET_ERR_MSG(extack, + "No other attributes can be set in nexthop groups"); + return -EINVAL; + } + + return 0; +} + +static bool ipv6_good_nh(const struct fib6_nh *nh) +{ + int state = NUD_REACHABLE; + struct neighbour *n; + + rcu_read_lock_bh(); + + n = __ipv6_neigh_lookup_noref_stub(nh->fib_nh_dev, &nh->fib_nh_gw6); + if (n) + state = n->nud_state; + + rcu_read_unlock_bh(); + + return !!(state & NUD_VALID); +} + +static bool ipv4_good_nh(const struct fib_nh *nh) +{ + int state = NUD_REACHABLE; + struct neighbour *n; + + rcu_read_lock_bh(); + + n = __ipv4_neigh_lookup_noref(nh->fib_nh_dev, + (__force u32)nh->fib_nh_gw4); + if (n) + state = n->nud_state; + + rcu_read_unlock_bh(); + + return !!(state & NUD_VALID); +} + +struct nexthop *nexthop_select_path(struct nexthop *nh, int hash) +{ + struct nexthop *rc = NULL; + struct nh_group *nhg; + int i; + + if (!nh->is_group) + return nh; + + nhg = rcu_dereference(nh->nh_grp); + for (i = 0; i < nhg->num_nh; ++i) { + struct nh_grp_entry *nhge = &nhg->nh_entries[i]; + struct nh_info *nhi; + + if (hash > atomic_read(&nhge->upper_bound)) + continue; + + /* nexthops always check if it is good and does + * not rely on a sysctl for this behavior + */ + nhi = rcu_dereference(nhge->nh->nh_info); + switch (nhi->family) { + case AF_INET: + if (ipv4_good_nh(&nhi->fib_nh)) + return nhge->nh; + break; + case AF_INET6: + if (ipv6_good_nh(&nhi->fib6_nh)) + return nhge->nh; + break; + } + + if (!rc) + rc = nhge->nh; + } + + return rc; +} +EXPORT_SYMBOL_GPL(nexthop_select_path); + +static void nh_group_rebalance(struct nh_group *nhg) +{ + int total = 0; + int w = 0; + int i; + + for (i = 0; i < nhg->num_nh; ++i) + total += nhg->nh_entries[i].weight; + + for (i = 0; i < nhg->num_nh; ++i) { + struct nh_grp_entry *nhge = &nhg->nh_entries[i]; + int upper_bound; + + w += nhge->weight; + upper_bound = DIV_ROUND_CLOSEST_ULL((u64)w << 31, total) - 1; + atomic_set(&nhge->upper_bound, upper_bound); + } +} + +static void remove_nh_grp_entry(struct nh_grp_entry *nhge, + struct nh_group *nhg, + struct nl_info *nlinfo) +{ + struct nexthop *nh = nhge->nh; + struct nh_grp_entry *nhges; + bool found = false; + int i; + + WARN_ON(!nh); + + nhges = nhg->nh_entries; + for (i = 0; i < nhg->num_nh; ++i) { + if (found) { + nhges[i-1].nh = nhges[i].nh; + nhges[i-1].weight = nhges[i].weight; + list_del(&nhges[i].nh_list); + list_add(&nhges[i-1].nh_list, &nhges[i-1].nh->grp_list); + } else if (nhg->nh_entries[i].nh == nh) { + found = true; + } + } + + if (WARN_ON(!found)) + return; + + nhg->num_nh--; + nhg->nh_entries[nhg->num_nh].nh = NULL; + + nh_group_rebalance(nhg); + + nexthop_put(nh); + + if (nlinfo) + nexthop_notify(RTM_NEWNEXTHOP, nhge->nh_parent, nlinfo); +} + +static void remove_nexthop_from_groups(struct net *net, struct nexthop *nh, + struct nl_info *nlinfo) +{ + struct nh_grp_entry *nhge, *tmp; + + list_for_each_entry_safe(nhge, tmp, &nh->grp_list, nh_list) { + struct nh_group *nhg; + + list_del(&nhge->nh_list); + nhg = rtnl_dereference(nhge->nh_parent->nh_grp); + remove_nh_grp_entry(nhge, nhg, nlinfo); + + /* if this group has no more entries then remove it */ + if (!nhg->num_nh) + remove_nexthop(net, nhge->nh_parent, nlinfo); + } +} + +static void remove_nexthop_group(struct nexthop *nh, struct nl_info *nlinfo) +{ + struct nh_group *nhg = rcu_dereference_rtnl(nh->nh_grp); + int i, num_nh = nhg->num_nh; + + for (i = 0; i < num_nh; ++i) { + struct nh_grp_entry *nhge = &nhg->nh_entries[i]; + + if (WARN_ON(!nhge->nh)) + continue; + + list_del(&nhge->nh_list); + nexthop_put(nhge->nh); + nhge->nh = NULL; + nhg->num_nh--; + } +} + +static void __remove_nexthop(struct net *net, struct nexthop *nh, + struct nl_info *nlinfo) +{ + if (nh->is_group) { + remove_nexthop_group(nh, nlinfo); + } else { + struct nh_info *nhi; + + nhi = rtnl_dereference(nh->nh_info); + if (nhi->fib_nhc.nhc_dev) + hlist_del(&nhi->dev_hash); + + remove_nexthop_from_groups(net, nh, nlinfo); + } +} + +static void remove_nexthop(struct net *net, struct nexthop *nh, + struct nl_info *nlinfo) +{ + /* remove from the tree */ + rb_erase(&nh->rb_node, &net->nexthop.rb_root); + + if (nlinfo) + nexthop_notify(RTM_DELNEXTHOP, nh, nlinfo); + + __remove_nexthop(net, nh, nlinfo); + nh_base_seq_inc(net); + + nexthop_put(nh); +} + +static int replace_nexthop(struct net *net, struct nexthop *old, + struct nexthop *new, struct netlink_ext_ack *extack) +{ + return -EEXIST; +} + +/* called with rtnl_lock held */ +static int insert_nexthop(struct net *net, struct nexthop *new_nh, + struct nh_config *cfg, struct netlink_ext_ack *extack) +{ + struct rb_node **pp, *parent = NULL, *next; + struct rb_root *root = &net->nexthop.rb_root; + bool replace = !!(cfg->nlflags & NLM_F_REPLACE); + bool create = !!(cfg->nlflags & NLM_F_CREATE); + u32 new_id = new_nh->id; + int rc = -EEXIST; + + pp = &root->rb_node; + while (1) { + struct nexthop *nh; + + next = rtnl_dereference(*pp); + if (!next) + break; + + parent = next; + + nh = rb_entry(parent, struct nexthop, rb_node); + if (new_id < nh->id) { + pp = &next->rb_left; + } else if (new_id > nh->id) { + pp = &next->rb_right; + } else if (replace) { + rc = replace_nexthop(net, nh, new_nh, extack); + if (!rc) + new_nh = nh; /* send notification with old nh */ + goto out; + } else { + /* id already exists and not a replace */ + goto out; + } + } + + if (replace && !create) { + NL_SET_ERR_MSG(extack, "Replace specified without create and no entry exists"); + rc = -ENOENT; + goto out; + } + + rb_link_node_rcu(&new_nh->rb_node, parent, pp); + rb_insert_color(&new_nh->rb_node, root); + rc = 0; +out: + if (!rc) { + nh_base_seq_inc(net); + nexthop_notify(RTM_NEWNEXTHOP, new_nh, &cfg->nlinfo); + } + + return rc; +} + +/* rtnl */ +/* remove all nexthops tied to a device being deleted */ +static void nexthop_flush_dev(struct net_device *dev) +{ + unsigned int hash = nh_dev_hashfn(dev->ifindex); + struct net *net = dev_net(dev); + struct hlist_head *head = &net->nexthop.devhash[hash]; + struct hlist_node *n; + struct nh_info *nhi; + + hlist_for_each_entry_safe(nhi, n, head, dev_hash) { + if (nhi->fib_nhc.nhc_dev != dev) + continue; + + remove_nexthop(net, nhi->nh_parent, NULL); + } +} + +/* rtnl; called when net namespace is deleted */ +static void flush_all_nexthops(struct net *net) +{ + struct rb_root *root = &net->nexthop.rb_root; + struct rb_node *node; + struct nexthop *nh; + + while ((node = rb_first(root))) { + nh = rb_entry(node, struct nexthop, rb_node); + remove_nexthop(net, nh, NULL); + cond_resched(); + } +} + +static struct nexthop *nexthop_create_group(struct net *net, + struct nh_config *cfg) +{ + struct nlattr *grps_attr = cfg->nh_grp; + struct nexthop_grp *entry = nla_data(grps_attr); + struct nh_group *nhg; + struct nexthop *nh; + int i; + + nh = nexthop_alloc(); + if (!nh) + return ERR_PTR(-ENOMEM); + + nh->is_group = 1; + + nhg = nexthop_grp_alloc(nla_len(grps_attr) / sizeof(*entry)); + if (!nhg) { + kfree(nh); + return ERR_PTR(-ENOMEM); + } + + for (i = 0; i < nhg->num_nh; ++i) { + struct nexthop *nhe; + struct nh_info *nhi; + + nhe = nexthop_find_by_id(net, entry[i].id); + if (!nexthop_get(nhe)) + goto out_no_nh; + + nhi = rtnl_dereference(nhe->nh_info); + if (nhi->family == AF_INET) + nhg->has_v4 = true; + + nhg->nh_entries[i].nh = nhe; + nhg->nh_entries[i].weight = entry[i].weight + 1; + list_add(&nhg->nh_entries[i].nh_list, &nhe->grp_list); + nhg->nh_entries[i].nh_parent = nh; + } + + if (cfg->nh_grp_type == NEXTHOP_GRP_TYPE_MPATH) { + nhg->mpath = 1; + nh_group_rebalance(nhg); + } + + rcu_assign_pointer(nh->nh_grp, nhg); + + return nh; + +out_no_nh: + for (; i >= 0; --i) + nexthop_put(nhg->nh_entries[i].nh); + + kfree(nhg); + kfree(nh); + + return ERR_PTR(-ENOENT); +} + +static int nh_create_ipv4(struct net *net, struct nexthop *nh, + struct nh_info *nhi, struct nh_config *cfg, + struct netlink_ext_ack *extack) +{ + struct fib_nh *fib_nh = &nhi->fib_nh; + struct fib_config fib_cfg = { + .fc_oif = cfg->nh_ifindex, + .fc_gw4 = cfg->gw.ipv4, + .fc_gw_family = cfg->gw.ipv4 ? AF_INET : 0, + .fc_flags = cfg->nh_flags, + .fc_encap = cfg->nh_encap, + .fc_encap_type = cfg->nh_encap_type, + }; + u32 tb_id = l3mdev_fib_table(cfg->dev); + int err = -EINVAL; + + err = fib_nh_init(net, fib_nh, &fib_cfg, 1, extack); + if (err) { + fib_nh_release(net, fib_nh); + goto out; + } + + /* sets nh_dev if successful */ + err = fib_check_nh(net, fib_nh, tb_id, 0, extack); + if (!err) { + nh->nh_flags = fib_nh->fib_nh_flags; + fib_info_update_nh_saddr(net, fib_nh, fib_nh->fib_nh_scope); + } else { + fib_nh_release(net, fib_nh); + } +out: + return err; +} + +static int nh_create_ipv6(struct net *net, struct nexthop *nh, + struct nh_info *nhi, struct nh_config *cfg, + struct netlink_ext_ack *extack) +{ + struct fib6_nh *fib6_nh = &nhi->fib6_nh; + struct fib6_config fib6_cfg = { + .fc_table = l3mdev_fib_table(cfg->dev), + .fc_ifindex = cfg->nh_ifindex, + .fc_gateway = cfg->gw.ipv6, + .fc_flags = cfg->nh_flags, + .fc_encap = cfg->nh_encap, + .fc_encap_type = cfg->nh_encap_type, + }; + int err = -EINVAL; + + if (!ipv6_addr_any(&cfg->gw.ipv6)) + fib6_cfg.fc_flags |= RTF_GATEWAY; + + /* sets nh_dev if successful */ + err = ipv6_stub->fib6_nh_init(net, fib6_nh, &fib6_cfg, GFP_KERNEL, + extack); + if (err) + ipv6_stub->fib6_nh_release(fib6_nh); + else + nh->nh_flags = fib6_nh->fib_nh_flags; + + return err; +} + +static struct nexthop *nexthop_create(struct net *net, struct nh_config *cfg, + struct netlink_ext_ack *extack) +{ + struct nh_info *nhi; + struct nexthop *nh; + int err = 0; + + nh = nexthop_alloc(); + if (!nh) + return ERR_PTR(-ENOMEM); + + nhi = kzalloc(sizeof(*nhi), GFP_KERNEL); + if (!nhi) { + kfree(nh); + return ERR_PTR(-ENOMEM); + } + + nh->nh_flags = cfg->nh_flags; + nh->net = net; + + nhi->nh_parent = nh; + nhi->family = cfg->nh_family; + nhi->fib_nhc.nhc_scope = RT_SCOPE_LINK; + + if (cfg->nh_blackhole) { + nhi->reject_nh = 1; + cfg->nh_ifindex = net->loopback_dev->ifindex; + } + + switch (cfg->nh_family) { + case AF_INET: + err = nh_create_ipv4(net, nh, nhi, cfg, extack); + break; + case AF_INET6: + err = nh_create_ipv6(net, nh, nhi, cfg, extack); + break; + } + + if (err) { + kfree(nhi); + kfree(nh); + return ERR_PTR(err); + } + + /* add the entry to the device based hash */ + nexthop_devhash_add(net, nhi); + + rcu_assign_pointer(nh->nh_info, nhi); + + return nh; +} + +/* called with rtnl lock held */ +static struct nexthop *nexthop_add(struct net *net, struct nh_config *cfg, + struct netlink_ext_ack *extack) +{ + struct nexthop *nh; + int err; + + if (cfg->nlflags & NLM_F_REPLACE && !cfg->nh_id) { + NL_SET_ERR_MSG(extack, "Replace requires nexthop id"); + return ERR_PTR(-EINVAL); + } + + if (!cfg->nh_id) { + cfg->nh_id = nh_find_unused_id(net); + if (!cfg->nh_id) { + NL_SET_ERR_MSG(extack, "No unused id"); + return ERR_PTR(-EINVAL); + } + } + + if (cfg->nh_grp) + nh = nexthop_create_group(net, cfg); + else + nh = nexthop_create(net, cfg, extack); + + if (IS_ERR(nh)) + return nh; + + refcount_set(&nh->refcnt, 1); + nh->id = cfg->nh_id; + nh->protocol = cfg->nh_protocol; + nh->net = net; + + err = insert_nexthop(net, nh, cfg, extack); + if (err) { + __remove_nexthop(net, nh, NULL); + nexthop_put(nh); + nh = ERR_PTR(err); + } + + return nh; +} + +static int rtm_to_nh_config(struct net *net, struct sk_buff *skb, + struct nlmsghdr *nlh, struct nh_config *cfg, + struct netlink_ext_ack *extack) +{ + struct nhmsg *nhm = nlmsg_data(nlh); + struct nlattr *tb[NHA_MAX + 1]; + int err; + + err = nlmsg_parse(nlh, sizeof(*nhm), tb, NHA_MAX, rtm_nh_policy, + extack); + if (err < 0) + return err; + + err = -EINVAL; + if (nhm->resvd || nhm->nh_scope) { + NL_SET_ERR_MSG(extack, "Invalid values in ancillary header"); + goto out; + } + if (nhm->nh_flags & ~NEXTHOP_VALID_USER_FLAGS) { + NL_SET_ERR_MSG(extack, "Invalid nexthop flags in ancillary header"); + goto out; + } + + switch (nhm->nh_family) { + case AF_INET: + case AF_INET6: + break; + case AF_UNSPEC: + if (tb[NHA_GROUP]) + break; + /* fallthrough */ + default: + NL_SET_ERR_MSG(extack, "Invalid address family"); + goto out; + } + + if (tb[NHA_GROUPS] || tb[NHA_MASTER]) { + NL_SET_ERR_MSG(extack, "Invalid attributes in request"); + goto out; + } + + memset(cfg, 0, sizeof(*cfg)); + cfg->nlflags = nlh->nlmsg_flags; + cfg->nlinfo.portid = NETLINK_CB(skb).portid; + cfg->nlinfo.nlh = nlh; + cfg->nlinfo.nl_net = net; + + cfg->nh_family = nhm->nh_family; + cfg->nh_protocol = nhm->nh_protocol; + cfg->nh_flags = nhm->nh_flags; + + if (tb[NHA_ID]) + cfg->nh_id = nla_get_u32(tb[NHA_ID]); + + if (tb[NHA_GROUP]) { + if (nhm->nh_family != AF_UNSPEC) { + NL_SET_ERR_MSG(extack, "Invalid family for group"); + goto out; + } + cfg->nh_grp = tb[NHA_GROUP]; + + cfg->nh_grp_type = NEXTHOP_GRP_TYPE_MPATH; + if (tb[NHA_GROUP_TYPE]) + cfg->nh_grp_type = nla_get_u16(tb[NHA_GROUP_TYPE]); + + if (cfg->nh_grp_type > NEXTHOP_GRP_TYPE_MAX) { + NL_SET_ERR_MSG(extack, "Invalid group type"); + goto out; + } + err = nh_check_attr_group(net, tb, extack); + + /* no other attributes should be set */ + goto out; + } + + if (tb[NHA_BLACKHOLE]) { + if (tb[NHA_GATEWAY] || tb[NHA_OIF] || + tb[NHA_ENCAP] || tb[NHA_ENCAP_TYPE]) { + NL_SET_ERR_MSG(extack, "Blackhole attribute can not be used with gateway or oif"); + goto out; + } + + cfg->nh_blackhole = 1; + err = 0; + goto out; + } + + if (!tb[NHA_OIF]) { + NL_SET_ERR_MSG(extack, "Device attribute required for non-blackhole nexthops"); + goto out; + } + + cfg->nh_ifindex = nla_get_u32(tb[NHA_OIF]); + if (cfg->nh_ifindex) + cfg->dev = __dev_get_by_index(net, cfg->nh_ifindex); + + if (!cfg->dev) { + NL_SET_ERR_MSG(extack, "Invalid device index"); + goto out; + } else if (!(cfg->dev->flags & IFF_UP)) { + NL_SET_ERR_MSG(extack, "Nexthop device is not up"); + err = -ENETDOWN; + goto out; + } else if (!netif_carrier_ok(cfg->dev)) { + NL_SET_ERR_MSG(extack, "Carrier for nexthop device is down"); + err = -ENETDOWN; + goto out; + } + + err = -EINVAL; + if (tb[NHA_GATEWAY]) { + struct nlattr *gwa = tb[NHA_GATEWAY]; + + switch (cfg->nh_family) { + case AF_INET: + if (nla_len(gwa) != sizeof(u32)) { + NL_SET_ERR_MSG(extack, "Invalid gateway"); + goto out; + } + cfg->gw.ipv4 = nla_get_be32(gwa); + break; + case AF_INET6: + if (nla_len(gwa) != sizeof(struct in6_addr)) { + NL_SET_ERR_MSG(extack, "Invalid gateway"); + goto out; + } + cfg->gw.ipv6 = nla_get_in6_addr(gwa); + break; + default: + NL_SET_ERR_MSG(extack, + "Unknown address family for gateway"); + goto out; + } + } else { + /* device only nexthop (no gateway) */ + if (cfg->nh_flags & RTNH_F_ONLINK) { + NL_SET_ERR_MSG(extack, + "ONLINK flag can not be set for nexthop without a gateway"); + goto out; + } + } + + if (tb[NHA_ENCAP]) { + cfg->nh_encap = tb[NHA_ENCAP]; + + if (!tb[NHA_ENCAP_TYPE]) { + NL_SET_ERR_MSG(extack, "LWT encapsulation type is missing"); + goto out; + } + + cfg->nh_encap_type = nla_get_u16(tb[NHA_ENCAP_TYPE]); + err = lwtunnel_valid_encap_type(cfg->nh_encap_type, extack); + if (err < 0) + goto out; + + } else if (tb[NHA_ENCAP_TYPE]) { + NL_SET_ERR_MSG(extack, "LWT encapsulation attribute is missing"); + goto out; + } + + + err = 0; +out: + return err; +} + +/* rtnl */ +static int rtm_new_nexthop(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct net *net = sock_net(skb->sk); + struct nh_config cfg; + struct nexthop *nh; + int err; + + err = rtm_to_nh_config(net, skb, nlh, &cfg, extack); + if (!err) { + nh = nexthop_add(net, &cfg, extack); + if (IS_ERR(nh)) + err = PTR_ERR(nh); + } + + return err; +} + +static int nh_valid_get_del_req(struct nlmsghdr *nlh, u32 *id, + struct netlink_ext_ack *extack) +{ + struct nhmsg *nhm = nlmsg_data(nlh); + struct nlattr *tb[NHA_MAX + 1]; + int err, i; + + err = nlmsg_parse(nlh, sizeof(*nhm), tb, NHA_MAX, rtm_nh_policy, + extack); + if (err < 0) + return err; + + err = -EINVAL; + for (i = 0; i < __NHA_MAX; ++i) { + if (!tb[i]) + continue; + + switch (i) { + case NHA_ID: + break; + default: + NL_SET_ERR_MSG_ATTR(extack, tb[i], + "Unexpected attribute in request"); + goto out; + } + } + if (nhm->nh_protocol || nhm->resvd || nhm->nh_scope || nhm->nh_flags) { + NL_SET_ERR_MSG(extack, "Invalid values in header"); + goto out; + } + + if (!tb[NHA_ID]) { + NL_SET_ERR_MSG(extack, "Nexthop id is missing"); + goto out; + } + + *id = nla_get_u32(tb[NHA_ID]); + if (!(*id)) + NL_SET_ERR_MSG(extack, "Invalid nexthop id"); + else + err = 0; +out: + return err; +} + +/* rtnl */ +static int rtm_del_nexthop(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct net *net = sock_net(skb->sk); + struct nl_info nlinfo = { + .nlh = nlh, + .nl_net = net, + .portid = NETLINK_CB(skb).portid, + }; + struct nexthop *nh; + int err; + u32 id; + + err = nh_valid_get_del_req(nlh, &id, extack); + if (err) + return err; + + nh = nexthop_find_by_id(net, id); + if (!nh) + return -ENOENT; + + remove_nexthop(net, nh, &nlinfo); + + return 0; +} + +/* rtnl */ +static int rtm_get_nexthop(struct sk_buff *in_skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct net *net = sock_net(in_skb->sk); + struct sk_buff *skb = NULL; + struct nexthop *nh; + int err; + u32 id; + + err = nh_valid_get_del_req(nlh, &id, extack); + if (err) + return err; + + err = -ENOBUFS; + skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); + if (!skb) + goto out; + + err = -ENOENT; + nh = nexthop_find_by_id(net, id); + if (!nh) + goto errout_free; + + err = nh_fill_node(skb, nh, RTM_NEWNEXTHOP, NETLINK_CB(in_skb).portid, + nlh->nlmsg_seq, 0); + if (err < 0) { + WARN_ON(err == -EMSGSIZE); + goto errout_free; + } + + err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); +out: + return err; +errout_free: + kfree_skb(skb); + goto out; +} + +static bool nh_dump_filtered(struct nexthop *nh, int dev_idx, int master_idx, + bool group_filter, u8 family) +{ + const struct net_device *dev; + const struct nh_info *nhi; + + if (group_filter && !nh->is_group) + return true; + + if (!dev_idx && !master_idx && !family) + return false; + + if (nh->is_group) + return true; + + nhi = rtnl_dereference(nh->nh_info); + if (family && nhi->family != family) + return true; + + dev = nhi->fib_nhc.nhc_dev; + if (dev_idx && (!dev || dev->ifindex != dev_idx)) + return true; + + if (master_idx) { + struct net_device *master; + + if (!dev) + return true; + + master = netdev_master_upper_dev_get((struct net_device *)dev); + if (!master || master->ifindex != master_idx) + return true; + } + + return false; +} + +static int nh_valid_dump_req(const struct nlmsghdr *nlh, int *dev_idx, + int *master_idx, bool *group_filter, + struct netlink_callback *cb) +{ + struct netlink_ext_ack *extack = cb->extack; + struct nlattr *tb[NHA_MAX + 1]; + struct nhmsg *nhm; + int err, i; + u32 idx; + + err = nlmsg_parse(nlh, sizeof(*nhm), tb, NHA_MAX, rtm_nh_policy, + NULL); + if (err < 0) + return err; + + for (i = 0; i <= NHA_MAX; ++i) { + if (!tb[i]) + continue; + + switch (i) { + case NHA_OIF: + idx = nla_get_u32(tb[i]); + if (idx > INT_MAX) { + NL_SET_ERR_MSG(extack, "Invalid device index"); + return -EINVAL; + } + *dev_idx = idx; + break; + case NHA_MASTER: + idx = nla_get_u32(tb[i]); + if (idx > INT_MAX) { + NL_SET_ERR_MSG(extack, "Invalid master device index"); + return -EINVAL; + } + *master_idx = idx; + break; + case NHA_GROUPS: + *group_filter = true; + break; + default: + NL_SET_ERR_MSG(extack, "Unsupported attribute in dump request"); + return -EINVAL; + } + } + + nhm = nlmsg_data(nlh); + if (nhm->nh_protocol || nhm->resvd || nhm->nh_scope || nhm->nh_flags) { + NL_SET_ERR_MSG(extack, "Invalid values in header for nexthop dump request"); + return -EINVAL; + } + + return 0; +} + +/* rtnl */ +static int rtm_dump_nexthop(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct nhmsg *nhm = nlmsg_data(cb->nlh); + int dev_filter_idx = 0, master_idx = 0; + struct net *net = sock_net(skb->sk); + struct rb_root *root = &net->nexthop.rb_root; + bool group_filter = false; + struct rb_node *node; + int idx = 0, s_idx; + int err; + + err = nh_valid_dump_req(cb->nlh, &dev_filter_idx, &master_idx, + &group_filter, cb); + if (err < 0) + return err; + + s_idx = cb->args[0]; + for (node = rb_first(root); node; node = rb_next(node)) { + struct nexthop *nh; + + if (idx < s_idx) + goto cont; + + nh = rb_entry(node, struct nexthop, rb_node); + if (nh_dump_filtered(nh, dev_filter_idx, master_idx, + group_filter, nhm->nh_family)) + goto cont; + + err = nh_fill_node(skb, nh, RTM_NEWNEXTHOP, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, NLM_F_MULTI); + if (err < 0) { + if (likely(skb->len)) + goto out; + + goto out_err; + } +cont: + idx++; + } + +out: + err = skb->len; +out_err: + cb->args[0] = idx; + cb->seq = net->nexthop.seq; + nl_dump_check_consistent(cb, nlmsg_hdr(skb)); + + return err; +} + +static void nexthop_sync_mtu(struct net_device *dev, u32 orig_mtu) +{ + unsigned int hash = nh_dev_hashfn(dev->ifindex); + struct net *net = dev_net(dev); + struct hlist_head *head = &net->nexthop.devhash[hash]; + struct hlist_node *n; + struct nh_info *nhi; + + hlist_for_each_entry_safe(nhi, n, head, dev_hash) { + if (nhi->fib_nhc.nhc_dev == dev) { + if (nhi->family == AF_INET) + fib_nhc_update_mtu(&nhi->fib_nhc, dev->mtu, + orig_mtu); + } + } +} + +/* rtnl */ +static int nh_netdev_event(struct notifier_block *this, + unsigned long event, void *ptr) +{ + struct net_device *dev = netdev_notifier_info_to_dev(ptr); + struct netdev_notifier_info_ext *info_ext; + + switch (event) { + case NETDEV_DOWN: + case NETDEV_UNREGISTER: + nexthop_flush_dev(dev); + break; + case NETDEV_CHANGE: + if (!(dev_get_flags(dev) & (IFF_RUNNING | IFF_LOWER_UP))) + nexthop_flush_dev(dev); + break; + case NETDEV_CHANGEMTU: + info_ext = ptr; + nexthop_sync_mtu(dev, info_ext->ext.mtu); + rt_cache_flush(dev_net(dev)); + break; + } + return NOTIFY_DONE; +} + +static struct notifier_block nh_netdev_notifier = { + .notifier_call = nh_netdev_event, +}; + +static void __net_exit nexthop_net_exit(struct net *net) +{ + rtnl_lock(); + flush_all_nexthops(net); + rtnl_unlock(); + kfree(net->nexthop.devhash); +} + +static int __net_init nexthop_net_init(struct net *net) +{ + size_t sz = sizeof(struct hlist_head) * NH_DEV_HASHSIZE; + + net->nexthop.rb_root = RB_ROOT; + net->nexthop.devhash = kzalloc(sz, GFP_KERNEL); + if (!net->nexthop.devhash) + return -ENOMEM; + + return 0; +} + +static struct pernet_operations nexthop_net_ops = { + .init = nexthop_net_init, + .exit = nexthop_net_exit, +}; + +static int __init nexthop_init(void) +{ + register_pernet_subsys(&nexthop_net_ops); + + register_netdevice_notifier(&nh_netdev_notifier); + + rtnl_register(PF_UNSPEC, RTM_NEWNEXTHOP, rtm_new_nexthop, NULL, 0); + rtnl_register(PF_UNSPEC, RTM_DELNEXTHOP, rtm_del_nexthop, NULL, 0); + rtnl_register(PF_UNSPEC, RTM_GETNEXTHOP, rtm_get_nexthop, + rtm_dump_nexthop, 0); + + rtnl_register(PF_INET, RTM_NEWNEXTHOP, rtm_new_nexthop, NULL, 0); + rtnl_register(PF_INET, RTM_GETNEXTHOP, NULL, rtm_dump_nexthop, 0); + + rtnl_register(PF_INET6, RTM_NEWNEXTHOP, rtm_new_nexthop, NULL, 0); + rtnl_register(PF_INET6, RTM_GETNEXTHOP, NULL, rtm_dump_nexthop, 0); + + return 0; +} +subsys_initcall(nexthop_init); diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index 836ea964cf14..ff5b6d8de2c6 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -583,8 +583,8 @@ err_protocol: void ipv6_frag_exit(void) { - inet_frags_fini(&ip6_frags); ip6_frags_sysctl_unregister(); unregister_pernet_subsys(&ip6_frags_ops); inet6_del_protocol(&frag_protocol, IPPROTO_FRAGMENT); + inet_frags_fini(&ip6_frags); } diff --git a/security/selinux/nlmsgtab.c b/security/selinux/nlmsgtab.c index 9cec81209617..2c75d823d8e2 100644 --- a/security/selinux/nlmsgtab.c +++ b/security/selinux/nlmsgtab.c @@ -83,6 +83,9 @@ static const struct nlmsg_perm nlmsg_route_perms[] = { RTM_NEWCHAIN, NETLINK_ROUTE_SOCKET__NLMSG_WRITE }, { RTM_DELCHAIN, NETLINK_ROUTE_SOCKET__NLMSG_WRITE }, { RTM_GETCHAIN, NETLINK_ROUTE_SOCKET__NLMSG_READ }, + { RTM_NEWNEXTHOP, NETLINK_ROUTE_SOCKET__NLMSG_WRITE }, + { RTM_DELNEXTHOP, NETLINK_ROUTE_SOCKET__NLMSG_WRITE }, + { RTM_GETNEXTHOP, NETLINK_ROUTE_SOCKET__NLMSG_READ }, }; static const struct nlmsg_perm nlmsg_tcpdiag_perms[] = @@ -166,7 +169,7 @@ int selinux_nlmsg_lookup(u16 sclass, u16 nlmsg_type, u32 *perm) * structures at the top of this file with the new mappings * before updating the BUILD_BUG_ON() macro! */ - BUILD_BUG_ON(RTM_MAX != (RTM_NEWCHAIN + 3)); + BUILD_BUG_ON(RTM_MAX != (RTM_NEWNEXTHOP + 3)); err = nlmsg_perm(nlmsg_type, perm, nlmsg_route_perms, sizeof(nlmsg_route_perms)); break; diff --git a/tools/testing/selftests/net/.gitignore b/tools/testing/selftests/net/.gitignore index 27ef4d07ac91..99a4e41d5249 100644 --- a/tools/testing/selftests/net/.gitignore +++ b/tools/testing/selftests/net/.gitignore @@ -18,3 +18,5 @@ tls txring_overwrite ip_defrag so_txtime +flowlabel +flowlabel_mgr diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile index 8af7869e0f1c..8343fb9d8a46 100644 --- a/tools/testing/selftests/net/Makefile +++ b/tools/testing/selftests/net/Makefile @@ -9,13 +9,13 @@ TEST_PROGS := run_netsocktests run_afpackettests test_bpf.sh netdevice.sh \ TEST_PROGS += fib_tests.sh fib-onlink-tests.sh pmtu.sh udpgso.sh ip_defrag.sh TEST_PROGS += udpgso_bench.sh fib_rule_tests.sh msg_zerocopy.sh psock_snd.sh TEST_PROGS += udpgro_bench.sh udpgro.sh test_vxlan_under_vrf.sh reuseport_addr_any.sh -TEST_PROGS += test_vxlan_fdb_changelink.sh so_txtime.sh +TEST_PROGS += test_vxlan_fdb_changelink.sh so_txtime.sh ipv6_flowlabel.sh TEST_PROGS_EXTENDED := in_netns.sh TEST_GEN_FILES = socket TEST_GEN_FILES += psock_fanout psock_tpacket msg_zerocopy reuseport_addr_any TEST_GEN_FILES += tcp_mmap tcp_inq psock_snd txring_overwrite TEST_GEN_FILES += udpgso udpgso_bench_tx udpgso_bench_rx ip_defrag -TEST_GEN_FILES += so_txtime +TEST_GEN_FILES += so_txtime ipv6_flowlabel ipv6_flowlabel_mgr TEST_GEN_PROGS = reuseport_bpf reuseport_bpf_cpu reuseport_bpf_numa TEST_GEN_PROGS += reuseport_dualstack reuseaddr_conflict tls diff --git a/tools/testing/selftests/net/ipv6_flowlabel.c b/tools/testing/selftests/net/ipv6_flowlabel.c new file mode 100644 index 000000000000..a7c41375374f --- /dev/null +++ b/tools/testing/selftests/net/ipv6_flowlabel.c @@ -0,0 +1,229 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Test IPV6_FLOWINFO cmsg on send and recv */ + +#define _GNU_SOURCE + +#include <arpa/inet.h> +#include <asm/byteorder.h> +#include <error.h> +#include <errno.h> +#include <fcntl.h> +#include <limits.h> +#include <linux/in6.h> +#include <stdbool.h> +#include <stdio.h> +#include <stdint.h> +#include <stdlib.h> +#include <string.h> +#include <sys/socket.h> +#include <sys/stat.h> +#include <sys/time.h> +#include <sys/types.h> +#include <unistd.h> + +/* uapi/glibc weirdness may leave this undefined */ +#ifndef IPV6_FLOWINFO +#define IPV6_FLOWINFO 11 +#endif + +#ifndef IPV6_FLOWLABEL_MGR +#define IPV6_FLOWLABEL_MGR 32 +#endif + +#define FLOWLABEL_WILDCARD ((uint32_t) -1) + +static const char cfg_data[] = "a"; +static uint32_t cfg_label = 1; + +static void do_send(int fd, bool with_flowlabel, uint32_t flowlabel) +{ + char control[CMSG_SPACE(sizeof(flowlabel))] = {0}; + struct msghdr msg = {0}; + struct iovec iov = {0}; + int ret; + + iov.iov_base = (char *)cfg_data; + iov.iov_len = sizeof(cfg_data); + + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + + if (with_flowlabel) { + struct cmsghdr *cm; + + cm = (void *)control; + cm->cmsg_len = CMSG_LEN(sizeof(flowlabel)); + cm->cmsg_level = SOL_IPV6; + cm->cmsg_type = IPV6_FLOWINFO; + *(uint32_t *)CMSG_DATA(cm) = htonl(flowlabel); + + msg.msg_control = control; + msg.msg_controllen = sizeof(control); + } + + ret = sendmsg(fd, &msg, 0); + if (ret == -1) + error(1, errno, "send"); + + if (with_flowlabel) + fprintf(stderr, "sent with label %u\n", flowlabel); + else + fprintf(stderr, "sent without label\n"); +} + +static void do_recv(int fd, bool with_flowlabel, uint32_t expect) +{ + char control[CMSG_SPACE(sizeof(expect))]; + char data[sizeof(cfg_data)]; + struct msghdr msg = {0}; + struct iovec iov = {0}; + struct cmsghdr *cm; + uint32_t flowlabel; + int ret; + + iov.iov_base = data; + iov.iov_len = sizeof(data); + + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + + memset(control, 0, sizeof(control)); + msg.msg_control = control; + msg.msg_controllen = sizeof(control); + + ret = recvmsg(fd, &msg, 0); + if (ret == -1) + error(1, errno, "recv"); + if (msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC)) + error(1, 0, "recv: truncated"); + if (ret != sizeof(cfg_data)) + error(1, 0, "recv: length mismatch"); + if (memcmp(data, cfg_data, sizeof(data))) + error(1, 0, "recv: data mismatch"); + + cm = CMSG_FIRSTHDR(&msg); + if (with_flowlabel) { + if (!cm) + error(1, 0, "recv: missing cmsg"); + if (CMSG_NXTHDR(&msg, cm)) + error(1, 0, "recv: too many cmsg"); + if (cm->cmsg_level != SOL_IPV6 || + cm->cmsg_type != IPV6_FLOWINFO) + error(1, 0, "recv: unexpected cmsg level or type"); + + flowlabel = ntohl(*(uint32_t *)CMSG_DATA(cm)); + fprintf(stderr, "recv with label %u\n", flowlabel); + + if (expect != FLOWLABEL_WILDCARD && expect != flowlabel) + fprintf(stderr, "recv: incorrect flowlabel %u != %u\n", + flowlabel, expect); + + } else { + fprintf(stderr, "recv without label\n"); + } +} + +static bool get_autoflowlabel_enabled(void) +{ + int fd, ret; + char val; + + fd = open("/proc/sys/net/ipv6/auto_flowlabels", O_RDONLY); + if (fd == -1) + error(1, errno, "open sysctl"); + + ret = read(fd, &val, 1); + if (ret == -1) + error(1, errno, "read sysctl"); + if (ret == 0) + error(1, 0, "read sysctl: 0"); + + if (close(fd)) + error(1, errno, "close sysctl"); + + return val == '1'; +} + +static void flowlabel_get(int fd, uint32_t label, uint8_t share, uint16_t flags) +{ + struct in6_flowlabel_req req = { + .flr_action = IPV6_FL_A_GET, + .flr_label = htonl(label), + .flr_flags = flags, + .flr_share = share, + }; + + /* do not pass IPV6_ADDR_ANY or IPV6_ADDR_MAPPED */ + req.flr_dst.s6_addr[0] = 0xfd; + req.flr_dst.s6_addr[15] = 0x1; + + if (setsockopt(fd, SOL_IPV6, IPV6_FLOWLABEL_MGR, &req, sizeof(req))) + error(1, errno, "setsockopt flowlabel get"); +} + +static void parse_opts(int argc, char **argv) +{ + int c; + + while ((c = getopt(argc, argv, "l:")) != -1) { + switch (c) { + case 'l': + cfg_label = strtoul(optarg, NULL, 0); + break; + default: + error(1, 0, "%s: parse error", argv[0]); + } + } +} + +int main(int argc, char **argv) +{ + struct sockaddr_in6 addr = { + .sin6_family = AF_INET6, + .sin6_port = htons(8000), + .sin6_addr = IN6ADDR_LOOPBACK_INIT, + }; + const int one = 1; + int fdt, fdr; + + parse_opts(argc, argv); + + fdt = socket(PF_INET6, SOCK_DGRAM, 0); + if (fdt == -1) + error(1, errno, "socket t"); + + fdr = socket(PF_INET6, SOCK_DGRAM, 0); + if (fdr == -1) + error(1, errno, "socket r"); + + if (connect(fdt, (void *)&addr, sizeof(addr))) + error(1, errno, "connect"); + if (bind(fdr, (void *)&addr, sizeof(addr))) + error(1, errno, "bind"); + + flowlabel_get(fdt, cfg_label, IPV6_FL_S_EXCL, IPV6_FL_F_CREATE); + + if (setsockopt(fdr, SOL_IPV6, IPV6_FLOWINFO, &one, sizeof(one))) + error(1, errno, "setsockopt flowinfo"); + + if (get_autoflowlabel_enabled()) { + fprintf(stderr, "send no label: recv auto flowlabel\n"); + do_send(fdt, false, 0); + do_recv(fdr, true, FLOWLABEL_WILDCARD); + } else { + fprintf(stderr, "send no label: recv no label (auto off)\n"); + do_send(fdt, false, 0); + do_recv(fdr, false, 0); + } + + fprintf(stderr, "send label\n"); + do_send(fdt, true, cfg_label); + do_recv(fdr, true, cfg_label); + + if (close(fdr)) + error(1, errno, "close r"); + if (close(fdt)) + error(1, errno, "close t"); + + return 0; +} diff --git a/tools/testing/selftests/net/ipv6_flowlabel.sh b/tools/testing/selftests/net/ipv6_flowlabel.sh new file mode 100755 index 000000000000..d3bc6442704e --- /dev/null +++ b/tools/testing/selftests/net/ipv6_flowlabel.sh @@ -0,0 +1,21 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 +# +# Regression tests for IPv6 flowlabels +# +# run in separate namespaces to avoid mgmt db conflicts betweent tests + +set -e + +echo "TEST management" +./in_netns.sh ./ipv6_flowlabel_mgr + +echo "TEST datapath" +./in_netns.sh \ + sh -c 'sysctl -q -w net.ipv6.auto_flowlabels=0 && ./ipv6_flowlabel -l 1' + +echo "TEST datapath (with auto-flowlabels)" +./in_netns.sh \ + sh -c 'sysctl -q -w net.ipv6.auto_flowlabels=1 && ./ipv6_flowlabel -l 1' + +echo OK. All tests passed diff --git a/tools/testing/selftests/net/ipv6_flowlabel_mgr.c b/tools/testing/selftests/net/ipv6_flowlabel_mgr.c new file mode 100644 index 000000000000..af95b48acea9 --- /dev/null +++ b/tools/testing/selftests/net/ipv6_flowlabel_mgr.c @@ -0,0 +1,199 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Test IPV6_FLOWINFO_MGR */ + +#define _GNU_SOURCE + +#include <arpa/inet.h> +#include <error.h> +#include <errno.h> +#include <limits.h> +#include <linux/in6.h> +#include <stdbool.h> +#include <stdio.h> +#include <stdint.h> +#include <stdlib.h> +#include <string.h> +#include <sys/socket.h> +#include <sys/stat.h> +#include <sys/time.h> +#include <sys/types.h> +#include <sys/wait.h> +#include <unistd.h> + +/* uapi/glibc weirdness may leave this undefined */ +#ifndef IPV6_FLOWLABEL_MGR +#define IPV6_FLOWLABEL_MGR 32 +#endif + +/* from net/ipv6/ip6_flowlabel.c */ +#define FL_MIN_LINGER 6 + +#define explain(x) \ + do { if (cfg_verbose) fprintf(stderr, " " x "\n"); } while (0) + +#define __expect(x) \ + do { \ + if (!(x)) \ + fprintf(stderr, "[OK] " #x "\n"); \ + else \ + error(1, 0, "[ERR] " #x " (line %d)", __LINE__); \ + } while (0) + +#define expect_pass(x) __expect(x) +#define expect_fail(x) __expect(!(x)) + +static bool cfg_long_running; +static bool cfg_verbose; + +static int flowlabel_get(int fd, uint32_t label, uint8_t share, uint16_t flags) +{ + struct in6_flowlabel_req req = { + .flr_action = IPV6_FL_A_GET, + .flr_label = htonl(label), + .flr_flags = flags, + .flr_share = share, + }; + + /* do not pass IPV6_ADDR_ANY or IPV6_ADDR_MAPPED */ + req.flr_dst.s6_addr[0] = 0xfd; + req.flr_dst.s6_addr[15] = 0x1; + + return setsockopt(fd, SOL_IPV6, IPV6_FLOWLABEL_MGR, &req, sizeof(req)); +} + +static int flowlabel_put(int fd, uint32_t label) +{ + struct in6_flowlabel_req req = { + .flr_action = IPV6_FL_A_PUT, + .flr_label = htonl(label), + }; + + return setsockopt(fd, SOL_IPV6, IPV6_FLOWLABEL_MGR, &req, sizeof(req)); +} + +static void run_tests(int fd) +{ + int wstatus; + pid_t pid; + + explain("cannot get non-existent label"); + expect_fail(flowlabel_get(fd, 1, IPV6_FL_S_ANY, 0)); + + explain("cannot put non-existent label"); + expect_fail(flowlabel_put(fd, 1)); + + explain("cannot create label greater than 20 bits"); + expect_fail(flowlabel_get(fd, 0x1FFFFF, IPV6_FL_S_ANY, + IPV6_FL_F_CREATE)); + + explain("create a new label (FL_F_CREATE)"); + expect_pass(flowlabel_get(fd, 1, IPV6_FL_S_ANY, IPV6_FL_F_CREATE)); + explain("can get the label (without FL_F_CREATE)"); + expect_pass(flowlabel_get(fd, 1, IPV6_FL_S_ANY, 0)); + explain("can get it again with create flag set, too"); + expect_pass(flowlabel_get(fd, 1, IPV6_FL_S_ANY, IPV6_FL_F_CREATE)); + explain("cannot get it again with the exclusive (FL_FL_EXCL) flag"); + expect_fail(flowlabel_get(fd, 1, IPV6_FL_S_ANY, + IPV6_FL_F_CREATE | IPV6_FL_F_EXCL)); + explain("can now put exactly three references"); + expect_pass(flowlabel_put(fd, 1)); + expect_pass(flowlabel_put(fd, 1)); + expect_pass(flowlabel_put(fd, 1)); + expect_fail(flowlabel_put(fd, 1)); + + explain("create a new exclusive label (FL_S_EXCL)"); + expect_pass(flowlabel_get(fd, 2, IPV6_FL_S_EXCL, IPV6_FL_F_CREATE)); + explain("cannot get it again in non-exclusive mode"); + expect_fail(flowlabel_get(fd, 2, IPV6_FL_S_ANY, IPV6_FL_F_CREATE)); + explain("cannot get it again in exclusive mode either"); + expect_fail(flowlabel_get(fd, 2, IPV6_FL_S_EXCL, IPV6_FL_F_CREATE)); + expect_pass(flowlabel_put(fd, 2)); + + if (cfg_long_running) { + explain("cannot reuse the label, due to linger"); + expect_fail(flowlabel_get(fd, 2, IPV6_FL_S_ANY, + IPV6_FL_F_CREATE)); + explain("after sleep, can reuse"); + sleep(FL_MIN_LINGER * 2 + 1); + expect_pass(flowlabel_get(fd, 2, IPV6_FL_S_ANY, + IPV6_FL_F_CREATE)); + } + + explain("create a new user-private label (FL_S_USER)"); + expect_pass(flowlabel_get(fd, 3, IPV6_FL_S_USER, IPV6_FL_F_CREATE)); + explain("cannot get it again in non-exclusive mode"); + expect_fail(flowlabel_get(fd, 3, IPV6_FL_S_ANY, 0)); + explain("cannot get it again in exclusive mode"); + expect_fail(flowlabel_get(fd, 3, IPV6_FL_S_EXCL, 0)); + explain("can get it again in user mode"); + expect_pass(flowlabel_get(fd, 3, IPV6_FL_S_USER, 0)); + explain("child process can get it too, but not after setuid(nobody)"); + pid = fork(); + if (pid == -1) + error(1, errno, "fork"); + if (!pid) { + expect_pass(flowlabel_get(fd, 3, IPV6_FL_S_USER, 0)); + if (setuid(USHRT_MAX)) + fprintf(stderr, "[INFO] skip setuid child test\n"); + else + expect_fail(flowlabel_get(fd, 3, IPV6_FL_S_USER, 0)); + exit(0); + } + if (wait(&wstatus) == -1) + error(1, errno, "wait"); + if (!WIFEXITED(wstatus) || WEXITSTATUS(wstatus) != 0) + error(1, errno, "wait: unexpected child result"); + + explain("create a new process-private label (FL_S_PROCESS)"); + expect_pass(flowlabel_get(fd, 4, IPV6_FL_S_PROCESS, IPV6_FL_F_CREATE)); + explain("can get it again"); + expect_pass(flowlabel_get(fd, 4, IPV6_FL_S_PROCESS, 0)); + explain("child process cannot can get it"); + pid = fork(); + if (pid == -1) + error(1, errno, "fork"); + if (!pid) { + expect_fail(flowlabel_get(fd, 4, IPV6_FL_S_PROCESS, 0)); + exit(0); + } + if (wait(&wstatus) == -1) + error(1, errno, "wait"); + if (!WIFEXITED(wstatus) || WEXITSTATUS(wstatus) != 0) + error(1, errno, "wait: unexpected child result"); +} + +static void parse_opts(int argc, char **argv) +{ + int c; + + while ((c = getopt(argc, argv, "lv")) != -1) { + switch (c) { + case 'l': + cfg_long_running = true; + break; + case 'v': + cfg_verbose = true; + break; + default: + error(1, 0, "%s: parse error", argv[0]); + } + } +} + +int main(int argc, char **argv) +{ + int fd; + + parse_opts(argc, argv); + + fd = socket(PF_INET6, SOCK_DGRAM, 0); + if (fd == -1) + error(1, errno, "socket"); + + run_tests(fd); + + if (close(fd)) + error(1, errno, "close"); + + return 0; +} |
