diff options
22 files changed, 1825 insertions, 73 deletions
diff --git a/Documentation/netlink/specs/devlink.yaml b/Documentation/netlink/specs/devlink.yaml index bfba466d694a..1c4bb0cbe5f0 100644 --- a/Documentation/netlink/specs/devlink.yaml +++ b/Documentation/netlink/specs/devlink.yaml @@ -224,6 +224,10 @@ definitions: value: 10 - name: binary + - + name: rate-tc-index-max + type: const + value: 7 attribute-sets: - @@ -844,7 +848,23 @@ attribute-sets: - name: region-direct type: flag - + - + name: rate-tc-bws + type: nest + multi-attr: true + nested-attributes: dl-rate-tc-bws + - + name: rate-tc-index + type: u8 + checks: + max: rate-tc-index-max + - + name: rate-tc-bw + type: u32 + doc: | + Specifies the bandwidth share assigned to the Traffic Class. + The bandwidth for the traffic class is determined + in proportion to the sum of the shares of all configured classes. - name: dl-dev-stats subset-of: devlink @@ -1249,6 +1269,14 @@ attribute-sets: - name: flash type: flag + - + name: dl-rate-tc-bws + subset-of: devlink + attributes: + - + name: rate-tc-index + - + name: rate-tc-bw operations: enum-model: directional @@ -2176,6 +2204,7 @@ operations: - rate-tx-priority - rate-tx-weight - rate-parent-node-name + - rate-tc-bws - name: rate-new @@ -2196,6 +2225,7 @@ operations: - rate-tx-priority - rate-tx-weight - rate-parent-node-name + - rate-tc-bws - name: rate-del diff --git a/Documentation/networking/devlink/devlink-port.rst b/Documentation/networking/devlink/devlink-port.rst index 9d22d41a7cd1..5e397798a402 100644 --- a/Documentation/networking/devlink/devlink-port.rst +++ b/Documentation/networking/devlink/devlink-port.rst @@ -418,6 +418,14 @@ API allows to configure following rate object's parameters: to all node children limits. ``tx_max`` is an upper limit for children. ``tx_share`` is a total bandwidth distributed among children. +``tc_bw`` + Allow users to set the bandwidth allocation per traffic class on rate + objects. This enables fine-grained QoS configurations by assigning a relative + share value to each traffic class. The bandwidth is distributed in proportion + to the share value for each class, relative to the sum of all shares. + When applied to a non-leaf node, tc_bw determines how bandwidth is shared + among its child elements. + ``tx_priority`` and ``tx_weight`` can be used simultaneously. In that case nodes with the same priority form a WFQ subgroup in the sibling group and arbitration among them is based on assigned weights. diff --git a/drivers/net/ethernet/mellanox/mlx5/core/devlink.c b/drivers/net/ethernet/mellanox/mlx5/core/devlink.c index 42218834183a..3ffa3fbacd16 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/devlink.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/devlink.c @@ -376,6 +376,8 @@ static const struct devlink_ops mlx5_devlink_ops = { .eswitch_encap_mode_get = mlx5_devlink_eswitch_encap_mode_get, .rate_leaf_tx_share_set = mlx5_esw_devlink_rate_leaf_tx_share_set, .rate_leaf_tx_max_set = mlx5_esw_devlink_rate_leaf_tx_max_set, + .rate_leaf_tc_bw_set = mlx5_esw_devlink_rate_leaf_tc_bw_set, + .rate_node_tc_bw_set = mlx5_esw_devlink_rate_node_tc_bw_set, .rate_node_tx_share_set = mlx5_esw_devlink_rate_node_tx_share_set, .rate_node_tx_max_set = mlx5_esw_devlink_rate_node_tx_max_set, .rate_node_new = mlx5_esw_devlink_rate_node_new, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c index b6ae384396b3..154bbb17ec0e 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c @@ -64,11 +64,19 @@ static void esw_qos_domain_release(struct mlx5_eswitch *esw) enum sched_node_type { SCHED_NODE_TYPE_VPORTS_TSAR, SCHED_NODE_TYPE_VPORT, + SCHED_NODE_TYPE_TC_ARBITER_TSAR, + SCHED_NODE_TYPE_RATE_LIMITER, + SCHED_NODE_TYPE_VPORT_TC, + SCHED_NODE_TYPE_VPORTS_TC_TSAR, }; static const char * const sched_node_type_str[] = { [SCHED_NODE_TYPE_VPORTS_TSAR] = "vports TSAR", [SCHED_NODE_TYPE_VPORT] = "vport", + [SCHED_NODE_TYPE_TC_ARBITER_TSAR] = "TC Arbiter TSAR", + [SCHED_NODE_TYPE_RATE_LIMITER] = "Rate Limiter", + [SCHED_NODE_TYPE_VPORT_TC] = "vport TC", + [SCHED_NODE_TYPE_VPORTS_TC_TSAR] = "vports TC TSAR", }; struct mlx5_esw_sched_node { @@ -92,6 +100,8 @@ struct mlx5_esw_sched_node { struct mlx5_vport *vport; /* Level in the hierarchy. The root node level is 1. */ u8 level; + /* Valid only when this node represents a traffic class. */ + u8 tc; }; static void esw_qos_node_attach_to_parent(struct mlx5_esw_sched_node *node) @@ -106,6 +116,13 @@ static void esw_qos_node_attach_to_parent(struct mlx5_esw_sched_node *node) } } +static int esw_qos_num_tcs(struct mlx5_core_dev *dev) +{ + int num_tcs = mlx5_max_tc(dev) + 1; + + return num_tcs < DEVLINK_RATE_TCS_MAX ? num_tcs : DEVLINK_RATE_TCS_MAX; +} + static void esw_qos_node_set_parent(struct mlx5_esw_sched_node *node, struct mlx5_esw_sched_node *parent) { @@ -116,8 +133,38 @@ esw_qos_node_set_parent(struct mlx5_esw_sched_node *node, struct mlx5_esw_sched_ esw_qos_node_attach_to_parent(node); } +static void esw_qos_nodes_set_parent(struct list_head *nodes, + struct mlx5_esw_sched_node *parent) +{ + struct mlx5_esw_sched_node *node, *tmp; + + list_for_each_entry_safe(node, tmp, nodes, entry) { + esw_qos_node_set_parent(node, parent); + if (!list_empty(&node->children) && + parent->type == SCHED_NODE_TYPE_TC_ARBITER_TSAR) { + struct mlx5_esw_sched_node *child; + + list_for_each_entry(child, &node->children, entry) { + struct mlx5_vport *vport = child->vport; + + if (vport) + vport->qos.sched_node->parent = parent; + } + } + } +} + void mlx5_esw_qos_vport_qos_free(struct mlx5_vport *vport) { + if (vport->qos.sched_nodes) { + int num_tcs = esw_qos_num_tcs(vport->qos.sched_node->esw->dev); + int i; + + for (i = 0; i < num_tcs; i++) + kfree(vport->qos.sched_nodes[i]); + kfree(vport->qos.sched_nodes); + } + kfree(vport->qos.sched_node); memset(&vport->qos, 0, sizeof(vport->qos)); } @@ -141,16 +188,37 @@ mlx5_esw_qos_vport_get_parent(const struct mlx5_vport *vport) static void esw_qos_sched_elem_warn(struct mlx5_esw_sched_node *node, int err, const char *op) { - if (node->vport) { + switch (node->type) { + case SCHED_NODE_TYPE_VPORTS_TC_TSAR: + esw_warn(node->esw->dev, + "E-Switch %s %s scheduling element failed (tc=%d,err=%d)\n", + op, sched_node_type_str[node->type], node->tc, err); + break; + case SCHED_NODE_TYPE_VPORT_TC: + esw_warn(node->esw->dev, + "E-Switch %s %s scheduling element failed (vport=%d,tc=%d,err=%d)\n", + op, + sched_node_type_str[node->type], + node->vport->vport, node->tc, err); + break; + case SCHED_NODE_TYPE_VPORT: esw_warn(node->esw->dev, "E-Switch %s %s scheduling element failed (vport=%d,err=%d)\n", op, sched_node_type_str[node->type], node->vport->vport, err); - return; + break; + case SCHED_NODE_TYPE_RATE_LIMITER: + case SCHED_NODE_TYPE_TC_ARBITER_TSAR: + case SCHED_NODE_TYPE_VPORTS_TSAR: + esw_warn(node->esw->dev, + "E-Switch %s %s scheduling element failed (err=%d)\n", + op, sched_node_type_str[node->type], err); + break; + default: + esw_warn(node->esw->dev, + "E-Switch %s scheduling element failed (err=%d)\n", + op, err); + break; } - - esw_warn(node->esw->dev, - "E-Switch %s %s scheduling element failed (err=%d)\n", - op, sched_node_type_str[node->type], err); } static int esw_qos_node_create_sched_element(struct mlx5_esw_sched_node *node, void *ctx, @@ -233,6 +301,24 @@ static int esw_qos_sched_elem_config(struct mlx5_esw_sched_node *node, u32 max_r return 0; } +static int esw_qos_create_rate_limit_element(struct mlx5_esw_sched_node *node, + struct netlink_ext_ack *extack) +{ + u32 sched_ctx[MLX5_ST_SZ_DW(scheduling_context)] = {}; + + if (!mlx5_qos_element_type_supported( + node->esw->dev, + SCHEDULING_CONTEXT_ELEMENT_TYPE_RATE_LIMIT, + SCHEDULING_HIERARCHY_E_SWITCH)) + return -EOPNOTSUPP; + + MLX5_SET(scheduling_context, sched_ctx, max_average_bw, node->max_rate); + MLX5_SET(scheduling_context, sched_ctx, element_type, + SCHEDULING_CONTEXT_ELEMENT_TYPE_RATE_LIMIT); + + return esw_qos_node_create_sched_element(node, sched_ctx, extack); +} + static u32 esw_qos_calculate_min_rate_divider(struct mlx5_eswitch *esw, struct mlx5_esw_sched_node *parent) { @@ -266,11 +352,13 @@ static u32 esw_qos_calculate_min_rate_divider(struct mlx5_eswitch *esw, return 0; } -static u32 esw_qos_calc_bw_share(u32 min_rate, u32 divider, u32 fw_max) +static u32 esw_qos_calc_bw_share(u32 value, u32 divider, u32 fw_max) { if (!divider) return 0; - return min_t(u32, max_t(u32, DIV_ROUND_UP(min_rate, divider), MLX5_MIN_BW_SHARE), fw_max); + return min_t(u32, fw_max, + max_t(u32, + DIV_ROUND_UP(value, divider), MLX5_MIN_BW_SHARE)); } static void esw_qos_update_sched_node_bw_share(struct mlx5_esw_sched_node *node, @@ -297,7 +385,13 @@ static void esw_qos_normalize_min_rate(struct mlx5_eswitch *esw, if (node->esw != esw || node->ix == esw->qos.root_tsar_ix) continue; - esw_qos_update_sched_node_bw_share(node, divider, extack); + /* Vports TC TSARs don't have a minimum rate configured, + * so there's no need to update the bw_share on them. + */ + if (node->type != SCHED_NODE_TYPE_VPORTS_TC_TSAR) { + esw_qos_update_sched_node_bw_share(node, divider, + extack); + } if (list_empty(&node->children)) continue; @@ -306,6 +400,20 @@ static void esw_qos_normalize_min_rate(struct mlx5_eswitch *esw, } } +static u32 esw_qos_calculate_tc_bw_divider(u32 *tc_bw) +{ + u32 total = 0; + int i; + + for (i = 0; i < DEVLINK_RATE_TCS_MAX; i++) + total += tc_bw[i]; + + /* If total is zero, tc-bw config is disabled and we shouldn't reach + * here. + */ + return WARN_ON(!total) ? 1 : total; +} + static int esw_qos_set_node_min_rate(struct mlx5_esw_sched_node *node, u32 min_rate, struct netlink_ext_ack *extack) { @@ -350,28 +458,64 @@ esw_qos_create_node_sched_elem(struct mlx5_core_dev *dev, u32 parent_element_id, tsar_ix); } -static int esw_qos_vport_create_sched_element(struct mlx5_esw_sched_node *vport_node, - struct netlink_ext_ack *extack) +static int +esw_qos_vport_create_sched_element(struct mlx5_esw_sched_node *vport_node, + struct netlink_ext_ack *extack) { u32 sched_ctx[MLX5_ST_SZ_DW(scheduling_context)] = {}; struct mlx5_core_dev *dev = vport_node->esw->dev; void *attr; - if (!mlx5_qos_element_type_supported(dev, - SCHEDULING_CONTEXT_ELEMENT_TYPE_VPORT, - SCHEDULING_HIERARCHY_E_SWITCH)) + if (!mlx5_qos_element_type_supported( + dev, + SCHEDULING_CONTEXT_ELEMENT_TYPE_VPORT, + SCHEDULING_HIERARCHY_E_SWITCH)) return -EOPNOTSUPP; MLX5_SET(scheduling_context, sched_ctx, element_type, SCHEDULING_CONTEXT_ELEMENT_TYPE_VPORT); attr = MLX5_ADDR_OF(scheduling_context, sched_ctx, element_attributes); MLX5_SET(vport_element, attr, vport_number, vport_node->vport->vport); - MLX5_SET(scheduling_context, sched_ctx, parent_element_id, vport_node->parent->ix); - MLX5_SET(scheduling_context, sched_ctx, max_average_bw, vport_node->max_rate); + MLX5_SET(scheduling_context, sched_ctx, parent_element_id, + vport_node->parent->ix); + MLX5_SET(scheduling_context, sched_ctx, max_average_bw, + vport_node->max_rate); return esw_qos_node_create_sched_element(vport_node, sched_ctx, extack); } +static int +esw_qos_vport_tc_create_sched_element(struct mlx5_esw_sched_node *vport_tc_node, + u32 rate_limit_elem_ix, + struct netlink_ext_ack *extack) +{ + u32 sched_ctx[MLX5_ST_SZ_DW(scheduling_context)] = {}; + struct mlx5_core_dev *dev = vport_tc_node->esw->dev; + void *attr; + + if (!mlx5_qos_element_type_supported( + dev, + SCHEDULING_CONTEXT_ELEMENT_TYPE_VPORT_TC, + SCHEDULING_HIERARCHY_E_SWITCH)) + return -EOPNOTSUPP; + + MLX5_SET(scheduling_context, sched_ctx, element_type, + SCHEDULING_CONTEXT_ELEMENT_TYPE_VPORT_TC); + attr = MLX5_ADDR_OF(scheduling_context, sched_ctx, element_attributes); + MLX5_SET(vport_tc_element, attr, vport_number, + vport_tc_node->vport->vport); + MLX5_SET(vport_tc_element, attr, traffic_class, vport_tc_node->tc); + MLX5_SET(scheduling_context, sched_ctx, max_bw_obj_id, + rate_limit_elem_ix); + MLX5_SET(scheduling_context, sched_ctx, parent_element_id, + vport_tc_node->parent->ix); + MLX5_SET(scheduling_context, sched_ctx, bw_share, + vport_tc_node->bw_share); + + return esw_qos_node_create_sched_element(vport_tc_node, sched_ctx, + extack); +} + static struct mlx5_esw_sched_node * __esw_qos_alloc_node(struct mlx5_eswitch *esw, u32 tsar_ix, enum sched_node_type type, struct mlx5_esw_sched_node *parent) @@ -388,6 +532,14 @@ __esw_qos_alloc_node(struct mlx5_eswitch *esw, u32 tsar_ix, enum sched_node_type node->parent = parent; INIT_LIST_HEAD(&node->children); esw_qos_node_attach_to_parent(node); + if (!parent) { + /* The caller is responsible for inserting the node into the + * parent list if necessary. This function can also be used with + * a NULL parent, which doesn't necessarily indicate that it + * refers to the root scheduling element. + */ + list_del_init(&node->entry); + } return node; } @@ -404,6 +556,149 @@ static void esw_qos_destroy_node(struct mlx5_esw_sched_node *node, struct netlin __esw_qos_free_node(node); } +static int esw_qos_create_vports_tc_node(struct mlx5_esw_sched_node *parent, + u8 tc, struct netlink_ext_ack *extack) +{ + u32 tsar_ctx[MLX5_ST_SZ_DW(scheduling_context)] = {}; + struct mlx5_core_dev *dev = parent->esw->dev; + struct mlx5_esw_sched_node *vports_tc_node; + void *attr; + int err; + + if (!mlx5_qos_element_type_supported( + dev, + SCHEDULING_CONTEXT_ELEMENT_TYPE_TSAR, + SCHEDULING_HIERARCHY_E_SWITCH) || + !mlx5_qos_tsar_type_supported(dev, + TSAR_ELEMENT_TSAR_TYPE_DWRR, + SCHEDULING_HIERARCHY_E_SWITCH)) + return -EOPNOTSUPP; + + vports_tc_node = __esw_qos_alloc_node(parent->esw, 0, + SCHED_NODE_TYPE_VPORTS_TC_TSAR, + parent); + if (!vports_tc_node) { + NL_SET_ERR_MSG_MOD(extack, "E-Switch alloc node failed"); + esw_warn(dev, "Failed to alloc vports TC node (tc=%d)\n", tc); + return -ENOMEM; + } + + attr = MLX5_ADDR_OF(scheduling_context, tsar_ctx, element_attributes); + MLX5_SET(tsar_element, attr, tsar_type, TSAR_ELEMENT_TSAR_TYPE_DWRR); + MLX5_SET(tsar_element, attr, traffic_class, tc); + MLX5_SET(scheduling_context, tsar_ctx, parent_element_id, parent->ix); + MLX5_SET(scheduling_context, tsar_ctx, element_type, + SCHEDULING_CONTEXT_ELEMENT_TYPE_TSAR); + + err = esw_qos_node_create_sched_element(vports_tc_node, tsar_ctx, + extack); + if (err) + goto err_create_sched_element; + + vports_tc_node->tc = tc; + + return 0; + +err_create_sched_element: + __esw_qos_free_node(vports_tc_node); + return err; +} + +static void +esw_qos_tc_arbiter_get_bw_shares(struct mlx5_esw_sched_node *tc_arbiter_node, + u32 *tc_bw) +{ + struct mlx5_esw_sched_node *vports_tc_node; + + list_for_each_entry(vports_tc_node, &tc_arbiter_node->children, entry) + tc_bw[vports_tc_node->tc] = vports_tc_node->bw_share; +} + +static void +esw_qos_set_tc_arbiter_bw_shares(struct mlx5_esw_sched_node *tc_arbiter_node, + u32 *tc_bw, struct netlink_ext_ack *extack) +{ + struct mlx5_eswitch *esw = tc_arbiter_node->esw; + struct mlx5_esw_sched_node *vports_tc_node; + u32 divider, fw_max_bw_share; + + fw_max_bw_share = MLX5_CAP_QOS(esw->dev, max_tsar_bw_share); + divider = esw_qos_calculate_tc_bw_divider(tc_bw); + list_for_each_entry(vports_tc_node, &tc_arbiter_node->children, entry) { + u8 tc = vports_tc_node->tc; + u32 bw_share; + + bw_share = tc_bw[tc] * fw_max_bw_share; + bw_share = esw_qos_calc_bw_share(bw_share, divider, + fw_max_bw_share); + esw_qos_sched_elem_config(vports_tc_node, 0, bw_share, extack); + } +} + +static void +esw_qos_destroy_vports_tc_nodes(struct mlx5_esw_sched_node *tc_arbiter_node, + struct netlink_ext_ack *extack) +{ + struct mlx5_esw_sched_node *vports_tc_node, *tmp; + + list_for_each_entry_safe(vports_tc_node, tmp, + &tc_arbiter_node->children, entry) + esw_qos_destroy_node(vports_tc_node, extack); +} + +static int +esw_qos_create_vports_tc_nodes(struct mlx5_esw_sched_node *tc_arbiter_node, + struct netlink_ext_ack *extack) +{ + struct mlx5_eswitch *esw = tc_arbiter_node->esw; + int err, i, num_tcs = esw_qos_num_tcs(esw->dev); + + for (i = 0; i < num_tcs; i++) { + err = esw_qos_create_vports_tc_node(tc_arbiter_node, i, extack); + if (err) + goto err_tc_node_create; + } + + return 0; + +err_tc_node_create: + esw_qos_destroy_vports_tc_nodes(tc_arbiter_node, NULL); + return err; +} + +static int esw_qos_create_tc_arbiter_sched_elem( + struct mlx5_esw_sched_node *tc_arbiter_node, + struct netlink_ext_ack *extack) +{ + u32 tsar_ctx[MLX5_ST_SZ_DW(scheduling_context)] = {}; + u32 tsar_parent_ix; + void *attr; + + if (!mlx5_qos_tsar_type_supported(tc_arbiter_node->esw->dev, + TSAR_ELEMENT_TSAR_TYPE_TC_ARB, + SCHEDULING_HIERARCHY_E_SWITCH)) { + NL_SET_ERR_MSG_MOD(extack, + "E-Switch TC Arbiter scheduling element is not supported"); + return -EOPNOTSUPP; + } + + attr = MLX5_ADDR_OF(scheduling_context, tsar_ctx, element_attributes); + MLX5_SET(tsar_element, attr, tsar_type, TSAR_ELEMENT_TSAR_TYPE_TC_ARB); + tsar_parent_ix = tc_arbiter_node->parent ? tc_arbiter_node->parent->ix : + tc_arbiter_node->esw->qos.root_tsar_ix; + MLX5_SET(scheduling_context, tsar_ctx, parent_element_id, + tsar_parent_ix); + MLX5_SET(scheduling_context, tsar_ctx, element_type, + SCHEDULING_CONTEXT_ELEMENT_TYPE_TSAR); + MLX5_SET(scheduling_context, tsar_ctx, max_average_bw, + tc_arbiter_node->max_rate); + MLX5_SET(scheduling_context, tsar_ctx, bw_share, + tc_arbiter_node->bw_share); + + return esw_qos_node_create_sched_element(tc_arbiter_node, tsar_ctx, + extack); +} + static struct mlx5_esw_sched_node * __esw_qos_create_vports_sched_node(struct mlx5_eswitch *esw, struct mlx5_esw_sched_node *parent, struct netlink_ext_ack *extack) @@ -426,6 +721,7 @@ __esw_qos_create_vports_sched_node(struct mlx5_eswitch *esw, struct mlx5_esw_sch goto err_alloc_node; } + list_add_tail(&node->entry, &esw->qos.domain->nodes); esw_qos_normalize_min_rate(esw, NULL, extack); trace_mlx5_esw_node_qos_create(esw->dev, node, node->ix); @@ -467,6 +763,9 @@ static void __esw_qos_destroy_node(struct mlx5_esw_sched_node *node, struct netl { struct mlx5_eswitch *esw = node->esw; + if (node->type == SCHED_NODE_TYPE_TC_ARBITER_TSAR) + esw_qos_destroy_vports_tc_nodes(node, extack); + trace_mlx5_esw_node_qos_destroy(esw->dev, node, node->ix); esw_qos_destroy_node(node, extack); esw_qos_normalize_min_rate(esw, NULL, extack); @@ -498,6 +797,9 @@ static int esw_qos_create(struct mlx5_eswitch *esw, struct netlink_ext_ack *exta SCHED_NODE_TYPE_VPORTS_TSAR, NULL)) esw->qos.node0 = ERR_PTR(-ENOMEM); + else + list_add_tail(&esw->qos.node0->entry, + &esw->qos.domain->nodes); } if (IS_ERR(esw->qos.node0)) { err = PTR_ERR(esw->qos.node0); @@ -555,12 +857,239 @@ static void esw_qos_put(struct mlx5_eswitch *esw) esw_qos_destroy(esw); } +static void +esw_qos_tc_arbiter_scheduling_teardown(struct mlx5_esw_sched_node *node, + struct netlink_ext_ack *extack) +{ + /* Clean up all Vports TC nodes within the TC arbiter node. */ + esw_qos_destroy_vports_tc_nodes(node, extack); + /* Destroy the scheduling element for the TC arbiter node itself. */ + esw_qos_node_destroy_sched_element(node, extack); +} + +static int esw_qos_tc_arbiter_scheduling_setup(struct mlx5_esw_sched_node *node, + struct netlink_ext_ack *extack) +{ + u32 curr_ix = node->ix; + int err; + + err = esw_qos_create_tc_arbiter_sched_elem(node, extack); + if (err) + return err; + /* Initialize the vports TC nodes within created TC arbiter TSAR. */ + err = esw_qos_create_vports_tc_nodes(node, extack); + if (err) + goto err_vports_tc_nodes; + + node->type = SCHED_NODE_TYPE_TC_ARBITER_TSAR; + + return 0; + +err_vports_tc_nodes: + /* If initialization fails, clean up the scheduling element + * for the TC arbiter node. + */ + esw_qos_node_destroy_sched_element(node, NULL); + node->ix = curr_ix; + return err; +} + +static int +esw_qos_create_vport_tc_sched_node(struct mlx5_vport *vport, + u32 rate_limit_elem_ix, + struct mlx5_esw_sched_node *vports_tc_node, + struct netlink_ext_ack *extack) +{ + struct mlx5_esw_sched_node *vport_node = vport->qos.sched_node; + struct mlx5_esw_sched_node *vport_tc_node; + u8 tc = vports_tc_node->tc; + int err; + + vport_tc_node = __esw_qos_alloc_node(vport_node->esw, 0, + SCHED_NODE_TYPE_VPORT_TC, + vports_tc_node); + if (!vport_tc_node) + return -ENOMEM; + + vport_tc_node->min_rate = vport_node->min_rate; + vport_tc_node->tc = tc; + vport_tc_node->vport = vport; + err = esw_qos_vport_tc_create_sched_element(vport_tc_node, + rate_limit_elem_ix, + extack); + if (err) + goto err_out; + + vport->qos.sched_nodes[tc] = vport_tc_node; + + return 0; +err_out: + __esw_qos_free_node(vport_tc_node); + return err; +} + +static void +esw_qos_destroy_vport_tc_sched_elements(struct mlx5_vport *vport, + struct netlink_ext_ack *extack) +{ + int i, num_tcs = esw_qos_num_tcs(vport->qos.sched_node->esw->dev); + + for (i = 0; i < num_tcs; i++) { + if (vport->qos.sched_nodes[i]) { + __esw_qos_destroy_node(vport->qos.sched_nodes[i], + extack); + } + } + + kfree(vport->qos.sched_nodes); + vport->qos.sched_nodes = NULL; +} + +static int +esw_qos_create_vport_tc_sched_elements(struct mlx5_vport *vport, + enum sched_node_type type, + struct netlink_ext_ack *extack) +{ + struct mlx5_esw_sched_node *vport_node = vport->qos.sched_node; + struct mlx5_esw_sched_node *tc_arbiter_node, *vports_tc_node; + int err, num_tcs = esw_qos_num_tcs(vport_node->esw->dev); + u32 rate_limit_elem_ix; + + vport->qos.sched_nodes = kcalloc(num_tcs, + sizeof(struct mlx5_esw_sched_node *), + GFP_KERNEL); + if (!vport->qos.sched_nodes) { + NL_SET_ERR_MSG_MOD(extack, + "Allocating the vport TC scheduling elements failed."); + return -ENOMEM; + } + + rate_limit_elem_ix = type == SCHED_NODE_TYPE_RATE_LIMITER ? + vport_node->ix : 0; + tc_arbiter_node = type == SCHED_NODE_TYPE_RATE_LIMITER ? + vport_node->parent : vport_node; + list_for_each_entry(vports_tc_node, &tc_arbiter_node->children, entry) { + err = esw_qos_create_vport_tc_sched_node(vport, + rate_limit_elem_ix, + vports_tc_node, + extack); + if (err) + goto err_create_vport_tc; + } + + return 0; + +err_create_vport_tc: + esw_qos_destroy_vport_tc_sched_elements(vport, NULL); + + return err; +} + +static int +esw_qos_vport_tc_enable(struct mlx5_vport *vport, enum sched_node_type type, + struct netlink_ext_ack *extack) +{ + struct mlx5_esw_sched_node *vport_node = vport->qos.sched_node; + int err, new_level, max_level; + + if (type == SCHED_NODE_TYPE_TC_ARBITER_TSAR) { + /* Increase the parent's level by 2 to account for both the + * TC arbiter and the vports TC scheduling element. + */ + new_level = vport_node->parent->level + 2; + max_level = 1 << MLX5_CAP_QOS(vport_node->esw->dev, + log_esw_max_sched_depth); + if (new_level > max_level) { + NL_SET_ERR_MSG_MOD(extack, + "TC arbitration on leafs is not supported beyond max scheduling depth"); + return -EOPNOTSUPP; + } + } + + esw_assert_qos_lock_held(vport->dev->priv.eswitch); + + if (type == SCHED_NODE_TYPE_RATE_LIMITER) + err = esw_qos_create_rate_limit_element(vport_node, extack); + else + err = esw_qos_tc_arbiter_scheduling_setup(vport_node, extack); + if (err) + return err; + + /* Rate limiters impact multiple nodes not directly connected to them + * and are not direct members of the QoS hierarchy. + * Unlink it from the parent to reflect that. + */ + if (type == SCHED_NODE_TYPE_RATE_LIMITER) { + list_del_init(&vport_node->entry); + vport_node->level = 0; + } + + err = esw_qos_create_vport_tc_sched_elements(vport, type, extack); + if (err) + goto err_sched_nodes; + + return 0; + +err_sched_nodes: + if (type == SCHED_NODE_TYPE_RATE_LIMITER) { + esw_qos_node_destroy_sched_element(vport_node, NULL); + list_add_tail(&vport_node->entry, + &vport_node->parent->children); + vport_node->level = vport_node->parent->level + 1; + } else { + esw_qos_tc_arbiter_scheduling_teardown(vport_node, NULL); + } + return err; +} + +static void esw_qos_vport_tc_disable(struct mlx5_vport *vport, + struct netlink_ext_ack *extack) +{ + struct mlx5_esw_sched_node *vport_node = vport->qos.sched_node; + enum sched_node_type curr_type = vport_node->type; + + esw_qos_destroy_vport_tc_sched_elements(vport, extack); + + if (curr_type == SCHED_NODE_TYPE_RATE_LIMITER) + esw_qos_node_destroy_sched_element(vport_node, extack); + else + esw_qos_tc_arbiter_scheduling_teardown(vport_node, extack); +} + +static int esw_qos_set_vport_tcs_min_rate(struct mlx5_vport *vport, + u32 min_rate, + struct netlink_ext_ack *extack) +{ + struct mlx5_esw_sched_node *vport_node = vport->qos.sched_node; + int err, i, num_tcs = esw_qos_num_tcs(vport_node->esw->dev); + + for (i = 0; i < num_tcs; i++) { + err = esw_qos_set_node_min_rate(vport->qos.sched_nodes[i], + min_rate, extack); + if (err) + goto err_out; + } + vport_node->min_rate = min_rate; + + return 0; +err_out: + for (--i; i >= 0; i--) { + esw_qos_set_node_min_rate(vport->qos.sched_nodes[i], + vport_node->min_rate, extack); + } + return err; +} + static void esw_qos_vport_disable(struct mlx5_vport *vport, struct netlink_ext_ack *extack) { struct mlx5_esw_sched_node *vport_node = vport->qos.sched_node; struct mlx5_esw_sched_node *parent = vport_node->parent; + enum sched_node_type curr_type = vport_node->type; - esw_qos_node_destroy_sched_element(vport_node, extack); + if (curr_type == SCHED_NODE_TYPE_VPORT) + esw_qos_node_destroy_sched_element(vport_node, extack); + else + esw_qos_vport_tc_disable(vport, extack); vport_node->bw_share = 0; list_del_init(&vport_node->entry); @@ -569,7 +1098,9 @@ static void esw_qos_vport_disable(struct mlx5_vport *vport, struct netlink_ext_a trace_mlx5_esw_vport_qos_destroy(vport_node->esw->dev, vport); } -static int esw_qos_vport_enable(struct mlx5_vport *vport, struct mlx5_esw_sched_node *parent, +static int esw_qos_vport_enable(struct mlx5_vport *vport, + enum sched_node_type type, + struct mlx5_esw_sched_node *parent, struct netlink_ext_ack *extack) { int err; @@ -577,10 +1108,16 @@ static int esw_qos_vport_enable(struct mlx5_vport *vport, struct mlx5_esw_sched_ esw_assert_qos_lock_held(vport->dev->priv.eswitch); esw_qos_node_set_parent(vport->qos.sched_node, parent); - err = esw_qos_vport_create_sched_element(vport->qos.sched_node, extack); + if (type == SCHED_NODE_TYPE_VPORT) { + err = esw_qos_vport_create_sched_element(vport->qos.sched_node, + extack); + } else { + err = esw_qos_vport_tc_enable(vport, type, extack); + } if (err) return err; + vport->qos.sched_node->type = type; esw_qos_normalize_min_rate(parent->esw, parent, extack); trace_mlx5_esw_vport_qos_create(vport->dev, vport, vport->qos.sched_node->max_rate, @@ -611,9 +1148,8 @@ static int mlx5_esw_qos_vport_enable(struct mlx5_vport *vport, enum sched_node_t sched_node->min_rate = min_rate; sched_node->vport = vport; vport->qos.sched_node = sched_node; - err = esw_qos_vport_enable(vport, parent, extack); + err = esw_qos_vport_enable(vport, type, parent, extack); if (err) { - __esw_qos_free_node(sched_node); esw_qos_put(esw); vport->qos.sched_node = NULL; } @@ -666,6 +1202,8 @@ static int mlx5_esw_qos_set_vport_min_rate(struct mlx5_vport *vport, u32 min_rat if (!vport_node) return mlx5_esw_qos_vport_enable(vport, SCHED_NODE_TYPE_VPORT, NULL, 0, min_rate, extack); + else if (vport_node->type == SCHED_NODE_TYPE_RATE_LIMITER) + return esw_qos_set_vport_tcs_min_rate(vport, min_rate, extack); else return esw_qos_set_node_min_rate(vport_node, min_rate, extack); } @@ -698,12 +1236,73 @@ bool mlx5_esw_qos_get_vport_rate(struct mlx5_vport *vport, u32 *max_rate, u32 *m return enabled; } +static int esw_qos_vport_tc_check_type(enum sched_node_type curr_type, + enum sched_node_type new_type, + struct netlink_ext_ack *extack) +{ + if (curr_type == SCHED_NODE_TYPE_TC_ARBITER_TSAR && + new_type == SCHED_NODE_TYPE_RATE_LIMITER) { + NL_SET_ERR_MSG_MOD(extack, + "Cannot switch from vport-level TC arbitration to node-level TC arbitration"); + return -EOPNOTSUPP; + } + + if (curr_type == SCHED_NODE_TYPE_RATE_LIMITER && + new_type == SCHED_NODE_TYPE_TC_ARBITER_TSAR) { + NL_SET_ERR_MSG_MOD(extack, + "Cannot switch from node-level TC arbitration to vport-level TC arbitration"); + return -EOPNOTSUPP; + } + + return 0; +} + +static int esw_qos_vport_update(struct mlx5_vport *vport, + enum sched_node_type type, + struct mlx5_esw_sched_node *parent, + struct netlink_ext_ack *extack) +{ + struct mlx5_esw_sched_node *curr_parent = vport->qos.sched_node->parent; + enum sched_node_type curr_type = vport->qos.sched_node->type; + u32 curr_tc_bw[DEVLINK_RATE_TCS_MAX] = {0}; + int err; + + esw_assert_qos_lock_held(vport->dev->priv.eswitch); + parent = parent ?: curr_parent; + if (curr_type == type && curr_parent == parent) + return 0; + + err = esw_qos_vport_tc_check_type(curr_type, type, extack); + if (err) + return err; + + if (curr_type == SCHED_NODE_TYPE_TC_ARBITER_TSAR && curr_type == type) { + esw_qos_tc_arbiter_get_bw_shares(vport->qos.sched_node, + curr_tc_bw); + } + + esw_qos_vport_disable(vport, extack); + + err = esw_qos_vport_enable(vport, type, parent, extack); + if (err) { + esw_qos_vport_enable(vport, curr_type, curr_parent, NULL); + extack = NULL; + } + + if (curr_type == SCHED_NODE_TYPE_TC_ARBITER_TSAR && curr_type == type) { + esw_qos_set_tc_arbiter_bw_shares(vport->qos.sched_node, + curr_tc_bw, extack); + } + + return err; +} + static int esw_qos_vport_update_parent(struct mlx5_vport *vport, struct mlx5_esw_sched_node *parent, struct netlink_ext_ack *extack) { struct mlx5_eswitch *esw = vport->dev->priv.eswitch; struct mlx5_esw_sched_node *curr_parent; - int err; + enum sched_node_type type; esw_assert_qos_lock_held(esw); curr_parent = vport->qos.sched_node->parent; @@ -711,15 +1310,205 @@ static int esw_qos_vport_update_parent(struct mlx5_vport *vport, struct mlx5_esw if (curr_parent == parent) return 0; - esw_qos_vport_disable(vport, extack); + /* Set vport QoS type based on parent node type if different from + * default QoS; otherwise, use the vport's current QoS type. + */ + if (parent->type == SCHED_NODE_TYPE_TC_ARBITER_TSAR) + type = SCHED_NODE_TYPE_RATE_LIMITER; + else if (curr_parent->type == SCHED_NODE_TYPE_TC_ARBITER_TSAR) + type = SCHED_NODE_TYPE_VPORT; + else + type = vport->qos.sched_node->type; + + return esw_qos_vport_update(vport, type, parent, extack); +} - err = esw_qos_vport_enable(vport, parent, extack); +static void +esw_qos_switch_vport_tcs_to_vport(struct mlx5_esw_sched_node *tc_arbiter_node, + struct mlx5_esw_sched_node *node, + struct netlink_ext_ack *extack) +{ + struct mlx5_esw_sched_node *vports_tc_node, *vport_tc_node, *tmp; + + vports_tc_node = list_first_entry(&tc_arbiter_node->children, + struct mlx5_esw_sched_node, + entry); + + list_for_each_entry_safe(vport_tc_node, tmp, &vports_tc_node->children, + entry) + esw_qos_vport_update_parent(vport_tc_node->vport, node, extack); +} + +static int esw_qos_switch_tc_arbiter_node_to_vports( + struct mlx5_esw_sched_node *tc_arbiter_node, + struct mlx5_esw_sched_node *node, + struct netlink_ext_ack *extack) +{ + u32 parent_tsar_ix = node->parent ? + node->parent->ix : node->esw->qos.root_tsar_ix; + int err; + + err = esw_qos_create_node_sched_elem(node->esw->dev, parent_tsar_ix, + node->max_rate, node->bw_share, + &node->ix); if (err) { - if (esw_qos_vport_enable(vport, curr_parent, NULL)) - esw_warn(parent->esw->dev, "vport restore QoS failed (vport=%d)\n", - vport->vport); + NL_SET_ERR_MSG_MOD(extack, + "Failed to create scheduling element for vports node when disabliing vports TC QoS"); + return err; + } + + node->type = SCHED_NODE_TYPE_VPORTS_TSAR; + + /* Disable TC QoS for vports in the arbiter node. */ + esw_qos_switch_vport_tcs_to_vport(tc_arbiter_node, node, extack); + + return 0; +} + +static int esw_qos_switch_vports_node_to_tc_arbiter( + struct mlx5_esw_sched_node *node, + struct mlx5_esw_sched_node *tc_arbiter_node, + struct netlink_ext_ack *extack) +{ + struct mlx5_esw_sched_node *vport_node, *tmp; + struct mlx5_vport *vport; + int err; + + /* Enable TC QoS for each vport in the node. */ + list_for_each_entry_safe(vport_node, tmp, &node->children, entry) { + vport = vport_node->vport; + err = esw_qos_vport_update_parent(vport, tc_arbiter_node, + extack); + if (err) + goto err_out; + } + + /* Destroy the current vports node TSAR. */ + err = mlx5_destroy_scheduling_element_cmd(node->esw->dev, + SCHEDULING_HIERARCHY_E_SWITCH, + node->ix); + if (err) + goto err_out; + + return 0; +err_out: + /* Restore vports back into the node if an error occurs. */ + esw_qos_switch_vport_tcs_to_vport(tc_arbiter_node, node, NULL); + + return err; +} + +static struct mlx5_esw_sched_node * +esw_qos_move_node(struct mlx5_esw_sched_node *curr_node) +{ + struct mlx5_esw_sched_node *new_node; + + new_node = __esw_qos_alloc_node(curr_node->esw, curr_node->ix, + curr_node->type, NULL); + if (!IS_ERR(new_node)) + esw_qos_nodes_set_parent(&curr_node->children, new_node); + + return new_node; +} + +static int esw_qos_node_disable_tc_arbitration(struct mlx5_esw_sched_node *node, + struct netlink_ext_ack *extack) +{ + struct mlx5_esw_sched_node *curr_node; + int err; + + if (node->type != SCHED_NODE_TYPE_TC_ARBITER_TSAR) + return 0; + + /* Allocate a new rate node to hold the current state, which will allow + * for restoring the vports back to this node after disabling TC + * arbitration. + */ + curr_node = esw_qos_move_node(node); + if (IS_ERR(curr_node)) { + NL_SET_ERR_MSG_MOD(extack, "Failed setting up vports node"); + return PTR_ERR(curr_node); + } + + /* Disable TC QoS for all vports, and assign them back to the node. */ + err = esw_qos_switch_tc_arbiter_node_to_vports(curr_node, node, extack); + if (err) + goto err_out; + + /* Clean up the TC arbiter node after disabling TC QoS for vports. */ + esw_qos_tc_arbiter_scheduling_teardown(curr_node, extack); + goto out; +err_out: + esw_qos_nodes_set_parent(&curr_node->children, node); +out: + __esw_qos_free_node(curr_node); + return err; +} + +static int esw_qos_node_enable_tc_arbitration(struct mlx5_esw_sched_node *node, + struct netlink_ext_ack *extack) +{ + struct mlx5_esw_sched_node *curr_node, *child; + int err, new_level, max_level; + + if (node->type == SCHED_NODE_TYPE_TC_ARBITER_TSAR) + return 0; + + /* Increase the hierarchy level by one to account for the additional + * vports TC scheduling node, and verify that the new level does not + * exceed the maximum allowed depth. + */ + new_level = node->level + 1; + max_level = 1 << MLX5_CAP_QOS(node->esw->dev, log_esw_max_sched_depth); + if (new_level > max_level) { + NL_SET_ERR_MSG_MOD(extack, + "TC arbitration on nodes is not supported beyond max scheduling depth"); + return -EOPNOTSUPP; + } + + /* Ensure the node does not contain non-leaf children before assigning + * TC bandwidth. + */ + if (!list_empty(&node->children)) { + list_for_each_entry(child, &node->children, entry) { + if (!child->vport) { + NL_SET_ERR_MSG_MOD(extack, + "Cannot configure TC bandwidth on a node with non-leaf children"); + return -EOPNOTSUPP; + } + } } + /* Allocate a new node that will store the information of the current + * node. This will be used later to restore the node if necessary. + */ + curr_node = esw_qos_move_node(node); + if (IS_ERR(curr_node)) { + NL_SET_ERR_MSG_MOD(extack, "Failed setting up node TC QoS"); + return PTR_ERR(curr_node); + } + + /* Initialize the TC arbiter node for QoS management. + * This step prepares the node for handling Traffic Class arbitration. + */ + err = esw_qos_tc_arbiter_scheduling_setup(node, extack); + if (err) + goto err_setup; + + /* Enable TC QoS for each vport within the current node. */ + err = esw_qos_switch_vports_node_to_tc_arbiter(curr_node, node, extack); + if (err) + goto err_switch_vports; + goto out; + +err_switch_vports: + esw_qos_tc_arbiter_scheduling_teardown(node, NULL); + node->ix = curr_node->ix; + node->type = curr_node->type; +err_setup: + esw_qos_nodes_set_parent(&curr_node->children, node); +out: + __esw_qos_free_node(curr_node); return err; } @@ -848,6 +1637,41 @@ static int esw_qos_devlink_rate_to_mbps(struct mlx5_core_dev *mdev, const char * return 0; } +static bool esw_qos_validate_unsupported_tc_bw(struct mlx5_eswitch *esw, + u32 *tc_bw) +{ + int i, num_tcs = esw_qos_num_tcs(esw->dev); + + for (i = num_tcs; i < DEVLINK_RATE_TCS_MAX; i++) { + if (tc_bw[i]) + return false; + } + + return true; +} + +static bool esw_qos_vport_validate_unsupported_tc_bw(struct mlx5_vport *vport, + u32 *tc_bw) +{ + struct mlx5_eswitch *esw = vport->qos.sched_node ? + vport->qos.sched_node->parent->esw : + vport->dev->priv.eswitch; + + return esw_qos_validate_unsupported_tc_bw(esw, tc_bw); +} + +static bool esw_qos_tc_bw_disabled(u32 *tc_bw) +{ + int i; + + for (i = 0; i < DEVLINK_RATE_TCS_MAX; i++) { + if (tc_bw[i]) + return false; + } + + return true; +} + int mlx5_esw_qos_init(struct mlx5_eswitch *esw) { if (esw->qos.domain) @@ -906,6 +1730,90 @@ int mlx5_esw_devlink_rate_leaf_tx_max_set(struct devlink_rate *rate_leaf, void * return err; } +int mlx5_esw_devlink_rate_leaf_tc_bw_set(struct devlink_rate *rate_leaf, + void *priv, + u32 *tc_bw, + struct netlink_ext_ack *extack) +{ + struct mlx5_esw_sched_node *vport_node; + struct mlx5_vport *vport = priv; + struct mlx5_eswitch *esw; + bool disable; + int err = 0; + + esw = vport->dev->priv.eswitch; + if (!mlx5_esw_allowed(esw)) + return -EPERM; + + disable = esw_qos_tc_bw_disabled(tc_bw); + esw_qos_lock(esw); + + if (!esw_qos_vport_validate_unsupported_tc_bw(vport, tc_bw)) { + NL_SET_ERR_MSG_MOD(extack, + "E-Switch traffic classes number is not supported"); + err = -EOPNOTSUPP; + goto unlock; + } + + vport_node = vport->qos.sched_node; + if (disable && !vport_node) + goto unlock; + + if (disable) { + if (vport_node->type == SCHED_NODE_TYPE_TC_ARBITER_TSAR) + err = esw_qos_vport_update(vport, SCHED_NODE_TYPE_VPORT, + NULL, extack); + goto unlock; + } + + if (!vport_node) { + err = mlx5_esw_qos_vport_enable(vport, + SCHED_NODE_TYPE_TC_ARBITER_TSAR, + NULL, 0, 0, extack); + vport_node = vport->qos.sched_node; + } else { + err = esw_qos_vport_update(vport, + SCHED_NODE_TYPE_TC_ARBITER_TSAR, + NULL, extack); + } + if (!err) + esw_qos_set_tc_arbiter_bw_shares(vport_node, tc_bw, extack); +unlock: + esw_qos_unlock(esw); + return err; +} + +int mlx5_esw_devlink_rate_node_tc_bw_set(struct devlink_rate *rate_node, + void *priv, + u32 *tc_bw, + struct netlink_ext_ack *extack) +{ + struct mlx5_esw_sched_node *node = priv; + struct mlx5_eswitch *esw = node->esw; + bool disable; + int err; + + if (!esw_qos_validate_unsupported_tc_bw(esw, tc_bw)) { + NL_SET_ERR_MSG_MOD(extack, + "E-Switch traffic classes number is not supported"); + return -EOPNOTSUPP; + } + + disable = esw_qos_tc_bw_disabled(tc_bw); + esw_qos_lock(esw); + if (disable) { + err = esw_qos_node_disable_tc_arbitration(node, extack); + goto unlock; + } + + err = esw_qos_node_enable_tc_arbitration(node, extack); + if (!err) + esw_qos_set_tc_arbiter_bw_shares(node, tc_bw, extack); +unlock: + esw_qos_unlock(esw); + return err; +} + int mlx5_esw_devlink_rate_node_tx_share_set(struct devlink_rate *rate_node, void *priv, u64 tx_share, struct netlink_ext_ack *extack) { @@ -996,10 +1904,16 @@ int mlx5_esw_qos_vport_update_parent(struct mlx5_vport *vport, struct mlx5_esw_s } esw_qos_lock(esw); - if (!vport->qos.sched_node && parent) - err = mlx5_esw_qos_vport_enable(vport, SCHED_NODE_TYPE_VPORT, parent, 0, 0, extack); - else if (vport->qos.sched_node) + if (!vport->qos.sched_node && parent) { + enum sched_node_type type; + + type = parent->type == SCHED_NODE_TYPE_TC_ARBITER_TSAR ? + SCHED_NODE_TYPE_RATE_LIMITER : SCHED_NODE_TYPE_VPORT; + err = mlx5_esw_qos_vport_enable(vport, type, parent, 0, 0, + extack); + } else if (vport->qos.sched_node) { err = esw_qos_vport_update_parent(vport, parent, extack); + } esw_qos_unlock(esw); return err; } @@ -1019,6 +1933,20 @@ int mlx5_esw_devlink_rate_leaf_parent_set(struct devlink_rate *devlink_rate, return mlx5_esw_qos_vport_update_parent(vport, node, extack); } +static bool esw_qos_is_node_empty(struct mlx5_esw_sched_node *node) +{ + if (list_empty(&node->children)) + return true; + + if (node->type != SCHED_NODE_TYPE_TC_ARBITER_TSAR) + return false; + + node = list_first_entry(&node->children, struct mlx5_esw_sched_node, + entry); + + return esw_qos_is_node_empty(node); +} + static int mlx5_esw_qos_node_validate_set_parent(struct mlx5_esw_sched_node *node, struct mlx5_esw_sched_node *parent, @@ -1032,13 +1960,26 @@ mlx5_esw_qos_node_validate_set_parent(struct mlx5_esw_sched_node *node, return -EOPNOTSUPP; } - if (!list_empty(&node->children)) { + if (!esw_qos_is_node_empty(node)) { NL_SET_ERR_MSG_MOD(extack, "Cannot reassign a node that contains rate objects"); return -EOPNOTSUPP; } + if (parent && parent->type == SCHED_NODE_TYPE_TC_ARBITER_TSAR) { + NL_SET_ERR_MSG_MOD(extack, + "Cannot attach a node to a parent with TC bandwidth configured"); + return -EOPNOTSUPP; + } + new_level = parent ? parent->level + 1 : 2; + if (node->type == SCHED_NODE_TYPE_TC_ARBITER_TSAR) { + /* Increase by one to account for the vports TC scheduling + * element. + */ + new_level += 1; + } + max_level = 1 << MLX5_CAP_QOS(node->esw->dev, log_esw_max_sched_depth); if (new_level > max_level) { NL_SET_ERR_MSG_MOD(extack, @@ -1049,6 +1990,32 @@ mlx5_esw_qos_node_validate_set_parent(struct mlx5_esw_sched_node *node, return 0; } +static int +esw_qos_tc_arbiter_node_update_parent(struct mlx5_esw_sched_node *node, + struct mlx5_esw_sched_node *parent, + struct netlink_ext_ack *extack) +{ + struct mlx5_esw_sched_node *curr_parent = node->parent; + u32 curr_tc_bw[DEVLINK_RATE_TCS_MAX] = {0}; + struct mlx5_eswitch *esw = node->esw; + int err; + + esw_qos_tc_arbiter_get_bw_shares(node, curr_tc_bw); + esw_qos_tc_arbiter_scheduling_teardown(node, extack); + esw_qos_node_set_parent(node, parent); + err = esw_qos_tc_arbiter_scheduling_setup(node, extack); + if (err) { + esw_qos_node_set_parent(node, curr_parent); + if (esw_qos_tc_arbiter_scheduling_setup(node, extack)) { + esw_warn(esw->dev, "Node restore QoS failed\n"); + return err; + } + } + esw_qos_set_tc_arbiter_bw_shares(node, curr_tc_bw, extack); + + return err; +} + static int esw_qos_vports_node_update_parent(struct mlx5_esw_sched_node *node, struct mlx5_esw_sched_node *parent, struct netlink_ext_ack *extack) @@ -1094,7 +2061,13 @@ static int mlx5_esw_qos_node_update_parent(struct mlx5_esw_sched_node *node, esw_qos_lock(esw); curr_parent = node->parent; - err = esw_qos_vports_node_update_parent(node, parent, extack); + if (node->type == SCHED_NODE_TYPE_TC_ARBITER_TSAR) { + err = esw_qos_tc_arbiter_node_update_parent(node, parent, + extack); + } else { + err = esw_qos_vports_node_update_parent(node, parent, extack); + } + if (err) goto out; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.h b/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.h index ed40ec8f027e..0a50982b0e27 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.h @@ -21,6 +21,14 @@ int mlx5_esw_devlink_rate_leaf_tx_share_set(struct devlink_rate *rate_leaf, void u64 tx_share, struct netlink_ext_ack *extack); int mlx5_esw_devlink_rate_leaf_tx_max_set(struct devlink_rate *rate_leaf, void *priv, u64 tx_max, struct netlink_ext_ack *extack); +int mlx5_esw_devlink_rate_leaf_tc_bw_set(struct devlink_rate *rate_node, + void *priv, + u32 *tc_bw, + struct netlink_ext_ack *extack); +int mlx5_esw_devlink_rate_node_tc_bw_set(struct devlink_rate *rate_node, + void *priv, + u32 *tc_bw, + struct netlink_ext_ack *extack); int mlx5_esw_devlink_rate_node_tx_share_set(struct devlink_rate *rate_node, void *priv, u64 tx_share, struct netlink_ext_ack *extack); int mlx5_esw_devlink_rate_node_tx_max_set(struct devlink_rate *rate_node, void *priv, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h index 8573d36785f4..d59fdcb29cb8 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h @@ -212,10 +212,20 @@ struct mlx5_vport { struct mlx5_vport_info info; - /* Protected with the E-Switch qos domain lock. */ + /* Protected with the E-Switch qos domain lock. The Vport QoS can + * either be disabled (sched_node is NULL) or in one of three states: + * 1. Regular QoS (sched_node is a vport node). + * 2. TC QoS enabled on the vport (sched_node is a TC arbiter). + * 3. TC QoS enabled on the vport's parent node + * (sched_node is a rate limit node). + * When TC is enabled in either mode, the vport owns vport TC scheduling + * nodes. + */ struct { - /* Vport scheduling element node. */ + /* Vport scheduling node. */ struct mlx5_esw_sched_node *sched_node; + /* Array of vport traffic class scheduling nodes. */ + struct mlx5_esw_sched_node **sched_nodes; } qos; u16 vport; diff --git a/drivers/net/netdevsim/dev.c b/drivers/net/netdevsim/dev.c index 3e0b61202f0c..b3647691060c 100644 --- a/drivers/net/netdevsim/dev.c +++ b/drivers/net/netdevsim/dev.c @@ -388,6 +388,17 @@ static const struct file_operations nsim_dev_rate_parent_fops = { .owner = THIS_MODULE, }; +static void nsim_dev_tc_bw_debugfs_init(struct dentry *ddir, u32 *tc_bw) +{ + int i; + + for (i = 0; i < DEVLINK_RATE_TCS_MAX; i++) { + char name[16]; + + snprintf(name, sizeof(name), "tc%d_bw", i); + debugfs_create_u32(name, 0400, ddir, &tc_bw[i]); + } +} static int nsim_dev_port_debugfs_init(struct nsim_dev *nsim_dev, struct nsim_dev_port *nsim_dev_port) { @@ -415,6 +426,8 @@ static int nsim_dev_port_debugfs_init(struct nsim_dev *nsim_dev, nsim_dev_port->ddir, &nsim_dev_port->parent_name, &nsim_dev_rate_parent_fops); + nsim_dev_tc_bw_debugfs_init(nsim_dev_port->ddir, + nsim_dev_port->tc_bw); } debugfs_create_symlink("dev", nsim_dev_port->ddir, dev_link_name); @@ -1172,6 +1185,19 @@ static int nsim_rate_bytes_to_units(char *name, u64 *rate, struct netlink_ext_ac return 0; } +static int nsim_leaf_tc_bw_set(struct devlink_rate *devlink_rate, + void *priv, u32 *tc_bw, + struct netlink_ext_ack *extack) +{ + struct nsim_dev_port *nsim_dev_port = priv; + int i; + + for (i = 0; i < DEVLINK_RATE_TCS_MAX; i++) + nsim_dev_port->tc_bw[i] = tc_bw[i]; + + return 0; +} + static int nsim_leaf_tx_share_set(struct devlink_rate *devlink_rate, void *priv, u64 tx_share, struct netlink_ext_ack *extack) { @@ -1210,8 +1236,21 @@ struct nsim_rate_node { char *parent_name; u16 tx_share; u16 tx_max; + u32 tc_bw[DEVLINK_RATE_TCS_MAX]; }; +static int nsim_node_tc_bw_set(struct devlink_rate *devlink_rate, void *priv, + u32 *tc_bw, struct netlink_ext_ack *extack) +{ + struct nsim_rate_node *nsim_node = priv; + int i; + + for (i = 0; i < DEVLINK_RATE_TCS_MAX; i++) + nsim_node->tc_bw[i] = tc_bw[i]; + + return 0; +} + static int nsim_node_tx_share_set(struct devlink_rate *devlink_rate, void *priv, u64 tx_share, struct netlink_ext_ack *extack) { @@ -1264,6 +1303,8 @@ static int nsim_rate_node_new(struct devlink_rate *node, void **priv, &nsim_node->parent_name, &nsim_dev_rate_parent_fops); + nsim_dev_tc_bw_debugfs_init(nsim_node->ddir, nsim_node->tc_bw); + *priv = nsim_node; return 0; } @@ -1340,8 +1381,10 @@ static const struct devlink_ops nsim_dev_devlink_ops = { .trap_policer_counter_get = nsim_dev_devlink_trap_policer_counter_get, .rate_leaf_tx_share_set = nsim_leaf_tx_share_set, .rate_leaf_tx_max_set = nsim_leaf_tx_max_set, + .rate_leaf_tc_bw_set = nsim_leaf_tc_bw_set, .rate_node_tx_share_set = nsim_node_tx_share_set, .rate_node_tx_max_set = nsim_node_tx_max_set, + .rate_node_tc_bw_set = nsim_node_tc_bw_set, .rate_node_new = nsim_rate_node_new, .rate_node_del = nsim_rate_node_del, .rate_leaf_parent_set = nsim_rate_leaf_parent_set, diff --git a/drivers/net/netdevsim/netdevsim.h b/drivers/net/netdevsim/netdevsim.h index 4a0c48c7a384..809dd29fc5fe 100644 --- a/drivers/net/netdevsim/netdevsim.h +++ b/drivers/net/netdevsim/netdevsim.h @@ -276,6 +276,7 @@ struct nsim_dev_port { struct dentry *ddir; struct dentry *rate_parent; char *parent_name; + u32 tc_bw[DEVLINK_RATE_TCS_MAX]; struct netdevsim *ns; }; diff --git a/drivers/net/vxlan/vxlan_vnifilter.c b/drivers/net/vxlan/vxlan_vnifilter.c index 4ff56d9f8f28..adc89e651e27 100644 --- a/drivers/net/vxlan/vxlan_vnifilter.c +++ b/drivers/net/vxlan/vxlan_vnifilter.c @@ -971,15 +971,10 @@ static int vxlan_vnifilter_process(struct sk_buff *skb, struct nlmsghdr *nlh, if (!(vxlan->cfg.flags & VXLAN_F_VNIFILTER)) return -EOPNOTSUPP; - nlmsg_for_each_attr(attr, nlh, sizeof(*tmsg), rem) { - switch (nla_type(attr)) { - case VXLAN_VNIFILTER_ENTRY: - err = vxlan_process_vni_filter(vxlan, attr, - nlh->nlmsg_type, extack); - break; - default: - continue; - } + nlmsg_for_each_attr_type(attr, VXLAN_VNIFILTER_ENTRY, nlh, + sizeof(*tmsg), rem) { + err = vxlan_process_vni_filter(vxlan, attr, nlh->nlmsg_type, + extack); vnis++; if (err) break; diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 6a42cc7a845a..657d44afc062 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -1621,10 +1621,9 @@ int nfsd_nl_threads_set_doit(struct sk_buff *skb, struct genl_info *info) return -EINVAL; /* count number of SERVER_THREADS values */ - nlmsg_for_each_attr(attr, info->nlhdr, GENL_HDRLEN, rem) { - if (nla_type(attr) == NFSD_A_SERVER_THREADS) - nrpools++; - } + nlmsg_for_each_attr_type(attr, NFSD_A_SERVER_THREADS, info->nlhdr, + GENL_HDRLEN, rem) + nrpools++; mutex_lock(&nfsd_mutex); @@ -1635,12 +1634,11 @@ int nfsd_nl_threads_set_doit(struct sk_buff *skb, struct genl_info *info) } i = 0; - nlmsg_for_each_attr(attr, info->nlhdr, GENL_HDRLEN, rem) { - if (nla_type(attr) == NFSD_A_SERVER_THREADS) { - nthreads[i++] = nla_get_u32(attr); - if (i >= nrpools) - break; - } + nlmsg_for_each_attr_type(attr, NFSD_A_SERVER_THREADS, info->nlhdr, + GENL_HDRLEN, rem) { + nthreads[i++] = nla_get_u32(attr); + if (i >= nrpools) + break; } if (info->attrs[NFSD_A_SERVER_GRACETIME] || @@ -1781,14 +1779,12 @@ int nfsd_nl_version_set_doit(struct sk_buff *skb, struct genl_info *info) for (i = 0; i <= NFSD_SUPPORTED_MINOR_VERSION; i++) nfsd_minorversion(nn, i, NFSD_CLEAR); - nlmsg_for_each_attr(attr, info->nlhdr, GENL_HDRLEN, rem) { + nlmsg_for_each_attr_type(attr, NFSD_A_SERVER_PROTO_VERSION, info->nlhdr, + GENL_HDRLEN, rem) { struct nlattr *tb[NFSD_A_VERSION_MAX + 1]; u32 major, minor = 0; bool enabled; - if (nla_type(attr) != NFSD_A_SERVER_PROTO_VERSION) - continue; - if (nla_parse_nested(tb, NFSD_A_VERSION_MAX, attr, nfsd_version_nl_policy, info->extack) < 0) continue; @@ -1939,14 +1935,12 @@ int nfsd_nl_listener_set_doit(struct sk_buff *skb, struct genl_info *info) * Walk the list of server_socks from userland and move any that match * back to sv_permsocks */ - nlmsg_for_each_attr(attr, info->nlhdr, GENL_HDRLEN, rem) { + nlmsg_for_each_attr_type(attr, NFSD_A_SERVER_SOCK_ADDR, info->nlhdr, + GENL_HDRLEN, rem) { struct nlattr *tb[NFSD_A_SOCK_MAX + 1]; const char *xcl_name; struct sockaddr *sa; - if (nla_type(attr) != NFSD_A_SERVER_SOCK_ADDR) - continue; - if (nla_parse_nested(tb, NFSD_A_SOCK_MAX, attr, nfsd_sock_nl_policy, info->extack) < 0) continue; @@ -2001,15 +1995,13 @@ int nfsd_nl_listener_set_doit(struct sk_buff *skb, struct genl_info *info) svc_xprt_destroy_all(serv, net); /* walk list of addrs again, open any that still don't exist */ - nlmsg_for_each_attr(attr, info->nlhdr, GENL_HDRLEN, rem) { + nlmsg_for_each_attr_type(attr, NFSD_A_SERVER_SOCK_ADDR, info->nlhdr, + GENL_HDRLEN, rem) { struct nlattr *tb[NFSD_A_SOCK_MAX + 1]; const char *xcl_name; struct sockaddr *sa; int ret; - if (nla_type(attr) != NFSD_A_SERVER_SOCK_ADDR) - continue; - if (nla_parse_nested(tb, NFSD_A_SOCK_MAX, attr, nfsd_sock_nl_policy, info->extack) < 0) continue; diff --git a/include/net/devlink.h b/include/net/devlink.h index 63517646a497..d0ce5a7e984c 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -118,6 +118,8 @@ struct devlink_rate { u32 tx_priority; u32 tx_weight; + + u32 tc_bw[DEVLINK_RATE_TCS_MAX]; }; struct devlink_port { @@ -1486,6 +1488,9 @@ struct devlink_ops { u32 tx_priority, struct netlink_ext_ack *extack); int (*rate_leaf_tx_weight_set)(struct devlink_rate *devlink_rate, void *priv, u32 tx_weight, struct netlink_ext_ack *extack); + int (*rate_leaf_tc_bw_set)(struct devlink_rate *devlink_rate, + void *priv, u32 *tc_bw, + struct netlink_ext_ack *extack); int (*rate_node_tx_share_set)(struct devlink_rate *devlink_rate, void *priv, u64 tx_share, struct netlink_ext_ack *extack); int (*rate_node_tx_max_set)(struct devlink_rate *devlink_rate, void *priv, @@ -1494,6 +1499,9 @@ struct devlink_ops { u32 tx_priority, struct netlink_ext_ack *extack); int (*rate_node_tx_weight_set)(struct devlink_rate *devlink_rate, void *priv, u32 tx_weight, struct netlink_ext_ack *extack); + int (*rate_node_tc_bw_set)(struct devlink_rate *devlink_rate, + void *priv, u32 *tc_bw, + struct netlink_ext_ack *extack); int (*rate_node_new)(struct devlink_rate *rate_node, void **priv, struct netlink_ext_ack *extack); int (*rate_node_del)(struct devlink_rate *rate_node, void *priv, diff --git a/include/net/netlink.h b/include/net/netlink.h index 90a560dc167a..1a8356ca4b78 100644 --- a/include/net/netlink.h +++ b/include/net/netlink.h @@ -68,6 +68,8 @@ * nlmsg_for_each_msg() loop over all messages * nlmsg_validate() validate netlink message incl. attrs * nlmsg_for_each_attr() loop over all attributes + * nlmsg_for_each_attr_type() loop over all attributes with the + * given type * * Misc: * nlmsg_report() report back to application? @@ -967,6 +969,18 @@ static inline u32 nlmsg_seq(const struct nlmsghdr *nlh) nlmsg_attrlen(nlh, hdrlen), rem) /** + * nlmsg_for_each_attr_type - iterate over a stream of attributes + * @pos: loop counter, set to the current attribute + * @type: required attribute type for @pos + * @nlh: netlink message header + * @hdrlen: length of the family specific header + * @rem: initialized to len, holds bytes currently remaining in stream + */ +#define nlmsg_for_each_attr_type(pos, type, nlh, hdrlen, rem) \ + nlmsg_for_each_attr(pos, nlh, hdrlen, rem) \ + if (nla_type(pos) == type) + +/** * nlmsg_put - Add a new netlink message to an skb * @skb: socket buffer to store message in * @portid: netlink PORTID of requesting application diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index a5ee0f13740a..e72bcc239afd 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -221,6 +221,11 @@ enum devlink_port_flavour { */ }; +/* IEEE 802.1Qaz standard supported values. */ + +#define DEVLINK_RATE_TCS_MAX 8 +#define DEVLINK_RATE_TC_INDEX_MAX (DEVLINK_RATE_TCS_MAX - 1) + enum devlink_rate_type { DEVLINK_RATE_TYPE_LEAF, DEVLINK_RATE_TYPE_NODE, @@ -629,6 +634,10 @@ enum devlink_attr { DEVLINK_ATTR_REGION_DIRECT, /* flag */ + DEVLINK_ATTR_RATE_TC_BWS, /* nested */ + DEVLINK_ATTR_RATE_TC_INDEX, /* u8 */ + DEVLINK_ATTR_RATE_TC_BW, /* u32 */ + /* Add new attributes above here, update the spec in * Documentation/netlink/specs/devlink.yaml and re-generate * net/devlink/netlink_gen.c. diff --git a/net/devlink/netlink_gen.c b/net/devlink/netlink_gen.c index e340d955cf3b..c50436433c18 100644 --- a/net/devlink/netlink_gen.c +++ b/net/devlink/netlink_gen.c @@ -45,6 +45,11 @@ const struct nla_policy devlink_dl_port_function_nl_policy[DEVLINK_PORT_FN_ATTR_ [DEVLINK_PORT_FN_ATTR_CAPS] = NLA_POLICY_BITFIELD32(15), }; +const struct nla_policy devlink_dl_rate_tc_bws_nl_policy[DEVLINK_ATTR_RATE_TC_BW + 1] = { + [DEVLINK_ATTR_RATE_TC_INDEX] = NLA_POLICY_MAX(NLA_U8, DEVLINK_RATE_TC_INDEX_MAX), + [DEVLINK_ATTR_RATE_TC_BW] = { .type = NLA_U32, }, +}; + const struct nla_policy devlink_dl_selftest_id_nl_policy[DEVLINK_ATTR_SELFTEST_ID_FLASH + 1] = { [DEVLINK_ATTR_SELFTEST_ID_FLASH] = { .type = NLA_FLAG, }, }; @@ -523,7 +528,7 @@ static const struct nla_policy devlink_rate_get_dump_nl_policy[DEVLINK_ATTR_DEV_ }; /* DEVLINK_CMD_RATE_SET - do */ -static const struct nla_policy devlink_rate_set_nl_policy[DEVLINK_ATTR_RATE_TX_WEIGHT + 1] = { +static const struct nla_policy devlink_rate_set_nl_policy[DEVLINK_ATTR_RATE_TC_BWS + 1] = { [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, }, [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, }, [DEVLINK_ATTR_RATE_NODE_NAME] = { .type = NLA_NUL_STRING, }, @@ -532,10 +537,11 @@ static const struct nla_policy devlink_rate_set_nl_policy[DEVLINK_ATTR_RATE_TX_W [DEVLINK_ATTR_RATE_TX_PRIORITY] = { .type = NLA_U32, }, [DEVLINK_ATTR_RATE_TX_WEIGHT] = { .type = NLA_U32, }, [DEVLINK_ATTR_RATE_PARENT_NODE_NAME] = { .type = NLA_NUL_STRING, }, + [DEVLINK_ATTR_RATE_TC_BWS] = NLA_POLICY_NESTED(devlink_dl_rate_tc_bws_nl_policy), }; /* DEVLINK_CMD_RATE_NEW - do */ -static const struct nla_policy devlink_rate_new_nl_policy[DEVLINK_ATTR_RATE_TX_WEIGHT + 1] = { +static const struct nla_policy devlink_rate_new_nl_policy[DEVLINK_ATTR_RATE_TC_BWS + 1] = { [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, }, [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, }, [DEVLINK_ATTR_RATE_NODE_NAME] = { .type = NLA_NUL_STRING, }, @@ -544,6 +550,7 @@ static const struct nla_policy devlink_rate_new_nl_policy[DEVLINK_ATTR_RATE_TX_W [DEVLINK_ATTR_RATE_TX_PRIORITY] = { .type = NLA_U32, }, [DEVLINK_ATTR_RATE_TX_WEIGHT] = { .type = NLA_U32, }, [DEVLINK_ATTR_RATE_PARENT_NODE_NAME] = { .type = NLA_NUL_STRING, }, + [DEVLINK_ATTR_RATE_TC_BWS] = NLA_POLICY_NESTED(devlink_dl_rate_tc_bws_nl_policy), }; /* DEVLINK_CMD_RATE_DEL - do */ @@ -1191,7 +1198,7 @@ const struct genl_split_ops devlink_nl_ops[74] = { .doit = devlink_nl_rate_set_doit, .post_doit = devlink_nl_post_doit, .policy = devlink_rate_set_nl_policy, - .maxattr = DEVLINK_ATTR_RATE_TX_WEIGHT, + .maxattr = DEVLINK_ATTR_RATE_TC_BWS, .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, }, { @@ -1201,7 +1208,7 @@ const struct genl_split_ops devlink_nl_ops[74] = { .doit = devlink_nl_rate_new_doit, .post_doit = devlink_nl_post_doit, .policy = devlink_rate_new_nl_policy, - .maxattr = DEVLINK_ATTR_RATE_TX_WEIGHT, + .maxattr = DEVLINK_ATTR_RATE_TC_BWS, .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, }, { diff --git a/net/devlink/netlink_gen.h b/net/devlink/netlink_gen.h index 8f2bd50ddf5e..fb733b5d4ff1 100644 --- a/net/devlink/netlink_gen.h +++ b/net/devlink/netlink_gen.h @@ -13,6 +13,7 @@ /* Common nested types */ extern const struct nla_policy devlink_dl_port_function_nl_policy[DEVLINK_PORT_FN_ATTR_CAPS + 1]; +extern const struct nla_policy devlink_dl_rate_tc_bws_nl_policy[DEVLINK_ATTR_RATE_TC_BW + 1]; extern const struct nla_policy devlink_dl_selftest_id_nl_policy[DEVLINK_ATTR_SELFTEST_ID_FLASH + 1]; /* Ops table for devlink */ diff --git a/net/devlink/rate.c b/net/devlink/rate.c index 8828ffaf6cbc..d39300a9b3d4 100644 --- a/net/devlink/rate.c +++ b/net/devlink/rate.c @@ -80,6 +80,29 @@ devlink_rate_get_from_info(struct devlink *devlink, struct genl_info *info) return ERR_PTR(-EINVAL); } +static int devlink_rate_put_tc_bws(struct sk_buff *msg, u32 *tc_bw) +{ + struct nlattr *nla_tc_bw; + int i; + + for (i = 0; i < DEVLINK_RATE_TCS_MAX; i++) { + nla_tc_bw = nla_nest_start(msg, DEVLINK_ATTR_RATE_TC_BWS); + if (!nla_tc_bw) + return -EMSGSIZE; + + if (nla_put_u8(msg, DEVLINK_ATTR_RATE_TC_INDEX, i) || + nla_put_u32(msg, DEVLINK_ATTR_RATE_TC_BW, tc_bw[i])) + goto nla_put_failure; + + nla_nest_end(msg, nla_tc_bw); + } + return 0; + +nla_put_failure: + nla_nest_cancel(msg, nla_tc_bw); + return -EMSGSIZE; +} + static int devlink_nl_rate_fill(struct sk_buff *msg, struct devlink_rate *devlink_rate, enum devlink_command cmd, u32 portid, u32 seq, @@ -129,6 +152,9 @@ static int devlink_nl_rate_fill(struct sk_buff *msg, devlink_rate->parent->name)) goto nla_put_failure; + if (devlink_rate_put_tc_bws(msg, devlink_rate->tc_bw)) + goto nla_put_failure; + genlmsg_end(msg, hdr); return 0; @@ -316,6 +342,87 @@ devlink_nl_rate_parent_node_set(struct devlink_rate *devlink_rate, return 0; } +static int devlink_nl_rate_tc_bw_parse(struct nlattr *parent_nest, u32 *tc_bw, + unsigned long *bitmap, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[DEVLINK_ATTR_MAX + 1]; + u8 tc_index; + int err; + + err = nla_parse_nested(tb, DEVLINK_ATTR_MAX, parent_nest, + devlink_dl_rate_tc_bws_nl_policy, extack); + if (err) + return err; + + if (!tb[DEVLINK_ATTR_RATE_TC_INDEX]) { + NL_SET_ERR_ATTR_MISS(extack, parent_nest, + DEVLINK_ATTR_RATE_TC_INDEX); + return -EINVAL; + } + + tc_index = nla_get_u8(tb[DEVLINK_ATTR_RATE_TC_INDEX]); + + if (!tb[DEVLINK_ATTR_RATE_TC_BW]) { + NL_SET_ERR_ATTR_MISS(extack, parent_nest, + DEVLINK_ATTR_RATE_TC_BW); + return -EINVAL; + } + + if (test_and_set_bit(tc_index, bitmap)) { + NL_SET_ERR_MSG_FMT(extack, + "Duplicate traffic class index specified (%u)", + tc_index); + return -EINVAL; + } + + tc_bw[tc_index] = nla_get_u32(tb[DEVLINK_ATTR_RATE_TC_BW]); + + return 0; +} + +static int devlink_nl_rate_tc_bw_set(struct devlink_rate *devlink_rate, + struct genl_info *info) +{ + DECLARE_BITMAP(bitmap, DEVLINK_RATE_TCS_MAX) = {}; + struct devlink *devlink = devlink_rate->devlink; + const struct devlink_ops *ops = devlink->ops; + u32 tc_bw[DEVLINK_RATE_TCS_MAX] = {}; + int rem, err = -EOPNOTSUPP, i; + struct nlattr *attr; + + nlmsg_for_each_attr_type(attr, DEVLINK_ATTR_RATE_TC_BWS, info->nlhdr, + GENL_HDRLEN, rem) { + err = devlink_nl_rate_tc_bw_parse(attr, tc_bw, bitmap, + info->extack); + if (err) + return err; + } + + for (i = 0; i < DEVLINK_RATE_TCS_MAX; i++) { + if (!test_bit(i, bitmap)) { + NL_SET_ERR_MSG_FMT(info->extack, + "Bandwidth values must be specified for all %u traffic classes", + DEVLINK_RATE_TCS_MAX); + return -EINVAL; + } + } + + if (devlink_rate_is_leaf(devlink_rate)) + err = ops->rate_leaf_tc_bw_set(devlink_rate, devlink_rate->priv, + tc_bw, info->extack); + else if (devlink_rate_is_node(devlink_rate)) + err = ops->rate_node_tc_bw_set(devlink_rate, devlink_rate->priv, + tc_bw, info->extack); + + if (err) + return err; + + memcpy(devlink_rate->tc_bw, tc_bw, sizeof(tc_bw)); + + return 0; +} + static int devlink_nl_rate_set(struct devlink_rate *devlink_rate, const struct devlink_ops *ops, struct genl_info *info) @@ -388,6 +495,12 @@ static int devlink_nl_rate_set(struct devlink_rate *devlink_rate, return err; } + if (attrs[DEVLINK_ATTR_RATE_TC_BWS]) { + err = devlink_nl_rate_tc_bw_set(devlink_rate, info); + if (err) + return err; + } + return 0; } @@ -423,6 +536,13 @@ static bool devlink_rate_set_ops_supported(const struct devlink_ops *ops, "TX weight set isn't supported for the leafs"); return false; } + if (attrs[DEVLINK_ATTR_RATE_TC_BWS] && + !ops->rate_leaf_tc_bw_set) { + NL_SET_ERR_MSG_ATTR(info->extack, + attrs[DEVLINK_ATTR_RATE_TC_BWS], + "TC bandwidth set isn't supported for the leafs"); + return false; + } } else if (type == DEVLINK_RATE_TYPE_NODE) { if (attrs[DEVLINK_ATTR_RATE_TX_SHARE] && !ops->rate_node_tx_share_set) { NL_SET_ERR_MSG(info->extack, "TX share set isn't supported for the nodes"); @@ -449,6 +569,13 @@ static bool devlink_rate_set_ops_supported(const struct devlink_ops *ops, "TX weight set isn't supported for the nodes"); return false; } + if (attrs[DEVLINK_ATTR_RATE_TC_BWS] && + !ops->rate_node_tc_bw_set) { + NL_SET_ERR_MSG_ATTR(info->extack, + attrs[DEVLINK_ATTR_RATE_TC_BWS], + "TC bandwidth set isn't supported for the nodes"); + return false; + } } else { WARN(1, "Unknown type of rate object"); return false; diff --git a/tools/testing/selftests/drivers/net/hw/devlink_rate_tc_bw.py b/tools/testing/selftests/drivers/net/hw/devlink_rate_tc_bw.py new file mode 100755 index 000000000000..820d8a03becc --- /dev/null +++ b/tools/testing/selftests/drivers/net/hw/devlink_rate_tc_bw.py @@ -0,0 +1,466 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: GPL-2.0 + +""" +Devlink Rate TC Bandwidth Test Suite +=================================== + +This test suite verifies the functionality of devlink-rate traffic class (TC) +bandwidth distribution in a virtualized environment. The tests validate that +bandwidth can be properly allocated between different traffic classes and +that TC mapping works as expected. + +Test Environment: +---------------- +- Creates 1 VF +- Establishes a bridge connecting the VF representor and the uplink representor +- Sets up 2 VLAN interfaces on the VF with different VLAN IDs (101, 102) +- Configures different traffic classes (TC3 and TC4) for each VLAN + +Test Cases: +---------- +1. test_no_tc_mapping_bandwidth: + - Verifies that without TC mapping, bandwidth is NOT distributed according to + the configured 80/20 split between TC4 and TC3 + - This test should fail if bandwidth matches the 80/20 split without TC + mapping + - Expected: Bandwidth should NOT be distributed as 80/20 + +2. test_tc_mapping_bandwidth: + - Configures TC mapping using mqprio qdisc + - Verifies that with TC mapping, bandwidth IS distributed according to the + configured 80/20 split between TC3 and TC4 + - Expected: Bandwidth should be distributed as 80/20 + +Bandwidth Distribution: +---------------------- +- TC3 (VLAN 101): Configured for 80% of total bandwidth +- TC4 (VLAN 102): Configured for 20% of total bandwidth +- Total bandwidth: 1Gbps +- Tolerance: +-12% + +Hardware-Specific Behavior (mlx5): +-------------------------- +mlx5 hardware enforces traffic class separation by ensuring that each transmit +queue (SQ) is associated with a single TC. If a packet is sent on a queue that +doesn't match the expected TC (based on DSCP or VLAN priority and hypervisor-set +mapping), the hardware moves the queue to the correct TC scheduler to preserve +traffic isolation. + +This behavior means that even without explicit TC-to-queue mapping, bandwidth +enforcement may still appear to work—because the hardware dynamically adjusts +the scheduling context. However, this can lead to performance issues in high +rates and HOL blocking if traffic from different TCs is mixed on the same queue. +""" + +import json +import os +import subprocess +import threading +import time + +from lib.py import ksft_pr, ksft_run, ksft_exit +from lib.py import KsftSkipEx, KsftFailEx, KsftXfailEx +from lib.py import NetDrvEpEnv, DevlinkFamily +from lib.py import NlError +from lib.py import cmd, defer, ethtool, ip + + +class BandwidthValidator: + """ + Validates bandwidth totals and per-TC shares against expected values + with a tolerance. + """ + + def __init__(self): + self.tolerance_percent = 12 + self.expected_total_gbps = 1.0 + self.total_min_expected = self.min_expected(self.expected_total_gbps) + self.total_max_expected = self.max_expected(self.expected_total_gbps) + self.tc_expected_percent = { + 3: 20.0, + 4: 80.0, + } + + def min_expected(self, value): + """Calculates the minimum acceptable value based on tolerance.""" + return value - (value * self.tolerance_percent / 100) + + def max_expected(self, value): + """Calculates the maximum acceptable value based on tolerance.""" + return value + (value * self.tolerance_percent / 100) + + def bound(self, expected, value): + """Returns True if value is within expected tolerance.""" + return self.min_expected(expected) <= value <= self.max_expected(expected) + + def tc_bandwidth_bound(self, value, tc_ix): + """ + Returns True if the given bandwidth value is within tolerance + for the TC's expected bandwidth. + """ + expected = self.tc_expected_percent[tc_ix] + return self.bound(expected, value) + + +def setup_vf(cfg, set_tc_mapping=True): + """ + Sets up a VF on the given network interface. + + Enables SR-IOV and switchdev mode, brings the VF interface up, + and optionally configures TC mapping using mqprio. + """ + try: + cmd(f"devlink dev eswitch set pci/{cfg.pci} mode switchdev") + defer(cmd, f"devlink dev eswitch set pci/{cfg.pci} mode legacy") + except Exception as exc: + raise KsftSkipEx(f"Failed to enable switchdev mode on {cfg.pci}") from exc + try: + cmd(f"echo 1 > /sys/class/net/{cfg.ifname}/device/sriov_numvfs") + defer(cmd, f"echo 0 > /sys/class/net/{cfg.ifname}/device/sriov_numvfs") + except Exception as exc: + raise KsftSkipEx(f"Failed to enable SR-IOV on {cfg.ifname}") from exc + + time.sleep(2) + vf_ifc = (os.listdir( + f"/sys/class/net/{cfg.ifname}/device/virtfn0/net") or [None])[0] + if vf_ifc: + ip(f"link set dev {vf_ifc} up") + else: + raise KsftSkipEx("VF interface not found") + if set_tc_mapping: + cmd(f"tc qdisc add dev {vf_ifc} root handle 5 mqprio mode dcb hw 1 num_tc 8") + + return vf_ifc + + +def setup_vlans_on_vf(vf_ifc): + """ + Sets up two VLAN interfaces on the given VF, each mapped to a different TC. + """ + vlan_configs = [ + {"vlan_id": 101, "tc": 3, "ip": "198.51.100.2"}, + {"vlan_id": 102, "tc": 4, "ip": "198.51.100.10"}, + ] + + for config in vlan_configs: + vlan_dev = f"{vf_ifc}.{config['vlan_id']}" + ip(f"link add link {vf_ifc} name {vlan_dev} type vlan id {config['vlan_id']}") + ip(f"addr add {config['ip']}/29 dev {vlan_dev}") + ip(f"link set dev {vlan_dev} up") + ip(f"link set dev {vlan_dev} type vlan egress-qos-map 0:{config['tc']}") + ksft_pr(f"Created VLAN {vlan_dev} on {vf_ifc} with tc {config['tc']} and IP {config['ip']}") + + +def get_vf_info(cfg): + """ + Finds the VF representor interface and devlink port index + for the given PCI device used in the test environment. + """ + cfg.vf_representor = None + cfg.vf_port_index = None + out = subprocess.check_output(["devlink", "-j", "port", "show"], encoding="utf-8") + ports = json.loads(out)["port"] + + for port_name, props in ports.items(): + netdev = props.get("netdev") + + if (port_name.startswith(f"pci/{cfg.pci}/") and + props.get("vfnum") == 0): + cfg.vf_representor = netdev + cfg.vf_port_index = int(port_name.split("/")[-1]) + break + + +def setup_bridge(cfg): + """ + Creates and configures a Linux bridge, with both the uplink + and VF representor interfaces attached to it. + """ + bridge_name = f"br_{os.getpid()}" + ip(f"link add name {bridge_name} type bridge") + defer(cmd, f"ip link del name {bridge_name} type bridge") + + ip(f"link set dev {cfg.ifname} master {bridge_name}") + + rep_name = cfg.vf_representor + if rep_name: + ip(f"link set dev {rep_name} master {bridge_name}") + ip(f"link set dev {rep_name} up") + ksft_pr(f"Set representor {rep_name} up and added to bridge") + else: + raise KsftSkipEx("Could not find representor for the VF") + + ip(f"link set dev {bridge_name} up") + + +def setup_devlink_rate(cfg): + """ + Configures devlink rate tx_max and traffic class bandwidth for the VF. + """ + port_index = cfg.vf_port_index + if port_index is None: + raise KsftSkipEx("Could not find VF port index") + try: + cfg.devnl.rate_set({ + "bus-name": "pci", + "dev-name": cfg.pci, + "port-index": port_index, + "rate-tx-max": 125000000, + "rate-tc-bws": [ + {"rate-tc-index": 0, "rate-tc-bw": 0}, + {"rate-tc-index": 1, "rate-tc-bw": 0}, + {"rate-tc-index": 2, "rate-tc-bw": 0}, + {"rate-tc-index": 3, "rate-tc-bw": 20}, + {"rate-tc-index": 4, "rate-tc-bw": 80}, + {"rate-tc-index": 5, "rate-tc-bw": 0}, + {"rate-tc-index": 6, "rate-tc-bw": 0}, + {"rate-tc-index": 7, "rate-tc-bw": 0}, + ] + }) + except NlError as exc: + if exc.error == 95: # EOPNOTSUPP + raise KsftSkipEx("devlink rate configuration is not supported on the VF") from exc + raise KsftFailEx(f"rate_set failed on VF port {port_index}") from exc + + +def setup_remote_server(cfg): + """ + Sets up VLAN interfaces and starts iperf3 servers on the remote side. + """ + remote_dev = cfg.remote_ifname + vlan_ids = [101, 102] + remote_ips = ["198.51.100.1", "198.51.100.9"] + + for vlan_id, ip_addr in zip(vlan_ids, remote_ips): + vlan_dev = f"{remote_dev}.{vlan_id}" + cmd(f"ip link add link {remote_dev} name {vlan_dev} " + f"type vlan id {vlan_id}", host=cfg.remote) + cmd(f"ip addr add {ip_addr}/29 dev {vlan_dev}", host=cfg.remote) + cmd(f"ip link set dev {vlan_dev} up", host=cfg.remote) + cmd(f"iperf3 -s -1 -B {ip_addr}",background=True, host=cfg.remote) + defer(cmd, f"ip link del {vlan_dev}", host=cfg.remote) + + +def setup_test_environment(cfg, set_tc_mapping=True): + """ + Sets up the complete test environment including VF creation, VLANs, + bridge configuration, devlink rate setup, and the remote server. + """ + vf_ifc = setup_vf(cfg, set_tc_mapping) + ksft_pr(f"Created VF interface: {vf_ifc}") + + setup_vlans_on_vf(vf_ifc) + + get_vf_info(cfg) + setup_bridge(cfg) + + setup_devlink_rate(cfg) + setup_remote_server(cfg) + time.sleep(2) + + +def run_iperf_client(server_ip, local_ip, barrier, min_expected_gbps=0.1): + """ + Runs a single iperf3 client instance, binding to the given local IP. + Waits on a barrier to synchronize with other threads. + """ + try: + barrier.wait(timeout=10) + except Exception as exc: + raise KsftFailEx("iperf3 barrier wait timed") from exc + + iperf_cmd = ["iperf3", "-c", server_ip, "-B", local_ip, "-J"] + result = subprocess.run(iperf_cmd, capture_output=True, text=True, + check=True) + + try: + output = json.loads(result.stdout) + bits_per_second = output["end"]["sum_received"]["bits_per_second"] + gbps = bits_per_second / 1e9 + if gbps < min_expected_gbps: + ksft_pr( + f"iperf3 bandwidth too low: {gbps:.2f} Gbps " + f"(expected ≥ {min_expected_gbps} Gbps)" + ) + return None + return gbps + except json.JSONDecodeError as exc: + ksft_pr(f"Failed to parse iperf3 JSON output: {exc}") + return None + + +def run_bandwidth_test(): + """ + Launches iperf3 client threads for each VLAN/TC pair and collects results. + """ + def _run_iperf_client_thread(server_ip, local_ip, results, barrier, tc_ix): + results[tc_ix] = run_iperf_client(server_ip, local_ip, barrier) + + vf_vlan_data = [ + # (local_ip, remote_ip, TC) + ("198.51.100.2", "198.51.100.1", 3), + ("198.51.100.10", "198.51.100.9", 4), + ] + + results = {} + threads = [] + start_barrier = threading.Barrier(len(vf_vlan_data)) + + for local_ip, remote_ip, tc_ix in vf_vlan_data: + thread = threading.Thread( + target=_run_iperf_client_thread, + args=(remote_ip, local_ip, results, start_barrier, tc_ix) + ) + thread.start() + threads.append(thread) + + for thread in threads: + thread.join() + + for tc_ix, tc_bw in results.items(): + if tc_bw is None: + raise KsftFailEx("iperf3 client failed; cannot evaluate bandwidth") + + return results + +def calculate_bandwidth_percentages(results): + """ + Calculates the percentage of total bandwidth received by TC3 and TC4. + """ + if 3 not in results or 4 not in results: + raise KsftFailEx(f"Missing expected TC results in {results}") + + tc3_bw = results[3] + tc4_bw = results[4] + total_bw = tc3_bw + tc4_bw + tc3_percentage = (tc3_bw / total_bw) * 100 + tc4_percentage = (tc4_bw / total_bw) * 100 + + return { + 'tc3_bw': tc3_bw, + 'tc4_bw': tc4_bw, + 'tc3_percentage': tc3_percentage, + 'tc4_percentage': tc4_percentage, + 'total_bw': total_bw + } + + +def print_bandwidth_results(bw_data, test_name): + """ + Prints bandwidth measurements and TC usage summary for a given test. + """ + ksft_pr(f"Bandwidth check results {test_name}:") + ksft_pr(f"TC 3: {bw_data['tc3_bw']:.2f} Gbits/sec") + ksft_pr(f"TC 4: {bw_data['tc4_bw']:.2f} Gbits/sec") + ksft_pr(f"Total bandwidth: {bw_data['total_bw']:.2f} Gbits/sec") + ksft_pr(f"TC 3 percentage: {bw_data['tc3_percentage']:.1f}%") + ksft_pr(f"TC 4 percentage: {bw_data['tc4_percentage']:.1f}%") + + +def verify_total_bandwidth(bw_data, validator): + """ + Ensures the total measured bandwidth falls within the acceptable tolerance. + """ + total = bw_data['total_bw'] + + if validator.bound(validator.expected_total_gbps, total): + return + + if total < validator.total_min_expected: + raise KsftSkipEx( + f"Total bandwidth {total:.2f} Gbps < minimum " + f"{validator.total_min_expected:.2f} Gbps; " + f"parent tx_max ({validator.expected_total_gbps:.1f} G) " + f"not reached, cannot validate share" + ) + + raise KsftFailEx( + f"Total bandwidth {total:.2f} Gbps exceeds allowed ceiling " + f"{validator.total_max_expected:.2f} Gbps " + f"(VF tx_max set to {validator.expected_total_gbps:.1f} G)" + ) + + +def check_bandwidth_distribution(bw_data, validator): + """ + Checks whether the measured TC3 and TC4 bandwidth percentages + fall within their expected tolerance ranges. + + Returns: + bool: True if both TC3 and TC4 percentages are within bounds. + """ + tc3_valid = validator.tc_bandwidth_bound(bw_data['tc3_percentage'], 3) + tc4_valid = validator.tc_bandwidth_bound(bw_data['tc4_percentage'], 4) + + return tc3_valid and tc4_valid + + +def run_bandwidth_distribution_test(cfg, set_tc_mapping): + """ + Runs parallel iperf3 tests for both TCs and collects results. + """ + setup_test_environment(cfg, set_tc_mapping) + bandwidths = run_bandwidth_test() + bw_data = calculate_bandwidth_percentages(bandwidths) + test_name = "with TC mapping" if set_tc_mapping else "without TC mapping" + print_bandwidth_results(bw_data, test_name) + + verify_total_bandwidth(bw_data, cfg.bw_validator) + + return check_bandwidth_distribution(bw_data, cfg.bw_validator) + + +def test_no_tc_mapping_bandwidth(cfg): + """ + Verifies that bandwidth is not split 80/20 without traffic class mapping. + """ + pass_bw_msg = "Bandwidth is NOT distributed as 80/20 without TC mapping" + fail_bw_msg = "Bandwidth matched 80/20 split without TC mapping" + is_mlx5 = "driver: mlx5" in ethtool(f"-i {cfg.ifname}").stdout + + if run_bandwidth_distribution_test(cfg, set_tc_mapping=False): + if is_mlx5: + raise KsftXfailEx(fail_bw_msg) + raise KsftFailEx(fail_bw_msg) + if is_mlx5: + raise KsftFailEx("mlx5 behavior changed:" + pass_bw_msg) + ksft_pr(pass_bw_msg) + + +def test_tc_mapping_bandwidth(cfg): + """ + Verifies that bandwidth is correctly split 80/20 between TC3 and TC4 + when traffic class mapping is set. + """ + if run_bandwidth_distribution_test(cfg, set_tc_mapping=True): + ksft_pr("Bandwidth is distributed as 80/20 with TC mapping") + else: + raise KsftFailEx("Bandwidth did not match 80/20 split with TC mapping") + + +def main() -> None: + """ + Main entry point for running the test cases. + """ + with NetDrvEpEnv(__file__, nsim_test=False) as cfg: + cfg.devnl = DevlinkFamily() + + cfg.pci = os.path.basename( + os.path.realpath(f"/sys/class/net/{cfg.ifname}/device") + ) + if not cfg.pci: + raise KsftSkipEx("Could not get PCI address of the interface") + cfg.require_cmd("iperf3") + cfg.require_cmd("iperf3", remote=True) + + cfg.bw_validator = BandwidthValidator() + + cases = [test_no_tc_mapping_bandwidth, test_tc_mapping_bandwidth] + + ksft_run(cases=cases, args=(cfg,)) + ksft_exit() + + +if __name__ == "__main__": + main() diff --git a/tools/testing/selftests/drivers/net/hw/lib/py/__init__.py b/tools/testing/selftests/drivers/net/hw/lib/py/__init__.py index 56ff11074b55..1462a339a74b 100644 --- a/tools/testing/selftests/drivers/net/hw/lib/py/__init__.py +++ b/tools/testing/selftests/drivers/net/hw/lib/py/__init__.py @@ -13,7 +13,7 @@ try: # Import one by one to avoid pylint false positives from net.lib.py import EthtoolFamily, NetdevFamily, NetshaperFamily, \ - NlError, RtnlFamily + NlError, RtnlFamily, DevlinkFamily from net.lib.py import CmdExitFailure from net.lib.py import bkg, cmd, defer, ethtool, fd_read_timeout, ip, \ rand_port, tool, wait_port_listen diff --git a/tools/testing/selftests/drivers/net/lib/py/__init__.py b/tools/testing/selftests/drivers/net/lib/py/__init__.py index 9ed1d8f70524..fce5d9218f1d 100644 --- a/tools/testing/selftests/drivers/net/lib/py/__init__.py +++ b/tools/testing/selftests/drivers/net/lib/py/__init__.py @@ -12,7 +12,7 @@ try: # Import one by one to avoid pylint false positives from net.lib.py import EthtoolFamily, NetdevFamily, NetshaperFamily, \ - NlError, RtnlFamily + NlError, RtnlFamily, DevlinkFamily from net.lib.py import CmdExitFailure from net.lib.py import bkg, cmd, defer, ethtool, fd_read_timeout, ip, \ rand_port, tool, wait_port_listen diff --git a/tools/testing/selftests/drivers/net/netdevsim/devlink.sh b/tools/testing/selftests/drivers/net/netdevsim/devlink.sh index b5ea2526f23c..a102803ff74f 100755 --- a/tools/testing/selftests/drivers/net/netdevsim/devlink.sh +++ b/tools/testing/selftests/drivers/net/netdevsim/devlink.sh @@ -608,6 +608,46 @@ rate_attr_parent_check() check_err $? "Unexpected parent attr value $api_value != $parent" } +rate_attr_tc_bw_check() +{ + local handle=$1 + local tc_bw=$2 + local debug_file=$3 + + local tc_bw_str="" + for bw in $tc_bw; do + local tc=${bw%%:*} + local value=${bw##*:} + tc_bw_str="$tc_bw_str $tc:$value" + done + tc_bw_str=${tc_bw_str# } + + rate_attr_set "$handle" tc-bw "$tc_bw_str" + check_err $? "Failed to set tc-bw values" + + for bw in $tc_bw; do + local tc=${bw%%:*} + local value=${bw##*:} + local debug_value + debug_value=$(cat "$debug_file"/tc"${tc}"_bw) + check_err $? "Failed to read tc-bw value from debugfs for tc$tc" + [ "$debug_value" == "$value" ] + check_err $? "Unexpected tc-bw debug value for tc$tc: $debug_value != $value" + done + + for bw in $tc_bw; do + local tc=${bw%%:*} + local expected_value=${bw##*:} + local api_value + api_value=$(rate_attr_get "$handle" tc_"$tc") + if [ "$api_value" = "null" ]; then + api_value=0 + fi + [ "$api_value" == "$expected_value" ] + check_err $? "Unexpected tc-bw value for tc$tc: $api_value != $expected_value" + done +} + rate_node_add() { local handle=$1 @@ -649,6 +689,13 @@ rate_test() rate=$(($rate+100)) done + local tc_bw="0:0 1:40 2:0 3:0 4:0 5:0 6:60 7:0" + for r_obj in $leafs + do + rate_attr_tc_bw_check "$r_obj" "$tc_bw" \ + "$DEBUGFS_DIR"/ports/"${r_obj##*/}" + done + local node1_name='group1' local node1="$DL_HANDLE/$node1_name" rate_node_add "$node1" @@ -666,6 +713,12 @@ rate_test() rate_attr_tx_rate_check $node1 tx_max $node_tx_max \ $DEBUGFS_DIR/rate_nodes/${node1##*/}/tx_max + + local tc_bw="0:20 1:0 2:0 3:0 4:0 5:20 6:60 7:0" + rate_attr_tc_bw_check $node1 "$tc_bw" \ + "$DEBUGFS_DIR"/rate_nodes/"${node1##*/}" + + rate_node_del "$node1" check_err $? "Failed to delete node $node1" local num_nodes=`rate_nodes_get $DL_HANDLE | wc -w` diff --git a/tools/testing/selftests/net/lib/py/__init__.py b/tools/testing/selftests/net/lib/py/__init__.py index 8697bd27dc30..02be28dcc089 100644 --- a/tools/testing/selftests/net/lib/py/__init__.py +++ b/tools/testing/selftests/net/lib/py/__init__.py @@ -6,4 +6,4 @@ from .netns import NetNS, NetNSEnter from .nsim import * from .utils import * from .ynl import NlError, YnlFamily, EthtoolFamily, NetdevFamily, RtnlFamily, RtnlAddrFamily -from .ynl import NetshaperFamily +from .ynl import NetshaperFamily, DevlinkFamily diff --git a/tools/testing/selftests/net/lib/py/ynl.py b/tools/testing/selftests/net/lib/py/ynl.py index 6329ae805abf..2b3a61ea3bfa 100644 --- a/tools/testing/selftests/net/lib/py/ynl.py +++ b/tools/testing/selftests/net/lib/py/ynl.py @@ -56,3 +56,8 @@ class NetshaperFamily(YnlFamily): def __init__(self, recv_size=0): super().__init__((SPEC_PATH / Path('net_shaper.yaml')).as_posix(), schema='', recv_size=recv_size) + +class DevlinkFamily(YnlFamily): + def __init__(self, recv_size=0): + super().__init__((SPEC_PATH / Path('devlink.yaml')).as_posix(), + schema='', recv_size=recv_size) |