summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/netlink/specs/devlink.yaml32
-rw-r--r--Documentation/networking/devlink/devlink-port.rst8
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/devlink.c2
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c1037
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/esw/qos.h8
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/eswitch.h14
-rw-r--r--drivers/net/netdevsim/dev.c43
-rw-r--r--drivers/net/netdevsim/netdevsim.h1
-rw-r--r--drivers/net/vxlan/vxlan_vnifilter.c13
-rw-r--r--fs/nfsd/nfsctl.c36
-rw-r--r--include/net/devlink.h8
-rw-r--r--include/net/netlink.h14
-rw-r--r--include/uapi/linux/devlink.h9
-rw-r--r--net/devlink/netlink_gen.c15
-rw-r--r--net/devlink/netlink_gen.h1
-rw-r--r--net/devlink/rate.c127
-rwxr-xr-xtools/testing/selftests/drivers/net/hw/devlink_rate_tc_bw.py466
-rw-r--r--tools/testing/selftests/drivers/net/hw/lib/py/__init__.py2
-rw-r--r--tools/testing/selftests/drivers/net/lib/py/__init__.py2
-rwxr-xr-xtools/testing/selftests/drivers/net/netdevsim/devlink.sh53
-rw-r--r--tools/testing/selftests/net/lib/py/__init__.py2
-rw-r--r--tools/testing/selftests/net/lib/py/ynl.py5
22 files changed, 1825 insertions, 73 deletions
diff --git a/Documentation/netlink/specs/devlink.yaml b/Documentation/netlink/specs/devlink.yaml
index bfba466d694a..1c4bb0cbe5f0 100644
--- a/Documentation/netlink/specs/devlink.yaml
+++ b/Documentation/netlink/specs/devlink.yaml
@@ -224,6 +224,10 @@ definitions:
value: 10
-
name: binary
+ -
+ name: rate-tc-index-max
+ type: const
+ value: 7
attribute-sets:
-
@@ -844,7 +848,23 @@ attribute-sets:
-
name: region-direct
type: flag
-
+ -
+ name: rate-tc-bws
+ type: nest
+ multi-attr: true
+ nested-attributes: dl-rate-tc-bws
+ -
+ name: rate-tc-index
+ type: u8
+ checks:
+ max: rate-tc-index-max
+ -
+ name: rate-tc-bw
+ type: u32
+ doc: |
+ Specifies the bandwidth share assigned to the Traffic Class.
+ The bandwidth for the traffic class is determined
+ in proportion to the sum of the shares of all configured classes.
-
name: dl-dev-stats
subset-of: devlink
@@ -1249,6 +1269,14 @@ attribute-sets:
-
name: flash
type: flag
+ -
+ name: dl-rate-tc-bws
+ subset-of: devlink
+ attributes:
+ -
+ name: rate-tc-index
+ -
+ name: rate-tc-bw
operations:
enum-model: directional
@@ -2176,6 +2204,7 @@ operations:
- rate-tx-priority
- rate-tx-weight
- rate-parent-node-name
+ - rate-tc-bws
-
name: rate-new
@@ -2196,6 +2225,7 @@ operations:
- rate-tx-priority
- rate-tx-weight
- rate-parent-node-name
+ - rate-tc-bws
-
name: rate-del
diff --git a/Documentation/networking/devlink/devlink-port.rst b/Documentation/networking/devlink/devlink-port.rst
index 9d22d41a7cd1..5e397798a402 100644
--- a/Documentation/networking/devlink/devlink-port.rst
+++ b/Documentation/networking/devlink/devlink-port.rst
@@ -418,6 +418,14 @@ API allows to configure following rate object's parameters:
to all node children limits. ``tx_max`` is an upper limit for children.
``tx_share`` is a total bandwidth distributed among children.
+``tc_bw``
+ Allow users to set the bandwidth allocation per traffic class on rate
+ objects. This enables fine-grained QoS configurations by assigning a relative
+ share value to each traffic class. The bandwidth is distributed in proportion
+ to the share value for each class, relative to the sum of all shares.
+ When applied to a non-leaf node, tc_bw determines how bandwidth is shared
+ among its child elements.
+
``tx_priority`` and ``tx_weight`` can be used simultaneously. In that case
nodes with the same priority form a WFQ subgroup in the sibling group
and arbitration among them is based on assigned weights.
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/devlink.c b/drivers/net/ethernet/mellanox/mlx5/core/devlink.c
index 42218834183a..3ffa3fbacd16 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/devlink.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/devlink.c
@@ -376,6 +376,8 @@ static const struct devlink_ops mlx5_devlink_ops = {
.eswitch_encap_mode_get = mlx5_devlink_eswitch_encap_mode_get,
.rate_leaf_tx_share_set = mlx5_esw_devlink_rate_leaf_tx_share_set,
.rate_leaf_tx_max_set = mlx5_esw_devlink_rate_leaf_tx_max_set,
+ .rate_leaf_tc_bw_set = mlx5_esw_devlink_rate_leaf_tc_bw_set,
+ .rate_node_tc_bw_set = mlx5_esw_devlink_rate_node_tc_bw_set,
.rate_node_tx_share_set = mlx5_esw_devlink_rate_node_tx_share_set,
.rate_node_tx_max_set = mlx5_esw_devlink_rate_node_tx_max_set,
.rate_node_new = mlx5_esw_devlink_rate_node_new,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c
index b6ae384396b3..154bbb17ec0e 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c
@@ -64,11 +64,19 @@ static void esw_qos_domain_release(struct mlx5_eswitch *esw)
enum sched_node_type {
SCHED_NODE_TYPE_VPORTS_TSAR,
SCHED_NODE_TYPE_VPORT,
+ SCHED_NODE_TYPE_TC_ARBITER_TSAR,
+ SCHED_NODE_TYPE_RATE_LIMITER,
+ SCHED_NODE_TYPE_VPORT_TC,
+ SCHED_NODE_TYPE_VPORTS_TC_TSAR,
};
static const char * const sched_node_type_str[] = {
[SCHED_NODE_TYPE_VPORTS_TSAR] = "vports TSAR",
[SCHED_NODE_TYPE_VPORT] = "vport",
+ [SCHED_NODE_TYPE_TC_ARBITER_TSAR] = "TC Arbiter TSAR",
+ [SCHED_NODE_TYPE_RATE_LIMITER] = "Rate Limiter",
+ [SCHED_NODE_TYPE_VPORT_TC] = "vport TC",
+ [SCHED_NODE_TYPE_VPORTS_TC_TSAR] = "vports TC TSAR",
};
struct mlx5_esw_sched_node {
@@ -92,6 +100,8 @@ struct mlx5_esw_sched_node {
struct mlx5_vport *vport;
/* Level in the hierarchy. The root node level is 1. */
u8 level;
+ /* Valid only when this node represents a traffic class. */
+ u8 tc;
};
static void esw_qos_node_attach_to_parent(struct mlx5_esw_sched_node *node)
@@ -106,6 +116,13 @@ static void esw_qos_node_attach_to_parent(struct mlx5_esw_sched_node *node)
}
}
+static int esw_qos_num_tcs(struct mlx5_core_dev *dev)
+{
+ int num_tcs = mlx5_max_tc(dev) + 1;
+
+ return num_tcs < DEVLINK_RATE_TCS_MAX ? num_tcs : DEVLINK_RATE_TCS_MAX;
+}
+
static void
esw_qos_node_set_parent(struct mlx5_esw_sched_node *node, struct mlx5_esw_sched_node *parent)
{
@@ -116,8 +133,38 @@ esw_qos_node_set_parent(struct mlx5_esw_sched_node *node, struct mlx5_esw_sched_
esw_qos_node_attach_to_parent(node);
}
+static void esw_qos_nodes_set_parent(struct list_head *nodes,
+ struct mlx5_esw_sched_node *parent)
+{
+ struct mlx5_esw_sched_node *node, *tmp;
+
+ list_for_each_entry_safe(node, tmp, nodes, entry) {
+ esw_qos_node_set_parent(node, parent);
+ if (!list_empty(&node->children) &&
+ parent->type == SCHED_NODE_TYPE_TC_ARBITER_TSAR) {
+ struct mlx5_esw_sched_node *child;
+
+ list_for_each_entry(child, &node->children, entry) {
+ struct mlx5_vport *vport = child->vport;
+
+ if (vport)
+ vport->qos.sched_node->parent = parent;
+ }
+ }
+ }
+}
+
void mlx5_esw_qos_vport_qos_free(struct mlx5_vport *vport)
{
+ if (vport->qos.sched_nodes) {
+ int num_tcs = esw_qos_num_tcs(vport->qos.sched_node->esw->dev);
+ int i;
+
+ for (i = 0; i < num_tcs; i++)
+ kfree(vport->qos.sched_nodes[i]);
+ kfree(vport->qos.sched_nodes);
+ }
+
kfree(vport->qos.sched_node);
memset(&vport->qos, 0, sizeof(vport->qos));
}
@@ -141,16 +188,37 @@ mlx5_esw_qos_vport_get_parent(const struct mlx5_vport *vport)
static void esw_qos_sched_elem_warn(struct mlx5_esw_sched_node *node, int err, const char *op)
{
- if (node->vport) {
+ switch (node->type) {
+ case SCHED_NODE_TYPE_VPORTS_TC_TSAR:
+ esw_warn(node->esw->dev,
+ "E-Switch %s %s scheduling element failed (tc=%d,err=%d)\n",
+ op, sched_node_type_str[node->type], node->tc, err);
+ break;
+ case SCHED_NODE_TYPE_VPORT_TC:
+ esw_warn(node->esw->dev,
+ "E-Switch %s %s scheduling element failed (vport=%d,tc=%d,err=%d)\n",
+ op,
+ sched_node_type_str[node->type],
+ node->vport->vport, node->tc, err);
+ break;
+ case SCHED_NODE_TYPE_VPORT:
esw_warn(node->esw->dev,
"E-Switch %s %s scheduling element failed (vport=%d,err=%d)\n",
op, sched_node_type_str[node->type], node->vport->vport, err);
- return;
+ break;
+ case SCHED_NODE_TYPE_RATE_LIMITER:
+ case SCHED_NODE_TYPE_TC_ARBITER_TSAR:
+ case SCHED_NODE_TYPE_VPORTS_TSAR:
+ esw_warn(node->esw->dev,
+ "E-Switch %s %s scheduling element failed (err=%d)\n",
+ op, sched_node_type_str[node->type], err);
+ break;
+ default:
+ esw_warn(node->esw->dev,
+ "E-Switch %s scheduling element failed (err=%d)\n",
+ op, err);
+ break;
}
-
- esw_warn(node->esw->dev,
- "E-Switch %s %s scheduling element failed (err=%d)\n",
- op, sched_node_type_str[node->type], err);
}
static int esw_qos_node_create_sched_element(struct mlx5_esw_sched_node *node, void *ctx,
@@ -233,6 +301,24 @@ static int esw_qos_sched_elem_config(struct mlx5_esw_sched_node *node, u32 max_r
return 0;
}
+static int esw_qos_create_rate_limit_element(struct mlx5_esw_sched_node *node,
+ struct netlink_ext_ack *extack)
+{
+ u32 sched_ctx[MLX5_ST_SZ_DW(scheduling_context)] = {};
+
+ if (!mlx5_qos_element_type_supported(
+ node->esw->dev,
+ SCHEDULING_CONTEXT_ELEMENT_TYPE_RATE_LIMIT,
+ SCHEDULING_HIERARCHY_E_SWITCH))
+ return -EOPNOTSUPP;
+
+ MLX5_SET(scheduling_context, sched_ctx, max_average_bw, node->max_rate);
+ MLX5_SET(scheduling_context, sched_ctx, element_type,
+ SCHEDULING_CONTEXT_ELEMENT_TYPE_RATE_LIMIT);
+
+ return esw_qos_node_create_sched_element(node, sched_ctx, extack);
+}
+
static u32 esw_qos_calculate_min_rate_divider(struct mlx5_eswitch *esw,
struct mlx5_esw_sched_node *parent)
{
@@ -266,11 +352,13 @@ static u32 esw_qos_calculate_min_rate_divider(struct mlx5_eswitch *esw,
return 0;
}
-static u32 esw_qos_calc_bw_share(u32 min_rate, u32 divider, u32 fw_max)
+static u32 esw_qos_calc_bw_share(u32 value, u32 divider, u32 fw_max)
{
if (!divider)
return 0;
- return min_t(u32, max_t(u32, DIV_ROUND_UP(min_rate, divider), MLX5_MIN_BW_SHARE), fw_max);
+ return min_t(u32, fw_max,
+ max_t(u32,
+ DIV_ROUND_UP(value, divider), MLX5_MIN_BW_SHARE));
}
static void esw_qos_update_sched_node_bw_share(struct mlx5_esw_sched_node *node,
@@ -297,7 +385,13 @@ static void esw_qos_normalize_min_rate(struct mlx5_eswitch *esw,
if (node->esw != esw || node->ix == esw->qos.root_tsar_ix)
continue;
- esw_qos_update_sched_node_bw_share(node, divider, extack);
+ /* Vports TC TSARs don't have a minimum rate configured,
+ * so there's no need to update the bw_share on them.
+ */
+ if (node->type != SCHED_NODE_TYPE_VPORTS_TC_TSAR) {
+ esw_qos_update_sched_node_bw_share(node, divider,
+ extack);
+ }
if (list_empty(&node->children))
continue;
@@ -306,6 +400,20 @@ static void esw_qos_normalize_min_rate(struct mlx5_eswitch *esw,
}
}
+static u32 esw_qos_calculate_tc_bw_divider(u32 *tc_bw)
+{
+ u32 total = 0;
+ int i;
+
+ for (i = 0; i < DEVLINK_RATE_TCS_MAX; i++)
+ total += tc_bw[i];
+
+ /* If total is zero, tc-bw config is disabled and we shouldn't reach
+ * here.
+ */
+ return WARN_ON(!total) ? 1 : total;
+}
+
static int esw_qos_set_node_min_rate(struct mlx5_esw_sched_node *node,
u32 min_rate, struct netlink_ext_ack *extack)
{
@@ -350,28 +458,64 @@ esw_qos_create_node_sched_elem(struct mlx5_core_dev *dev, u32 parent_element_id,
tsar_ix);
}
-static int esw_qos_vport_create_sched_element(struct mlx5_esw_sched_node *vport_node,
- struct netlink_ext_ack *extack)
+static int
+esw_qos_vport_create_sched_element(struct mlx5_esw_sched_node *vport_node,
+ struct netlink_ext_ack *extack)
{
u32 sched_ctx[MLX5_ST_SZ_DW(scheduling_context)] = {};
struct mlx5_core_dev *dev = vport_node->esw->dev;
void *attr;
- if (!mlx5_qos_element_type_supported(dev,
- SCHEDULING_CONTEXT_ELEMENT_TYPE_VPORT,
- SCHEDULING_HIERARCHY_E_SWITCH))
+ if (!mlx5_qos_element_type_supported(
+ dev,
+ SCHEDULING_CONTEXT_ELEMENT_TYPE_VPORT,
+ SCHEDULING_HIERARCHY_E_SWITCH))
return -EOPNOTSUPP;
MLX5_SET(scheduling_context, sched_ctx, element_type,
SCHEDULING_CONTEXT_ELEMENT_TYPE_VPORT);
attr = MLX5_ADDR_OF(scheduling_context, sched_ctx, element_attributes);
MLX5_SET(vport_element, attr, vport_number, vport_node->vport->vport);
- MLX5_SET(scheduling_context, sched_ctx, parent_element_id, vport_node->parent->ix);
- MLX5_SET(scheduling_context, sched_ctx, max_average_bw, vport_node->max_rate);
+ MLX5_SET(scheduling_context, sched_ctx, parent_element_id,
+ vport_node->parent->ix);
+ MLX5_SET(scheduling_context, sched_ctx, max_average_bw,
+ vport_node->max_rate);
return esw_qos_node_create_sched_element(vport_node, sched_ctx, extack);
}
+static int
+esw_qos_vport_tc_create_sched_element(struct mlx5_esw_sched_node *vport_tc_node,
+ u32 rate_limit_elem_ix,
+ struct netlink_ext_ack *extack)
+{
+ u32 sched_ctx[MLX5_ST_SZ_DW(scheduling_context)] = {};
+ struct mlx5_core_dev *dev = vport_tc_node->esw->dev;
+ void *attr;
+
+ if (!mlx5_qos_element_type_supported(
+ dev,
+ SCHEDULING_CONTEXT_ELEMENT_TYPE_VPORT_TC,
+ SCHEDULING_HIERARCHY_E_SWITCH))
+ return -EOPNOTSUPP;
+
+ MLX5_SET(scheduling_context, sched_ctx, element_type,
+ SCHEDULING_CONTEXT_ELEMENT_TYPE_VPORT_TC);
+ attr = MLX5_ADDR_OF(scheduling_context, sched_ctx, element_attributes);
+ MLX5_SET(vport_tc_element, attr, vport_number,
+ vport_tc_node->vport->vport);
+ MLX5_SET(vport_tc_element, attr, traffic_class, vport_tc_node->tc);
+ MLX5_SET(scheduling_context, sched_ctx, max_bw_obj_id,
+ rate_limit_elem_ix);
+ MLX5_SET(scheduling_context, sched_ctx, parent_element_id,
+ vport_tc_node->parent->ix);
+ MLX5_SET(scheduling_context, sched_ctx, bw_share,
+ vport_tc_node->bw_share);
+
+ return esw_qos_node_create_sched_element(vport_tc_node, sched_ctx,
+ extack);
+}
+
static struct mlx5_esw_sched_node *
__esw_qos_alloc_node(struct mlx5_eswitch *esw, u32 tsar_ix, enum sched_node_type type,
struct mlx5_esw_sched_node *parent)
@@ -388,6 +532,14 @@ __esw_qos_alloc_node(struct mlx5_eswitch *esw, u32 tsar_ix, enum sched_node_type
node->parent = parent;
INIT_LIST_HEAD(&node->children);
esw_qos_node_attach_to_parent(node);
+ if (!parent) {
+ /* The caller is responsible for inserting the node into the
+ * parent list if necessary. This function can also be used with
+ * a NULL parent, which doesn't necessarily indicate that it
+ * refers to the root scheduling element.
+ */
+ list_del_init(&node->entry);
+ }
return node;
}
@@ -404,6 +556,149 @@ static void esw_qos_destroy_node(struct mlx5_esw_sched_node *node, struct netlin
__esw_qos_free_node(node);
}
+static int esw_qos_create_vports_tc_node(struct mlx5_esw_sched_node *parent,
+ u8 tc, struct netlink_ext_ack *extack)
+{
+ u32 tsar_ctx[MLX5_ST_SZ_DW(scheduling_context)] = {};
+ struct mlx5_core_dev *dev = parent->esw->dev;
+ struct mlx5_esw_sched_node *vports_tc_node;
+ void *attr;
+ int err;
+
+ if (!mlx5_qos_element_type_supported(
+ dev,
+ SCHEDULING_CONTEXT_ELEMENT_TYPE_TSAR,
+ SCHEDULING_HIERARCHY_E_SWITCH) ||
+ !mlx5_qos_tsar_type_supported(dev,
+ TSAR_ELEMENT_TSAR_TYPE_DWRR,
+ SCHEDULING_HIERARCHY_E_SWITCH))
+ return -EOPNOTSUPP;
+
+ vports_tc_node = __esw_qos_alloc_node(parent->esw, 0,
+ SCHED_NODE_TYPE_VPORTS_TC_TSAR,
+ parent);
+ if (!vports_tc_node) {
+ NL_SET_ERR_MSG_MOD(extack, "E-Switch alloc node failed");
+ esw_warn(dev, "Failed to alloc vports TC node (tc=%d)\n", tc);
+ return -ENOMEM;
+ }
+
+ attr = MLX5_ADDR_OF(scheduling_context, tsar_ctx, element_attributes);
+ MLX5_SET(tsar_element, attr, tsar_type, TSAR_ELEMENT_TSAR_TYPE_DWRR);
+ MLX5_SET(tsar_element, attr, traffic_class, tc);
+ MLX5_SET(scheduling_context, tsar_ctx, parent_element_id, parent->ix);
+ MLX5_SET(scheduling_context, tsar_ctx, element_type,
+ SCHEDULING_CONTEXT_ELEMENT_TYPE_TSAR);
+
+ err = esw_qos_node_create_sched_element(vports_tc_node, tsar_ctx,
+ extack);
+ if (err)
+ goto err_create_sched_element;
+
+ vports_tc_node->tc = tc;
+
+ return 0;
+
+err_create_sched_element:
+ __esw_qos_free_node(vports_tc_node);
+ return err;
+}
+
+static void
+esw_qos_tc_arbiter_get_bw_shares(struct mlx5_esw_sched_node *tc_arbiter_node,
+ u32 *tc_bw)
+{
+ struct mlx5_esw_sched_node *vports_tc_node;
+
+ list_for_each_entry(vports_tc_node, &tc_arbiter_node->children, entry)
+ tc_bw[vports_tc_node->tc] = vports_tc_node->bw_share;
+}
+
+static void
+esw_qos_set_tc_arbiter_bw_shares(struct mlx5_esw_sched_node *tc_arbiter_node,
+ u32 *tc_bw, struct netlink_ext_ack *extack)
+{
+ struct mlx5_eswitch *esw = tc_arbiter_node->esw;
+ struct mlx5_esw_sched_node *vports_tc_node;
+ u32 divider, fw_max_bw_share;
+
+ fw_max_bw_share = MLX5_CAP_QOS(esw->dev, max_tsar_bw_share);
+ divider = esw_qos_calculate_tc_bw_divider(tc_bw);
+ list_for_each_entry(vports_tc_node, &tc_arbiter_node->children, entry) {
+ u8 tc = vports_tc_node->tc;
+ u32 bw_share;
+
+ bw_share = tc_bw[tc] * fw_max_bw_share;
+ bw_share = esw_qos_calc_bw_share(bw_share, divider,
+ fw_max_bw_share);
+ esw_qos_sched_elem_config(vports_tc_node, 0, bw_share, extack);
+ }
+}
+
+static void
+esw_qos_destroy_vports_tc_nodes(struct mlx5_esw_sched_node *tc_arbiter_node,
+ struct netlink_ext_ack *extack)
+{
+ struct mlx5_esw_sched_node *vports_tc_node, *tmp;
+
+ list_for_each_entry_safe(vports_tc_node, tmp,
+ &tc_arbiter_node->children, entry)
+ esw_qos_destroy_node(vports_tc_node, extack);
+}
+
+static int
+esw_qos_create_vports_tc_nodes(struct mlx5_esw_sched_node *tc_arbiter_node,
+ struct netlink_ext_ack *extack)
+{
+ struct mlx5_eswitch *esw = tc_arbiter_node->esw;
+ int err, i, num_tcs = esw_qos_num_tcs(esw->dev);
+
+ for (i = 0; i < num_tcs; i++) {
+ err = esw_qos_create_vports_tc_node(tc_arbiter_node, i, extack);
+ if (err)
+ goto err_tc_node_create;
+ }
+
+ return 0;
+
+err_tc_node_create:
+ esw_qos_destroy_vports_tc_nodes(tc_arbiter_node, NULL);
+ return err;
+}
+
+static int esw_qos_create_tc_arbiter_sched_elem(
+ struct mlx5_esw_sched_node *tc_arbiter_node,
+ struct netlink_ext_ack *extack)
+{
+ u32 tsar_ctx[MLX5_ST_SZ_DW(scheduling_context)] = {};
+ u32 tsar_parent_ix;
+ void *attr;
+
+ if (!mlx5_qos_tsar_type_supported(tc_arbiter_node->esw->dev,
+ TSAR_ELEMENT_TSAR_TYPE_TC_ARB,
+ SCHEDULING_HIERARCHY_E_SWITCH)) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "E-Switch TC Arbiter scheduling element is not supported");
+ return -EOPNOTSUPP;
+ }
+
+ attr = MLX5_ADDR_OF(scheduling_context, tsar_ctx, element_attributes);
+ MLX5_SET(tsar_element, attr, tsar_type, TSAR_ELEMENT_TSAR_TYPE_TC_ARB);
+ tsar_parent_ix = tc_arbiter_node->parent ? tc_arbiter_node->parent->ix :
+ tc_arbiter_node->esw->qos.root_tsar_ix;
+ MLX5_SET(scheduling_context, tsar_ctx, parent_element_id,
+ tsar_parent_ix);
+ MLX5_SET(scheduling_context, tsar_ctx, element_type,
+ SCHEDULING_CONTEXT_ELEMENT_TYPE_TSAR);
+ MLX5_SET(scheduling_context, tsar_ctx, max_average_bw,
+ tc_arbiter_node->max_rate);
+ MLX5_SET(scheduling_context, tsar_ctx, bw_share,
+ tc_arbiter_node->bw_share);
+
+ return esw_qos_node_create_sched_element(tc_arbiter_node, tsar_ctx,
+ extack);
+}
+
static struct mlx5_esw_sched_node *
__esw_qos_create_vports_sched_node(struct mlx5_eswitch *esw, struct mlx5_esw_sched_node *parent,
struct netlink_ext_ack *extack)
@@ -426,6 +721,7 @@ __esw_qos_create_vports_sched_node(struct mlx5_eswitch *esw, struct mlx5_esw_sch
goto err_alloc_node;
}
+ list_add_tail(&node->entry, &esw->qos.domain->nodes);
esw_qos_normalize_min_rate(esw, NULL, extack);
trace_mlx5_esw_node_qos_create(esw->dev, node, node->ix);
@@ -467,6 +763,9 @@ static void __esw_qos_destroy_node(struct mlx5_esw_sched_node *node, struct netl
{
struct mlx5_eswitch *esw = node->esw;
+ if (node->type == SCHED_NODE_TYPE_TC_ARBITER_TSAR)
+ esw_qos_destroy_vports_tc_nodes(node, extack);
+
trace_mlx5_esw_node_qos_destroy(esw->dev, node, node->ix);
esw_qos_destroy_node(node, extack);
esw_qos_normalize_min_rate(esw, NULL, extack);
@@ -498,6 +797,9 @@ static int esw_qos_create(struct mlx5_eswitch *esw, struct netlink_ext_ack *exta
SCHED_NODE_TYPE_VPORTS_TSAR,
NULL))
esw->qos.node0 = ERR_PTR(-ENOMEM);
+ else
+ list_add_tail(&esw->qos.node0->entry,
+ &esw->qos.domain->nodes);
}
if (IS_ERR(esw->qos.node0)) {
err = PTR_ERR(esw->qos.node0);
@@ -555,12 +857,239 @@ static void esw_qos_put(struct mlx5_eswitch *esw)
esw_qos_destroy(esw);
}
+static void
+esw_qos_tc_arbiter_scheduling_teardown(struct mlx5_esw_sched_node *node,
+ struct netlink_ext_ack *extack)
+{
+ /* Clean up all Vports TC nodes within the TC arbiter node. */
+ esw_qos_destroy_vports_tc_nodes(node, extack);
+ /* Destroy the scheduling element for the TC arbiter node itself. */
+ esw_qos_node_destroy_sched_element(node, extack);
+}
+
+static int esw_qos_tc_arbiter_scheduling_setup(struct mlx5_esw_sched_node *node,
+ struct netlink_ext_ack *extack)
+{
+ u32 curr_ix = node->ix;
+ int err;
+
+ err = esw_qos_create_tc_arbiter_sched_elem(node, extack);
+ if (err)
+ return err;
+ /* Initialize the vports TC nodes within created TC arbiter TSAR. */
+ err = esw_qos_create_vports_tc_nodes(node, extack);
+ if (err)
+ goto err_vports_tc_nodes;
+
+ node->type = SCHED_NODE_TYPE_TC_ARBITER_TSAR;
+
+ return 0;
+
+err_vports_tc_nodes:
+ /* If initialization fails, clean up the scheduling element
+ * for the TC arbiter node.
+ */
+ esw_qos_node_destroy_sched_element(node, NULL);
+ node->ix = curr_ix;
+ return err;
+}
+
+static int
+esw_qos_create_vport_tc_sched_node(struct mlx5_vport *vport,
+ u32 rate_limit_elem_ix,
+ struct mlx5_esw_sched_node *vports_tc_node,
+ struct netlink_ext_ack *extack)
+{
+ struct mlx5_esw_sched_node *vport_node = vport->qos.sched_node;
+ struct mlx5_esw_sched_node *vport_tc_node;
+ u8 tc = vports_tc_node->tc;
+ int err;
+
+ vport_tc_node = __esw_qos_alloc_node(vport_node->esw, 0,
+ SCHED_NODE_TYPE_VPORT_TC,
+ vports_tc_node);
+ if (!vport_tc_node)
+ return -ENOMEM;
+
+ vport_tc_node->min_rate = vport_node->min_rate;
+ vport_tc_node->tc = tc;
+ vport_tc_node->vport = vport;
+ err = esw_qos_vport_tc_create_sched_element(vport_tc_node,
+ rate_limit_elem_ix,
+ extack);
+ if (err)
+ goto err_out;
+
+ vport->qos.sched_nodes[tc] = vport_tc_node;
+
+ return 0;
+err_out:
+ __esw_qos_free_node(vport_tc_node);
+ return err;
+}
+
+static void
+esw_qos_destroy_vport_tc_sched_elements(struct mlx5_vport *vport,
+ struct netlink_ext_ack *extack)
+{
+ int i, num_tcs = esw_qos_num_tcs(vport->qos.sched_node->esw->dev);
+
+ for (i = 0; i < num_tcs; i++) {
+ if (vport->qos.sched_nodes[i]) {
+ __esw_qos_destroy_node(vport->qos.sched_nodes[i],
+ extack);
+ }
+ }
+
+ kfree(vport->qos.sched_nodes);
+ vport->qos.sched_nodes = NULL;
+}
+
+static int
+esw_qos_create_vport_tc_sched_elements(struct mlx5_vport *vport,
+ enum sched_node_type type,
+ struct netlink_ext_ack *extack)
+{
+ struct mlx5_esw_sched_node *vport_node = vport->qos.sched_node;
+ struct mlx5_esw_sched_node *tc_arbiter_node, *vports_tc_node;
+ int err, num_tcs = esw_qos_num_tcs(vport_node->esw->dev);
+ u32 rate_limit_elem_ix;
+
+ vport->qos.sched_nodes = kcalloc(num_tcs,
+ sizeof(struct mlx5_esw_sched_node *),
+ GFP_KERNEL);
+ if (!vport->qos.sched_nodes) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "Allocating the vport TC scheduling elements failed.");
+ return -ENOMEM;
+ }
+
+ rate_limit_elem_ix = type == SCHED_NODE_TYPE_RATE_LIMITER ?
+ vport_node->ix : 0;
+ tc_arbiter_node = type == SCHED_NODE_TYPE_RATE_LIMITER ?
+ vport_node->parent : vport_node;
+ list_for_each_entry(vports_tc_node, &tc_arbiter_node->children, entry) {
+ err = esw_qos_create_vport_tc_sched_node(vport,
+ rate_limit_elem_ix,
+ vports_tc_node,
+ extack);
+ if (err)
+ goto err_create_vport_tc;
+ }
+
+ return 0;
+
+err_create_vport_tc:
+ esw_qos_destroy_vport_tc_sched_elements(vport, NULL);
+
+ return err;
+}
+
+static int
+esw_qos_vport_tc_enable(struct mlx5_vport *vport, enum sched_node_type type,
+ struct netlink_ext_ack *extack)
+{
+ struct mlx5_esw_sched_node *vport_node = vport->qos.sched_node;
+ int err, new_level, max_level;
+
+ if (type == SCHED_NODE_TYPE_TC_ARBITER_TSAR) {
+ /* Increase the parent's level by 2 to account for both the
+ * TC arbiter and the vports TC scheduling element.
+ */
+ new_level = vport_node->parent->level + 2;
+ max_level = 1 << MLX5_CAP_QOS(vport_node->esw->dev,
+ log_esw_max_sched_depth);
+ if (new_level > max_level) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "TC arbitration on leafs is not supported beyond max scheduling depth");
+ return -EOPNOTSUPP;
+ }
+ }
+
+ esw_assert_qos_lock_held(vport->dev->priv.eswitch);
+
+ if (type == SCHED_NODE_TYPE_RATE_LIMITER)
+ err = esw_qos_create_rate_limit_element(vport_node, extack);
+ else
+ err = esw_qos_tc_arbiter_scheduling_setup(vport_node, extack);
+ if (err)
+ return err;
+
+ /* Rate limiters impact multiple nodes not directly connected to them
+ * and are not direct members of the QoS hierarchy.
+ * Unlink it from the parent to reflect that.
+ */
+ if (type == SCHED_NODE_TYPE_RATE_LIMITER) {
+ list_del_init(&vport_node->entry);
+ vport_node->level = 0;
+ }
+
+ err = esw_qos_create_vport_tc_sched_elements(vport, type, extack);
+ if (err)
+ goto err_sched_nodes;
+
+ return 0;
+
+err_sched_nodes:
+ if (type == SCHED_NODE_TYPE_RATE_LIMITER) {
+ esw_qos_node_destroy_sched_element(vport_node, NULL);
+ list_add_tail(&vport_node->entry,
+ &vport_node->parent->children);
+ vport_node->level = vport_node->parent->level + 1;
+ } else {
+ esw_qos_tc_arbiter_scheduling_teardown(vport_node, NULL);
+ }
+ return err;
+}
+
+static void esw_qos_vport_tc_disable(struct mlx5_vport *vport,
+ struct netlink_ext_ack *extack)
+{
+ struct mlx5_esw_sched_node *vport_node = vport->qos.sched_node;
+ enum sched_node_type curr_type = vport_node->type;
+
+ esw_qos_destroy_vport_tc_sched_elements(vport, extack);
+
+ if (curr_type == SCHED_NODE_TYPE_RATE_LIMITER)
+ esw_qos_node_destroy_sched_element(vport_node, extack);
+ else
+ esw_qos_tc_arbiter_scheduling_teardown(vport_node, extack);
+}
+
+static int esw_qos_set_vport_tcs_min_rate(struct mlx5_vport *vport,
+ u32 min_rate,
+ struct netlink_ext_ack *extack)
+{
+ struct mlx5_esw_sched_node *vport_node = vport->qos.sched_node;
+ int err, i, num_tcs = esw_qos_num_tcs(vport_node->esw->dev);
+
+ for (i = 0; i < num_tcs; i++) {
+ err = esw_qos_set_node_min_rate(vport->qos.sched_nodes[i],
+ min_rate, extack);
+ if (err)
+ goto err_out;
+ }
+ vport_node->min_rate = min_rate;
+
+ return 0;
+err_out:
+ for (--i; i >= 0; i--) {
+ esw_qos_set_node_min_rate(vport->qos.sched_nodes[i],
+ vport_node->min_rate, extack);
+ }
+ return err;
+}
+
static void esw_qos_vport_disable(struct mlx5_vport *vport, struct netlink_ext_ack *extack)
{
struct mlx5_esw_sched_node *vport_node = vport->qos.sched_node;
struct mlx5_esw_sched_node *parent = vport_node->parent;
+ enum sched_node_type curr_type = vport_node->type;
- esw_qos_node_destroy_sched_element(vport_node, extack);
+ if (curr_type == SCHED_NODE_TYPE_VPORT)
+ esw_qos_node_destroy_sched_element(vport_node, extack);
+ else
+ esw_qos_vport_tc_disable(vport, extack);
vport_node->bw_share = 0;
list_del_init(&vport_node->entry);
@@ -569,7 +1098,9 @@ static void esw_qos_vport_disable(struct mlx5_vport *vport, struct netlink_ext_a
trace_mlx5_esw_vport_qos_destroy(vport_node->esw->dev, vport);
}
-static int esw_qos_vport_enable(struct mlx5_vport *vport, struct mlx5_esw_sched_node *parent,
+static int esw_qos_vport_enable(struct mlx5_vport *vport,
+ enum sched_node_type type,
+ struct mlx5_esw_sched_node *parent,
struct netlink_ext_ack *extack)
{
int err;
@@ -577,10 +1108,16 @@ static int esw_qos_vport_enable(struct mlx5_vport *vport, struct mlx5_esw_sched_
esw_assert_qos_lock_held(vport->dev->priv.eswitch);
esw_qos_node_set_parent(vport->qos.sched_node, parent);
- err = esw_qos_vport_create_sched_element(vport->qos.sched_node, extack);
+ if (type == SCHED_NODE_TYPE_VPORT) {
+ err = esw_qos_vport_create_sched_element(vport->qos.sched_node,
+ extack);
+ } else {
+ err = esw_qos_vport_tc_enable(vport, type, extack);
+ }
if (err)
return err;
+ vport->qos.sched_node->type = type;
esw_qos_normalize_min_rate(parent->esw, parent, extack);
trace_mlx5_esw_vport_qos_create(vport->dev, vport,
vport->qos.sched_node->max_rate,
@@ -611,9 +1148,8 @@ static int mlx5_esw_qos_vport_enable(struct mlx5_vport *vport, enum sched_node_t
sched_node->min_rate = min_rate;
sched_node->vport = vport;
vport->qos.sched_node = sched_node;
- err = esw_qos_vport_enable(vport, parent, extack);
+ err = esw_qos_vport_enable(vport, type, parent, extack);
if (err) {
- __esw_qos_free_node(sched_node);
esw_qos_put(esw);
vport->qos.sched_node = NULL;
}
@@ -666,6 +1202,8 @@ static int mlx5_esw_qos_set_vport_min_rate(struct mlx5_vport *vport, u32 min_rat
if (!vport_node)
return mlx5_esw_qos_vport_enable(vport, SCHED_NODE_TYPE_VPORT, NULL, 0, min_rate,
extack);
+ else if (vport_node->type == SCHED_NODE_TYPE_RATE_LIMITER)
+ return esw_qos_set_vport_tcs_min_rate(vport, min_rate, extack);
else
return esw_qos_set_node_min_rate(vport_node, min_rate, extack);
}
@@ -698,12 +1236,73 @@ bool mlx5_esw_qos_get_vport_rate(struct mlx5_vport *vport, u32 *max_rate, u32 *m
return enabled;
}
+static int esw_qos_vport_tc_check_type(enum sched_node_type curr_type,
+ enum sched_node_type new_type,
+ struct netlink_ext_ack *extack)
+{
+ if (curr_type == SCHED_NODE_TYPE_TC_ARBITER_TSAR &&
+ new_type == SCHED_NODE_TYPE_RATE_LIMITER) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "Cannot switch from vport-level TC arbitration to node-level TC arbitration");
+ return -EOPNOTSUPP;
+ }
+
+ if (curr_type == SCHED_NODE_TYPE_RATE_LIMITER &&
+ new_type == SCHED_NODE_TYPE_TC_ARBITER_TSAR) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "Cannot switch from node-level TC arbitration to vport-level TC arbitration");
+ return -EOPNOTSUPP;
+ }
+
+ return 0;
+}
+
+static int esw_qos_vport_update(struct mlx5_vport *vport,
+ enum sched_node_type type,
+ struct mlx5_esw_sched_node *parent,
+ struct netlink_ext_ack *extack)
+{
+ struct mlx5_esw_sched_node *curr_parent = vport->qos.sched_node->parent;
+ enum sched_node_type curr_type = vport->qos.sched_node->type;
+ u32 curr_tc_bw[DEVLINK_RATE_TCS_MAX] = {0};
+ int err;
+
+ esw_assert_qos_lock_held(vport->dev->priv.eswitch);
+ parent = parent ?: curr_parent;
+ if (curr_type == type && curr_parent == parent)
+ return 0;
+
+ err = esw_qos_vport_tc_check_type(curr_type, type, extack);
+ if (err)
+ return err;
+
+ if (curr_type == SCHED_NODE_TYPE_TC_ARBITER_TSAR && curr_type == type) {
+ esw_qos_tc_arbiter_get_bw_shares(vport->qos.sched_node,
+ curr_tc_bw);
+ }
+
+ esw_qos_vport_disable(vport, extack);
+
+ err = esw_qos_vport_enable(vport, type, parent, extack);
+ if (err) {
+ esw_qos_vport_enable(vport, curr_type, curr_parent, NULL);
+ extack = NULL;
+ }
+
+ if (curr_type == SCHED_NODE_TYPE_TC_ARBITER_TSAR && curr_type == type) {
+ esw_qos_set_tc_arbiter_bw_shares(vport->qos.sched_node,
+ curr_tc_bw, extack);
+ }
+
+ return err;
+}
+
static int esw_qos_vport_update_parent(struct mlx5_vport *vport, struct mlx5_esw_sched_node *parent,
struct netlink_ext_ack *extack)
{
struct mlx5_eswitch *esw = vport->dev->priv.eswitch;
struct mlx5_esw_sched_node *curr_parent;
- int err;
+ enum sched_node_type type;
esw_assert_qos_lock_held(esw);
curr_parent = vport->qos.sched_node->parent;
@@ -711,15 +1310,205 @@ static int esw_qos_vport_update_parent(struct mlx5_vport *vport, struct mlx5_esw
if (curr_parent == parent)
return 0;
- esw_qos_vport_disable(vport, extack);
+ /* Set vport QoS type based on parent node type if different from
+ * default QoS; otherwise, use the vport's current QoS type.
+ */
+ if (parent->type == SCHED_NODE_TYPE_TC_ARBITER_TSAR)
+ type = SCHED_NODE_TYPE_RATE_LIMITER;
+ else if (curr_parent->type == SCHED_NODE_TYPE_TC_ARBITER_TSAR)
+ type = SCHED_NODE_TYPE_VPORT;
+ else
+ type = vport->qos.sched_node->type;
+
+ return esw_qos_vport_update(vport, type, parent, extack);
+}
- err = esw_qos_vport_enable(vport, parent, extack);
+static void
+esw_qos_switch_vport_tcs_to_vport(struct mlx5_esw_sched_node *tc_arbiter_node,
+ struct mlx5_esw_sched_node *node,
+ struct netlink_ext_ack *extack)
+{
+ struct mlx5_esw_sched_node *vports_tc_node, *vport_tc_node, *tmp;
+
+ vports_tc_node = list_first_entry(&tc_arbiter_node->children,
+ struct mlx5_esw_sched_node,
+ entry);
+
+ list_for_each_entry_safe(vport_tc_node, tmp, &vports_tc_node->children,
+ entry)
+ esw_qos_vport_update_parent(vport_tc_node->vport, node, extack);
+}
+
+static int esw_qos_switch_tc_arbiter_node_to_vports(
+ struct mlx5_esw_sched_node *tc_arbiter_node,
+ struct mlx5_esw_sched_node *node,
+ struct netlink_ext_ack *extack)
+{
+ u32 parent_tsar_ix = node->parent ?
+ node->parent->ix : node->esw->qos.root_tsar_ix;
+ int err;
+
+ err = esw_qos_create_node_sched_elem(node->esw->dev, parent_tsar_ix,
+ node->max_rate, node->bw_share,
+ &node->ix);
if (err) {
- if (esw_qos_vport_enable(vport, curr_parent, NULL))
- esw_warn(parent->esw->dev, "vport restore QoS failed (vport=%d)\n",
- vport->vport);
+ NL_SET_ERR_MSG_MOD(extack,
+ "Failed to create scheduling element for vports node when disabliing vports TC QoS");
+ return err;
+ }
+
+ node->type = SCHED_NODE_TYPE_VPORTS_TSAR;
+
+ /* Disable TC QoS for vports in the arbiter node. */
+ esw_qos_switch_vport_tcs_to_vport(tc_arbiter_node, node, extack);
+
+ return 0;
+}
+
+static int esw_qos_switch_vports_node_to_tc_arbiter(
+ struct mlx5_esw_sched_node *node,
+ struct mlx5_esw_sched_node *tc_arbiter_node,
+ struct netlink_ext_ack *extack)
+{
+ struct mlx5_esw_sched_node *vport_node, *tmp;
+ struct mlx5_vport *vport;
+ int err;
+
+ /* Enable TC QoS for each vport in the node. */
+ list_for_each_entry_safe(vport_node, tmp, &node->children, entry) {
+ vport = vport_node->vport;
+ err = esw_qos_vport_update_parent(vport, tc_arbiter_node,
+ extack);
+ if (err)
+ goto err_out;
+ }
+
+ /* Destroy the current vports node TSAR. */
+ err = mlx5_destroy_scheduling_element_cmd(node->esw->dev,
+ SCHEDULING_HIERARCHY_E_SWITCH,
+ node->ix);
+ if (err)
+ goto err_out;
+
+ return 0;
+err_out:
+ /* Restore vports back into the node if an error occurs. */
+ esw_qos_switch_vport_tcs_to_vport(tc_arbiter_node, node, NULL);
+
+ return err;
+}
+
+static struct mlx5_esw_sched_node *
+esw_qos_move_node(struct mlx5_esw_sched_node *curr_node)
+{
+ struct mlx5_esw_sched_node *new_node;
+
+ new_node = __esw_qos_alloc_node(curr_node->esw, curr_node->ix,
+ curr_node->type, NULL);
+ if (!IS_ERR(new_node))
+ esw_qos_nodes_set_parent(&curr_node->children, new_node);
+
+ return new_node;
+}
+
+static int esw_qos_node_disable_tc_arbitration(struct mlx5_esw_sched_node *node,
+ struct netlink_ext_ack *extack)
+{
+ struct mlx5_esw_sched_node *curr_node;
+ int err;
+
+ if (node->type != SCHED_NODE_TYPE_TC_ARBITER_TSAR)
+ return 0;
+
+ /* Allocate a new rate node to hold the current state, which will allow
+ * for restoring the vports back to this node after disabling TC
+ * arbitration.
+ */
+ curr_node = esw_qos_move_node(node);
+ if (IS_ERR(curr_node)) {
+ NL_SET_ERR_MSG_MOD(extack, "Failed setting up vports node");
+ return PTR_ERR(curr_node);
+ }
+
+ /* Disable TC QoS for all vports, and assign them back to the node. */
+ err = esw_qos_switch_tc_arbiter_node_to_vports(curr_node, node, extack);
+ if (err)
+ goto err_out;
+
+ /* Clean up the TC arbiter node after disabling TC QoS for vports. */
+ esw_qos_tc_arbiter_scheduling_teardown(curr_node, extack);
+ goto out;
+err_out:
+ esw_qos_nodes_set_parent(&curr_node->children, node);
+out:
+ __esw_qos_free_node(curr_node);
+ return err;
+}
+
+static int esw_qos_node_enable_tc_arbitration(struct mlx5_esw_sched_node *node,
+ struct netlink_ext_ack *extack)
+{
+ struct mlx5_esw_sched_node *curr_node, *child;
+ int err, new_level, max_level;
+
+ if (node->type == SCHED_NODE_TYPE_TC_ARBITER_TSAR)
+ return 0;
+
+ /* Increase the hierarchy level by one to account for the additional
+ * vports TC scheduling node, and verify that the new level does not
+ * exceed the maximum allowed depth.
+ */
+ new_level = node->level + 1;
+ max_level = 1 << MLX5_CAP_QOS(node->esw->dev, log_esw_max_sched_depth);
+ if (new_level > max_level) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "TC arbitration on nodes is not supported beyond max scheduling depth");
+ return -EOPNOTSUPP;
+ }
+
+ /* Ensure the node does not contain non-leaf children before assigning
+ * TC bandwidth.
+ */
+ if (!list_empty(&node->children)) {
+ list_for_each_entry(child, &node->children, entry) {
+ if (!child->vport) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "Cannot configure TC bandwidth on a node with non-leaf children");
+ return -EOPNOTSUPP;
+ }
+ }
}
+ /* Allocate a new node that will store the information of the current
+ * node. This will be used later to restore the node if necessary.
+ */
+ curr_node = esw_qos_move_node(node);
+ if (IS_ERR(curr_node)) {
+ NL_SET_ERR_MSG_MOD(extack, "Failed setting up node TC QoS");
+ return PTR_ERR(curr_node);
+ }
+
+ /* Initialize the TC arbiter node for QoS management.
+ * This step prepares the node for handling Traffic Class arbitration.
+ */
+ err = esw_qos_tc_arbiter_scheduling_setup(node, extack);
+ if (err)
+ goto err_setup;
+
+ /* Enable TC QoS for each vport within the current node. */
+ err = esw_qos_switch_vports_node_to_tc_arbiter(curr_node, node, extack);
+ if (err)
+ goto err_switch_vports;
+ goto out;
+
+err_switch_vports:
+ esw_qos_tc_arbiter_scheduling_teardown(node, NULL);
+ node->ix = curr_node->ix;
+ node->type = curr_node->type;
+err_setup:
+ esw_qos_nodes_set_parent(&curr_node->children, node);
+out:
+ __esw_qos_free_node(curr_node);
return err;
}
@@ -848,6 +1637,41 @@ static int esw_qos_devlink_rate_to_mbps(struct mlx5_core_dev *mdev, const char *
return 0;
}
+static bool esw_qos_validate_unsupported_tc_bw(struct mlx5_eswitch *esw,
+ u32 *tc_bw)
+{
+ int i, num_tcs = esw_qos_num_tcs(esw->dev);
+
+ for (i = num_tcs; i < DEVLINK_RATE_TCS_MAX; i++) {
+ if (tc_bw[i])
+ return false;
+ }
+
+ return true;
+}
+
+static bool esw_qos_vport_validate_unsupported_tc_bw(struct mlx5_vport *vport,
+ u32 *tc_bw)
+{
+ struct mlx5_eswitch *esw = vport->qos.sched_node ?
+ vport->qos.sched_node->parent->esw :
+ vport->dev->priv.eswitch;
+
+ return esw_qos_validate_unsupported_tc_bw(esw, tc_bw);
+}
+
+static bool esw_qos_tc_bw_disabled(u32 *tc_bw)
+{
+ int i;
+
+ for (i = 0; i < DEVLINK_RATE_TCS_MAX; i++) {
+ if (tc_bw[i])
+ return false;
+ }
+
+ return true;
+}
+
int mlx5_esw_qos_init(struct mlx5_eswitch *esw)
{
if (esw->qos.domain)
@@ -906,6 +1730,90 @@ int mlx5_esw_devlink_rate_leaf_tx_max_set(struct devlink_rate *rate_leaf, void *
return err;
}
+int mlx5_esw_devlink_rate_leaf_tc_bw_set(struct devlink_rate *rate_leaf,
+ void *priv,
+ u32 *tc_bw,
+ struct netlink_ext_ack *extack)
+{
+ struct mlx5_esw_sched_node *vport_node;
+ struct mlx5_vport *vport = priv;
+ struct mlx5_eswitch *esw;
+ bool disable;
+ int err = 0;
+
+ esw = vport->dev->priv.eswitch;
+ if (!mlx5_esw_allowed(esw))
+ return -EPERM;
+
+ disable = esw_qos_tc_bw_disabled(tc_bw);
+ esw_qos_lock(esw);
+
+ if (!esw_qos_vport_validate_unsupported_tc_bw(vport, tc_bw)) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "E-Switch traffic classes number is not supported");
+ err = -EOPNOTSUPP;
+ goto unlock;
+ }
+
+ vport_node = vport->qos.sched_node;
+ if (disable && !vport_node)
+ goto unlock;
+
+ if (disable) {
+ if (vport_node->type == SCHED_NODE_TYPE_TC_ARBITER_TSAR)
+ err = esw_qos_vport_update(vport, SCHED_NODE_TYPE_VPORT,
+ NULL, extack);
+ goto unlock;
+ }
+
+ if (!vport_node) {
+ err = mlx5_esw_qos_vport_enable(vport,
+ SCHED_NODE_TYPE_TC_ARBITER_TSAR,
+ NULL, 0, 0, extack);
+ vport_node = vport->qos.sched_node;
+ } else {
+ err = esw_qos_vport_update(vport,
+ SCHED_NODE_TYPE_TC_ARBITER_TSAR,
+ NULL, extack);
+ }
+ if (!err)
+ esw_qos_set_tc_arbiter_bw_shares(vport_node, tc_bw, extack);
+unlock:
+ esw_qos_unlock(esw);
+ return err;
+}
+
+int mlx5_esw_devlink_rate_node_tc_bw_set(struct devlink_rate *rate_node,
+ void *priv,
+ u32 *tc_bw,
+ struct netlink_ext_ack *extack)
+{
+ struct mlx5_esw_sched_node *node = priv;
+ struct mlx5_eswitch *esw = node->esw;
+ bool disable;
+ int err;
+
+ if (!esw_qos_validate_unsupported_tc_bw(esw, tc_bw)) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "E-Switch traffic classes number is not supported");
+ return -EOPNOTSUPP;
+ }
+
+ disable = esw_qos_tc_bw_disabled(tc_bw);
+ esw_qos_lock(esw);
+ if (disable) {
+ err = esw_qos_node_disable_tc_arbitration(node, extack);
+ goto unlock;
+ }
+
+ err = esw_qos_node_enable_tc_arbitration(node, extack);
+ if (!err)
+ esw_qos_set_tc_arbiter_bw_shares(node, tc_bw, extack);
+unlock:
+ esw_qos_unlock(esw);
+ return err;
+}
+
int mlx5_esw_devlink_rate_node_tx_share_set(struct devlink_rate *rate_node, void *priv,
u64 tx_share, struct netlink_ext_ack *extack)
{
@@ -996,10 +1904,16 @@ int mlx5_esw_qos_vport_update_parent(struct mlx5_vport *vport, struct mlx5_esw_s
}
esw_qos_lock(esw);
- if (!vport->qos.sched_node && parent)
- err = mlx5_esw_qos_vport_enable(vport, SCHED_NODE_TYPE_VPORT, parent, 0, 0, extack);
- else if (vport->qos.sched_node)
+ if (!vport->qos.sched_node && parent) {
+ enum sched_node_type type;
+
+ type = parent->type == SCHED_NODE_TYPE_TC_ARBITER_TSAR ?
+ SCHED_NODE_TYPE_RATE_LIMITER : SCHED_NODE_TYPE_VPORT;
+ err = mlx5_esw_qos_vport_enable(vport, type, parent, 0, 0,
+ extack);
+ } else if (vport->qos.sched_node) {
err = esw_qos_vport_update_parent(vport, parent, extack);
+ }
esw_qos_unlock(esw);
return err;
}
@@ -1019,6 +1933,20 @@ int mlx5_esw_devlink_rate_leaf_parent_set(struct devlink_rate *devlink_rate,
return mlx5_esw_qos_vport_update_parent(vport, node, extack);
}
+static bool esw_qos_is_node_empty(struct mlx5_esw_sched_node *node)
+{
+ if (list_empty(&node->children))
+ return true;
+
+ if (node->type != SCHED_NODE_TYPE_TC_ARBITER_TSAR)
+ return false;
+
+ node = list_first_entry(&node->children, struct mlx5_esw_sched_node,
+ entry);
+
+ return esw_qos_is_node_empty(node);
+}
+
static int
mlx5_esw_qos_node_validate_set_parent(struct mlx5_esw_sched_node *node,
struct mlx5_esw_sched_node *parent,
@@ -1032,13 +1960,26 @@ mlx5_esw_qos_node_validate_set_parent(struct mlx5_esw_sched_node *node,
return -EOPNOTSUPP;
}
- if (!list_empty(&node->children)) {
+ if (!esw_qos_is_node_empty(node)) {
NL_SET_ERR_MSG_MOD(extack,
"Cannot reassign a node that contains rate objects");
return -EOPNOTSUPP;
}
+ if (parent && parent->type == SCHED_NODE_TYPE_TC_ARBITER_TSAR) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "Cannot attach a node to a parent with TC bandwidth configured");
+ return -EOPNOTSUPP;
+ }
+
new_level = parent ? parent->level + 1 : 2;
+ if (node->type == SCHED_NODE_TYPE_TC_ARBITER_TSAR) {
+ /* Increase by one to account for the vports TC scheduling
+ * element.
+ */
+ new_level += 1;
+ }
+
max_level = 1 << MLX5_CAP_QOS(node->esw->dev, log_esw_max_sched_depth);
if (new_level > max_level) {
NL_SET_ERR_MSG_MOD(extack,
@@ -1049,6 +1990,32 @@ mlx5_esw_qos_node_validate_set_parent(struct mlx5_esw_sched_node *node,
return 0;
}
+static int
+esw_qos_tc_arbiter_node_update_parent(struct mlx5_esw_sched_node *node,
+ struct mlx5_esw_sched_node *parent,
+ struct netlink_ext_ack *extack)
+{
+ struct mlx5_esw_sched_node *curr_parent = node->parent;
+ u32 curr_tc_bw[DEVLINK_RATE_TCS_MAX] = {0};
+ struct mlx5_eswitch *esw = node->esw;
+ int err;
+
+ esw_qos_tc_arbiter_get_bw_shares(node, curr_tc_bw);
+ esw_qos_tc_arbiter_scheduling_teardown(node, extack);
+ esw_qos_node_set_parent(node, parent);
+ err = esw_qos_tc_arbiter_scheduling_setup(node, extack);
+ if (err) {
+ esw_qos_node_set_parent(node, curr_parent);
+ if (esw_qos_tc_arbiter_scheduling_setup(node, extack)) {
+ esw_warn(esw->dev, "Node restore QoS failed\n");
+ return err;
+ }
+ }
+ esw_qos_set_tc_arbiter_bw_shares(node, curr_tc_bw, extack);
+
+ return err;
+}
+
static int esw_qos_vports_node_update_parent(struct mlx5_esw_sched_node *node,
struct mlx5_esw_sched_node *parent,
struct netlink_ext_ack *extack)
@@ -1094,7 +2061,13 @@ static int mlx5_esw_qos_node_update_parent(struct mlx5_esw_sched_node *node,
esw_qos_lock(esw);
curr_parent = node->parent;
- err = esw_qos_vports_node_update_parent(node, parent, extack);
+ if (node->type == SCHED_NODE_TYPE_TC_ARBITER_TSAR) {
+ err = esw_qos_tc_arbiter_node_update_parent(node, parent,
+ extack);
+ } else {
+ err = esw_qos_vports_node_update_parent(node, parent, extack);
+ }
+
if (err)
goto out;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.h b/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.h
index ed40ec8f027e..0a50982b0e27 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.h
@@ -21,6 +21,14 @@ int mlx5_esw_devlink_rate_leaf_tx_share_set(struct devlink_rate *rate_leaf, void
u64 tx_share, struct netlink_ext_ack *extack);
int mlx5_esw_devlink_rate_leaf_tx_max_set(struct devlink_rate *rate_leaf, void *priv,
u64 tx_max, struct netlink_ext_ack *extack);
+int mlx5_esw_devlink_rate_leaf_tc_bw_set(struct devlink_rate *rate_node,
+ void *priv,
+ u32 *tc_bw,
+ struct netlink_ext_ack *extack);
+int mlx5_esw_devlink_rate_node_tc_bw_set(struct devlink_rate *rate_node,
+ void *priv,
+ u32 *tc_bw,
+ struct netlink_ext_ack *extack);
int mlx5_esw_devlink_rate_node_tx_share_set(struct devlink_rate *rate_node, void *priv,
u64 tx_share, struct netlink_ext_ack *extack);
int mlx5_esw_devlink_rate_node_tx_max_set(struct devlink_rate *rate_node, void *priv,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
index 8573d36785f4..d59fdcb29cb8 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
@@ -212,10 +212,20 @@ struct mlx5_vport {
struct mlx5_vport_info info;
- /* Protected with the E-Switch qos domain lock. */
+ /* Protected with the E-Switch qos domain lock. The Vport QoS can
+ * either be disabled (sched_node is NULL) or in one of three states:
+ * 1. Regular QoS (sched_node is a vport node).
+ * 2. TC QoS enabled on the vport (sched_node is a TC arbiter).
+ * 3. TC QoS enabled on the vport's parent node
+ * (sched_node is a rate limit node).
+ * When TC is enabled in either mode, the vport owns vport TC scheduling
+ * nodes.
+ */
struct {
- /* Vport scheduling element node. */
+ /* Vport scheduling node. */
struct mlx5_esw_sched_node *sched_node;
+ /* Array of vport traffic class scheduling nodes. */
+ struct mlx5_esw_sched_node **sched_nodes;
} qos;
u16 vport;
diff --git a/drivers/net/netdevsim/dev.c b/drivers/net/netdevsim/dev.c
index 3e0b61202f0c..b3647691060c 100644
--- a/drivers/net/netdevsim/dev.c
+++ b/drivers/net/netdevsim/dev.c
@@ -388,6 +388,17 @@ static const struct file_operations nsim_dev_rate_parent_fops = {
.owner = THIS_MODULE,
};
+static void nsim_dev_tc_bw_debugfs_init(struct dentry *ddir, u32 *tc_bw)
+{
+ int i;
+
+ for (i = 0; i < DEVLINK_RATE_TCS_MAX; i++) {
+ char name[16];
+
+ snprintf(name, sizeof(name), "tc%d_bw", i);
+ debugfs_create_u32(name, 0400, ddir, &tc_bw[i]);
+ }
+}
static int nsim_dev_port_debugfs_init(struct nsim_dev *nsim_dev,
struct nsim_dev_port *nsim_dev_port)
{
@@ -415,6 +426,8 @@ static int nsim_dev_port_debugfs_init(struct nsim_dev *nsim_dev,
nsim_dev_port->ddir,
&nsim_dev_port->parent_name,
&nsim_dev_rate_parent_fops);
+ nsim_dev_tc_bw_debugfs_init(nsim_dev_port->ddir,
+ nsim_dev_port->tc_bw);
}
debugfs_create_symlink("dev", nsim_dev_port->ddir, dev_link_name);
@@ -1172,6 +1185,19 @@ static int nsim_rate_bytes_to_units(char *name, u64 *rate, struct netlink_ext_ac
return 0;
}
+static int nsim_leaf_tc_bw_set(struct devlink_rate *devlink_rate,
+ void *priv, u32 *tc_bw,
+ struct netlink_ext_ack *extack)
+{
+ struct nsim_dev_port *nsim_dev_port = priv;
+ int i;
+
+ for (i = 0; i < DEVLINK_RATE_TCS_MAX; i++)
+ nsim_dev_port->tc_bw[i] = tc_bw[i];
+
+ return 0;
+}
+
static int nsim_leaf_tx_share_set(struct devlink_rate *devlink_rate, void *priv,
u64 tx_share, struct netlink_ext_ack *extack)
{
@@ -1210,8 +1236,21 @@ struct nsim_rate_node {
char *parent_name;
u16 tx_share;
u16 tx_max;
+ u32 tc_bw[DEVLINK_RATE_TCS_MAX];
};
+static int nsim_node_tc_bw_set(struct devlink_rate *devlink_rate, void *priv,
+ u32 *tc_bw, struct netlink_ext_ack *extack)
+{
+ struct nsim_rate_node *nsim_node = priv;
+ int i;
+
+ for (i = 0; i < DEVLINK_RATE_TCS_MAX; i++)
+ nsim_node->tc_bw[i] = tc_bw[i];
+
+ return 0;
+}
+
static int nsim_node_tx_share_set(struct devlink_rate *devlink_rate, void *priv,
u64 tx_share, struct netlink_ext_ack *extack)
{
@@ -1264,6 +1303,8 @@ static int nsim_rate_node_new(struct devlink_rate *node, void **priv,
&nsim_node->parent_name,
&nsim_dev_rate_parent_fops);
+ nsim_dev_tc_bw_debugfs_init(nsim_node->ddir, nsim_node->tc_bw);
+
*priv = nsim_node;
return 0;
}
@@ -1340,8 +1381,10 @@ static const struct devlink_ops nsim_dev_devlink_ops = {
.trap_policer_counter_get = nsim_dev_devlink_trap_policer_counter_get,
.rate_leaf_tx_share_set = nsim_leaf_tx_share_set,
.rate_leaf_tx_max_set = nsim_leaf_tx_max_set,
+ .rate_leaf_tc_bw_set = nsim_leaf_tc_bw_set,
.rate_node_tx_share_set = nsim_node_tx_share_set,
.rate_node_tx_max_set = nsim_node_tx_max_set,
+ .rate_node_tc_bw_set = nsim_node_tc_bw_set,
.rate_node_new = nsim_rate_node_new,
.rate_node_del = nsim_rate_node_del,
.rate_leaf_parent_set = nsim_rate_leaf_parent_set,
diff --git a/drivers/net/netdevsim/netdevsim.h b/drivers/net/netdevsim/netdevsim.h
index 4a0c48c7a384..809dd29fc5fe 100644
--- a/drivers/net/netdevsim/netdevsim.h
+++ b/drivers/net/netdevsim/netdevsim.h
@@ -276,6 +276,7 @@ struct nsim_dev_port {
struct dentry *ddir;
struct dentry *rate_parent;
char *parent_name;
+ u32 tc_bw[DEVLINK_RATE_TCS_MAX];
struct netdevsim *ns;
};
diff --git a/drivers/net/vxlan/vxlan_vnifilter.c b/drivers/net/vxlan/vxlan_vnifilter.c
index 4ff56d9f8f28..adc89e651e27 100644
--- a/drivers/net/vxlan/vxlan_vnifilter.c
+++ b/drivers/net/vxlan/vxlan_vnifilter.c
@@ -971,15 +971,10 @@ static int vxlan_vnifilter_process(struct sk_buff *skb, struct nlmsghdr *nlh,
if (!(vxlan->cfg.flags & VXLAN_F_VNIFILTER))
return -EOPNOTSUPP;
- nlmsg_for_each_attr(attr, nlh, sizeof(*tmsg), rem) {
- switch (nla_type(attr)) {
- case VXLAN_VNIFILTER_ENTRY:
- err = vxlan_process_vni_filter(vxlan, attr,
- nlh->nlmsg_type, extack);
- break;
- default:
- continue;
- }
+ nlmsg_for_each_attr_type(attr, VXLAN_VNIFILTER_ENTRY, nlh,
+ sizeof(*tmsg), rem) {
+ err = vxlan_process_vni_filter(vxlan, attr, nlh->nlmsg_type,
+ extack);
vnis++;
if (err)
break;
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 6a42cc7a845a..657d44afc062 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -1621,10 +1621,9 @@ int nfsd_nl_threads_set_doit(struct sk_buff *skb, struct genl_info *info)
return -EINVAL;
/* count number of SERVER_THREADS values */
- nlmsg_for_each_attr(attr, info->nlhdr, GENL_HDRLEN, rem) {
- if (nla_type(attr) == NFSD_A_SERVER_THREADS)
- nrpools++;
- }
+ nlmsg_for_each_attr_type(attr, NFSD_A_SERVER_THREADS, info->nlhdr,
+ GENL_HDRLEN, rem)
+ nrpools++;
mutex_lock(&nfsd_mutex);
@@ -1635,12 +1634,11 @@ int nfsd_nl_threads_set_doit(struct sk_buff *skb, struct genl_info *info)
}
i = 0;
- nlmsg_for_each_attr(attr, info->nlhdr, GENL_HDRLEN, rem) {
- if (nla_type(attr) == NFSD_A_SERVER_THREADS) {
- nthreads[i++] = nla_get_u32(attr);
- if (i >= nrpools)
- break;
- }
+ nlmsg_for_each_attr_type(attr, NFSD_A_SERVER_THREADS, info->nlhdr,
+ GENL_HDRLEN, rem) {
+ nthreads[i++] = nla_get_u32(attr);
+ if (i >= nrpools)
+ break;
}
if (info->attrs[NFSD_A_SERVER_GRACETIME] ||
@@ -1781,14 +1779,12 @@ int nfsd_nl_version_set_doit(struct sk_buff *skb, struct genl_info *info)
for (i = 0; i <= NFSD_SUPPORTED_MINOR_VERSION; i++)
nfsd_minorversion(nn, i, NFSD_CLEAR);
- nlmsg_for_each_attr(attr, info->nlhdr, GENL_HDRLEN, rem) {
+ nlmsg_for_each_attr_type(attr, NFSD_A_SERVER_PROTO_VERSION, info->nlhdr,
+ GENL_HDRLEN, rem) {
struct nlattr *tb[NFSD_A_VERSION_MAX + 1];
u32 major, minor = 0;
bool enabled;
- if (nla_type(attr) != NFSD_A_SERVER_PROTO_VERSION)
- continue;
-
if (nla_parse_nested(tb, NFSD_A_VERSION_MAX, attr,
nfsd_version_nl_policy, info->extack) < 0)
continue;
@@ -1939,14 +1935,12 @@ int nfsd_nl_listener_set_doit(struct sk_buff *skb, struct genl_info *info)
* Walk the list of server_socks from userland and move any that match
* back to sv_permsocks
*/
- nlmsg_for_each_attr(attr, info->nlhdr, GENL_HDRLEN, rem) {
+ nlmsg_for_each_attr_type(attr, NFSD_A_SERVER_SOCK_ADDR, info->nlhdr,
+ GENL_HDRLEN, rem) {
struct nlattr *tb[NFSD_A_SOCK_MAX + 1];
const char *xcl_name;
struct sockaddr *sa;
- if (nla_type(attr) != NFSD_A_SERVER_SOCK_ADDR)
- continue;
-
if (nla_parse_nested(tb, NFSD_A_SOCK_MAX, attr,
nfsd_sock_nl_policy, info->extack) < 0)
continue;
@@ -2001,15 +1995,13 @@ int nfsd_nl_listener_set_doit(struct sk_buff *skb, struct genl_info *info)
svc_xprt_destroy_all(serv, net);
/* walk list of addrs again, open any that still don't exist */
- nlmsg_for_each_attr(attr, info->nlhdr, GENL_HDRLEN, rem) {
+ nlmsg_for_each_attr_type(attr, NFSD_A_SERVER_SOCK_ADDR, info->nlhdr,
+ GENL_HDRLEN, rem) {
struct nlattr *tb[NFSD_A_SOCK_MAX + 1];
const char *xcl_name;
struct sockaddr *sa;
int ret;
- if (nla_type(attr) != NFSD_A_SERVER_SOCK_ADDR)
- continue;
-
if (nla_parse_nested(tb, NFSD_A_SOCK_MAX, attr,
nfsd_sock_nl_policy, info->extack) < 0)
continue;
diff --git a/include/net/devlink.h b/include/net/devlink.h
index 63517646a497..d0ce5a7e984c 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -118,6 +118,8 @@ struct devlink_rate {
u32 tx_priority;
u32 tx_weight;
+
+ u32 tc_bw[DEVLINK_RATE_TCS_MAX];
};
struct devlink_port {
@@ -1486,6 +1488,9 @@ struct devlink_ops {
u32 tx_priority, struct netlink_ext_ack *extack);
int (*rate_leaf_tx_weight_set)(struct devlink_rate *devlink_rate, void *priv,
u32 tx_weight, struct netlink_ext_ack *extack);
+ int (*rate_leaf_tc_bw_set)(struct devlink_rate *devlink_rate,
+ void *priv, u32 *tc_bw,
+ struct netlink_ext_ack *extack);
int (*rate_node_tx_share_set)(struct devlink_rate *devlink_rate, void *priv,
u64 tx_share, struct netlink_ext_ack *extack);
int (*rate_node_tx_max_set)(struct devlink_rate *devlink_rate, void *priv,
@@ -1494,6 +1499,9 @@ struct devlink_ops {
u32 tx_priority, struct netlink_ext_ack *extack);
int (*rate_node_tx_weight_set)(struct devlink_rate *devlink_rate, void *priv,
u32 tx_weight, struct netlink_ext_ack *extack);
+ int (*rate_node_tc_bw_set)(struct devlink_rate *devlink_rate,
+ void *priv, u32 *tc_bw,
+ struct netlink_ext_ack *extack);
int (*rate_node_new)(struct devlink_rate *rate_node, void **priv,
struct netlink_ext_ack *extack);
int (*rate_node_del)(struct devlink_rate *rate_node, void *priv,
diff --git a/include/net/netlink.h b/include/net/netlink.h
index 90a560dc167a..1a8356ca4b78 100644
--- a/include/net/netlink.h
+++ b/include/net/netlink.h
@@ -68,6 +68,8 @@
* nlmsg_for_each_msg() loop over all messages
* nlmsg_validate() validate netlink message incl. attrs
* nlmsg_for_each_attr() loop over all attributes
+ * nlmsg_for_each_attr_type() loop over all attributes with the
+ * given type
*
* Misc:
* nlmsg_report() report back to application?
@@ -967,6 +969,18 @@ static inline u32 nlmsg_seq(const struct nlmsghdr *nlh)
nlmsg_attrlen(nlh, hdrlen), rem)
/**
+ * nlmsg_for_each_attr_type - iterate over a stream of attributes
+ * @pos: loop counter, set to the current attribute
+ * @type: required attribute type for @pos
+ * @nlh: netlink message header
+ * @hdrlen: length of the family specific header
+ * @rem: initialized to len, holds bytes currently remaining in stream
+ */
+#define nlmsg_for_each_attr_type(pos, type, nlh, hdrlen, rem) \
+ nlmsg_for_each_attr(pos, nlh, hdrlen, rem) \
+ if (nla_type(pos) == type)
+
+/**
* nlmsg_put - Add a new netlink message to an skb
* @skb: socket buffer to store message in
* @portid: netlink PORTID of requesting application
diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h
index a5ee0f13740a..e72bcc239afd 100644
--- a/include/uapi/linux/devlink.h
+++ b/include/uapi/linux/devlink.h
@@ -221,6 +221,11 @@ enum devlink_port_flavour {
*/
};
+/* IEEE 802.1Qaz standard supported values. */
+
+#define DEVLINK_RATE_TCS_MAX 8
+#define DEVLINK_RATE_TC_INDEX_MAX (DEVLINK_RATE_TCS_MAX - 1)
+
enum devlink_rate_type {
DEVLINK_RATE_TYPE_LEAF,
DEVLINK_RATE_TYPE_NODE,
@@ -629,6 +634,10 @@ enum devlink_attr {
DEVLINK_ATTR_REGION_DIRECT, /* flag */
+ DEVLINK_ATTR_RATE_TC_BWS, /* nested */
+ DEVLINK_ATTR_RATE_TC_INDEX, /* u8 */
+ DEVLINK_ATTR_RATE_TC_BW, /* u32 */
+
/* Add new attributes above here, update the spec in
* Documentation/netlink/specs/devlink.yaml and re-generate
* net/devlink/netlink_gen.c.
diff --git a/net/devlink/netlink_gen.c b/net/devlink/netlink_gen.c
index e340d955cf3b..c50436433c18 100644
--- a/net/devlink/netlink_gen.c
+++ b/net/devlink/netlink_gen.c
@@ -45,6 +45,11 @@ const struct nla_policy devlink_dl_port_function_nl_policy[DEVLINK_PORT_FN_ATTR_
[DEVLINK_PORT_FN_ATTR_CAPS] = NLA_POLICY_BITFIELD32(15),
};
+const struct nla_policy devlink_dl_rate_tc_bws_nl_policy[DEVLINK_ATTR_RATE_TC_BW + 1] = {
+ [DEVLINK_ATTR_RATE_TC_INDEX] = NLA_POLICY_MAX(NLA_U8, DEVLINK_RATE_TC_INDEX_MAX),
+ [DEVLINK_ATTR_RATE_TC_BW] = { .type = NLA_U32, },
+};
+
const struct nla_policy devlink_dl_selftest_id_nl_policy[DEVLINK_ATTR_SELFTEST_ID_FLASH + 1] = {
[DEVLINK_ATTR_SELFTEST_ID_FLASH] = { .type = NLA_FLAG, },
};
@@ -523,7 +528,7 @@ static const struct nla_policy devlink_rate_get_dump_nl_policy[DEVLINK_ATTR_DEV_
};
/* DEVLINK_CMD_RATE_SET - do */
-static const struct nla_policy devlink_rate_set_nl_policy[DEVLINK_ATTR_RATE_TX_WEIGHT + 1] = {
+static const struct nla_policy devlink_rate_set_nl_policy[DEVLINK_ATTR_RATE_TC_BWS + 1] = {
[DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
[DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
[DEVLINK_ATTR_RATE_NODE_NAME] = { .type = NLA_NUL_STRING, },
@@ -532,10 +537,11 @@ static const struct nla_policy devlink_rate_set_nl_policy[DEVLINK_ATTR_RATE_TX_W
[DEVLINK_ATTR_RATE_TX_PRIORITY] = { .type = NLA_U32, },
[DEVLINK_ATTR_RATE_TX_WEIGHT] = { .type = NLA_U32, },
[DEVLINK_ATTR_RATE_PARENT_NODE_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_RATE_TC_BWS] = NLA_POLICY_NESTED(devlink_dl_rate_tc_bws_nl_policy),
};
/* DEVLINK_CMD_RATE_NEW - do */
-static const struct nla_policy devlink_rate_new_nl_policy[DEVLINK_ATTR_RATE_TX_WEIGHT + 1] = {
+static const struct nla_policy devlink_rate_new_nl_policy[DEVLINK_ATTR_RATE_TC_BWS + 1] = {
[DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
[DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
[DEVLINK_ATTR_RATE_NODE_NAME] = { .type = NLA_NUL_STRING, },
@@ -544,6 +550,7 @@ static const struct nla_policy devlink_rate_new_nl_policy[DEVLINK_ATTR_RATE_TX_W
[DEVLINK_ATTR_RATE_TX_PRIORITY] = { .type = NLA_U32, },
[DEVLINK_ATTR_RATE_TX_WEIGHT] = { .type = NLA_U32, },
[DEVLINK_ATTR_RATE_PARENT_NODE_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_RATE_TC_BWS] = NLA_POLICY_NESTED(devlink_dl_rate_tc_bws_nl_policy),
};
/* DEVLINK_CMD_RATE_DEL - do */
@@ -1191,7 +1198,7 @@ const struct genl_split_ops devlink_nl_ops[74] = {
.doit = devlink_nl_rate_set_doit,
.post_doit = devlink_nl_post_doit,
.policy = devlink_rate_set_nl_policy,
- .maxattr = DEVLINK_ATTR_RATE_TX_WEIGHT,
+ .maxattr = DEVLINK_ATTR_RATE_TC_BWS,
.flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO,
},
{
@@ -1201,7 +1208,7 @@ const struct genl_split_ops devlink_nl_ops[74] = {
.doit = devlink_nl_rate_new_doit,
.post_doit = devlink_nl_post_doit,
.policy = devlink_rate_new_nl_policy,
- .maxattr = DEVLINK_ATTR_RATE_TX_WEIGHT,
+ .maxattr = DEVLINK_ATTR_RATE_TC_BWS,
.flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO,
},
{
diff --git a/net/devlink/netlink_gen.h b/net/devlink/netlink_gen.h
index 8f2bd50ddf5e..fb733b5d4ff1 100644
--- a/net/devlink/netlink_gen.h
+++ b/net/devlink/netlink_gen.h
@@ -13,6 +13,7 @@
/* Common nested types */
extern const struct nla_policy devlink_dl_port_function_nl_policy[DEVLINK_PORT_FN_ATTR_CAPS + 1];
+extern const struct nla_policy devlink_dl_rate_tc_bws_nl_policy[DEVLINK_ATTR_RATE_TC_BW + 1];
extern const struct nla_policy devlink_dl_selftest_id_nl_policy[DEVLINK_ATTR_SELFTEST_ID_FLASH + 1];
/* Ops table for devlink */
diff --git a/net/devlink/rate.c b/net/devlink/rate.c
index 8828ffaf6cbc..d39300a9b3d4 100644
--- a/net/devlink/rate.c
+++ b/net/devlink/rate.c
@@ -80,6 +80,29 @@ devlink_rate_get_from_info(struct devlink *devlink, struct genl_info *info)
return ERR_PTR(-EINVAL);
}
+static int devlink_rate_put_tc_bws(struct sk_buff *msg, u32 *tc_bw)
+{
+ struct nlattr *nla_tc_bw;
+ int i;
+
+ for (i = 0; i < DEVLINK_RATE_TCS_MAX; i++) {
+ nla_tc_bw = nla_nest_start(msg, DEVLINK_ATTR_RATE_TC_BWS);
+ if (!nla_tc_bw)
+ return -EMSGSIZE;
+
+ if (nla_put_u8(msg, DEVLINK_ATTR_RATE_TC_INDEX, i) ||
+ nla_put_u32(msg, DEVLINK_ATTR_RATE_TC_BW, tc_bw[i]))
+ goto nla_put_failure;
+
+ nla_nest_end(msg, nla_tc_bw);
+ }
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(msg, nla_tc_bw);
+ return -EMSGSIZE;
+}
+
static int devlink_nl_rate_fill(struct sk_buff *msg,
struct devlink_rate *devlink_rate,
enum devlink_command cmd, u32 portid, u32 seq,
@@ -129,6 +152,9 @@ static int devlink_nl_rate_fill(struct sk_buff *msg,
devlink_rate->parent->name))
goto nla_put_failure;
+ if (devlink_rate_put_tc_bws(msg, devlink_rate->tc_bw))
+ goto nla_put_failure;
+
genlmsg_end(msg, hdr);
return 0;
@@ -316,6 +342,87 @@ devlink_nl_rate_parent_node_set(struct devlink_rate *devlink_rate,
return 0;
}
+static int devlink_nl_rate_tc_bw_parse(struct nlattr *parent_nest, u32 *tc_bw,
+ unsigned long *bitmap,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[DEVLINK_ATTR_MAX + 1];
+ u8 tc_index;
+ int err;
+
+ err = nla_parse_nested(tb, DEVLINK_ATTR_MAX, parent_nest,
+ devlink_dl_rate_tc_bws_nl_policy, extack);
+ if (err)
+ return err;
+
+ if (!tb[DEVLINK_ATTR_RATE_TC_INDEX]) {
+ NL_SET_ERR_ATTR_MISS(extack, parent_nest,
+ DEVLINK_ATTR_RATE_TC_INDEX);
+ return -EINVAL;
+ }
+
+ tc_index = nla_get_u8(tb[DEVLINK_ATTR_RATE_TC_INDEX]);
+
+ if (!tb[DEVLINK_ATTR_RATE_TC_BW]) {
+ NL_SET_ERR_ATTR_MISS(extack, parent_nest,
+ DEVLINK_ATTR_RATE_TC_BW);
+ return -EINVAL;
+ }
+
+ if (test_and_set_bit(tc_index, bitmap)) {
+ NL_SET_ERR_MSG_FMT(extack,
+ "Duplicate traffic class index specified (%u)",
+ tc_index);
+ return -EINVAL;
+ }
+
+ tc_bw[tc_index] = nla_get_u32(tb[DEVLINK_ATTR_RATE_TC_BW]);
+
+ return 0;
+}
+
+static int devlink_nl_rate_tc_bw_set(struct devlink_rate *devlink_rate,
+ struct genl_info *info)
+{
+ DECLARE_BITMAP(bitmap, DEVLINK_RATE_TCS_MAX) = {};
+ struct devlink *devlink = devlink_rate->devlink;
+ const struct devlink_ops *ops = devlink->ops;
+ u32 tc_bw[DEVLINK_RATE_TCS_MAX] = {};
+ int rem, err = -EOPNOTSUPP, i;
+ struct nlattr *attr;
+
+ nlmsg_for_each_attr_type(attr, DEVLINK_ATTR_RATE_TC_BWS, info->nlhdr,
+ GENL_HDRLEN, rem) {
+ err = devlink_nl_rate_tc_bw_parse(attr, tc_bw, bitmap,
+ info->extack);
+ if (err)
+ return err;
+ }
+
+ for (i = 0; i < DEVLINK_RATE_TCS_MAX; i++) {
+ if (!test_bit(i, bitmap)) {
+ NL_SET_ERR_MSG_FMT(info->extack,
+ "Bandwidth values must be specified for all %u traffic classes",
+ DEVLINK_RATE_TCS_MAX);
+ return -EINVAL;
+ }
+ }
+
+ if (devlink_rate_is_leaf(devlink_rate))
+ err = ops->rate_leaf_tc_bw_set(devlink_rate, devlink_rate->priv,
+ tc_bw, info->extack);
+ else if (devlink_rate_is_node(devlink_rate))
+ err = ops->rate_node_tc_bw_set(devlink_rate, devlink_rate->priv,
+ tc_bw, info->extack);
+
+ if (err)
+ return err;
+
+ memcpy(devlink_rate->tc_bw, tc_bw, sizeof(tc_bw));
+
+ return 0;
+}
+
static int devlink_nl_rate_set(struct devlink_rate *devlink_rate,
const struct devlink_ops *ops,
struct genl_info *info)
@@ -388,6 +495,12 @@ static int devlink_nl_rate_set(struct devlink_rate *devlink_rate,
return err;
}
+ if (attrs[DEVLINK_ATTR_RATE_TC_BWS]) {
+ err = devlink_nl_rate_tc_bw_set(devlink_rate, info);
+ if (err)
+ return err;
+ }
+
return 0;
}
@@ -423,6 +536,13 @@ static bool devlink_rate_set_ops_supported(const struct devlink_ops *ops,
"TX weight set isn't supported for the leafs");
return false;
}
+ if (attrs[DEVLINK_ATTR_RATE_TC_BWS] &&
+ !ops->rate_leaf_tc_bw_set) {
+ NL_SET_ERR_MSG_ATTR(info->extack,
+ attrs[DEVLINK_ATTR_RATE_TC_BWS],
+ "TC bandwidth set isn't supported for the leafs");
+ return false;
+ }
} else if (type == DEVLINK_RATE_TYPE_NODE) {
if (attrs[DEVLINK_ATTR_RATE_TX_SHARE] && !ops->rate_node_tx_share_set) {
NL_SET_ERR_MSG(info->extack, "TX share set isn't supported for the nodes");
@@ -449,6 +569,13 @@ static bool devlink_rate_set_ops_supported(const struct devlink_ops *ops,
"TX weight set isn't supported for the nodes");
return false;
}
+ if (attrs[DEVLINK_ATTR_RATE_TC_BWS] &&
+ !ops->rate_node_tc_bw_set) {
+ NL_SET_ERR_MSG_ATTR(info->extack,
+ attrs[DEVLINK_ATTR_RATE_TC_BWS],
+ "TC bandwidth set isn't supported for the nodes");
+ return false;
+ }
} else {
WARN(1, "Unknown type of rate object");
return false;
diff --git a/tools/testing/selftests/drivers/net/hw/devlink_rate_tc_bw.py b/tools/testing/selftests/drivers/net/hw/devlink_rate_tc_bw.py
new file mode 100755
index 000000000000..820d8a03becc
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/hw/devlink_rate_tc_bw.py
@@ -0,0 +1,466 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: GPL-2.0
+
+"""
+Devlink Rate TC Bandwidth Test Suite
+===================================
+
+This test suite verifies the functionality of devlink-rate traffic class (TC)
+bandwidth distribution in a virtualized environment. The tests validate that
+bandwidth can be properly allocated between different traffic classes and
+that TC mapping works as expected.
+
+Test Environment:
+----------------
+- Creates 1 VF
+- Establishes a bridge connecting the VF representor and the uplink representor
+- Sets up 2 VLAN interfaces on the VF with different VLAN IDs (101, 102)
+- Configures different traffic classes (TC3 and TC4) for each VLAN
+
+Test Cases:
+----------
+1. test_no_tc_mapping_bandwidth:
+ - Verifies that without TC mapping, bandwidth is NOT distributed according to
+ the configured 80/20 split between TC4 and TC3
+ - This test should fail if bandwidth matches the 80/20 split without TC
+ mapping
+ - Expected: Bandwidth should NOT be distributed as 80/20
+
+2. test_tc_mapping_bandwidth:
+ - Configures TC mapping using mqprio qdisc
+ - Verifies that with TC mapping, bandwidth IS distributed according to the
+ configured 80/20 split between TC3 and TC4
+ - Expected: Bandwidth should be distributed as 80/20
+
+Bandwidth Distribution:
+----------------------
+- TC3 (VLAN 101): Configured for 80% of total bandwidth
+- TC4 (VLAN 102): Configured for 20% of total bandwidth
+- Total bandwidth: 1Gbps
+- Tolerance: +-12%
+
+Hardware-Specific Behavior (mlx5):
+--------------------------
+mlx5 hardware enforces traffic class separation by ensuring that each transmit
+queue (SQ) is associated with a single TC. If a packet is sent on a queue that
+doesn't match the expected TC (based on DSCP or VLAN priority and hypervisor-set
+mapping), the hardware moves the queue to the correct TC scheduler to preserve
+traffic isolation.
+
+This behavior means that even without explicit TC-to-queue mapping, bandwidth
+enforcement may still appear to work—because the hardware dynamically adjusts
+the scheduling context. However, this can lead to performance issues in high
+rates and HOL blocking if traffic from different TCs is mixed on the same queue.
+"""
+
+import json
+import os
+import subprocess
+import threading
+import time
+
+from lib.py import ksft_pr, ksft_run, ksft_exit
+from lib.py import KsftSkipEx, KsftFailEx, KsftXfailEx
+from lib.py import NetDrvEpEnv, DevlinkFamily
+from lib.py import NlError
+from lib.py import cmd, defer, ethtool, ip
+
+
+class BandwidthValidator:
+ """
+ Validates bandwidth totals and per-TC shares against expected values
+ with a tolerance.
+ """
+
+ def __init__(self):
+ self.tolerance_percent = 12
+ self.expected_total_gbps = 1.0
+ self.total_min_expected = self.min_expected(self.expected_total_gbps)
+ self.total_max_expected = self.max_expected(self.expected_total_gbps)
+ self.tc_expected_percent = {
+ 3: 20.0,
+ 4: 80.0,
+ }
+
+ def min_expected(self, value):
+ """Calculates the minimum acceptable value based on tolerance."""
+ return value - (value * self.tolerance_percent / 100)
+
+ def max_expected(self, value):
+ """Calculates the maximum acceptable value based on tolerance."""
+ return value + (value * self.tolerance_percent / 100)
+
+ def bound(self, expected, value):
+ """Returns True if value is within expected tolerance."""
+ return self.min_expected(expected) <= value <= self.max_expected(expected)
+
+ def tc_bandwidth_bound(self, value, tc_ix):
+ """
+ Returns True if the given bandwidth value is within tolerance
+ for the TC's expected bandwidth.
+ """
+ expected = self.tc_expected_percent[tc_ix]
+ return self.bound(expected, value)
+
+
+def setup_vf(cfg, set_tc_mapping=True):
+ """
+ Sets up a VF on the given network interface.
+
+ Enables SR-IOV and switchdev mode, brings the VF interface up,
+ and optionally configures TC mapping using mqprio.
+ """
+ try:
+ cmd(f"devlink dev eswitch set pci/{cfg.pci} mode switchdev")
+ defer(cmd, f"devlink dev eswitch set pci/{cfg.pci} mode legacy")
+ except Exception as exc:
+ raise KsftSkipEx(f"Failed to enable switchdev mode on {cfg.pci}") from exc
+ try:
+ cmd(f"echo 1 > /sys/class/net/{cfg.ifname}/device/sriov_numvfs")
+ defer(cmd, f"echo 0 > /sys/class/net/{cfg.ifname}/device/sriov_numvfs")
+ except Exception as exc:
+ raise KsftSkipEx(f"Failed to enable SR-IOV on {cfg.ifname}") from exc
+
+ time.sleep(2)
+ vf_ifc = (os.listdir(
+ f"/sys/class/net/{cfg.ifname}/device/virtfn0/net") or [None])[0]
+ if vf_ifc:
+ ip(f"link set dev {vf_ifc} up")
+ else:
+ raise KsftSkipEx("VF interface not found")
+ if set_tc_mapping:
+ cmd(f"tc qdisc add dev {vf_ifc} root handle 5 mqprio mode dcb hw 1 num_tc 8")
+
+ return vf_ifc
+
+
+def setup_vlans_on_vf(vf_ifc):
+ """
+ Sets up two VLAN interfaces on the given VF, each mapped to a different TC.
+ """
+ vlan_configs = [
+ {"vlan_id": 101, "tc": 3, "ip": "198.51.100.2"},
+ {"vlan_id": 102, "tc": 4, "ip": "198.51.100.10"},
+ ]
+
+ for config in vlan_configs:
+ vlan_dev = f"{vf_ifc}.{config['vlan_id']}"
+ ip(f"link add link {vf_ifc} name {vlan_dev} type vlan id {config['vlan_id']}")
+ ip(f"addr add {config['ip']}/29 dev {vlan_dev}")
+ ip(f"link set dev {vlan_dev} up")
+ ip(f"link set dev {vlan_dev} type vlan egress-qos-map 0:{config['tc']}")
+ ksft_pr(f"Created VLAN {vlan_dev} on {vf_ifc} with tc {config['tc']} and IP {config['ip']}")
+
+
+def get_vf_info(cfg):
+ """
+ Finds the VF representor interface and devlink port index
+ for the given PCI device used in the test environment.
+ """
+ cfg.vf_representor = None
+ cfg.vf_port_index = None
+ out = subprocess.check_output(["devlink", "-j", "port", "show"], encoding="utf-8")
+ ports = json.loads(out)["port"]
+
+ for port_name, props in ports.items():
+ netdev = props.get("netdev")
+
+ if (port_name.startswith(f"pci/{cfg.pci}/") and
+ props.get("vfnum") == 0):
+ cfg.vf_representor = netdev
+ cfg.vf_port_index = int(port_name.split("/")[-1])
+ break
+
+
+def setup_bridge(cfg):
+ """
+ Creates and configures a Linux bridge, with both the uplink
+ and VF representor interfaces attached to it.
+ """
+ bridge_name = f"br_{os.getpid()}"
+ ip(f"link add name {bridge_name} type bridge")
+ defer(cmd, f"ip link del name {bridge_name} type bridge")
+
+ ip(f"link set dev {cfg.ifname} master {bridge_name}")
+
+ rep_name = cfg.vf_representor
+ if rep_name:
+ ip(f"link set dev {rep_name} master {bridge_name}")
+ ip(f"link set dev {rep_name} up")
+ ksft_pr(f"Set representor {rep_name} up and added to bridge")
+ else:
+ raise KsftSkipEx("Could not find representor for the VF")
+
+ ip(f"link set dev {bridge_name} up")
+
+
+def setup_devlink_rate(cfg):
+ """
+ Configures devlink rate tx_max and traffic class bandwidth for the VF.
+ """
+ port_index = cfg.vf_port_index
+ if port_index is None:
+ raise KsftSkipEx("Could not find VF port index")
+ try:
+ cfg.devnl.rate_set({
+ "bus-name": "pci",
+ "dev-name": cfg.pci,
+ "port-index": port_index,
+ "rate-tx-max": 125000000,
+ "rate-tc-bws": [
+ {"rate-tc-index": 0, "rate-tc-bw": 0},
+ {"rate-tc-index": 1, "rate-tc-bw": 0},
+ {"rate-tc-index": 2, "rate-tc-bw": 0},
+ {"rate-tc-index": 3, "rate-tc-bw": 20},
+ {"rate-tc-index": 4, "rate-tc-bw": 80},
+ {"rate-tc-index": 5, "rate-tc-bw": 0},
+ {"rate-tc-index": 6, "rate-tc-bw": 0},
+ {"rate-tc-index": 7, "rate-tc-bw": 0},
+ ]
+ })
+ except NlError as exc:
+ if exc.error == 95: # EOPNOTSUPP
+ raise KsftSkipEx("devlink rate configuration is not supported on the VF") from exc
+ raise KsftFailEx(f"rate_set failed on VF port {port_index}") from exc
+
+
+def setup_remote_server(cfg):
+ """
+ Sets up VLAN interfaces and starts iperf3 servers on the remote side.
+ """
+ remote_dev = cfg.remote_ifname
+ vlan_ids = [101, 102]
+ remote_ips = ["198.51.100.1", "198.51.100.9"]
+
+ for vlan_id, ip_addr in zip(vlan_ids, remote_ips):
+ vlan_dev = f"{remote_dev}.{vlan_id}"
+ cmd(f"ip link add link {remote_dev} name {vlan_dev} "
+ f"type vlan id {vlan_id}", host=cfg.remote)
+ cmd(f"ip addr add {ip_addr}/29 dev {vlan_dev}", host=cfg.remote)
+ cmd(f"ip link set dev {vlan_dev} up", host=cfg.remote)
+ cmd(f"iperf3 -s -1 -B {ip_addr}",background=True, host=cfg.remote)
+ defer(cmd, f"ip link del {vlan_dev}", host=cfg.remote)
+
+
+def setup_test_environment(cfg, set_tc_mapping=True):
+ """
+ Sets up the complete test environment including VF creation, VLANs,
+ bridge configuration, devlink rate setup, and the remote server.
+ """
+ vf_ifc = setup_vf(cfg, set_tc_mapping)
+ ksft_pr(f"Created VF interface: {vf_ifc}")
+
+ setup_vlans_on_vf(vf_ifc)
+
+ get_vf_info(cfg)
+ setup_bridge(cfg)
+
+ setup_devlink_rate(cfg)
+ setup_remote_server(cfg)
+ time.sleep(2)
+
+
+def run_iperf_client(server_ip, local_ip, barrier, min_expected_gbps=0.1):
+ """
+ Runs a single iperf3 client instance, binding to the given local IP.
+ Waits on a barrier to synchronize with other threads.
+ """
+ try:
+ barrier.wait(timeout=10)
+ except Exception as exc:
+ raise KsftFailEx("iperf3 barrier wait timed") from exc
+
+ iperf_cmd = ["iperf3", "-c", server_ip, "-B", local_ip, "-J"]
+ result = subprocess.run(iperf_cmd, capture_output=True, text=True,
+ check=True)
+
+ try:
+ output = json.loads(result.stdout)
+ bits_per_second = output["end"]["sum_received"]["bits_per_second"]
+ gbps = bits_per_second / 1e9
+ if gbps < min_expected_gbps:
+ ksft_pr(
+ f"iperf3 bandwidth too low: {gbps:.2f} Gbps "
+ f"(expected ≥ {min_expected_gbps} Gbps)"
+ )
+ return None
+ return gbps
+ except json.JSONDecodeError as exc:
+ ksft_pr(f"Failed to parse iperf3 JSON output: {exc}")
+ return None
+
+
+def run_bandwidth_test():
+ """
+ Launches iperf3 client threads for each VLAN/TC pair and collects results.
+ """
+ def _run_iperf_client_thread(server_ip, local_ip, results, barrier, tc_ix):
+ results[tc_ix] = run_iperf_client(server_ip, local_ip, barrier)
+
+ vf_vlan_data = [
+ # (local_ip, remote_ip, TC)
+ ("198.51.100.2", "198.51.100.1", 3),
+ ("198.51.100.10", "198.51.100.9", 4),
+ ]
+
+ results = {}
+ threads = []
+ start_barrier = threading.Barrier(len(vf_vlan_data))
+
+ for local_ip, remote_ip, tc_ix in vf_vlan_data:
+ thread = threading.Thread(
+ target=_run_iperf_client_thread,
+ args=(remote_ip, local_ip, results, start_barrier, tc_ix)
+ )
+ thread.start()
+ threads.append(thread)
+
+ for thread in threads:
+ thread.join()
+
+ for tc_ix, tc_bw in results.items():
+ if tc_bw is None:
+ raise KsftFailEx("iperf3 client failed; cannot evaluate bandwidth")
+
+ return results
+
+def calculate_bandwidth_percentages(results):
+ """
+ Calculates the percentage of total bandwidth received by TC3 and TC4.
+ """
+ if 3 not in results or 4 not in results:
+ raise KsftFailEx(f"Missing expected TC results in {results}")
+
+ tc3_bw = results[3]
+ tc4_bw = results[4]
+ total_bw = tc3_bw + tc4_bw
+ tc3_percentage = (tc3_bw / total_bw) * 100
+ tc4_percentage = (tc4_bw / total_bw) * 100
+
+ return {
+ 'tc3_bw': tc3_bw,
+ 'tc4_bw': tc4_bw,
+ 'tc3_percentage': tc3_percentage,
+ 'tc4_percentage': tc4_percentage,
+ 'total_bw': total_bw
+ }
+
+
+def print_bandwidth_results(bw_data, test_name):
+ """
+ Prints bandwidth measurements and TC usage summary for a given test.
+ """
+ ksft_pr(f"Bandwidth check results {test_name}:")
+ ksft_pr(f"TC 3: {bw_data['tc3_bw']:.2f} Gbits/sec")
+ ksft_pr(f"TC 4: {bw_data['tc4_bw']:.2f} Gbits/sec")
+ ksft_pr(f"Total bandwidth: {bw_data['total_bw']:.2f} Gbits/sec")
+ ksft_pr(f"TC 3 percentage: {bw_data['tc3_percentage']:.1f}%")
+ ksft_pr(f"TC 4 percentage: {bw_data['tc4_percentage']:.1f}%")
+
+
+def verify_total_bandwidth(bw_data, validator):
+ """
+ Ensures the total measured bandwidth falls within the acceptable tolerance.
+ """
+ total = bw_data['total_bw']
+
+ if validator.bound(validator.expected_total_gbps, total):
+ return
+
+ if total < validator.total_min_expected:
+ raise KsftSkipEx(
+ f"Total bandwidth {total:.2f} Gbps < minimum "
+ f"{validator.total_min_expected:.2f} Gbps; "
+ f"parent tx_max ({validator.expected_total_gbps:.1f} G) "
+ f"not reached, cannot validate share"
+ )
+
+ raise KsftFailEx(
+ f"Total bandwidth {total:.2f} Gbps exceeds allowed ceiling "
+ f"{validator.total_max_expected:.2f} Gbps "
+ f"(VF tx_max set to {validator.expected_total_gbps:.1f} G)"
+ )
+
+
+def check_bandwidth_distribution(bw_data, validator):
+ """
+ Checks whether the measured TC3 and TC4 bandwidth percentages
+ fall within their expected tolerance ranges.
+
+ Returns:
+ bool: True if both TC3 and TC4 percentages are within bounds.
+ """
+ tc3_valid = validator.tc_bandwidth_bound(bw_data['tc3_percentage'], 3)
+ tc4_valid = validator.tc_bandwidth_bound(bw_data['tc4_percentage'], 4)
+
+ return tc3_valid and tc4_valid
+
+
+def run_bandwidth_distribution_test(cfg, set_tc_mapping):
+ """
+ Runs parallel iperf3 tests for both TCs and collects results.
+ """
+ setup_test_environment(cfg, set_tc_mapping)
+ bandwidths = run_bandwidth_test()
+ bw_data = calculate_bandwidth_percentages(bandwidths)
+ test_name = "with TC mapping" if set_tc_mapping else "without TC mapping"
+ print_bandwidth_results(bw_data, test_name)
+
+ verify_total_bandwidth(bw_data, cfg.bw_validator)
+
+ return check_bandwidth_distribution(bw_data, cfg.bw_validator)
+
+
+def test_no_tc_mapping_bandwidth(cfg):
+ """
+ Verifies that bandwidth is not split 80/20 without traffic class mapping.
+ """
+ pass_bw_msg = "Bandwidth is NOT distributed as 80/20 without TC mapping"
+ fail_bw_msg = "Bandwidth matched 80/20 split without TC mapping"
+ is_mlx5 = "driver: mlx5" in ethtool(f"-i {cfg.ifname}").stdout
+
+ if run_bandwidth_distribution_test(cfg, set_tc_mapping=False):
+ if is_mlx5:
+ raise KsftXfailEx(fail_bw_msg)
+ raise KsftFailEx(fail_bw_msg)
+ if is_mlx5:
+ raise KsftFailEx("mlx5 behavior changed:" + pass_bw_msg)
+ ksft_pr(pass_bw_msg)
+
+
+def test_tc_mapping_bandwidth(cfg):
+ """
+ Verifies that bandwidth is correctly split 80/20 between TC3 and TC4
+ when traffic class mapping is set.
+ """
+ if run_bandwidth_distribution_test(cfg, set_tc_mapping=True):
+ ksft_pr("Bandwidth is distributed as 80/20 with TC mapping")
+ else:
+ raise KsftFailEx("Bandwidth did not match 80/20 split with TC mapping")
+
+
+def main() -> None:
+ """
+ Main entry point for running the test cases.
+ """
+ with NetDrvEpEnv(__file__, nsim_test=False) as cfg:
+ cfg.devnl = DevlinkFamily()
+
+ cfg.pci = os.path.basename(
+ os.path.realpath(f"/sys/class/net/{cfg.ifname}/device")
+ )
+ if not cfg.pci:
+ raise KsftSkipEx("Could not get PCI address of the interface")
+ cfg.require_cmd("iperf3")
+ cfg.require_cmd("iperf3", remote=True)
+
+ cfg.bw_validator = BandwidthValidator()
+
+ cases = [test_no_tc_mapping_bandwidth, test_tc_mapping_bandwidth]
+
+ ksft_run(cases=cases, args=(cfg,))
+ ksft_exit()
+
+
+if __name__ == "__main__":
+ main()
diff --git a/tools/testing/selftests/drivers/net/hw/lib/py/__init__.py b/tools/testing/selftests/drivers/net/hw/lib/py/__init__.py
index 56ff11074b55..1462a339a74b 100644
--- a/tools/testing/selftests/drivers/net/hw/lib/py/__init__.py
+++ b/tools/testing/selftests/drivers/net/hw/lib/py/__init__.py
@@ -13,7 +13,7 @@ try:
# Import one by one to avoid pylint false positives
from net.lib.py import EthtoolFamily, NetdevFamily, NetshaperFamily, \
- NlError, RtnlFamily
+ NlError, RtnlFamily, DevlinkFamily
from net.lib.py import CmdExitFailure
from net.lib.py import bkg, cmd, defer, ethtool, fd_read_timeout, ip, \
rand_port, tool, wait_port_listen
diff --git a/tools/testing/selftests/drivers/net/lib/py/__init__.py b/tools/testing/selftests/drivers/net/lib/py/__init__.py
index 9ed1d8f70524..fce5d9218f1d 100644
--- a/tools/testing/selftests/drivers/net/lib/py/__init__.py
+++ b/tools/testing/selftests/drivers/net/lib/py/__init__.py
@@ -12,7 +12,7 @@ try:
# Import one by one to avoid pylint false positives
from net.lib.py import EthtoolFamily, NetdevFamily, NetshaperFamily, \
- NlError, RtnlFamily
+ NlError, RtnlFamily, DevlinkFamily
from net.lib.py import CmdExitFailure
from net.lib.py import bkg, cmd, defer, ethtool, fd_read_timeout, ip, \
rand_port, tool, wait_port_listen
diff --git a/tools/testing/selftests/drivers/net/netdevsim/devlink.sh b/tools/testing/selftests/drivers/net/netdevsim/devlink.sh
index b5ea2526f23c..a102803ff74f 100755
--- a/tools/testing/selftests/drivers/net/netdevsim/devlink.sh
+++ b/tools/testing/selftests/drivers/net/netdevsim/devlink.sh
@@ -608,6 +608,46 @@ rate_attr_parent_check()
check_err $? "Unexpected parent attr value $api_value != $parent"
}
+rate_attr_tc_bw_check()
+{
+ local handle=$1
+ local tc_bw=$2
+ local debug_file=$3
+
+ local tc_bw_str=""
+ for bw in $tc_bw; do
+ local tc=${bw%%:*}
+ local value=${bw##*:}
+ tc_bw_str="$tc_bw_str $tc:$value"
+ done
+ tc_bw_str=${tc_bw_str# }
+
+ rate_attr_set "$handle" tc-bw "$tc_bw_str"
+ check_err $? "Failed to set tc-bw values"
+
+ for bw in $tc_bw; do
+ local tc=${bw%%:*}
+ local value=${bw##*:}
+ local debug_value
+ debug_value=$(cat "$debug_file"/tc"${tc}"_bw)
+ check_err $? "Failed to read tc-bw value from debugfs for tc$tc"
+ [ "$debug_value" == "$value" ]
+ check_err $? "Unexpected tc-bw debug value for tc$tc: $debug_value != $value"
+ done
+
+ for bw in $tc_bw; do
+ local tc=${bw%%:*}
+ local expected_value=${bw##*:}
+ local api_value
+ api_value=$(rate_attr_get "$handle" tc_"$tc")
+ if [ "$api_value" = "null" ]; then
+ api_value=0
+ fi
+ [ "$api_value" == "$expected_value" ]
+ check_err $? "Unexpected tc-bw value for tc$tc: $api_value != $expected_value"
+ done
+}
+
rate_node_add()
{
local handle=$1
@@ -649,6 +689,13 @@ rate_test()
rate=$(($rate+100))
done
+ local tc_bw="0:0 1:40 2:0 3:0 4:0 5:0 6:60 7:0"
+ for r_obj in $leafs
+ do
+ rate_attr_tc_bw_check "$r_obj" "$tc_bw" \
+ "$DEBUGFS_DIR"/ports/"${r_obj##*/}"
+ done
+
local node1_name='group1'
local node1="$DL_HANDLE/$node1_name"
rate_node_add "$node1"
@@ -666,6 +713,12 @@ rate_test()
rate_attr_tx_rate_check $node1 tx_max $node_tx_max \
$DEBUGFS_DIR/rate_nodes/${node1##*/}/tx_max
+
+ local tc_bw="0:20 1:0 2:0 3:0 4:0 5:20 6:60 7:0"
+ rate_attr_tc_bw_check $node1 "$tc_bw" \
+ "$DEBUGFS_DIR"/rate_nodes/"${node1##*/}"
+
+
rate_node_del "$node1"
check_err $? "Failed to delete node $node1"
local num_nodes=`rate_nodes_get $DL_HANDLE | wc -w`
diff --git a/tools/testing/selftests/net/lib/py/__init__.py b/tools/testing/selftests/net/lib/py/__init__.py
index 8697bd27dc30..02be28dcc089 100644
--- a/tools/testing/selftests/net/lib/py/__init__.py
+++ b/tools/testing/selftests/net/lib/py/__init__.py
@@ -6,4 +6,4 @@ from .netns import NetNS, NetNSEnter
from .nsim import *
from .utils import *
from .ynl import NlError, YnlFamily, EthtoolFamily, NetdevFamily, RtnlFamily, RtnlAddrFamily
-from .ynl import NetshaperFamily
+from .ynl import NetshaperFamily, DevlinkFamily
diff --git a/tools/testing/selftests/net/lib/py/ynl.py b/tools/testing/selftests/net/lib/py/ynl.py
index 6329ae805abf..2b3a61ea3bfa 100644
--- a/tools/testing/selftests/net/lib/py/ynl.py
+++ b/tools/testing/selftests/net/lib/py/ynl.py
@@ -56,3 +56,8 @@ class NetshaperFamily(YnlFamily):
def __init__(self, recv_size=0):
super().__init__((SPEC_PATH / Path('net_shaper.yaml')).as_posix(),
schema='', recv_size=recv_size)
+
+class DevlinkFamily(YnlFamily):
+ def __init__(self, recv_size=0):
+ super().__init__((SPEC_PATH / Path('devlink.yaml')).as_posix(),
+ schema='', recv_size=recv_size)