summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorShay Drory <shayd@nvidia.com>2022-12-06 20:51:15 +0200
committerJakub Kicinski <kuba@kernel.org>2022-12-07 20:09:18 -0800
commitda65e9ff3bf614d2836e38e1d405c7073e6ba3b7 (patch)
treec90d6feafc4b3aa3244a4f0e6fe75eaa834defec
parent875cd5eeba96091d97ce27e6e9b855fba9d2e0fc (diff)
devlink: Expose port function commands to control RoCE
Expose port function commands to enable / disable RoCE, this is used to control the port RoCE device capabilities. When RoCE is disabled for a function of the port, function cannot create any RoCE specific resources (e.g GID table). It also saves system memory utilization. For example disabling RoCE enable a VF/SF saves 1 Mbytes of system memory per function. Example of a PCI VF port which supports function configuration: Set RoCE of the VF's port function. $ devlink port show pci/0000:06:00.0/2 pci/0000:06:00.0/2: type eth netdev enp6s0pf0vf1 flavour pcivf pfnum 0 vfnum 1 function: hw_addr 00:00:00:00:00:00 roce enable $ devlink port function set pci/0000:06:00.0/2 roce disable $ devlink port show pci/0000:06:00.0/2 pci/0000:06:00.0/2: type eth netdev enp6s0pf0vf1 flavour pcivf pfnum 0 vfnum 1 function: hw_addr 00:00:00:00:00:00 roce disable Signed-off-by: Shay Drory <shayd@nvidia.com> Reviewed-by: Jiri Pirko <jiri@nvidia.com> Signed-off-by: Jakub Kicinski <kuba@kernel.org>
-rw-r--r--Documentation/networking/devlink/devlink-port.rst34
-rw-r--r--include/net/devlink.h18
-rw-r--r--include/uapi/linux/devlink.h10
-rw-r--r--net/core/devlink.c113
4 files changed, 174 insertions, 1 deletions
diff --git a/Documentation/networking/devlink/devlink-port.rst b/Documentation/networking/devlink/devlink-port.rst
index 2c637f4aae8e..c3302d23e480 100644
--- a/Documentation/networking/devlink/devlink-port.rst
+++ b/Documentation/networking/devlink/devlink-port.rst
@@ -110,7 +110,7 @@ devlink ports for both the controllers.
Function configuration
======================
-A user can configure the function attribute before enumerating the PCI
+Users can configure one or more function attributes before enumerating the PCI
function. Usually it means, user should configure function attribute
before a bus specific device for the function is created. However, when
SRIOV is enabled, virtual function devices are created on the PCI bus.
@@ -122,6 +122,9 @@ A user may set the hardware address of the function using
`devlink port function set hw_addr` command. For Ethernet port function
this means a MAC address.
+Users may also set the RoCE capability of the function using
+`devlink port function set roce` command.
+
Function attributes
===================
@@ -162,6 +165,35 @@ device created for the PCI VF/SF.
function:
hw_addr 00:00:00:00:88:88
+RoCE capability setup
+---------------------
+Not all PCI VFs/SFs require RoCE capability.
+
+When RoCE capability is disabled, it saves system memory per PCI VF/SF.
+
+When user disables RoCE capability for a VF/SF, user application cannot send or
+receive any RoCE packets through this VF/SF and RoCE GID table for this PCI
+will be empty.
+
+When RoCE capability is disabled in the device using port function attribute,
+VF/SF driver cannot override it.
+
+- Get RoCE capability of the VF device::
+
+ $ devlink port show pci/0000:06:00.0/2
+ pci/0000:06:00.0/2: type eth netdev enp6s0pf0vf1 flavour pcivf pfnum 0 vfnum 1
+ function:
+ hw_addr 00:00:00:00:00:00 roce enable
+
+- Set RoCE capability of the VF device::
+
+ $ devlink port function set pci/0000:06:00.0/2 roce disable
+
+ $ devlink port show pci/0000:06:00.0/2
+ pci/0000:06:00.0/2: type eth netdev enp6s0pf0vf1 flavour pcivf pfnum 0 vfnum 1
+ function:
+ hw_addr 00:00:00:00:00:00 roce disable
+
Subfunction
============
diff --git a/include/net/devlink.h b/include/net/devlink.h
index 5f6eca5e4a40..ce4c65d2f2e7 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -1452,6 +1452,24 @@ struct devlink_ops {
const u8 *hw_addr, int hw_addr_len,
struct netlink_ext_ack *extack);
/**
+ * @port_fn_roce_get: Port function's roce get function.
+ *
+ * Query RoCE state of a function managed by the devlink port.
+ * Return -EOPNOTSUPP if port function RoCE handling is not supported.
+ */
+ int (*port_fn_roce_get)(struct devlink_port *devlink_port,
+ bool *is_enable,
+ struct netlink_ext_ack *extack);
+ /**
+ * @port_fn_roce_set: Port function's roce set function.
+ *
+ * Enable/Disable the RoCE state of a function managed by the devlink
+ * port.
+ * Return -EOPNOTSUPP if port function RoCE handling is not supported.
+ */
+ int (*port_fn_roce_set)(struct devlink_port *devlink_port,
+ bool enable, struct netlink_ext_ack *extack);
+ /**
* port_new() - Add a new port function of a specified flavor
* @devlink: Devlink instance
* @attrs: attributes of the new port
diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h
index 70191d96af89..6cc2925bd478 100644
--- a/include/uapi/linux/devlink.h
+++ b/include/uapi/linux/devlink.h
@@ -658,11 +658,21 @@ enum devlink_resource_unit {
DEVLINK_RESOURCE_UNIT_ENTRY,
};
+enum devlink_port_fn_attr_cap {
+ DEVLINK_PORT_FN_ATTR_CAP_ROCE_BIT,
+
+ /* Add new caps above */
+ __DEVLINK_PORT_FN_ATTR_CAPS_MAX,
+};
+
+#define DEVLINK_PORT_FN_CAP_ROCE _BITUL(DEVLINK_PORT_FN_ATTR_CAP_ROCE_BIT)
+
enum devlink_port_function_attr {
DEVLINK_PORT_FUNCTION_ATTR_UNSPEC,
DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR, /* binary */
DEVLINK_PORT_FN_ATTR_STATE, /* u8 */
DEVLINK_PORT_FN_ATTR_OPSTATE, /* u8 */
+ DEVLINK_PORT_FN_ATTR_CAPS, /* bitfield32 */
__DEVLINK_PORT_FUNCTION_ATTR_MAX,
DEVLINK_PORT_FUNCTION_ATTR_MAX = __DEVLINK_PORT_FUNCTION_ATTR_MAX - 1
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 035249c5dd17..8c0ad52431c5 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -195,11 +195,16 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(devlink_hwmsg);
EXPORT_TRACEPOINT_SYMBOL_GPL(devlink_hwerr);
EXPORT_TRACEPOINT_SYMBOL_GPL(devlink_trap_report);
+#define DEVLINK_PORT_FN_CAPS_VALID_MASK \
+ (_BITUL(__DEVLINK_PORT_FN_ATTR_CAPS_MAX) - 1)
+
static const struct nla_policy devlink_function_nl_policy[DEVLINK_PORT_FUNCTION_ATTR_MAX + 1] = {
[DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR] = { .type = NLA_BINARY },
[DEVLINK_PORT_FN_ATTR_STATE] =
NLA_POLICY_RANGE(NLA_U8, DEVLINK_PORT_FN_STATE_INACTIVE,
DEVLINK_PORT_FN_STATE_ACTIVE),
+ [DEVLINK_PORT_FN_ATTR_CAPS] =
+ NLA_POLICY_BITFIELD32(DEVLINK_PORT_FN_CAPS_VALID_MASK),
};
static const struct nla_policy devlink_selftest_nl_policy[DEVLINK_ATTR_SELFTEST_ID_MAX + 1] = {
@@ -680,6 +685,60 @@ devlink_sb_tc_index_get_from_attrs(struct devlink_sb *devlink_sb,
return 0;
}
+static void devlink_port_fn_cap_fill(struct nla_bitfield32 *caps,
+ u32 cap, bool is_enable)
+{
+ caps->selector |= cap;
+ if (is_enable)
+ caps->value |= cap;
+}
+
+static int devlink_port_fn_roce_fill(const struct devlink_ops *ops,
+ struct devlink_port *devlink_port,
+ struct nla_bitfield32 *caps,
+ struct netlink_ext_ack *extack)
+{
+ bool is_enable;
+ int err;
+
+ if (!ops->port_fn_roce_get)
+ return 0;
+
+ err = ops->port_fn_roce_get(devlink_port, &is_enable, extack);
+ if (err) {
+ if (err == -EOPNOTSUPP)
+ return 0;
+ return err;
+ }
+
+ devlink_port_fn_cap_fill(caps, DEVLINK_PORT_FN_CAP_ROCE, is_enable);
+ return 0;
+}
+
+static int devlink_port_fn_caps_fill(const struct devlink_ops *ops,
+ struct devlink_port *devlink_port,
+ struct sk_buff *msg,
+ struct netlink_ext_ack *extack,
+ bool *msg_updated)
+{
+ struct nla_bitfield32 caps = {};
+ int err;
+
+ err = devlink_port_fn_roce_fill(ops, devlink_port, &caps, extack);
+ if (err)
+ return err;
+
+ if (!caps.selector)
+ return 0;
+ err = nla_put_bitfield32(msg, DEVLINK_PORT_FN_ATTR_CAPS, caps.value,
+ caps.selector);
+ if (err)
+ return err;
+
+ *msg_updated = true;
+ return 0;
+}
+
static int
devlink_sb_tc_index_get_from_info(struct devlink_sb *devlink_sb,
struct genl_info *info,
@@ -1264,6 +1323,35 @@ static int devlink_port_fn_state_fill(const struct devlink_ops *ops,
}
static int
+devlink_port_fn_roce_set(struct devlink_port *devlink_port, bool enable,
+ struct netlink_ext_ack *extack)
+{
+ const struct devlink_ops *ops = devlink_port->devlink->ops;
+
+ return ops->port_fn_roce_set(devlink_port, enable, extack);
+}
+
+static int devlink_port_fn_caps_set(struct devlink_port *devlink_port,
+ const struct nlattr *attr,
+ struct netlink_ext_ack *extack)
+{
+ struct nla_bitfield32 caps;
+ u32 caps_value;
+ int err;
+
+ caps = nla_get_bitfield32(attr);
+ caps_value = caps.value & caps.selector;
+ if (caps.selector & DEVLINK_PORT_FN_CAP_ROCE) {
+ err = devlink_port_fn_roce_set(devlink_port,
+ caps_value & DEVLINK_PORT_FN_CAP_ROCE,
+ extack);
+ if (err)
+ return err;
+ }
+ return 0;
+}
+
+static int
devlink_nl_port_function_attrs_put(struct sk_buff *msg, struct devlink_port *port,
struct netlink_ext_ack *extack)
{
@@ -1281,6 +1369,10 @@ devlink_nl_port_function_attrs_put(struct sk_buff *msg, struct devlink_port *por
&msg_updated);
if (err)
goto out;
+ err = devlink_port_fn_caps_fill(ops, port, msg, extack,
+ &msg_updated);
+ if (err)
+ goto out;
err = devlink_port_fn_state_fill(ops, port, msg, extack, &msg_updated);
out:
if (err || !msg_updated)
@@ -1653,6 +1745,7 @@ static int devlink_port_function_validate(struct devlink_port *devlink_port,
struct netlink_ext_ack *extack)
{
const struct devlink_ops *ops = devlink_port->devlink->ops;
+ struct nlattr *attr;
if (tb[DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR] &&
!ops->port_function_hw_addr_set) {
@@ -1665,6 +1758,18 @@ static int devlink_port_function_validate(struct devlink_port *devlink_port,
"Function does not support state setting");
return -EOPNOTSUPP;
}
+ attr = tb[DEVLINK_PORT_FN_ATTR_CAPS];
+ if (attr) {
+ struct nla_bitfield32 caps;
+
+ caps = nla_get_bitfield32(attr);
+ if (caps.selector & DEVLINK_PORT_FN_CAP_ROCE &&
+ !ops->port_fn_roce_set) {
+ NL_SET_ERR_MSG_ATTR(extack, attr,
+ "Port doesn't support RoCE function attribute");
+ return -EOPNOTSUPP;
+ }
+ }
return 0;
}
@@ -1692,6 +1797,14 @@ static int devlink_port_function_set(struct devlink_port *port,
if (err)
return err;
}
+
+ attr = tb[DEVLINK_PORT_FN_ATTR_CAPS];
+ if (attr) {
+ err = devlink_port_fn_caps_set(port, attr, extack);
+ if (err)
+ return err;
+ }
+
/* Keep this as the last function attribute set, so that when
* multiple port function attributes are set along with state,
* Those can be applied first before activating the state.