diff options
43 files changed, 1786 insertions, 250 deletions
diff --git a/Documentation/devicetree/bindings/net/nxp,tja11xx.yaml b/Documentation/devicetree/bindings/net/nxp,tja11xx.yaml index d51da24f3505..ab8867e6939b 100644 --- a/Documentation/devicetree/bindings/net/nxp,tja11xx.yaml +++ b/Documentation/devicetree/bindings/net/nxp,tja11xx.yaml @@ -31,6 +31,22 @@ patternProperties: description: The ID number for the child PHY. Should be +1 of parent PHY. + nxp,rmii-refclk-in: + type: boolean + description: | + The REF_CLK is provided for both transmitted and received data + in RMII mode. This clock signal is provided by the PHY and is + typically derived from an external 25MHz crystal. Alternatively, + a 50MHz clock signal generated by an external oscillator can be + connected to pin REF_CLK. A third option is to connect a 25MHz + clock to pin CLK_IN_OUT. So, the REF_CLK should be configured + as input or output according to the actual circuit connection. + If present, indicates that the REF_CLK will be configured as + interface reference clock input when RMII mode enabled. + If not present, the REF_CLK will be configured as interface + reference clock output when RMII mode enabled. + Only supported on TJA1100 and TJA1101. + required: - reg @@ -44,6 +60,7 @@ examples: tja1101_phy0: ethernet-phy@4 { reg = <0x4>; + nxp,rmii-refclk-in; }; }; - | diff --git a/Documentation/devicetree/bindings/net/ti,k3-am654-cpsw-nuss.yaml b/Documentation/devicetree/bindings/net/ti,k3-am654-cpsw-nuss.yaml index b8281d8be940..9ef11913052c 100644 --- a/Documentation/devicetree/bindings/net/ti,k3-am654-cpsw-nuss.yaml +++ b/Documentation/devicetree/bindings/net/ti,k3-am654-cpsw-nuss.yaml @@ -55,6 +55,7 @@ properties: compatible: enum: - ti,am654-cpsw-nuss + - ti,j7200-cpswxg-nuss - ti,j721e-cpsw-nuss - ti,am642-cpsw-nuss @@ -110,7 +111,7 @@ properties: const: 0 patternProperties: - port@[1-2]: + "^port@[1-4]$": type: object description: CPSWxG NUSS external ports @@ -119,7 +120,7 @@ properties: properties: reg: minimum: 1 - maximum: 2 + maximum: 4 description: CPSW port number phys: @@ -178,6 +179,19 @@ required: - '#address-cells' - '#size-cells' +allOf: + - if: + not: + properties: + compatible: + contains: + const: ti,j7200-cpswxg-nuss + then: + properties: + ethernet-ports: + patternProperties: + "^port@[3-4]$": false + additionalProperties: false examples: diff --git a/Documentation/userspace-api/index.rst b/Documentation/userspace-api/index.rst index a61eac0c73f8..c78da9ce0ec4 100644 --- a/Documentation/userspace-api/index.rst +++ b/Documentation/userspace-api/index.rst @@ -26,6 +26,7 @@ place where this information is gathered. ioctl/index iommu media/index + netlink/index sysfs-platform_profile vduse futex2 diff --git a/Documentation/userspace-api/netlink/index.rst b/Documentation/userspace-api/netlink/index.rst new file mode 100644 index 000000000000..b0c21538d97d --- /dev/null +++ b/Documentation/userspace-api/netlink/index.rst @@ -0,0 +1,12 @@ +.. SPDX-License-Identifier: BSD-3-Clause + +================ +Netlink Handbook +================ + +Netlink documentation for users. + +.. toctree:: + :maxdepth: 2 + + intro diff --git a/Documentation/userspace-api/netlink/intro.rst b/Documentation/userspace-api/netlink/intro.rst new file mode 100644 index 000000000000..94337f79e077 --- /dev/null +++ b/Documentation/userspace-api/netlink/intro.rst @@ -0,0 +1,643 @@ +.. SPDX-License-Identifier: BSD-3-Clause + +======================= +Introduction to Netlink +======================= + +Netlink is often described as an ioctl() replacement. +It aims to replace fixed-format C structures as supplied +to ioctl() with a format which allows an easy way to add +or extended the arguments. + +To achieve this Netlink uses a minimal fixed-format metadata header +followed by multiple attributes in the TLV (type, length, value) format. + +Unfortunately the protocol has evolved over the years, in an organic +and undocumented fashion, making it hard to coherently explain. +To make the most practical sense this document starts by describing +netlink as it is used today and dives into more "historical" uses +in later sections. + +Opening a socket +================ + +Netlink communication happens over sockets, a socket needs to be +opened first: + +.. code-block:: c + + fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_GENERIC); + +The use of sockets allows for a natural way of exchanging information +in both directions (to and from the kernel). The operations are still +performed synchronously when applications send() the request but +a separate recv() system call is needed to read the reply. + +A very simplified flow of a Netlink "call" will therefore look +something like: + +.. code-block:: c + + fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_GENERIC); + + /* format the request */ + send(fd, &request, sizeof(request)); + n = recv(fd, &response, RSP_BUFFER_SIZE); + /* interpret the response */ + +Netlink also provides natural support for "dumping", i.e. communicating +to user space all objects of a certain type (e.g. dumping all network +interfaces). + +.. code-block:: c + + fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_GENERIC); + + /* format the dump request */ + send(fd, &request, sizeof(request)); + while (1) { + n = recv(fd, &buffer, RSP_BUFFER_SIZE); + /* one recv() call can read multiple messages, hence the loop below */ + for (nl_msg in buffer) { + if (nl_msg.nlmsg_type == NLMSG_DONE) + goto dump_finished; + /* process the object */ + } + } + dump_finished: + +The first two arguments of the socket() call require little explanation - +it is opening a Netlink socket, with all headers provided by the user +(hence NETLINK, RAW). The last argument is the protocol within Netlink. +This field used to identify the subsystem with which the socket will +communicate. + +Classic vs Generic Netlink +-------------------------- + +Initial implementation of Netlink depended on a static allocation +of IDs to subsystems and provided little supporting infrastructure. +Let us refer to those protocols collectively as **Classic Netlink**. +The list of them is defined on top of the ``include/uapi/linux/netlink.h`` +file, they include among others - general networking (NETLINK_ROUTE), +iSCSI (NETLINK_ISCSI), and audit (NETLINK_AUDIT). + +**Generic Netlink** (introduced in 2005) allows for dynamic registration of +subsystems (and subsystem ID allocation), introspection and simplifies +implementing the kernel side of the interface. + +The following section describes how to use Generic Netlink, as the +number of subsystems using Generic Netlink outnumbers the older +protocols by an order of magnitude. There are also no plans for adding +more Classic Netlink protocols to the kernel. +Basic information on how communicating with core networking parts of +the Linux kernel (or another of the 20 subsystems using Classic +Netlink) differs from Generic Netlink is provided later in this document. + +Generic Netlink +=============== + +In addition to the Netlink fixed metadata header each Netlink protocol +defines its own fixed metadata header. (Similarly to how network +headers stack - Ethernet > IP > TCP we have Netlink > Generic N. > Family.) + +A Netlink message always starts with struct nlmsghdr, which is followed +by a protocol-specific header. In case of Generic Netlink the protocol +header is struct genlmsghdr. + +The practical meaning of the fields in case of Generic Netlink is as follows: + +.. code-block:: c + + struct nlmsghdr { + __u32 nlmsg_len; /* Length of message including headers */ + __u16 nlmsg_type; /* Generic Netlink Family (subsystem) ID */ + __u16 nlmsg_flags; /* Flags - request or dump */ + __u32 nlmsg_seq; /* Sequence number */ + __u32 nlmsg_pid; /* Port ID, set to 0 */ + }; + struct genlmsghdr { + __u8 cmd; /* Command, as defined by the Family */ + __u8 version; /* Irrelevant, set to 1 */ + __u16 reserved; /* Reserved, set to 0 */ + }; + /* TLV attributes follow... */ + +In Classic Netlink :c:member:`nlmsghdr.nlmsg_type` used to identify +which operation within the subsystem the message was referring to +(e.g. get information about a netdev). Generic Netlink needs to mux +multiple subsystems in a single protocol so it uses this field to +identify the subsystem, and :c:member:`genlmsghdr.cmd` identifies +the operation instead. (See :ref:`res_fam` for +information on how to find the Family ID of the subsystem of interest.) +Note that the first 16 values (0 - 15) of this field are reserved for +control messages both in Classic Netlink and Generic Netlink. +See :ref:`nl_msg_type` for more details. + +There are 3 usual types of message exchanges on a Netlink socket: + + - performing a single action (``do``); + - dumping information (``dump``); + - getting asynchronous notifications (``multicast``). + +Classic Netlink is very flexible and presumably allows other types +of exchanges to happen, but in practice those are the three that get +used. + +Asynchronous notifications are sent by the kernel and received by +the user sockets which subscribed to them. ``do`` and ``dump`` requests +are initiated by the user. :c:member:`nlmsghdr.nlmsg_flags` should +be set as follows: + + - for ``do``: ``NLM_F_REQUEST | NLM_F_ACK`` + - for ``dump``: ``NLM_F_REQUEST | NLM_F_ACK | NLM_F_DUMP`` + +:c:member:`nlmsghdr.nlmsg_seq` should be a set to a monotonically +increasing value. The value gets echoed back in responses and doesn't +matter in practice, but setting it to an increasing value for each +message sent is considered good hygiene. The purpose of the field is +matching responses to requests. Asynchronous notifications will have +:c:member:`nlmsghdr.nlmsg_seq` of ``0``. + +:c:member:`nlmsghdr.nlmsg_pid` is the Netlink equivalent of an address. +This field can be set to ``0`` when talking to the kernel. +See :ref:`nlmsg_pid` for the (uncommon) uses of the field. + +The expected use for :c:member:`genlmsghdr.version` was to allow +versioning of the APIs provided by the subsystems. No subsystem to +date made significant use of this field, so setting it to ``1`` seems +like a safe bet. + +.. _nl_msg_type: + +Netlink message types +--------------------- + +As previously mentioned :c:member:`nlmsghdr.nlmsg_type` carries +protocol specific values but the first 16 identifiers are reserved +(first subsystem specific message type should be equal to +``NLMSG_MIN_TYPE`` which is ``0x10``). + +There are only 4 Netlink control messages defined: + + - ``NLMSG_NOOP`` - ignore the message, not used in practice; + - ``NLMSG_ERROR`` - carries the return code of an operation; + - ``NLMSG_DONE`` - marks the end of a dump; + - ``NLMSG_OVERRUN`` - socket buffer has overflown, not used to date. + +``NLMSG_ERROR`` and ``NLMSG_DONE`` are of practical importance. +They carry return codes for operations. Note that unless +the ``NLM_F_ACK`` flag is set on the request Netlink will not respond +with ``NLMSG_ERROR`` if there is no error. To avoid having to special-case +this quirk it is recommended to always set ``NLM_F_ACK``. + +The format of ``NLMSG_ERROR`` is described by struct nlmsgerr:: + + ---------------------------------------------- + | struct nlmsghdr - response header | + ---------------------------------------------- + | int error | + ---------------------------------------------- + | struct nlmsghdr - original request header | + ---------------------------------------------- + | ** optionally (1) payload of the request | + ---------------------------------------------- + | ** optionally (2) extended ACK | + ---------------------------------------------- + +There are two instances of struct nlmsghdr here, first of the response +and second of the request. ``NLMSG_ERROR`` carries the information about +the request which led to the error. This could be useful when trying +to match requests to responses or re-parse the request to dump it into +logs. + +The payload of the request is not echoed in messages reporting success +(``error == 0``) or if ``NETLINK_CAP_ACK`` setsockopt() was set. +The latter is common +and perhaps recommended as having to read a copy of every request back +from the kernel is rather wasteful. The absence of request payload +is indicated by ``NLM_F_CAPPED`` in :c:member:`nlmsghdr.nlmsg_flags`. + +The second optional element of ``NLMSG_ERROR`` are the extended ACK +attributes. See :ref:`ext_ack` for more details. The presence +of extended ACK is indicated by ``NLM_F_ACK_TLVS`` in +:c:member:`nlmsghdr.nlmsg_flags`. + +``NLMSG_DONE`` is simpler, the request is never echoed but the extended +ACK attributes may be present:: + + ---------------------------------------------- + | struct nlmsghdr - response header | + ---------------------------------------------- + | int error | + ---------------------------------------------- + | ** optionally extended ACK | + ---------------------------------------------- + +.. _res_fam: + +Resolving the Family ID +----------------------- + +This section explains how to find the Family ID of a subsystem. +It also serves as an example of Generic Netlink communication. + +Generic Netlink is itself a subsystem exposed via the Generic Netlink API. +To avoid a circular dependency Generic Netlink has a statically allocated +Family ID (``GENL_ID_CTRL`` which is equal to ``NLMSG_MIN_TYPE``). +The Generic Netlink family implements a command used to find out information +about other families (``CTRL_CMD_GETFAMILY``). + +To get information about the Generic Netlink family named for example +``"test1"`` we need to send a message on the previously opened Generic Netlink +socket. The message should target the Generic Netlink Family (1), be a +``do`` (2) call to ``CTRL_CMD_GETFAMILY`` (3). A ``dump`` version of this +call would make the kernel respond with information about *all* the families +it knows about. Last but not least the name of the family in question has +to be specified (4) as an attribute with the appropriate type:: + + struct nlmsghdr: + __u32 nlmsg_len: 32 + __u16 nlmsg_type: GENL_ID_CTRL // (1) + __u16 nlmsg_flags: NLM_F_REQUEST | NLM_F_ACK // (2) + __u32 nlmsg_seq: 1 + __u32 nlmsg_pid: 0 + + struct genlmsghdr: + __u8 cmd: CTRL_CMD_GETFAMILY // (3) + __u8 version: 2 /* or 1, doesn't matter */ + __u16 reserved: 0 + + struct nlattr: // (4) + __u16 nla_len: 10 + __u16 nla_type: CTRL_ATTR_FAMILY_NAME + char data: test1\0 + + (padding:) + char data: \0\0 + +The length fields in Netlink (:c:member:`nlmsghdr.nlmsg_len` +and :c:member:`nlattr.nla_len`) always *include* the header. +Attribute headers in netlink must be aligned to 4 bytes from the start +of the message, hence the extra ``\0\0`` after ``CTRL_ATTR_FAMILY_NAME``. +The attribute lengths *exclude* the padding. + +If the family is found kernel will reply with two messages, the response +with all the information about the family:: + + /* Message #1 - reply */ + struct nlmsghdr: + __u32 nlmsg_len: 136 + __u16 nlmsg_type: GENL_ID_CTRL + __u16 nlmsg_flags: 0 + __u32 nlmsg_seq: 1 /* echoed from our request */ + __u32 nlmsg_pid: 5831 /* The PID of our user space process */ + + struct genlmsghdr: + __u8 cmd: CTRL_CMD_GETFAMILY + __u8 version: 2 + __u16 reserved: 0 + + struct nlattr: + __u16 nla_len: 10 + __u16 nla_type: CTRL_ATTR_FAMILY_NAME + char data: test1\0 + + (padding:) + data: \0\0 + + struct nlattr: + __u16 nla_len: 6 + __u16 nla_type: CTRL_ATTR_FAMILY_ID + __u16: 123 /* The Family ID we are after */ + + (padding:) + char data: \0\0 + + struct nlattr: + __u16 nla_len: 9 + __u16 nla_type: CTRL_ATTR_FAMILY_VERSION + __u16: 1 + + /* ... etc, more attributes will follow. */ + +And the error code (success) since ``NLM_F_ACK`` had been set on the request:: + + /* Message #2 - the ACK */ + struct nlmsghdr: + __u32 nlmsg_len: 36 + __u16 nlmsg_type: NLMSG_ERROR + __u16 nlmsg_flags: NLM_F_CAPPED /* There won't be a payload */ + __u32 nlmsg_seq: 1 /* echoed from our request */ + __u32 nlmsg_pid: 5831 /* The PID of our user space process */ + + int error: 0 + + struct nlmsghdr: /* Copy of the request header as we sent it */ + __u32 nlmsg_len: 32 + __u16 nlmsg_type: GENL_ID_CTRL + __u16 nlmsg_flags: NLM_F_REQUEST | NLM_F_ACK + __u32 nlmsg_seq: 1 + __u32 nlmsg_pid: 0 + +The order of attributes (struct nlattr) is not guaranteed so the user +has to walk the attributes and parse them. + +Note that Generic Netlink sockets are not associated or bound to a single +family. A socket can be used to exchange messages with many different +families, selecting the recipient family on message-by-message basis using +the :c:member:`nlmsghdr.nlmsg_type` field. + +.. _ext_ack: + +Extended ACK +------------ + +Extended ACK controls reporting of additional error/warning TLVs +in ``NLMSG_ERROR`` and ``NLMSG_DONE`` messages. To maintain backward +compatibility this feature has to be explicitly enabled by setting +the ``NETLINK_EXT_ACK`` setsockopt() to ``1``. + +Types of extended ack attributes are defined in enum nlmsgerr_attrs. +The two most commonly used attributes are ``NLMSGERR_ATTR_MSG`` +and ``NLMSGERR_ATTR_OFFS``. + +``NLMSGERR_ATTR_MSG`` carries a message in English describing +the encountered problem. These messages are far more detailed +than what can be expressed thru standard UNIX error codes. + +``NLMSGERR_ATTR_OFFS`` points to the attribute which caused the problem. + +Extended ACKs can be reported on errors as well as in case of success. +The latter should be treated as a warning. + +Extended ACKs greatly improve the usability of Netlink and should +always be enabled, appropriately parsed and reported to the user. + +Advanced topics +=============== + +Dump consistency +---------------- + +Some of the data structures kernel uses for storing objects make +it hard to provide an atomic snapshot of all the objects in a dump +(without impacting the fast-paths updating them). + +Kernel may set the ``NLM_F_DUMP_INTR`` flag on any message in a dump +(including the ``NLMSG_DONE`` message) if the dump was interrupted and +may be inconsistent (e.g. missing objects). User space should retry +the dump if it sees the flag set. + +Introspection +------------- + +The basic introspection abilities are enabled by access to the Family +object as reported in :ref:`res_fam`. User can query information about +the Generic Netlink family, including which operations are supported +by the kernel and what attributes the kernel understands. +Family information includes the highest ID of an attribute kernel can parse, +a separate command (``CTRL_CMD_GETPOLICY``) provides detailed information +about supported attributes, including ranges of values the kernel accepts. + +Querying family information is useful in cases when user space needs +to make sure that the kernel has support for a feature before issuing +a request. + +.. _nlmsg_pid: + +nlmsg_pid +--------- + +:c:member:`nlmsghdr.nlmsg_pid` is the Netlink equivalent of an address. +It is referred to as Port ID, sometimes Process ID because for historical +reasons if the application does not select (bind() to) an explicit Port ID +kernel will automatically assign it the ID equal to its Process ID +(as reported by the getpid() system call). + +Similarly to the bind() semantics of the TCP/IP network protocols the value +of zero means "assign automatically", hence it is common for applications +to leave the :c:member:`nlmsghdr.nlmsg_pid` field initialized to ``0``. + +The field is still used today in rare cases when kernel needs to send +a unicast notification. User space application can use bind() to associate +its socket with a specific PID, it then communicates its PID to the kernel. +This way the kernel can reach the specific user space process. + +This sort of communication is utilized in UMH (User Mode Helper)-like +scenarios when kernel needs to trigger user space processing or ask user +space for a policy decision. + +Multicast notifications +----------------------- + +One of the strengths of Netlink is the ability to send event notifications +to user space. This is a unidirectional form of communication (kernel -> +user) and does not involve any control messages like ``NLMSG_ERROR`` or +``NLMSG_DONE``. + +For example the Generic Netlink family itself defines a set of multicast +notifications about registered families. When a new family is added the +sockets subscribed to the notifications will get the following message:: + + struct nlmsghdr: + __u32 nlmsg_len: 136 + __u16 nlmsg_type: GENL_ID_CTRL + __u16 nlmsg_flags: 0 + __u32 nlmsg_seq: 0 + __u32 nlmsg_pid: 0 + + struct genlmsghdr: + __u8 cmd: CTRL_CMD_NEWFAMILY + __u8 version: 2 + __u16 reserved: 0 + + struct nlattr: + __u16 nla_len: 10 + __u16 nla_type: CTRL_ATTR_FAMILY_NAME + char data: test1\0 + + (padding:) + data: \0\0 + + struct nlattr: + __u16 nla_len: 6 + __u16 nla_type: CTRL_ATTR_FAMILY_ID + __u16: 123 /* The Family ID we are after */ + + (padding:) + char data: \0\0 + + struct nlattr: + __u16 nla_len: 9 + __u16 nla_type: CTRL_ATTR_FAMILY_VERSION + __u16: 1 + + /* ... etc, more attributes will follow. */ + +The notification contains the same information as the response +to the ``CTRL_CMD_GETFAMILY`` request. + +The Netlink headers of the notification are mostly 0 and irrelevant. +The :c:member:`nlmsghdr.nlmsg_seq` may be either zero or a monotonically +increasing notification sequence number maintained by the family. + +To receive notifications the user socket must subscribe to the relevant +notification group. Much like the Family ID, the Group ID for a given +multicast group is dynamic and can be found inside the Family information. +The ``CTRL_ATTR_MCAST_GROUPS`` attribute contains nests with names +(``CTRL_ATTR_MCAST_GRP_NAME``) and IDs (``CTRL_ATTR_MCAST_GRP_ID``) of +the groups family. + +Once the Group ID is known a setsockopt() call adds the socket to the group: + +.. code-block:: c + + unsigned int group_id; + + /* .. find the group ID... */ + + setsockopt(fd, SOL_NETLINK, NETLINK_ADD_MEMBERSHIP, + &group_id, sizeof(group_id)); + +The socket will now receive notifications. + +It is recommended to use separate sockets for receiving notifications +and sending requests to the kernel. The asynchronous nature of notifications +means that they may get mixed in with the responses making the message +handling much harder. + +Buffer sizing +------------- + +Netlink sockets are datagram sockets rather than stream sockets, +meaning that each message must be received in its entirety by a single +recv()/recvmsg() system call. If the buffer provided by the user is too +short, the message will be truncated and the ``MSG_TRUNC`` flag set +in struct msghdr (struct msghdr is the second argument +of the recvmsg() system call, *not* a Netlink header). + +Upon truncation the remaining part of the message is discarded. + +Netlink expects that the user buffer will be at least 8kB or a page +size of the CPU architecture, whichever is bigger. Particular Netlink +families may, however, require a larger buffer. 32kB buffer is recommended +for most efficient handling of dumps (larger buffer fits more dumped +objects and therefore fewer recvmsg() calls are needed). + +Classic Netlink +=============== + +The main differences between Classic and Generic Netlink are the dynamic +allocation of subsystem identifiers and availability of introspection. +In theory the protocol does not differ significantly, however, in practice +Classic Netlink experimented with concepts which were abandoned in Generic +Netlink (really, they usually only found use in a small corner of a single +subsystem). This section is meant as an explainer of a few of such concepts, +with the explicit goal of giving the Generic Netlink +users the confidence to ignore them when reading the uAPI headers. + +Most of the concepts and examples here refer to the ``NETLINK_ROUTE`` family, +which covers much of the configuration of the Linux networking stack. +Real documentation of that family, deserves a chapter (or a book) of its own. + +Families +-------- + +Netlink refers to subsystems as families. This is a remnant of using +sockets and the concept of protocol families, which are part of message +demultiplexing in ``NETLINK_ROUTE``. + +Sadly every layer of encapsulation likes to refer to whatever it's carrying +as "families" making the term very confusing: + + 1. AF_NETLINK is a bona fide socket protocol family + 2. AF_NETLINK's documentation refers to what comes after its own + header (struct nlmsghdr) in a message as a "Family Header" + 3. Generic Netlink is a family for AF_NETLINK (struct genlmsghdr follows + struct nlmsghdr), yet it also calls its users "Families". + +Note that the Generic Netlink Family IDs are in a different "ID space" +and overlap with Classic Netlink protocol numbers (e.g. ``NETLINK_CRYPTO`` +has the Classic Netlink protocol ID of 21 which Generic Netlink will +happily allocate to one of its families as well). + +Strict checking +--------------- + +The ``NETLINK_GET_STRICT_CHK`` socket option enables strict input checking +in ``NETLINK_ROUTE``. It was needed because historically kernel did not +validate the fields of structures it didn't process. This made it impossible +to start using those fields later without risking regressions in applications +which initialized them incorrectly or not at all. + +``NETLINK_GET_STRICT_CHK`` declares that the application is initializing +all fields correctly. It also opts into validating that message does not +contain trailing data and requests that kernel rejects attributes with +type higher than largest attribute type known to the kernel. + +``NETLINK_GET_STRICT_CHK`` is not used outside of ``NETLINK_ROUTE``. + +Unknown attributes +------------------ + +Historically Netlink ignored all unknown attributes. The thinking was that +it would free the application from having to probe what kernel supports. +The application could make a request to change the state and check which +parts of the request "stuck". + +This is no longer the case for new Generic Netlink families and those opting +in to strict checking. See enum netlink_validation for validation types +performed. + +Fixed metadata and structures +----------------------------- + +Classic Netlink made liberal use of fixed-format structures within +the messages. Messages would commonly have a structure with +a considerable number of fields after struct nlmsghdr. It was also +common to put structures with multiple members inside attributes, +without breaking each member into an attribute of its own. + +This has caused problems with validation and extensibility and +therefore using binary structures is actively discouraged for new +attributes. + +Request types +------------- + +``NETLINK_ROUTE`` categorized requests into 4 types ``NEW``, ``DEL``, ``GET``, +and ``SET``. Each object can handle all or some of those requests +(objects being netdevs, routes, addresses, qdiscs etc.) Request type +is defined by the 2 lowest bits of the message type, so commands for +new objects would always be allocated with a stride of 4. + +Each object would also have it's own fixed metadata shared by all request +types (e.g. struct ifinfomsg for netdev requests, struct ifaddrmsg for address +requests, struct tcmsg for qdisc requests). + +Even though other protocols and Generic Netlink commands often use +the same verbs in their message names (``GET``, ``SET``) the concept +of request types did not find wider adoption. + +Message flags +------------- + +The earlier section has already covered the basic request flags +(``NLM_F_REQUEST``, ``NLM_F_ACK``, ``NLM_F_DUMP``) and the ``NLMSG_ERROR`` / +``NLMSG_DONE`` flags (``NLM_F_CAPPED``, ``NLM_F_ACK_TLVS``). +Dump flags were also mentioned (``NLM_F_MULTI``, ``NLM_F_DUMP_INTR``). + +Those are the main flags of note, with a small exception (of ``ieee802154``) +Generic Netlink does not make use of other flags. If the protocol needs +to communicate special constraints for a request it should use +an attribute, not the flags in struct nlmsghdr. + +Classic Netlink, however, defined various flags for its ``GET``, ``NEW`` +and ``DEL`` requests. Since request types have not been generalized +the request type specific flags should not be used either. + +uAPI reference +============== + +.. kernel-doc:: include/uapi/linux/netlink.h diff --git a/drivers/net/dsa/ocelot/felix.c b/drivers/net/dsa/ocelot/felix.c index aadb0bd7c24f..ee19ed96f284 100644 --- a/drivers/net/dsa/ocelot/felix.c +++ b/drivers/net/dsa/ocelot/felix.c @@ -445,6 +445,9 @@ static int felix_tag_8021q_setup(struct dsa_switch *ds) if (err) return err; + dsa_switch_for_each_cpu_port(dp, ds) + ocelot_port_setup_dsa_8021q_cpu(ocelot, dp->index); + dsa_switch_for_each_user_port(dp, ds) ocelot_port_assign_dsa_8021q_cpu(ocelot, dp->index, dp->cpu_dp->index); @@ -493,6 +496,9 @@ static void felix_tag_8021q_teardown(struct dsa_switch *ds) dsa_switch_for_each_user_port(dp, ds) ocelot_port_unassign_dsa_8021q_cpu(ocelot, dp->index); + dsa_switch_for_each_cpu_port(dp, ds) + ocelot_port_teardown_dsa_8021q_cpu(ocelot, dp->index); + dsa_tag_8021q_unregister(ds); } diff --git a/drivers/net/ethernet/faraday/ftmac100.c b/drivers/net/ethernet/faraday/ftmac100.c index 8a341e2d5833..2e6524009b19 100644 --- a/drivers/net/ethernet/faraday/ftmac100.c +++ b/drivers/net/ethernet/faraday/ftmac100.c @@ -1075,6 +1075,7 @@ static int ftmac100_probe(struct platform_device *pdev) SET_NETDEV_DEV(netdev, &pdev->dev); netdev->ethtool_ops = &ftmac100_ethtool_ops; netdev->netdev_ops = &ftmac100_netdev_ops; + netdev->max_mtu = MAX_PKT_SIZE; platform_set_drvdata(pdev, netdev); diff --git a/drivers/net/ethernet/marvell/prestera/prestera.h b/drivers/net/ethernet/marvell/prestera/prestera.h index 2f84d0fb4094..e5a4381a88b3 100644 --- a/drivers/net/ethernet/marvell/prestera/prestera.h +++ b/drivers/net/ethernet/marvell/prestera/prestera.h @@ -367,6 +367,8 @@ int prestera_port_learning_set(struct prestera_port *port, bool learn_enable); int prestera_port_uc_flood_set(struct prestera_port *port, bool flood); int prestera_port_mc_flood_set(struct prestera_port *port, bool flood); +int prestera_port_br_locked_set(struct prestera_port *port, bool br_locked); + int prestera_port_pvid_set(struct prestera_port *port, u16 vid); bool prestera_netdev_check(const struct net_device *dev); diff --git a/drivers/net/ethernet/marvell/prestera/prestera_hw.c b/drivers/net/ethernet/marvell/prestera/prestera_hw.c index e0e9ae34ceea..1a68a888d587 100644 --- a/drivers/net/ethernet/marvell/prestera/prestera_hw.c +++ b/drivers/net/ethernet/marvell/prestera/prestera_hw.c @@ -101,6 +101,7 @@ enum { PRESTERA_CMD_PORT_ATTR_LEARNING = 7, PRESTERA_CMD_PORT_ATTR_FLOOD = 8, PRESTERA_CMD_PORT_ATTR_CAPABILITY = 9, + PRESTERA_CMD_PORT_ATTR_LOCKED = 10, PRESTERA_CMD_PORT_ATTR_PHY_MODE = 12, PRESTERA_CMD_PORT_ATTR_TYPE = 13, PRESTERA_CMD_PORT_ATTR_STATS = 17, @@ -285,6 +286,7 @@ union prestera_msg_port_param { u8 duplex; u8 fec; u8 fc; + u8 br_locked; union { struct { u8 admin; @@ -1640,6 +1642,22 @@ int prestera_hw_port_mc_flood_set(const struct prestera_port *port, bool flood) &req.cmd, sizeof(req)); } +int prestera_hw_port_br_locked_set(const struct prestera_port *port, + bool br_locked) +{ + struct prestera_msg_port_attr_req req = { + .attr = __cpu_to_le32(PRESTERA_CMD_PORT_ATTR_LOCKED), + .port = __cpu_to_le32(port->hw_id), + .dev = __cpu_to_le32(port->dev_id), + .param = { + .br_locked = br_locked, + } + }; + + return prestera_cmd(port->sw, PRESTERA_CMD_TYPE_PORT_ATTR_SET, + &req.cmd, sizeof(req)); +} + int prestera_hw_vlan_create(struct prestera_switch *sw, u16 vid) { struct prestera_msg_vlan_req req = { diff --git a/drivers/net/ethernet/marvell/prestera/prestera_hw.h b/drivers/net/ethernet/marvell/prestera/prestera_hw.h index 56e043146dd2..4aca43e72a05 100644 --- a/drivers/net/ethernet/marvell/prestera/prestera_hw.h +++ b/drivers/net/ethernet/marvell/prestera/prestera_hw.h @@ -183,6 +183,8 @@ int prestera_hw_port_speed_get(const struct prestera_port *port, u32 *speed); int prestera_hw_port_learning_set(struct prestera_port *port, bool enable); int prestera_hw_port_uc_flood_set(const struct prestera_port *port, bool flood); int prestera_hw_port_mc_flood_set(const struct prestera_port *port, bool flood); +int prestera_hw_port_br_locked_set(const struct prestera_port *port, + bool br_locked); int prestera_hw_port_accept_frm_type(struct prestera_port *port, enum prestera_accept_frm_type type); /* Vlan API */ diff --git a/drivers/net/ethernet/marvell/prestera/prestera_main.c b/drivers/net/ethernet/marvell/prestera/prestera_main.c index 3489b80ae0d6..3956d6d5df3c 100644 --- a/drivers/net/ethernet/marvell/prestera/prestera_main.c +++ b/drivers/net/ethernet/marvell/prestera/prestera_main.c @@ -51,6 +51,11 @@ int prestera_port_mc_flood_set(struct prestera_port *port, bool flood) return prestera_hw_port_mc_flood_set(port, flood); } +int prestera_port_br_locked_set(struct prestera_port *port, bool br_locked) +{ + return prestera_hw_port_br_locked_set(port, br_locked); +} + int prestera_port_pvid_set(struct prestera_port *port, u16 vid) { enum prestera_accept_frm_type frm_type; diff --git a/drivers/net/ethernet/marvell/prestera/prestera_switchdev.c b/drivers/net/ethernet/marvell/prestera/prestera_switchdev.c index 71cde97d85c8..e548cd32582e 100644 --- a/drivers/net/ethernet/marvell/prestera/prestera_switchdev.c +++ b/drivers/net/ethernet/marvell/prestera/prestera_switchdev.c @@ -143,6 +143,7 @@ prestera_br_port_flags_reset(struct prestera_bridge_port *br_port, prestera_port_uc_flood_set(port, false); prestera_port_mc_flood_set(port, false); prestera_port_learning_set(port, false); + prestera_port_br_locked_set(port, false); } static int prestera_br_port_flags_set(struct prestera_bridge_port *br_port, @@ -162,6 +163,11 @@ static int prestera_br_port_flags_set(struct prestera_bridge_port *br_port, if (err) goto err_out; + err = prestera_port_br_locked_set(port, + br_port->flags & BR_PORT_LOCKED); + if (err) + goto err_out; + return 0; err_out: @@ -1163,7 +1169,7 @@ static int prestera_port_obj_attr_set(struct net_device *dev, const void *ctx, break; case SWITCHDEV_ATTR_ID_PORT_PRE_BRIDGE_FLAGS: if (attr->u.brport_flags.mask & - ~(BR_LEARNING | BR_FLOOD | BR_MCAST_FLOOD)) + ~(BR_LEARNING | BR_FLOOD | BR_MCAST_FLOOD | BR_PORT_LOCKED)) err = -EINVAL; break; case SWITCHDEV_ATTR_ID_PORT_BRIDGE_FLAGS: diff --git a/drivers/net/ethernet/mellanox/mlxsw/core.c b/drivers/net/ethernet/mellanox/mlxsw/core.c index 75553eb2c7f2..450dcd9951bb 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/core.c +++ b/drivers/net/ethernet/mellanox/mlxsw/core.c @@ -70,6 +70,8 @@ struct mlxsw_core { struct workqueue_struct *emad_wq; struct list_head rx_listener_list; struct list_head event_listener_list; + struct list_head irq_event_handler_list; + struct mutex irq_event_handler_lock; /* Locks access to handlers list */ struct { atomic64_t tid; struct list_head trans_list; @@ -2090,6 +2092,18 @@ static void mlxsw_core_health_fini(struct mlxsw_core *mlxsw_core) devlink_health_reporter_destroy(mlxsw_core->health.fw_fatal); } +static void mlxsw_core_irq_event_handler_init(struct mlxsw_core *mlxsw_core) +{ + INIT_LIST_HEAD(&mlxsw_core->irq_event_handler_list); + mutex_init(&mlxsw_core->irq_event_handler_lock); +} + +static void mlxsw_core_irq_event_handler_fini(struct mlxsw_core *mlxsw_core) +{ + mutex_destroy(&mlxsw_core->irq_event_handler_lock); + WARN_ON(!list_empty(&mlxsw_core->irq_event_handler_list)); +} + static int __mlxsw_core_bus_device_register(const struct mlxsw_bus_info *mlxsw_bus_info, const struct mlxsw_bus *mlxsw_bus, @@ -2125,6 +2139,7 @@ __mlxsw_core_bus_device_register(const struct mlxsw_bus_info *mlxsw_bus_info, mlxsw_core->bus = mlxsw_bus; mlxsw_core->bus_priv = bus_priv; mlxsw_core->bus_info = mlxsw_bus_info; + mlxsw_core_irq_event_handler_init(mlxsw_core); err = mlxsw_bus->init(bus_priv, mlxsw_core, mlxsw_driver->profile, &mlxsw_core->res); @@ -2233,6 +2248,7 @@ err_ports_init: err_register_resources: mlxsw_bus->fini(bus_priv); err_bus_init: + mlxsw_core_irq_event_handler_fini(mlxsw_core); if (!reload) { devl_unlock(devlink); devlink_free(devlink); @@ -2302,6 +2318,7 @@ void mlxsw_core_bus_device_unregister(struct mlxsw_core *mlxsw_core, if (!reload) devl_resources_unregister(devlink); mlxsw_core->bus->fini(mlxsw_core->bus_priv); + mlxsw_core_irq_event_handler_fini(mlxsw_core); if (!reload) { devl_unlock(devlink); devlink_free(devlink); @@ -2772,6 +2789,57 @@ int mlxsw_reg_trans_bulk_wait(struct list_head *bulk_list) } EXPORT_SYMBOL(mlxsw_reg_trans_bulk_wait); +struct mlxsw_core_irq_event_handler_item { + struct list_head list; + void (*cb)(struct mlxsw_core *mlxsw_core); +}; + +int mlxsw_core_irq_event_handler_register(struct mlxsw_core *mlxsw_core, + mlxsw_irq_event_cb_t cb) +{ + struct mlxsw_core_irq_event_handler_item *item; + + item = kzalloc(sizeof(*item), GFP_KERNEL); + if (!item) + return -ENOMEM; + item->cb = cb; + mutex_lock(&mlxsw_core->irq_event_handler_lock); + list_add_tail(&item->list, &mlxsw_core->irq_event_handler_list); + mutex_unlock(&mlxsw_core->irq_event_handler_lock); + return 0; +} +EXPORT_SYMBOL(mlxsw_core_irq_event_handler_register); + +void mlxsw_core_irq_event_handler_unregister(struct mlxsw_core *mlxsw_core, + mlxsw_irq_event_cb_t cb) +{ + struct mlxsw_core_irq_event_handler_item *item, *tmp; + + mutex_lock(&mlxsw_core->irq_event_handler_lock); + list_for_each_entry_safe(item, tmp, + &mlxsw_core->irq_event_handler_list, list) { + if (item->cb == cb) { + list_del(&item->list); + kfree(item); + } + } + mutex_unlock(&mlxsw_core->irq_event_handler_lock); +} +EXPORT_SYMBOL(mlxsw_core_irq_event_handler_unregister); + +void mlxsw_core_irq_event_handlers_call(struct mlxsw_core *mlxsw_core) +{ + struct mlxsw_core_irq_event_handler_item *item; + + mutex_lock(&mlxsw_core->irq_event_handler_lock); + list_for_each_entry(item, &mlxsw_core->irq_event_handler_list, list) { + if (item->cb) + item->cb(mlxsw_core); + } + mutex_unlock(&mlxsw_core->irq_event_handler_lock); +} +EXPORT_SYMBOL(mlxsw_core_irq_event_handlers_call); + static int mlxsw_core_reg_access_cmd(struct mlxsw_core *mlxsw_core, const struct mlxsw_reg_info *reg, char *payload, diff --git a/drivers/net/ethernet/mellanox/mlxsw/core.h b/drivers/net/ethernet/mellanox/mlxsw/core.h index 02d9cc2ef0c8..c7c0b3cefd8d 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/core.h +++ b/drivers/net/ethernet/mellanox/mlxsw/core.h @@ -215,6 +215,14 @@ int mlxsw_reg_trans_write(struct mlxsw_core *mlxsw_core, mlxsw_reg_trans_cb_t *cb, unsigned long cb_priv); int mlxsw_reg_trans_bulk_wait(struct list_head *bulk_list); +typedef void mlxsw_irq_event_cb_t(struct mlxsw_core *mlxsw_core); + +int mlxsw_core_irq_event_handler_register(struct mlxsw_core *mlxsw_core, + mlxsw_irq_event_cb_t cb); +void mlxsw_core_irq_event_handler_unregister(struct mlxsw_core *mlxsw_core, + mlxsw_irq_event_cb_t cb); +void mlxsw_core_irq_event_handlers_call(struct mlxsw_core *mlxsw_core); + int mlxsw_reg_query(struct mlxsw_core *mlxsw_core, const struct mlxsw_reg_info *reg, char *payload); int mlxsw_reg_write(struct mlxsw_core *mlxsw_core, diff --git a/drivers/net/ethernet/mellanox/mlxsw/core_linecards.c b/drivers/net/ethernet/mellanox/mlxsw/core_linecards.c index ca59f0b946da..83d2dc91ba2c 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/core_linecards.c +++ b/drivers/net/ethernet/mellanox/mlxsw/core_linecards.c @@ -785,6 +785,21 @@ static int mlxsw_linecard_status_get_and_process(struct mlxsw_core *mlxsw_core, return mlxsw_linecard_status_process(linecards, linecard, mddq_pl); } +static void mlxsw_linecards_irq_event_handler(struct mlxsw_core *mlxsw_core) +{ + struct mlxsw_linecards *linecards = mlxsw_core_linecards(mlxsw_core); + int i; + + /* Handle change of line card active state. */ + for (i = 0; i < linecards->count; i++) { + struct mlxsw_linecard *linecard = mlxsw_linecard_get(linecards, + i + 1); + + mlxsw_linecard_status_get_and_process(mlxsw_core, linecards, + linecard); + } +} + static const char * const mlxsw_linecard_status_event_type_name[] = { [MLXSW_LINECARD_STATUS_EVENT_TYPE_PROVISION] = "provision", [MLXSW_LINECARD_STATUS_EVENT_TYPE_UNPROVISION] = "unprovision", @@ -1238,7 +1253,6 @@ static int mlxsw_linecard_init(struct mlxsw_core *mlxsw_core, { struct devlink_linecard *devlink_linecard; struct mlxsw_linecard *linecard; - int err; linecard = mlxsw_linecard_get(linecards, slot_index); linecard->slot_index = slot_index; @@ -1248,17 +1262,45 @@ static int mlxsw_linecard_init(struct mlxsw_core *mlxsw_core, devlink_linecard = devlink_linecard_create(priv_to_devlink(mlxsw_core), slot_index, &mlxsw_linecard_ops, linecard); - if (IS_ERR(devlink_linecard)) { - err = PTR_ERR(devlink_linecard); - goto err_devlink_linecard_create; - } + if (IS_ERR(devlink_linecard)) + return PTR_ERR(devlink_linecard); + linecard->devlink_linecard = devlink_linecard; INIT_DELAYED_WORK(&linecard->status_event_to_dw, &mlxsw_linecard_status_event_to_work); + return 0; +} + +static void mlxsw_linecard_fini(struct mlxsw_core *mlxsw_core, + struct mlxsw_linecards *linecards, + u8 slot_index) +{ + struct mlxsw_linecard *linecard; + + linecard = mlxsw_linecard_get(linecards, slot_index); + cancel_delayed_work_sync(&linecard->status_event_to_dw); + /* Make sure all scheduled events are processed */ + mlxsw_core_flush_owq(); + if (linecard->active) + mlxsw_linecard_active_clear(linecard); + mlxsw_linecard_bdev_del(linecard); + devlink_linecard_destroy(linecard->devlink_linecard); + mutex_destroy(&linecard->lock); +} + +static int +mlxsw_linecard_event_delivery_init(struct mlxsw_core *mlxsw_core, + struct mlxsw_linecards *linecards, + u8 slot_index) +{ + struct mlxsw_linecard *linecard; + int err; + + linecard = mlxsw_linecard_get(linecards, slot_index); err = mlxsw_linecard_event_delivery_set(mlxsw_core, linecard, true); if (err) - goto err_event_delivery_set; + return err; err = mlxsw_linecard_status_get_and_process(mlxsw_core, linecards, linecard); @@ -1269,29 +1311,18 @@ static int mlxsw_linecard_init(struct mlxsw_core *mlxsw_core, err_status_get_and_process: mlxsw_linecard_event_delivery_set(mlxsw_core, linecard, false); -err_event_delivery_set: - devlink_linecard_destroy(linecard->devlink_linecard); -err_devlink_linecard_create: - mutex_destroy(&linecard->lock); return err; } -static void mlxsw_linecard_fini(struct mlxsw_core *mlxsw_core, - struct mlxsw_linecards *linecards, - u8 slot_index) +static void +mlxsw_linecard_event_delivery_fini(struct mlxsw_core *mlxsw_core, + struct mlxsw_linecards *linecards, + u8 slot_index) { struct mlxsw_linecard *linecard; linecard = mlxsw_linecard_get(linecards, slot_index); mlxsw_linecard_event_delivery_set(mlxsw_core, linecard, false); - cancel_delayed_work_sync(&linecard->status_event_to_dw); - /* Make sure all scheduled events are processed */ - mlxsw_core_flush_owq(); - if (linecard->active) - mlxsw_linecard_active_clear(linecard); - mlxsw_linecard_bdev_del(linecard); - devlink_linecard_destroy(linecard->devlink_linecard); - mutex_destroy(&linecard->lock); } /* LINECARDS INI BUNDLE FILE @@ -1505,6 +1536,11 @@ int mlxsw_linecards_init(struct mlxsw_core *mlxsw_core, if (err) goto err_traps_register; + err = mlxsw_core_irq_event_handler_register(mlxsw_core, + mlxsw_linecards_irq_event_handler); + if (err) + goto err_irq_event_handler_register; + mlxsw_core_linecards_set(mlxsw_core, linecards); for (i = 0; i < linecards->count; i++) { @@ -1513,11 +1549,25 @@ int mlxsw_linecards_init(struct mlxsw_core *mlxsw_core, goto err_linecard_init; } + for (i = 0; i < linecards->count; i++) { + err = mlxsw_linecard_event_delivery_init(mlxsw_core, linecards, + i + 1); + if (err) + goto err_linecard_event_delivery_init; + } + return 0; +err_linecard_event_delivery_init: + for (i--; i >= 0; i--) + mlxsw_linecard_event_delivery_fini(mlxsw_core, linecards, i + 1); + i = linecards->count; err_linecard_init: for (i--; i >= 0; i--) mlxsw_linecard_fini(mlxsw_core, linecards, i + 1); + mlxsw_core_irq_event_handler_unregister(mlxsw_core, + mlxsw_linecards_irq_event_handler); +err_irq_event_handler_register: mlxsw_core_traps_unregister(mlxsw_core, mlxsw_linecard_listener, ARRAY_SIZE(mlxsw_linecard_listener), mlxsw_core); @@ -1536,7 +1586,11 @@ void mlxsw_linecards_fini(struct mlxsw_core *mlxsw_core) if (!linecards) return; for (i = 0; i < linecards->count; i++) + mlxsw_linecard_event_delivery_fini(mlxsw_core, linecards, i + 1); + for (i = 0; i < linecards->count; i++) mlxsw_linecard_fini(mlxsw_core, linecards, i + 1); + mlxsw_core_irq_event_handler_unregister(mlxsw_core, + mlxsw_linecards_irq_event_handler); mlxsw_core_traps_unregister(mlxsw_core, mlxsw_linecard_listener, ARRAY_SIZE(mlxsw_linecard_listener), mlxsw_core); diff --git a/drivers/net/ethernet/mellanox/mlxsw/i2c.c b/drivers/net/ethernet/mellanox/mlxsw/i2c.c index ce843ea91464..716c73e4fd59 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/i2c.c +++ b/drivers/net/ethernet/mellanox/mlxsw/i2c.c @@ -9,6 +9,7 @@ #include <linux/mutex.h> #include <linux/module.h> #include <linux/mod_devicetable.h> +#include <linux/platform_data/mlxreg.h> #include <linux/slab.h> #include "cmd.h" @@ -51,6 +52,15 @@ #define MLXSW_I2C_TIMEOUT_MSECS 5000 #define MLXSW_I2C_MAX_DATA_SIZE 256 +/* Driver can be initialized by kernel platform driver or from the user + * space. In the first case IRQ line number is passed through the platform + * data, otherwise default IRQ line is to be used. Default IRQ is relevant + * only for specific I2C slave address, allowing 3.4 MHz I2C path to the chip + * (special hardware feature for I2C acceleration). + */ +#define MLXSW_I2C_DEFAULT_IRQ 17 +#define MLXSW_FAST_I2C_SLAVE 0x37 + /** * struct mlxsw_i2c - device private data: * @cmd: command attributes; @@ -63,6 +73,9 @@ * @core: switch core pointer; * @bus_info: bus info block; * @block_size: maximum block size allowed to pass to under layer; + * @pdata: device platform data; + * @irq_work: interrupts work item; + * @irq: IRQ line number; */ struct mlxsw_i2c { struct { @@ -76,6 +89,9 @@ struct mlxsw_i2c { struct mlxsw_core *core; struct mlxsw_bus_info bus_info; u16 block_size; + struct mlxreg_core_hotplug_platform_data *pdata; + struct work_struct irq_work; + int irq; }; #define MLXSW_I2C_READ_MSG(_client, _addr_buf, _buf, _len) { \ @@ -546,6 +562,67 @@ static void mlxsw_i2c_fini(void *bus_priv) mlxsw_i2c->core = NULL; } +static void mlxsw_i2c_work_handler(struct work_struct *work) +{ + struct mlxsw_i2c *mlxsw_i2c; + + mlxsw_i2c = container_of(work, struct mlxsw_i2c, irq_work); + mlxsw_core_irq_event_handlers_call(mlxsw_i2c->core); +} + +static irqreturn_t mlxsw_i2c_irq_handler(int irq, void *dev) +{ + struct mlxsw_i2c *mlxsw_i2c = dev; + + mlxsw_core_schedule_work(&mlxsw_i2c->irq_work); + + /* Interrupt handler shares IRQ line with 'main' interrupt handler. + * Return here IRQ_NONE, while main handler will return IRQ_HANDLED. + */ + return IRQ_NONE; +} + +static int mlxsw_i2c_irq_init(struct mlxsw_i2c *mlxsw_i2c, u8 addr) +{ + int err; + + /* Initialize interrupt handler if system hotplug driver is reachable, + * otherwise interrupt line is not enabled and interrupts will not be + * raised to CPU. Also request_irq() call will be not valid. + */ + if (!IS_REACHABLE(CONFIG_MLXREG_HOTPLUG)) + return 0; + + /* Set default interrupt line. */ + if (mlxsw_i2c->pdata && mlxsw_i2c->pdata->irq) + mlxsw_i2c->irq = mlxsw_i2c->pdata->irq; + else if (addr == MLXSW_FAST_I2C_SLAVE) + mlxsw_i2c->irq = MLXSW_I2C_DEFAULT_IRQ; + + if (!mlxsw_i2c->irq) + return 0; + + INIT_WORK(&mlxsw_i2c->irq_work, mlxsw_i2c_work_handler); + err = request_irq(mlxsw_i2c->irq, mlxsw_i2c_irq_handler, + IRQF_TRIGGER_FALLING | IRQF_SHARED, "mlxsw-i2c", + mlxsw_i2c); + if (err) { + dev_err(mlxsw_i2c->bus_info.dev, "Failed to request irq: %d\n", + err); + return err; + } + + return 0; +} + +static void mlxsw_i2c_irq_fini(struct mlxsw_i2c *mlxsw_i2c) +{ + if (!IS_REACHABLE(CONFIG_MLXREG_HOTPLUG) || !mlxsw_i2c->irq) + return; + cancel_work_sync(&mlxsw_i2c->irq_work); + free_irq(mlxsw_i2c->irq, mlxsw_i2c); +} + static const struct mlxsw_bus mlxsw_i2c_bus = { .kind = "i2c", .init = mlxsw_i2c_init, @@ -638,17 +715,24 @@ static int mlxsw_i2c_probe(struct i2c_client *client, mlxsw_i2c->bus_info.dev = &client->dev; mlxsw_i2c->bus_info.low_frequency = true; mlxsw_i2c->dev = &client->dev; + mlxsw_i2c->pdata = client->dev.platform_data; + + err = mlxsw_i2c_irq_init(mlxsw_i2c, client->addr); + if (err) + goto errout; err = mlxsw_core_bus_device_register(&mlxsw_i2c->bus_info, &mlxsw_i2c_bus, mlxsw_i2c, false, NULL, NULL); if (err) { dev_err(&client->dev, "Fail to register core bus\n"); - return err; + goto err_bus_device_register; } return 0; +err_bus_device_register: + mlxsw_i2c_irq_fini(mlxsw_i2c); errout: mutex_destroy(&mlxsw_i2c->cmd.lock); i2c_set_clientdata(client, NULL); @@ -661,6 +745,7 @@ static int mlxsw_i2c_remove(struct i2c_client *client) struct mlxsw_i2c *mlxsw_i2c = i2c_get_clientdata(client); mlxsw_core_bus_device_unregister(mlxsw_i2c->core, false); + mlxsw_i2c_irq_fini(mlxsw_i2c); mutex_destroy(&mlxsw_i2c->cmd.lock); return 0; diff --git a/drivers/net/ethernet/mellanox/mlxsw/minimal.c b/drivers/net/ethernet/mellanox/mlxsw/minimal.c index bb1cd4bae82e..7d3fa2883e8b 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/minimal.c +++ b/drivers/net/ethernet/mellanox/mlxsw/minimal.c @@ -26,20 +26,29 @@ static const struct mlxsw_fw_rev mlxsw_m_fw_rev = { struct mlxsw_m_port; +struct mlxsw_m_line_card { + bool active; + int module_to_port[]; +}; + struct mlxsw_m { struct mlxsw_m_port **ports; - int *module_to_port; struct mlxsw_core *core; const struct mlxsw_bus_info *bus_info; u8 base_mac[ETH_ALEN]; u8 max_ports; + u8 max_modules_per_slot; /* Maximum number of modules per-slot. */ + u8 num_of_slots; /* Including the main board. */ + struct mlxsw_m_line_card **line_cards; }; struct mlxsw_m_port { struct net_device *dev; struct mlxsw_m *mlxsw_m; u16 local_port; + u8 slot_index; u8 module; + u8 module_offset; }; static int mlxsw_m_base_mac_get(struct mlxsw_m *mlxsw_m) @@ -111,8 +120,9 @@ static int mlxsw_m_get_module_info(struct net_device *netdev, struct mlxsw_m_port *mlxsw_m_port = netdev_priv(netdev); struct mlxsw_core *core = mlxsw_m_port->mlxsw_m->core; - return mlxsw_env_get_module_info(netdev, core, 0, mlxsw_m_port->module, - modinfo); + return mlxsw_env_get_module_info(netdev, core, + mlxsw_m_port->slot_index, + mlxsw_m_port->module, modinfo); } static int @@ -122,7 +132,8 @@ mlxsw_m_get_module_eeprom(struct net_device *netdev, struct ethtool_eeprom *ee, struct mlxsw_m_port *mlxsw_m_port = netdev_priv(netdev); struct mlxsw_core *core = mlxsw_m_port->mlxsw_m->core; - return mlxsw_env_get_module_eeprom(netdev, core, 0, + return mlxsw_env_get_module_eeprom(netdev, core, + mlxsw_m_port->slot_index, mlxsw_m_port->module, ee, data); } @@ -134,7 +145,8 @@ mlxsw_m_get_module_eeprom_by_page(struct net_device *netdev, struct mlxsw_m_port *mlxsw_m_port = netdev_priv(netdev); struct mlxsw_core *core = mlxsw_m_port->mlxsw_m->core; - return mlxsw_env_get_module_eeprom_by_page(core, 0, + return mlxsw_env_get_module_eeprom_by_page(core, + mlxsw_m_port->slot_index, mlxsw_m_port->module, page, extack); } @@ -144,7 +156,8 @@ static int mlxsw_m_reset(struct net_device *netdev, u32 *flags) struct mlxsw_m_port *mlxsw_m_port = netdev_priv(netdev); struct mlxsw_core *core = mlxsw_m_port->mlxsw_m->core; - return mlxsw_env_reset_module(netdev, core, 0, mlxsw_m_port->module, + return mlxsw_env_reset_module(netdev, core, mlxsw_m_port->slot_index, + mlxsw_m_port->module, flags); } @@ -156,7 +169,8 @@ mlxsw_m_get_module_power_mode(struct net_device *netdev, struct mlxsw_m_port *mlxsw_m_port = netdev_priv(netdev); struct mlxsw_core *core = mlxsw_m_port->mlxsw_m->core; - return mlxsw_env_get_module_power_mode(core, 0, mlxsw_m_port->module, + return mlxsw_env_get_module_power_mode(core, mlxsw_m_port->slot_index, + mlxsw_m_port->module, params, extack); } @@ -168,7 +182,8 @@ mlxsw_m_set_module_power_mode(struct net_device *netdev, struct mlxsw_m_port *mlxsw_m_port = netdev_priv(netdev); struct mlxsw_core *core = mlxsw_m_port->mlxsw_m->core; - return mlxsw_env_set_module_power_mode(core, 0, mlxsw_m_port->module, + return mlxsw_env_set_module_power_mode(core, mlxsw_m_port->slot_index, + mlxsw_m_port->module, params->policy, extack); } @@ -184,7 +199,7 @@ static const struct ethtool_ops mlxsw_m_port_ethtool_ops = { static int mlxsw_m_port_module_info_get(struct mlxsw_m *mlxsw_m, u16 local_port, - u8 *p_module, u8 *p_width) + u8 *p_module, u8 *p_width, u8 *p_slot_index) { char pmlp_pl[MLXSW_REG_PMLP_LEN]; int err; @@ -195,6 +210,7 @@ mlxsw_m_port_module_info_get(struct mlxsw_m *mlxsw_m, u16 local_port, return err; *p_module = mlxsw_reg_pmlp_module_get(pmlp_pl, 0); *p_width = mlxsw_reg_pmlp_width_get(pmlp_pl); + *p_slot_index = mlxsw_reg_pmlp_slot_index_get(pmlp_pl, 0); return 0; } @@ -212,18 +228,25 @@ mlxsw_m_port_dev_addr_get(struct mlxsw_m_port *mlxsw_m_port) if (err) return err; mlxsw_reg_ppad_mac_memcpy_from(ppad_pl, addr); - eth_hw_addr_gen(mlxsw_m_port->dev, addr, mlxsw_m_port->module + 1); + eth_hw_addr_gen(mlxsw_m_port->dev, addr, mlxsw_m_port->module + 1 + + mlxsw_m_port->module_offset); return 0; } +static bool mlxsw_m_port_created(struct mlxsw_m *mlxsw_m, u16 local_port) +{ + return mlxsw_m->ports[local_port]; +} + static int -mlxsw_m_port_create(struct mlxsw_m *mlxsw_m, u16 local_port, u8 module) +mlxsw_m_port_create(struct mlxsw_m *mlxsw_m, u16 local_port, u8 slot_index, + u8 module) { struct mlxsw_m_port *mlxsw_m_port; struct net_device *dev; int err; - err = mlxsw_core_port_init(mlxsw_m->core, local_port, 0, + err = mlxsw_core_port_init(mlxsw_m->core, local_port, slot_index, module + 1, false, 0, false, 0, mlxsw_m->base_mac, sizeof(mlxsw_m->base_mac)); @@ -246,6 +269,15 @@ mlxsw_m_port_create(struct mlxsw_m *mlxsw_m, u16 local_port, u8 module) mlxsw_m_port->mlxsw_m = mlxsw_m; mlxsw_m_port->local_port = local_port; mlxsw_m_port->module = module; + mlxsw_m_port->slot_index = slot_index; + /* Add module offset for line card. Offset for main board iz zero. + * For line card in slot #n offset is calculated as (#n - 1) + * multiplied by maximum modules number, which could be found on a line + * card. + */ + mlxsw_m_port->module_offset = mlxsw_m_port->slot_index ? + (mlxsw_m_port->slot_index - 1) * + mlxsw_m->max_modules_per_slot : 0; dev->netdev_ops = &mlxsw_m_port_netdev_ops; dev->ethtool_ops = &mlxsw_m_port_ethtool_ops; @@ -291,19 +323,29 @@ static void mlxsw_m_port_remove(struct mlxsw_m *mlxsw_m, u16 local_port) mlxsw_core_port_fini(mlxsw_m->core, local_port); } +static int* +mlxsw_m_port_mapping_get(struct mlxsw_m *mlxsw_m, u8 slot_index, u8 module) +{ + return &mlxsw_m->line_cards[slot_index]->module_to_port[module]; +} + static int mlxsw_m_port_module_map(struct mlxsw_m *mlxsw_m, u16 local_port, u8 *last_module) { unsigned int max_ports = mlxsw_core_max_ports(mlxsw_m->core); - u8 module, width; + u8 module, width, slot_index; + int *module_to_port; int err; /* Fill out to local port mapping array */ err = mlxsw_m_port_module_info_get(mlxsw_m, local_port, &module, - &width); + &width, &slot_index); if (err) return err; + /* Skip if line card has been already configured */ + if (mlxsw_m->line_cards[slot_index]->active) + return 0; if (!width) return 0; /* Skip, if port belongs to the cluster */ @@ -313,91 +355,216 @@ static int mlxsw_m_port_module_map(struct mlxsw_m *mlxsw_m, u16 local_port, if (WARN_ON_ONCE(module >= max_ports)) return -EINVAL; - mlxsw_env_module_port_map(mlxsw_m->core, 0, module); - mlxsw_m->module_to_port[module] = ++mlxsw_m->max_ports; + mlxsw_env_module_port_map(mlxsw_m->core, slot_index, module); + module_to_port = mlxsw_m_port_mapping_get(mlxsw_m, slot_index, module); + *module_to_port = local_port; return 0; } -static void mlxsw_m_port_module_unmap(struct mlxsw_m *mlxsw_m, u8 module) +static void +mlxsw_m_port_module_unmap(struct mlxsw_m *mlxsw_m, u8 slot_index, u8 module) { - mlxsw_m->module_to_port[module] = -1; - mlxsw_env_module_port_unmap(mlxsw_m->core, 0, module); + int *module_to_port = mlxsw_m_port_mapping_get(mlxsw_m, slot_index, + module); + *module_to_port = -1; + mlxsw_env_module_port_unmap(mlxsw_m->core, slot_index, module); } -static int mlxsw_m_ports_create(struct mlxsw_m *mlxsw_m) +static int mlxsw_m_linecards_init(struct mlxsw_m *mlxsw_m) { unsigned int max_ports = mlxsw_core_max_ports(mlxsw_m->core); - u8 last_module = max_ports; - int i; - int err; + char mgpir_pl[MLXSW_REG_MGPIR_LEN]; + u8 num_of_modules; + int i, j, err; + + mlxsw_reg_mgpir_pack(mgpir_pl, 0); + err = mlxsw_reg_query(mlxsw_m->core, MLXSW_REG(mgpir), mgpir_pl); + if (err) + return err; + + mlxsw_reg_mgpir_unpack(mgpir_pl, NULL, NULL, NULL, &num_of_modules, + &mlxsw_m->num_of_slots); + /* If the system is modular, get the maximum number of modules per-slot. + * Otherwise, get the maximum number of modules on the main board. + */ + if (mlxsw_m->num_of_slots) + mlxsw_m->max_modules_per_slot = + mlxsw_reg_mgpir_max_modules_per_slot_get(mgpir_pl); + else + mlxsw_m->max_modules_per_slot = num_of_modules; + /* Add slot for main board. */ + mlxsw_m->num_of_slots += 1; mlxsw_m->ports = kcalloc(max_ports, sizeof(*mlxsw_m->ports), GFP_KERNEL); if (!mlxsw_m->ports) return -ENOMEM; - mlxsw_m->module_to_port = kmalloc_array(max_ports, sizeof(int), - GFP_KERNEL); - if (!mlxsw_m->module_to_port) { - err = -ENOMEM; - goto err_module_to_port_alloc; + mlxsw_m->line_cards = kcalloc(mlxsw_m->num_of_slots, + sizeof(*mlxsw_m->line_cards), + GFP_KERNEL); + if (!mlxsw_m->line_cards) + goto err_kcalloc; + + for (i = 0; i < mlxsw_m->num_of_slots; i++) { + mlxsw_m->line_cards[i] = + kzalloc(struct_size(mlxsw_m->line_cards[i], + module_to_port, + mlxsw_m->max_modules_per_slot), + GFP_KERNEL); + if (!mlxsw_m->line_cards[i]) + goto err_kmalloc_array; + + /* Invalidate the entries of module to local port mapping array. */ + for (j = 0; j < mlxsw_m->max_modules_per_slot; j++) + mlxsw_m->line_cards[i]->module_to_port[j] = -1; } - /* Invalidate the entries of module to local port mapping array */ - for (i = 0; i < max_ports; i++) - mlxsw_m->module_to_port[i] = -1; + return 0; - /* Fill out module to local port mapping array */ - for (i = 1; i < max_ports; i++) { - err = mlxsw_m_port_module_map(mlxsw_m, i, &last_module); - if (err) - goto err_module_to_port_map; +err_kmalloc_array: + for (i--; i >= 0; i--) + kfree(mlxsw_m->line_cards[i]); +err_kcalloc: + kfree(mlxsw_m->ports); + return err; +} + +static void mlxsw_m_linecards_fini(struct mlxsw_m *mlxsw_m) +{ + int i = mlxsw_m->num_of_slots; + + for (i--; i >= 0; i--) + kfree(mlxsw_m->line_cards[i]); + kfree(mlxsw_m->line_cards); + kfree(mlxsw_m->ports); +} + +static void +mlxsw_m_linecard_port_module_unmap(struct mlxsw_m *mlxsw_m, u8 slot_index) +{ + int i; + + for (i = mlxsw_m->max_modules_per_slot - 1; i >= 0; i--) { + int *module_to_port; + + module_to_port = mlxsw_m_port_mapping_get(mlxsw_m, slot_index, i); + if (*module_to_port > 0) + mlxsw_m_port_module_unmap(mlxsw_m, slot_index, i); } +} - /* Create port objects for each valid entry */ - for (i = 0; i < mlxsw_m->max_ports; i++) { - if (mlxsw_m->module_to_port[i] > 0) { - err = mlxsw_m_port_create(mlxsw_m, - mlxsw_m->module_to_port[i], - i); +static int +mlxsw_m_linecard_ports_create(struct mlxsw_m *mlxsw_m, u8 slot_index) +{ + int *module_to_port; + int i, err; + + for (i = 0; i < mlxsw_m->max_modules_per_slot; i++) { + module_to_port = mlxsw_m_port_mapping_get(mlxsw_m, slot_index, i); + if (*module_to_port > 0) { + err = mlxsw_m_port_create(mlxsw_m, *module_to_port, + slot_index, i); if (err) - goto err_module_to_port_create; + goto err_port_create; + /* Mark slot as active */ + if (!mlxsw_m->line_cards[slot_index]->active) + mlxsw_m->line_cards[slot_index]->active = true; } } - return 0; -err_module_to_port_create: +err_port_create: for (i--; i >= 0; i--) { - if (mlxsw_m->module_to_port[i] > 0) - mlxsw_m_port_remove(mlxsw_m, - mlxsw_m->module_to_port[i]); + module_to_port = mlxsw_m_port_mapping_get(mlxsw_m, slot_index, i); + if (*module_to_port > 0 && + mlxsw_m_port_created(mlxsw_m, *module_to_port)) { + mlxsw_m_port_remove(mlxsw_m, *module_to_port); + /* Mark slot as inactive */ + if (mlxsw_m->line_cards[slot_index]->active) + mlxsw_m->line_cards[slot_index]->active = false; + } } - i = max_ports; -err_module_to_port_map: - for (i--; i > 0; i--) - mlxsw_m_port_module_unmap(mlxsw_m, i); - kfree(mlxsw_m->module_to_port); -err_module_to_port_alloc: - kfree(mlxsw_m->ports); return err; } -static void mlxsw_m_ports_remove(struct mlxsw_m *mlxsw_m) +static void +mlxsw_m_linecard_ports_remove(struct mlxsw_m *mlxsw_m, u8 slot_index) { int i; - for (i = 0; i < mlxsw_m->max_ports; i++) { - if (mlxsw_m->module_to_port[i] > 0) { - mlxsw_m_port_remove(mlxsw_m, - mlxsw_m->module_to_port[i]); - mlxsw_m_port_module_unmap(mlxsw_m, i); + for (i = 0; i < mlxsw_m->max_modules_per_slot; i++) { + int *module_to_port = mlxsw_m_port_mapping_get(mlxsw_m, + slot_index, i); + + if (*module_to_port > 0 && + mlxsw_m_port_created(mlxsw_m, *module_to_port)) { + mlxsw_m_port_remove(mlxsw_m, *module_to_port); + mlxsw_m_port_module_unmap(mlxsw_m, slot_index, i); } } +} - kfree(mlxsw_m->module_to_port); - kfree(mlxsw_m->ports); +static int mlxsw_m_ports_module_map(struct mlxsw_m *mlxsw_m) +{ + unsigned int max_ports = mlxsw_core_max_ports(mlxsw_m->core); + u8 last_module = max_ports; + int i, err; + + for (i = 1; i < max_ports; i++) { + err = mlxsw_m_port_module_map(mlxsw_m, i, &last_module); + if (err) + return err; + } + + return 0; +} + +static int mlxsw_m_ports_create(struct mlxsw_m *mlxsw_m) +{ + int err; + + /* Fill out module to local port mapping array */ + err = mlxsw_m_ports_module_map(mlxsw_m); + if (err) + goto err_ports_module_map; + + /* Create port objects for each valid entry */ + err = mlxsw_m_linecard_ports_create(mlxsw_m, 0); + if (err) + goto err_linecard_ports_create; + + return 0; + +err_linecard_ports_create: +err_ports_module_map: + mlxsw_m_linecard_port_module_unmap(mlxsw_m, 0); + + return err; +} + +static void mlxsw_m_ports_remove(struct mlxsw_m *mlxsw_m) +{ + mlxsw_m_linecard_ports_remove(mlxsw_m, 0); +} + +static void +mlxsw_m_ports_remove_selected(struct mlxsw_core *mlxsw_core, + bool (*selector)(void *priv, u16 local_port), + void *priv) +{ + struct mlxsw_m *mlxsw_m = mlxsw_core_driver_priv(mlxsw_core); + struct mlxsw_linecard *linecard_priv = priv; + struct mlxsw_m_line_card *linecard; + + linecard = mlxsw_m->line_cards[linecard_priv->slot_index]; + + if (WARN_ON(!linecard->active)) + return; + + mlxsw_m_linecard_ports_remove(mlxsw_m, linecard_priv->slot_index); + linecard->active = false; } static int mlxsw_m_fw_rev_validate(struct mlxsw_m *mlxsw_m) @@ -418,6 +585,60 @@ static int mlxsw_m_fw_rev_validate(struct mlxsw_m *mlxsw_m) return -EINVAL; } +static void +mlxsw_m_got_active(struct mlxsw_core *mlxsw_core, u8 slot_index, void *priv) +{ + struct mlxsw_m_line_card *linecard; + struct mlxsw_m *mlxsw_m = priv; + int err; + + linecard = mlxsw_m->line_cards[slot_index]; + /* Skip if line card has been already configured during init */ + if (linecard->active) + return; + + /* Fill out module to local port mapping array */ + err = mlxsw_m_ports_module_map(mlxsw_m); + if (err) + goto err_ports_module_map; + + /* Create port objects for each valid entry */ + err = mlxsw_m_linecard_ports_create(mlxsw_m, slot_index); + if (err) { + dev_err(mlxsw_m->bus_info->dev, "Failed to create port for line card at slot %d\n", + slot_index); + goto err_linecard_ports_create; + } + + linecard->active = true; + + return; + +err_linecard_ports_create: +err_ports_module_map: + mlxsw_m_linecard_port_module_unmap(mlxsw_m, slot_index); +} + +static void +mlxsw_m_got_inactive(struct mlxsw_core *mlxsw_core, u8 slot_index, void *priv) +{ + struct mlxsw_m_line_card *linecard; + struct mlxsw_m *mlxsw_m = priv; + + linecard = mlxsw_m->line_cards[slot_index]; + + if (WARN_ON(!linecard->active)) + return; + + mlxsw_m_linecard_ports_remove(mlxsw_m, slot_index); + linecard->active = false; +} + +static struct mlxsw_linecards_event_ops mlxsw_m_event_ops = { + .got_active = mlxsw_m_got_active, + .got_inactive = mlxsw_m_got_inactive, +}; + static int mlxsw_m_init(struct mlxsw_core *mlxsw_core, const struct mlxsw_bus_info *mlxsw_bus_info, struct netlink_ext_ack *extack) @@ -438,13 +659,33 @@ static int mlxsw_m_init(struct mlxsw_core *mlxsw_core, return err; } + err = mlxsw_m_linecards_init(mlxsw_m); + if (err) { + dev_err(mlxsw_m->bus_info->dev, "Failed to create line cards\n"); + return err; + } + + err = mlxsw_linecards_event_ops_register(mlxsw_core, + &mlxsw_m_event_ops, mlxsw_m); + if (err) { + dev_err(mlxsw_m->bus_info->dev, "Failed to register line cards operations\n"); + goto linecards_event_ops_register; + } + err = mlxsw_m_ports_create(mlxsw_m); if (err) { dev_err(mlxsw_m->bus_info->dev, "Failed to create ports\n"); - return err; + goto err_ports_create; } return 0; + +err_ports_create: + mlxsw_linecards_event_ops_unregister(mlxsw_core, + &mlxsw_m_event_ops, mlxsw_m); +linecards_event_ops_register: + mlxsw_m_linecards_fini(mlxsw_m); + return err; } static void mlxsw_m_fini(struct mlxsw_core *mlxsw_core) @@ -452,6 +693,9 @@ static void mlxsw_m_fini(struct mlxsw_core *mlxsw_core) struct mlxsw_m *mlxsw_m = mlxsw_core_driver_priv(mlxsw_core); mlxsw_m_ports_remove(mlxsw_m); + mlxsw_linecards_event_ops_unregister(mlxsw_core, + &mlxsw_m_event_ops, mlxsw_m); + mlxsw_m_linecards_fini(mlxsw_m); } static const struct mlxsw_config_profile mlxsw_m_config_profile; @@ -461,6 +705,7 @@ static struct mlxsw_driver mlxsw_m_driver = { .priv_size = sizeof(struct mlxsw_m), .init = mlxsw_m_init, .fini = mlxsw_m_fini, + .ports_remove_selected = mlxsw_m_ports_remove_selected, .profile = &mlxsw_m_config_profile, }; diff --git a/drivers/net/ethernet/micrel/ks8851_spi.c b/drivers/net/ethernet/micrel/ks8851_spi.c index 82d55fc27edc..70bc7253454f 100644 --- a/drivers/net/ethernet/micrel/ks8851_spi.c +++ b/drivers/net/ethernet/micrel/ks8851_spi.c @@ -413,7 +413,8 @@ static int ks8851_probe_spi(struct spi_device *spi) spi->bits_per_word = 8; - ks = netdev_priv(netdev); + kss = netdev_priv(netdev); + ks = &kss->ks8851; ks->lock = ks8851_lock_spi; ks->unlock = ks8851_unlock_spi; @@ -433,8 +434,6 @@ static int ks8851_probe_spi(struct spi_device *spi) IRQ_RXPSI) /* RX process stop */ ks->rc_ier = STD_IRQ; - kss = to_ks8851_spi(ks); - kss->spidev = spi; mutex_init(&kss->lock); INIT_WORK(&kss->tx_work, ks8851_tx_work); diff --git a/drivers/net/ethernet/mscc/ocelot.c b/drivers/net/ethernet/mscc/ocelot.c index 306026e6aa11..dddaffdaad9a 100644 --- a/drivers/net/ethernet/mscc/ocelot.c +++ b/drivers/net/ethernet/mscc/ocelot.c @@ -2064,6 +2064,16 @@ static int ocelot_bond_get_id(struct ocelot *ocelot, struct net_device *bond) return __ffs(bond_mask); } +/* Returns the mask of user ports assigned to this DSA tag_8021q CPU port. + * Note that when CPU ports are in a LAG, the user ports are assigned to the + * 'primary' CPU port, the one whose physical port number gives the logical + * port number of the LAG. + * + * We leave PGID_SRC poorly configured for the 'secondary' CPU port in the LAG + * (to which no user port is assigned), but it appears that forwarding from + * this secondary CPU port looks at the PGID_SRC associated with the logical + * port ID that it's assigned to, which *is* configured properly. + */ static u32 ocelot_dsa_8021q_cpu_assigned_ports(struct ocelot *ocelot, struct ocelot_port *cpu) { @@ -2080,9 +2090,15 @@ static u32 ocelot_dsa_8021q_cpu_assigned_ports(struct ocelot *ocelot, mask |= BIT(port); } + if (cpu->bond) + mask &= ~ocelot_get_bond_mask(ocelot, cpu->bond); + return mask; } +/* Returns the DSA tag_8021q CPU port that the given port is assigned to, + * or the bit mask of CPU ports if said CPU port is in a LAG. + */ u32 ocelot_port_assigned_dsa_8021q_cpu_mask(struct ocelot *ocelot, int port) { struct ocelot_port *ocelot_port = ocelot->ports[port]; @@ -2091,6 +2107,9 @@ u32 ocelot_port_assigned_dsa_8021q_cpu_mask(struct ocelot *ocelot, int port) if (!cpu_port) return 0; + if (cpu_port->bond) + return ocelot_get_bond_mask(ocelot, cpu_port->bond); + return BIT(cpu_port->index); } EXPORT_SYMBOL_GPL(ocelot_port_assigned_dsa_8021q_cpu_mask); @@ -2214,61 +2233,61 @@ static void ocelot_update_pgid_cpu(struct ocelot *ocelot) ocelot_write_rix(ocelot, pgid_cpu, ANA_PGID_PGID, PGID_CPU); } -void ocelot_port_assign_dsa_8021q_cpu(struct ocelot *ocelot, int port, - int cpu) +void ocelot_port_setup_dsa_8021q_cpu(struct ocelot *ocelot, int cpu) { struct ocelot_port *cpu_port = ocelot->ports[cpu]; u16 vid; mutex_lock(&ocelot->fwd_domain_lock); - ocelot->ports[port]->dsa_8021q_cpu = cpu_port; - - if (!cpu_port->is_dsa_8021q_cpu) { - cpu_port->is_dsa_8021q_cpu = true; - - for (vid = OCELOT_RSV_VLAN_RANGE_START; vid < VLAN_N_VID; vid++) - ocelot_vlan_member_add(ocelot, cpu, vid, true); + cpu_port->is_dsa_8021q_cpu = true; - ocelot_update_pgid_cpu(ocelot); - } + for (vid = OCELOT_RSV_VLAN_RANGE_START; vid < VLAN_N_VID; vid++) + ocelot_vlan_member_add(ocelot, cpu, vid, true); - ocelot_apply_bridge_fwd_mask(ocelot, true); + ocelot_update_pgid_cpu(ocelot); mutex_unlock(&ocelot->fwd_domain_lock); } -EXPORT_SYMBOL_GPL(ocelot_port_assign_dsa_8021q_cpu); +EXPORT_SYMBOL_GPL(ocelot_port_setup_dsa_8021q_cpu); -void ocelot_port_unassign_dsa_8021q_cpu(struct ocelot *ocelot, int port) +void ocelot_port_teardown_dsa_8021q_cpu(struct ocelot *ocelot, int cpu) { - struct ocelot_port *cpu_port = ocelot->ports[port]->dsa_8021q_cpu; - bool keep = false; + struct ocelot_port *cpu_port = ocelot->ports[cpu]; u16 vid; - int p; mutex_lock(&ocelot->fwd_domain_lock); - ocelot->ports[port]->dsa_8021q_cpu = NULL; + cpu_port->is_dsa_8021q_cpu = false; - for (p = 0; p < ocelot->num_phys_ports; p++) { - if (!ocelot->ports[p]) - continue; + for (vid = OCELOT_RSV_VLAN_RANGE_START; vid < VLAN_N_VID; vid++) + ocelot_vlan_member_del(ocelot, cpu_port->index, vid); - if (ocelot->ports[p]->dsa_8021q_cpu == cpu_port) { - keep = true; - break; - } - } + ocelot_update_pgid_cpu(ocelot); - if (!keep) { - cpu_port->is_dsa_8021q_cpu = false; + mutex_unlock(&ocelot->fwd_domain_lock); +} +EXPORT_SYMBOL_GPL(ocelot_port_teardown_dsa_8021q_cpu); + +void ocelot_port_assign_dsa_8021q_cpu(struct ocelot *ocelot, int port, + int cpu) +{ + struct ocelot_port *cpu_port = ocelot->ports[cpu]; - for (vid = OCELOT_RSV_VLAN_RANGE_START; vid < VLAN_N_VID; vid++) - ocelot_vlan_member_del(ocelot, cpu_port->index, vid); + mutex_lock(&ocelot->fwd_domain_lock); - ocelot_update_pgid_cpu(ocelot); - } + ocelot->ports[port]->dsa_8021q_cpu = cpu_port; + ocelot_apply_bridge_fwd_mask(ocelot, true); + + mutex_unlock(&ocelot->fwd_domain_lock); +} +EXPORT_SYMBOL_GPL(ocelot_port_assign_dsa_8021q_cpu); +void ocelot_port_unassign_dsa_8021q_cpu(struct ocelot *ocelot, int port) +{ + mutex_lock(&ocelot->fwd_domain_lock); + + ocelot->ports[port]->dsa_8021q_cpu = NULL; ocelot_apply_bridge_fwd_mask(ocelot, true); mutex_unlock(&ocelot->fwd_domain_lock); diff --git a/drivers/net/ethernet/ti/am65-cpsw-nuss.c b/drivers/net/ethernet/ti/am65-cpsw-nuss.c index f4a6b590a1e3..7ef5d8208a4e 100644 --- a/drivers/net/ethernet/ti/am65-cpsw-nuss.c +++ b/drivers/net/ethernet/ti/am65-cpsw-nuss.c @@ -74,6 +74,9 @@ #define AM65_CPSW_PORTN_REG_TS_VLAN_LTYPE_REG 0x318 #define AM65_CPSW_PORTN_REG_TS_CTL_LTYPE2 0x31C +#define AM65_CPSW_SGMII_CONTROL_REG 0x010 +#define AM65_CPSW_SGMII_CONTROL_MR_AN_ENABLE BIT(0) + #define AM65_CPSW_CTL_VLAN_AWARE BIT(1) #define AM65_CPSW_CTL_P0_ENABLE BIT(2) #define AM65_CPSW_CTL_P0_TX_CRC_REMOVE BIT(13) @@ -590,11 +593,6 @@ static int am65_cpsw_nuss_ndo_slave_open(struct net_device *ndev) /* mac_sl should be configured via phy-link interface */ am65_cpsw_sl_ctl_reset(port); - ret = phy_set_mode_ext(port->slave.ifphy, PHY_MODE_ETHERNET, - port->slave.phy_if); - if (ret) - goto error_cleanup; - ret = phylink_of_phy_connect(port->slave.phylink, port->slave.phy_node, 0); if (ret) goto error_cleanup; @@ -1409,7 +1407,14 @@ static const struct net_device_ops am65_cpsw_nuss_netdev_ops = { static void am65_cpsw_nuss_mac_config(struct phylink_config *config, unsigned int mode, const struct phylink_link_state *state) { - /* Currently not used */ + struct am65_cpsw_slave_data *slave = container_of(config, struct am65_cpsw_slave_data, + phylink_config); + struct am65_cpsw_port *port = container_of(slave, struct am65_cpsw_port, slave); + struct am65_cpsw_common *common = port->common; + + if (common->pdata.extra_modes & BIT(state->interface)) + writel(AM65_CPSW_SGMII_CONTROL_MR_AN_ENABLE, + port->sgmii_base + AM65_CPSW_SGMII_CONTROL_REG); } static void am65_cpsw_nuss_mac_link_down(struct phylink_config *config, unsigned int mode, @@ -1847,6 +1852,8 @@ static int am65_cpsw_nuss_init_slave_ports(struct am65_cpsw_common *common) port->common = common; port->port_base = common->cpsw_base + AM65_CPSW_NU_PORTS_BASE + AM65_CPSW_NU_PORTS_OFFSET * (port_id); + if (common->pdata.extra_modes) + port->sgmii_base = common->ss_base + AM65_CPSW_SGMII_BASE * (port_id); port->stat_base = common->cpsw_base + AM65_CPSW_NU_STATS_BASE + (AM65_CPSW_NU_STATS_PORT_OFFSET * port_id); port->name = of_get_property(port_np, "label", NULL); @@ -1886,6 +1893,10 @@ static int am65_cpsw_nuss_init_slave_ports(struct am65_cpsw_common *common) goto of_node_put; } + ret = phy_set_mode_ext(port->slave.ifphy, PHY_MODE_ETHERNET, port->slave.phy_if); + if (ret) + goto of_node_put; + ret = of_get_mac_address(port_np, port->slave.mac_addr); if (ret) { am65_cpsw_am654_get_efuse_macid(port_np, @@ -1981,7 +1992,18 @@ am65_cpsw_nuss_init_port_ndev(struct am65_cpsw_common *common, u32 port_idx) port->slave.phylink_config.type = PHYLINK_NETDEV; port->slave.phylink_config.mac_capabilities = MAC_SYM_PAUSE | MAC_10 | MAC_100 | MAC_1000FD; - phy_interface_set_rgmii(port->slave.phylink_config.supported_interfaces); + if (phy_interface_mode_is_rgmii(port->slave.phy_if)) { + phy_interface_set_rgmii(port->slave.phylink_config.supported_interfaces); + } else if (port->slave.phy_if == PHY_INTERFACE_MODE_RMII) { + __set_bit(PHY_INTERFACE_MODE_RMII, + port->slave.phylink_config.supported_interfaces); + } else if (common->pdata.extra_modes & BIT(port->slave.phy_if)) { + __set_bit(PHY_INTERFACE_MODE_QSGMII, + port->slave.phylink_config.supported_interfaces); + } else { + dev_err(dev, "selected phy-mode is not supported\n"); + return -EOPNOTSUPP; + } phylink = phylink_create(&port->slave.phylink_config, of_node_to_fwnode(port->slave.phy_node), @@ -2611,10 +2633,18 @@ static const struct am65_cpsw_pdata am64x_cpswxg_pdata = { .fdqring_mode = K3_RINGACC_RING_MODE_RING, }; +static const struct am65_cpsw_pdata j7200_cpswxg_pdata = { + .quirks = 0, + .ale_dev_id = "am64-cpswxg", + .fdqring_mode = K3_RINGACC_RING_MODE_RING, + .extra_modes = BIT(PHY_INTERFACE_MODE_QSGMII), +}; + static const struct of_device_id am65_cpsw_nuss_of_mtable[] = { { .compatible = "ti,am654-cpsw-nuss", .data = &am65x_sr1_0}, { .compatible = "ti,j721e-cpsw-nuss", .data = &j721e_pdata}, { .compatible = "ti,am642-cpsw-nuss", .data = &am64x_cpswxg_pdata}, + { .compatible = "ti,j7200-cpswxg-nuss", .data = &j7200_cpswxg_pdata}, { /* sentinel */ }, }; MODULE_DEVICE_TABLE(of, am65_cpsw_nuss_of_mtable); diff --git a/drivers/net/ethernet/ti/am65-cpsw-nuss.h b/drivers/net/ethernet/ti/am65-cpsw-nuss.h index ac945631bf2f..2c9850fdfcb6 100644 --- a/drivers/net/ethernet/ti/am65-cpsw-nuss.h +++ b/drivers/net/ethernet/ti/am65-cpsw-nuss.h @@ -46,6 +46,7 @@ struct am65_cpsw_port { const char *name; u32 port_id; void __iomem *port_base; + void __iomem *sgmii_base; void __iomem *stat_base; void __iomem *fetch_ram_base; bool disabled; @@ -88,6 +89,7 @@ struct am65_cpsw_rx_chn { struct am65_cpsw_pdata { u32 quirks; + u64 extra_modes; enum k3_ring_mode fdqring_mode; const char *ale_dev_id; }; diff --git a/drivers/net/phy/nxp-tja11xx.c b/drivers/net/phy/nxp-tja11xx.c index 2a8195c50d14..ec91e671f8aa 100644 --- a/drivers/net/phy/nxp-tja11xx.c +++ b/drivers/net/phy/nxp-tja11xx.c @@ -10,6 +10,7 @@ #include <linux/mdio.h> #include <linux/mii.h> #include <linux/module.h> +#include <linux/of.h> #include <linux/phy.h> #include <linux/hwmon.h> #include <linux/bitfield.h> @@ -34,6 +35,11 @@ #define MII_CFG1 18 #define MII_CFG1_MASTER_SLAVE BIT(15) #define MII_CFG1_AUTO_OP BIT(14) +#define MII_CFG1_INTERFACE_MODE_MASK GENMASK(9, 8) +#define MII_CFG1_MII_MODE (0x0 << 8) +#define MII_CFG1_RMII_MODE_REFCLK_IN BIT(8) +#define MII_CFG1_RMII_MODE_REFCLK_OUT BIT(9) +#define MII_CFG1_REVMII_MODE GENMASK(9, 8) #define MII_CFG1_SLEEP_CONFIRM BIT(6) #define MII_CFG1_LED_MODE_MASK GENMASK(5, 4) #define MII_CFG1_LED_MODE_LINKUP 0 @@ -72,11 +78,15 @@ #define MII_COMMCFG 27 #define MII_COMMCFG_AUTO_OP BIT(15) +/* Configure REF_CLK as input in RMII mode */ +#define TJA110X_RMII_MODE_REFCLK_IN BIT(0) + struct tja11xx_priv { char *hwmon_name; struct device *hwmon_dev; struct phy_device *phydev; struct work_struct phy_register_work; + u32 flags; }; struct tja11xx_phy_stats { @@ -251,8 +261,34 @@ do_test: return __genphy_config_aneg(phydev, changed); } +static int tja11xx_get_interface_mode(struct phy_device *phydev) +{ + struct tja11xx_priv *priv = phydev->priv; + int mii_mode; + + switch (phydev->interface) { + case PHY_INTERFACE_MODE_MII: + mii_mode = MII_CFG1_MII_MODE; + break; + case PHY_INTERFACE_MODE_REVMII: + mii_mode = MII_CFG1_REVMII_MODE; + break; + case PHY_INTERFACE_MODE_RMII: + if (priv->flags & TJA110X_RMII_MODE_REFCLK_IN) + mii_mode = MII_CFG1_RMII_MODE_REFCLK_IN; + else + mii_mode = MII_CFG1_RMII_MODE_REFCLK_OUT; + break; + default: + return -EINVAL; + } + + return mii_mode; +} + static int tja11xx_config_init(struct phy_device *phydev) { + u16 reg_mask, reg_val; int ret; ret = tja11xx_enable_reg_write(phydev); @@ -265,15 +301,32 @@ static int tja11xx_config_init(struct phy_device *phydev) switch (phydev->phy_id & PHY_ID_MASK) { case PHY_ID_TJA1100: - ret = phy_modify(phydev, MII_CFG1, - MII_CFG1_AUTO_OP | MII_CFG1_LED_MODE_MASK | - MII_CFG1_LED_ENABLE, - MII_CFG1_AUTO_OP | MII_CFG1_LED_MODE_LINKUP | - MII_CFG1_LED_ENABLE); + reg_mask = MII_CFG1_AUTO_OP | MII_CFG1_LED_MODE_MASK | + MII_CFG1_LED_ENABLE; + reg_val = MII_CFG1_AUTO_OP | MII_CFG1_LED_MODE_LINKUP | + MII_CFG1_LED_ENABLE; + + reg_mask |= MII_CFG1_INTERFACE_MODE_MASK; + ret = tja11xx_get_interface_mode(phydev); + if (ret < 0) + return ret; + + reg_val |= (ret & 0xffff); + ret = phy_modify(phydev, MII_CFG1, reg_mask, reg_val); if (ret) return ret; break; case PHY_ID_TJA1101: + reg_mask = MII_CFG1_INTERFACE_MODE_MASK; + ret = tja11xx_get_interface_mode(phydev); + if (ret < 0) + return ret; + + reg_val = ret & 0xffff; + ret = phy_modify(phydev, MII_CFG1, reg_mask, reg_val); + if (ret) + return ret; + fallthrough; case PHY_ID_TJA1102: ret = phy_set_bits(phydev, MII_COMMCFG, MII_COMMCFG_AUTO_OP); if (ret) @@ -458,16 +511,36 @@ static int tja11xx_hwmon_register(struct phy_device *phydev, return PTR_ERR_OR_ZERO(priv->hwmon_dev); } +static int tja11xx_parse_dt(struct phy_device *phydev) +{ + struct device_node *node = phydev->mdio.dev.of_node; + struct tja11xx_priv *priv = phydev->priv; + + if (!IS_ENABLED(CONFIG_OF_MDIO)) + return 0; + + if (of_property_read_bool(node, "nxp,rmii-refclk-in")) + priv->flags |= TJA110X_RMII_MODE_REFCLK_IN; + + return 0; +} + static int tja11xx_probe(struct phy_device *phydev) { struct device *dev = &phydev->mdio.dev; struct tja11xx_priv *priv; + int ret; priv = devm_kzalloc(dev, sizeof(*priv), GFP_KERNEL); if (!priv) return -ENOMEM; priv->phydev = phydev; + phydev->priv = priv; + + ret = tja11xx_parse_dt(phydev); + if (ret) + return ret; return tja11xx_hwmon_register(phydev, priv); } diff --git a/include/linux/compiler_attributes.h b/include/linux/compiler_attributes.h index 445e80517cab..fc93c9488c76 100644 --- a/include/linux/compiler_attributes.h +++ b/include/linux/compiler_attributes.h @@ -371,4 +371,11 @@ */ #define __weak __attribute__((__weak__)) +/* + * Used by functions that use '__builtin_return_address'. These function + * don't want to be splited or made inline, which can make + * the '__builtin_return_address' get unexpected address. + */ +#define __fix_address noinline __noclone + #endif /* __LINUX_COMPILER_ATTRIBUTES_H */ diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index ca8afa382bf2..b51d07a727c9 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1195,7 +1195,8 @@ static inline bool skb_unref(struct sk_buff *skb) return true; } -void kfree_skb_reason(struct sk_buff *skb, enum skb_drop_reason reason); +void __fix_address +kfree_skb_reason(struct sk_buff *skb, enum skb_drop_reason reason); /** * kfree_skb - free an sk_buff with 'NOT_SPECIFIED' reason diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h index 1c53c4c4d88f..568a87c5e0d0 100644 --- a/include/net/af_vsock.h +++ b/include/net/af_vsock.h @@ -78,6 +78,7 @@ struct vsock_sock { s64 vsock_stream_has_data(struct vsock_sock *vsk); s64 vsock_stream_has_space(struct vsock_sock *vsk); struct sock *vsock_create_connected(struct sock *parent); +void vsock_data_ready(struct sock *sk); /**** TRANSPORT ****/ @@ -135,6 +136,7 @@ struct vsock_transport { u64 (*stream_rcvhiwat)(struct vsock_sock *); bool (*stream_is_active)(struct vsock_sock *); bool (*stream_allow)(u32 cid, u32 port); + int (*set_rcvlowat)(struct vsock_sock *vsk, int val); /* SEQ_PACKET. */ ssize_t (*seqpacket_dequeue)(struct vsock_sock *vsk, struct msghdr *msg, diff --git a/include/net/dsa.h b/include/net/dsa.h index b902b31bebce..f2ce12860546 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -559,6 +559,10 @@ static inline bool dsa_is_user_port(struct dsa_switch *ds, int p) list_for_each_entry((_dp), &(_dst)->ports, list) \ if (dsa_port_is_user((_dp))) +#define dsa_tree_for_each_cpu_port(_dp, _dst) \ + list_for_each_entry((_dp), &(_dst)->ports, list) \ + if (dsa_port_is_cpu((_dp))) + #define dsa_switch_for_each_port(_dp, _ds) \ list_for_each_entry((_dp), &(_ds)->dst->ports, list) \ if ((_dp)->ds == (_ds)) diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h index 2edea901bbd5..2a7e18ee5577 100644 --- a/include/soc/mscc/ocelot.h +++ b/include/soc/mscc/ocelot.h @@ -1024,6 +1024,8 @@ void ocelot_deinit(struct ocelot *ocelot); void ocelot_init_port(struct ocelot *ocelot, int port); void ocelot_deinit_port(struct ocelot *ocelot, int port); +void ocelot_port_setup_dsa_8021q_cpu(struct ocelot *ocelot, int cpu); +void ocelot_port_teardown_dsa_8021q_cpu(struct ocelot *ocelot, int cpu); void ocelot_port_assign_dsa_8021q_cpu(struct ocelot *ocelot, int port, int cpu); void ocelot_port_unassign_dsa_8021q_cpu(struct ocelot *ocelot, int port); u32 ocelot_port_assigned_dsa_8021q_cpu_mask(struct ocelot *ocelot, int port); diff --git a/include/uapi/linux/netlink.h b/include/uapi/linux/netlink.h index 1e543cf0568c..e0ab261ceca2 100644 --- a/include/uapi/linux/netlink.h +++ b/include/uapi/linux/netlink.h @@ -41,12 +41,20 @@ struct sockaddr_nl { __u32 nl_groups; /* multicast groups mask */ }; +/** + * struct nlmsghdr - fixed format metadata header of Netlink messages + * @nlmsg_len: Length of message including header + * @nlmsg_type: Message content type + * @nlmsg_flags: Additional flags + * @nlmsg_seq: Sequence number + * @nlmsg_pid: Sending process port ID + */ struct nlmsghdr { - __u32 nlmsg_len; /* Length of message including header */ - __u16 nlmsg_type; /* Message content */ - __u16 nlmsg_flags; /* Additional flags */ - __u32 nlmsg_seq; /* Sequence number */ - __u32 nlmsg_pid; /* Sending process port ID */ + __u32 nlmsg_len; + __u16 nlmsg_type; + __u16 nlmsg_flags; + __u32 nlmsg_seq; + __u32 nlmsg_pid; }; /* Flags values */ @@ -337,6 +345,9 @@ enum netlink_attribute_type { * bitfield32 type (U32) * @NL_POLICY_TYPE_ATTR_MASK: mask of valid bits for unsigned integers (U64) * @NL_POLICY_TYPE_ATTR_PAD: pad attribute for 64-bit alignment + * + * @__NL_POLICY_TYPE_ATTR_MAX: number of attributes + * @NL_POLICY_TYPE_ATTR_MAX: highest attribute number */ enum netlink_policy_type_attr { NL_POLICY_TYPE_ATTR_UNSPEC, diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c index a84a7cfb9d6d..efbd93e92ce2 100644 --- a/net/bridge/br_if.c +++ b/net/bridge/br_if.c @@ -568,26 +568,6 @@ int br_add_if(struct net_bridge *br, struct net_device *dev, !is_valid_ether_addr(dev->dev_addr)) return -EINVAL; - /* Also don't allow bridging of net devices that are DSA masters, since - * the bridge layer rx_handler prevents the DSA fake ethertype handler - * to be invoked, so we don't get the chance to strip off and parse the - * DSA switch tag protocol header (the bridge layer just returns - * RX_HANDLER_CONSUMED, stopping RX processing for these frames). - * The only case where that would not be an issue is when bridging can - * already be offloaded, such as when the DSA master is itself a DSA - * or plain switchdev port, and is bridged only with other ports from - * the same hardware device. - */ - if (netdev_uses_dsa(dev)) { - list_for_each_entry(p, &br->port_list, list) { - if (!netdev_port_same_parent_id(dev, p->dev)) { - NL_SET_ERR_MSG(extack, - "Cannot do software bridging with a DSA master"); - return -EINVAL; - } - } - } - /* No bridging of bridges */ if (dev->netdev_ops->ndo_start_xmit == br_dev_xmit) { NL_SET_ERR_MSG(extack, diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 974bbbbe7138..35d9d5958dc6 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -777,9 +777,10 @@ EXPORT_SYMBOL(__kfree_skb); * hit zero. Meanwhile, pass the drop reason to 'kfree_skb' * tracepoint. */ -void kfree_skb_reason(struct sk_buff *skb, enum skb_drop_reason reason) +void __fix_address +kfree_skb_reason(struct sk_buff *skb, enum skb_drop_reason reason) { - if (!skb_unref(skb)) + if (unlikely(!skb_unref(skb))) return; DEBUG_NET_WARN_ON_ONCE(reason <= 0 || reason >= SKB_DROP_REASON_MAX); diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index 055a6d1d4372..ed56c7a554b8 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -1060,26 +1060,24 @@ static int dsa_tree_setup_switches(struct dsa_switch_tree *dst) static int dsa_tree_setup_master(struct dsa_switch_tree *dst) { - struct dsa_port *dp; + struct dsa_port *cpu_dp; int err = 0; rtnl_lock(); - list_for_each_entry(dp, &dst->ports, list) { - if (dsa_port_is_cpu(dp)) { - struct net_device *master = dp->master; - bool admin_up = (master->flags & IFF_UP) && - !qdisc_tx_is_noop(master); + dsa_tree_for_each_cpu_port(cpu_dp, dst) { + struct net_device *master = cpu_dp->master; + bool admin_up = (master->flags & IFF_UP) && + !qdisc_tx_is_noop(master); - err = dsa_master_setup(master, dp); - if (err) - break; + err = dsa_master_setup(master, cpu_dp); + if (err) + break; - /* Replay master state event */ - dsa_tree_master_admin_state_change(dst, master, admin_up); - dsa_tree_master_oper_state_change(dst, master, - netif_oper_up(master)); - } + /* Replay master state event */ + dsa_tree_master_admin_state_change(dst, master, admin_up); + dsa_tree_master_oper_state_change(dst, master, + netif_oper_up(master)); } rtnl_unlock(); @@ -1089,22 +1087,20 @@ static int dsa_tree_setup_master(struct dsa_switch_tree *dst) static void dsa_tree_teardown_master(struct dsa_switch_tree *dst) { - struct dsa_port *dp; + struct dsa_port *cpu_dp; rtnl_lock(); - list_for_each_entry(dp, &dst->ports, list) { - if (dsa_port_is_cpu(dp)) { - struct net_device *master = dp->master; + dsa_tree_for_each_cpu_port(cpu_dp, dst) { + struct net_device *master = cpu_dp->master; - /* Synthesizing an "admin down" state is sufficient for - * the switches to get a notification if the master is - * currently up and running. - */ - dsa_tree_master_admin_state_change(dst, master, false); + /* Synthesizing an "admin down" state is sufficient for + * the switches to get a notification if the master is + * currently up and running. + */ + dsa_tree_master_admin_state_change(dst, master, false); - dsa_master_teardown(master); - } + dsa_master_teardown(master); } rtnl_unlock(); @@ -1252,7 +1248,6 @@ out_disconnect: * they would have formed disjoint trees (different "dsa,member" values). */ int dsa_tree_change_tag_proto(struct dsa_switch_tree *dst, - struct net_device *master, const struct dsa_device_ops *tag_ops, const struct dsa_device_ops *old_tag_ops) { @@ -1268,14 +1263,11 @@ int dsa_tree_change_tag_proto(struct dsa_switch_tree *dst, * attempts to change the tagging protocol. If we ever lift the IFF_UP * restriction, there needs to be another mutex which serializes this. */ - if (master->flags & IFF_UP) - goto out_unlock; - list_for_each_entry(dp, &dst->ports, list) { - if (!dsa_port_is_user(dp)) - continue; + if (dsa_port_is_cpu(dp) && (dp->master->flags & IFF_UP)) + goto out_unlock; - if (dp->slave->flags & IFF_UP) + if (dsa_port_is_user(dp) && (dp->slave->flags & IFF_UP)) goto out_unlock; } diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index 8924366467e0..614fbba8fe39 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -545,7 +545,6 @@ struct dsa_lag *dsa_tree_lag_find(struct dsa_switch_tree *dst, int dsa_tree_notify(struct dsa_switch_tree *dst, unsigned long e, void *v); int dsa_broadcast(unsigned long e, void *v); int dsa_tree_change_tag_proto(struct dsa_switch_tree *dst, - struct net_device *master, const struct dsa_device_ops *tag_ops, const struct dsa_device_ops *old_tag_ops); void dsa_tree_master_admin_state_change(struct dsa_switch_tree *dst, diff --git a/net/dsa/master.c b/net/dsa/master.c index 9e5dabc751d9..fb810edc8281 100644 --- a/net/dsa/master.c +++ b/net/dsa/master.c @@ -307,7 +307,7 @@ static ssize_t tagging_store(struct device *d, struct device_attribute *attr, */ goto out; - err = dsa_tree_change_tag_proto(cpu_dp->ds->dst, dev, new_tag_ops, + err = dsa_tree_change_tag_proto(cpu_dp->ds->dst, new_tag_ops, old_tag_ops); if (err) { /* On failure the old tagger is restored, so we don't need the diff --git a/net/dsa/slave.c b/net/dsa/slave.c index ce12c3427bad..b24f1131af90 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -2476,6 +2476,9 @@ static int dsa_slave_changeupper(struct net_device *dev, struct netlink_ext_ack *extack; int err = NOTIFY_DONE; + if (!dsa_slave_dev_check(dev)) + return err; + extack = netdev_notifier_info_to_extack(&info->info); if (netif_is_bridge_master(info->upper_dev)) { @@ -2531,6 +2534,9 @@ static int dsa_slave_prechangeupper(struct net_device *dev, { struct dsa_port *dp = dsa_slave_to_port(dev); + if (!dsa_slave_dev_check(dev)) + return NOTIFY_DONE; + if (netif_is_bridge_master(info->upper_dev) && !info->linking) dsa_port_pre_bridge_leave(dp, info->upper_dev); else if (netif_is_lag_master(info->upper_dev) && !info->linking) @@ -2551,6 +2557,9 @@ dsa_slave_lag_changeupper(struct net_device *dev, int err = NOTIFY_DONE; struct dsa_port *dp; + if (!netif_is_lag_master(dev)) + return err; + netdev_for_each_lower_dev(dev, lower, iter) { if (!dsa_slave_dev_check(lower)) continue; @@ -2580,6 +2589,9 @@ dsa_slave_lag_prechangeupper(struct net_device *dev, int err = NOTIFY_DONE; struct dsa_port *dp; + if (!netif_is_lag_master(dev)) + return err; + netdev_for_each_lower_dev(dev, lower, iter) { if (!dsa_slave_dev_check(lower)) continue; @@ -2687,6 +2699,75 @@ dsa_slave_prechangeupper_sanity_check(struct net_device *dev, return NOTIFY_DONE; } +static int +dsa_master_prechangeupper_sanity_check(struct net_device *master, + struct netdev_notifier_changeupper_info *info) +{ + struct netlink_ext_ack *extack; + + if (!netdev_uses_dsa(master)) + return NOTIFY_DONE; + + if (!info->linking) + return NOTIFY_DONE; + + /* Allow DSA switch uppers */ + if (dsa_slave_dev_check(info->upper_dev)) + return NOTIFY_DONE; + + /* Allow bridge uppers of DSA masters, subject to further + * restrictions in dsa_bridge_prechangelower_sanity_check() + */ + if (netif_is_bridge_master(info->upper_dev)) + return NOTIFY_DONE; + + extack = netdev_notifier_info_to_extack(&info->info); + + NL_SET_ERR_MSG_MOD(extack, + "DSA master cannot join unknown upper interfaces"); + return notifier_from_errno(-EBUSY); +} + +/* Don't allow bridging of DSA masters, since the bridge layer rx_handler + * prevents the DSA fake ethertype handler to be invoked, so we don't get the + * chance to strip off and parse the DSA switch tag protocol header (the bridge + * layer just returns RX_HANDLER_CONSUMED, stopping RX processing for these + * frames). + * The only case where that would not be an issue is when bridging can already + * be offloaded, such as when the DSA master is itself a DSA or plain switchdev + * port, and is bridged only with other ports from the same hardware device. + */ +static int +dsa_bridge_prechangelower_sanity_check(struct net_device *new_lower, + struct netdev_notifier_changeupper_info *info) +{ + struct net_device *br = info->upper_dev; + struct netlink_ext_ack *extack; + struct net_device *lower; + struct list_head *iter; + + if (!netif_is_bridge_master(br)) + return NOTIFY_DONE; + + if (!info->linking) + return NOTIFY_DONE; + + extack = netdev_notifier_info_to_extack(&info->info); + + netdev_for_each_lower_dev(br, lower, iter) { + if (!netdev_uses_dsa(new_lower) && !netdev_uses_dsa(lower)) + continue; + + if (!netdev_port_same_parent_id(lower, new_lower)) { + NL_SET_ERR_MSG(extack, + "Cannot do software bridging with a DSA master"); + return notifier_from_errno(-EINVAL); + } + } + + return NOTIFY_DONE; +} + static int dsa_slave_netdevice_event(struct notifier_block *nb, unsigned long event, void *ptr) { @@ -2698,25 +2779,40 @@ static int dsa_slave_netdevice_event(struct notifier_block *nb, int err; err = dsa_slave_prechangeupper_sanity_check(dev, info); - if (err != NOTIFY_DONE) + if (notifier_to_errno(err)) + return err; + + err = dsa_master_prechangeupper_sanity_check(dev, info); + if (notifier_to_errno(err)) return err; - if (dsa_slave_dev_check(dev)) - return dsa_slave_prechangeupper(dev, ptr); + err = dsa_bridge_prechangelower_sanity_check(dev, info); + if (notifier_to_errno(err)) + return err; - if (netif_is_lag_master(dev)) - return dsa_slave_lag_prechangeupper(dev, ptr); + err = dsa_slave_prechangeupper(dev, ptr); + if (notifier_to_errno(err)) + return err; + + err = dsa_slave_lag_prechangeupper(dev, ptr); + if (notifier_to_errno(err)) + return err; break; } - case NETDEV_CHANGEUPPER: - if (dsa_slave_dev_check(dev)) - return dsa_slave_changeupper(dev, ptr); + case NETDEV_CHANGEUPPER: { + int err; - if (netif_is_lag_master(dev)) - return dsa_slave_lag_changeupper(dev, ptr); + err = dsa_slave_changeupper(dev, ptr); + if (notifier_to_errno(err)) + return err; + + err = dsa_slave_lag_changeupper(dev, ptr); + if (notifier_to_errno(err)) + return err; break; + } case NETDEV_CHANGELOWERSTATE: { struct netdev_notifier_changelowerstate_info *info = ptr; struct dsa_port *dp; @@ -2777,6 +2873,9 @@ static int dsa_slave_netdevice_event(struct notifier_block *nb, if (!dsa_port_is_user(dp)) continue; + if (dp->cpu_dp != cpu_dp) + continue; + list_add(&dp->slave->close_list, &close_list); } diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index bbe218753662..ba62f8e8bd15 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -4433,12 +4433,16 @@ static void __tcp_alloc_md5sig_pool(void) * to memory. See smp_rmb() in tcp_get_md5sig_pool() */ smp_wmb(); - tcp_md5sig_pool_populated = true; + /* Paired with READ_ONCE() from tcp_alloc_md5sig_pool() + * and tcp_get_md5sig_pool(). + */ + WRITE_ONCE(tcp_md5sig_pool_populated, true); } bool tcp_alloc_md5sig_pool(void) { - if (unlikely(!tcp_md5sig_pool_populated)) { + /* Paired with WRITE_ONCE() from __tcp_alloc_md5sig_pool() */ + if (unlikely(!READ_ONCE(tcp_md5sig_pool_populated))) { mutex_lock(&tcp_md5sig_mutex); if (!tcp_md5sig_pool_populated) { @@ -4449,7 +4453,8 @@ bool tcp_alloc_md5sig_pool(void) mutex_unlock(&tcp_md5sig_mutex); } - return tcp_md5sig_pool_populated; + /* Paired with WRITE_ONCE() from __tcp_alloc_md5sig_pool() */ + return READ_ONCE(tcp_md5sig_pool_populated); } EXPORT_SYMBOL(tcp_alloc_md5sig_pool); @@ -4465,7 +4470,8 @@ struct tcp_md5sig_pool *tcp_get_md5sig_pool(void) { local_bh_disable(); - if (tcp_md5sig_pool_populated) { + /* Paired with WRITE_ONCE() from __tcp_alloc_md5sig_pool() */ + if (READ_ONCE(tcp_md5sig_pool_populated)) { /* coupled with smp_wmb() in __tcp_alloc_md5sig_pool() */ smp_rmb(); return this_cpu_ptr(&tcp_md5sig_pool); diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 790d6809be81..1ebab4b11262 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -1977,9 +1977,6 @@ static int tc_new_tfilter(struct sk_buff *skb, struct nlmsghdr *n, bool rtnl_held = false; u32 flags; - if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) - return -EPERM; - replay: tp_created = 0; @@ -2208,9 +2205,6 @@ static int tc_del_tfilter(struct sk_buff *skb, struct nlmsghdr *n, int err; bool rtnl_held = false; - if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) - return -EPERM; - err = nlmsg_parse_deprecated(n, sizeof(*t), tca, TCA_MAX, rtm_tca_policy, extack); if (err < 0) @@ -2826,10 +2820,6 @@ static int tc_ctl_chain(struct sk_buff *skb, struct nlmsghdr *n, unsigned long cl; int err; - if (n->nlmsg_type != RTM_GETCHAIN && - !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) - return -EPERM; - replay: q = NULL; err = nlmsg_parse_deprecated(n, sizeof(*t), tca, TCA_MAX, diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 9f7680728e2b..db1569fac57c 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -1425,10 +1425,6 @@ static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, struct Qdisc *p = NULL; int err; - if ((n->nlmsg_type != RTM_GETQDISC) && - !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) - return -EPERM; - err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX, rtm_tca_policy, extack); if (err < 0) @@ -1509,9 +1505,6 @@ static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, struct Qdisc *q, *p; int err; - if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) - return -EPERM; - replay: /* Reinit, just in case something touches this. */ err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX, @@ -1993,10 +1986,6 @@ static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, u32 qid; int err; - if ((n->nlmsg_type != RTM_GETTCLASS) && - !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) - return -EPERM; - err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX, rtm_tca_policy, extack); if (err < 0) diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index b4ee163154a6..ee418701cdee 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -882,6 +882,16 @@ s64 vsock_stream_has_space(struct vsock_sock *vsk) } EXPORT_SYMBOL_GPL(vsock_stream_has_space); +void vsock_data_ready(struct sock *sk) +{ + struct vsock_sock *vsk = vsock_sk(sk); + + if (vsock_stream_has_data(vsk) >= sk->sk_rcvlowat || + sock_flag(sk, SOCK_DONE)) + sk->sk_data_ready(sk); +} +EXPORT_SYMBOL_GPL(vsock_data_ready); + static int vsock_release(struct socket *sock) { __vsock_release(sock->sk, 0); @@ -1066,8 +1076,9 @@ static __poll_t vsock_poll(struct file *file, struct socket *sock, if (transport && transport->stream_is_active(vsk) && !(sk->sk_shutdown & RCV_SHUTDOWN)) { bool data_ready_now = false; + int target = sock_rcvlowat(sk, 0, INT_MAX); int ret = transport->notify_poll_in( - vsk, 1, &data_ready_now); + vsk, target, &data_ready_now); if (ret < 0) { mask |= EPOLLERR; } else { @@ -2137,6 +2148,25 @@ out: return err; } +static int vsock_set_rcvlowat(struct sock *sk, int val) +{ + const struct vsock_transport *transport; + struct vsock_sock *vsk; + + vsk = vsock_sk(sk); + + if (val > vsk->buffer_size) + return -EINVAL; + + transport = vsk->transport; + + if (transport && transport->set_rcvlowat) + return transport->set_rcvlowat(vsk, val); + + WRITE_ONCE(sk->sk_rcvlowat, val ? : 1); + return 0; +} + static const struct proto_ops vsock_stream_ops = { .family = PF_VSOCK, .owner = THIS_MODULE, @@ -2156,6 +2186,7 @@ static const struct proto_ops vsock_stream_ops = { .recvmsg = vsock_connectible_recvmsg, .mmap = sock_no_mmap, .sendpage = sock_no_sendpage, + .set_rcvlowat = vsock_set_rcvlowat, }; static const struct proto_ops vsock_seqpacket_ops = { diff --git a/net/vmw_vsock/hyperv_transport.c b/net/vmw_vsock/hyperv_transport.c index fd98229e3db3..59c3e2697069 100644 --- a/net/vmw_vsock/hyperv_transport.c +++ b/net/vmw_vsock/hyperv_transport.c @@ -815,6 +815,12 @@ int hvs_notify_send_post_enqueue(struct vsock_sock *vsk, ssize_t written, return 0; } +static +int hvs_set_rcvlowat(struct vsock_sock *vsk, int val) +{ + return -EOPNOTSUPP; +} + static struct vsock_transport hvs_transport = { .module = THIS_MODULE, @@ -850,6 +856,7 @@ static struct vsock_transport hvs_transport = { .notify_send_pre_enqueue = hvs_notify_send_pre_enqueue, .notify_send_post_enqueue = hvs_notify_send_post_enqueue, + .set_rcvlowat = hvs_set_rcvlowat }; static bool hvs_check_transport(struct vsock_sock *vsk) diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c index ec2c2afbf0d0..35863132f4f1 100644 --- a/net/vmw_vsock/virtio_transport_common.c +++ b/net/vmw_vsock/virtio_transport_common.c @@ -634,10 +634,7 @@ virtio_transport_notify_poll_in(struct vsock_sock *vsk, size_t target, bool *data_ready_now) { - if (vsock_stream_has_data(vsk)) - *data_ready_now = true; - else - *data_ready_now = false; + *data_ready_now = vsock_stream_has_data(vsk) >= target; return 0; } @@ -1084,7 +1081,7 @@ virtio_transport_recv_connected(struct sock *sk, switch (le16_to_cpu(pkt->hdr.op)) { case VIRTIO_VSOCK_OP_RW: virtio_transport_recv_enqueue(vsk, pkt); - sk->sk_data_ready(sk); + vsock_data_ready(sk); return err; case VIRTIO_VSOCK_OP_CREDIT_REQUEST: virtio_transport_send_credit_update(vsk); diff --git a/net/vmw_vsock/vmci_transport_notify.c b/net/vmw_vsock/vmci_transport_notify.c index d69fc4b595ad..7c3a7db134b2 100644 --- a/net/vmw_vsock/vmci_transport_notify.c +++ b/net/vmw_vsock/vmci_transport_notify.c @@ -307,7 +307,7 @@ vmci_transport_handle_wrote(struct sock *sk, struct vsock_sock *vsk = vsock_sk(sk); PKT_FIELD(vsk, sent_waiting_read) = false; #endif - sk->sk_data_ready(sk); + vsock_data_ready(sk); } static void vmci_transport_notify_pkt_socket_init(struct sock *sk) @@ -340,12 +340,12 @@ vmci_transport_notify_pkt_poll_in(struct sock *sk, { struct vsock_sock *vsk = vsock_sk(sk); - if (vsock_stream_has_data(vsk)) { + if (vsock_stream_has_data(vsk) >= target) { *data_ready_now = true; } else { - /* We can't read right now because there is nothing in the - * queue. Ask for notifications when there is something to - * read. + /* We can't read right now because there is not enough data + * in the queue. Ask for notifications when there is something + * to read. */ if (sk->sk_state == TCP_ESTABLISHED) { if (!send_waiting_read(sk, 1)) diff --git a/net/vmw_vsock/vmci_transport_notify_qstate.c b/net/vmw_vsock/vmci_transport_notify_qstate.c index 0f36d7c45db3..e96a88d850a8 100644 --- a/net/vmw_vsock/vmci_transport_notify_qstate.c +++ b/net/vmw_vsock/vmci_transport_notify_qstate.c @@ -84,7 +84,7 @@ vmci_transport_handle_wrote(struct sock *sk, bool bottom_half, struct sockaddr_vm *dst, struct sockaddr_vm *src) { - sk->sk_data_ready(sk); + vsock_data_ready(sk); } static void vsock_block_update_write_window(struct sock *sk) @@ -161,12 +161,12 @@ vmci_transport_notify_pkt_poll_in(struct sock *sk, { struct vsock_sock *vsk = vsock_sk(sk); - if (vsock_stream_has_data(vsk)) { + if (vsock_stream_has_data(vsk) >= target) { *data_ready_now = true; } else { - /* We can't read right now because there is nothing in the - * queue. Ask for notifications when there is something to - * read. + /* We can't read right now because there is not enough data + * in the queue. Ask for notifications when there is something + * to read. */ if (sk->sk_state == TCP_ESTABLISHED) vsock_block_update_write_window(sk); @@ -282,7 +282,7 @@ vmci_transport_notify_pkt_recv_post_dequeue( /* See the comment in * vmci_transport_notify_pkt_send_post_enqueue(). */ - sk->sk_data_ready(sk); + vsock_data_ready(sk); } return err; diff --git a/tools/testing/vsock/vsock_test.c b/tools/testing/vsock/vsock_test.c index dc577461afc2..bb6d691cb30d 100644 --- a/tools/testing/vsock/vsock_test.c +++ b/tools/testing/vsock/vsock_test.c @@ -18,6 +18,7 @@ #include <sys/socket.h> #include <time.h> #include <sys/mman.h> +#include <poll.h> #include "timeout.h" #include "control.h" @@ -596,6 +597,108 @@ static void test_seqpacket_invalid_rec_buffer_server(const struct test_opts *opt close(fd); } +#define RCVLOWAT_BUF_SIZE 128 + +static void test_stream_poll_rcvlowat_server(const struct test_opts *opts) +{ + int fd; + int i; + + fd = vsock_stream_accept(VMADDR_CID_ANY, 1234, NULL); + if (fd < 0) { + perror("accept"); + exit(EXIT_FAILURE); + } + + /* Send 1 byte. */ + send_byte(fd, 1, 0); + + control_writeln("SRVSENT"); + + /* Wait until client is ready to receive rest of data. */ + control_expectln("CLNSENT"); + + for (i = 0; i < RCVLOWAT_BUF_SIZE - 1; i++) + send_byte(fd, 1, 0); + + /* Keep socket in active state. */ + control_expectln("POLLDONE"); + + close(fd); +} + +static void test_stream_poll_rcvlowat_client(const struct test_opts *opts) +{ + unsigned long lowat_val = RCVLOWAT_BUF_SIZE; + char buf[RCVLOWAT_BUF_SIZE]; + struct pollfd fds; + ssize_t read_res; + short poll_flags; + int fd; + + fd = vsock_stream_connect(opts->peer_cid, 1234); + if (fd < 0) { + perror("connect"); + exit(EXIT_FAILURE); + } + + if (setsockopt(fd, SOL_SOCKET, SO_RCVLOWAT, + &lowat_val, sizeof(lowat_val))) { + perror("setsockopt"); + exit(EXIT_FAILURE); + } + + control_expectln("SRVSENT"); + + /* At this point, server sent 1 byte. */ + fds.fd = fd; + poll_flags = POLLIN | POLLRDNORM; + fds.events = poll_flags; + + /* Try to wait for 1 sec. */ + if (poll(&fds, 1, 1000) < 0) { + perror("poll"); + exit(EXIT_FAILURE); + } + + /* poll() must return nothing. */ + if (fds.revents) { + fprintf(stderr, "Unexpected poll result %hx\n", + fds.revents); + exit(EXIT_FAILURE); + } + + /* Tell server to send rest of data. */ + control_writeln("CLNSENT"); + + /* Poll for data. */ + if (poll(&fds, 1, 10000) < 0) { + perror("poll"); + exit(EXIT_FAILURE); + } + + /* Only these two bits are expected. */ + if (fds.revents != poll_flags) { + fprintf(stderr, "Unexpected poll result %hx\n", + fds.revents); + exit(EXIT_FAILURE); + } + + /* Use MSG_DONTWAIT, if call is going to wait, EAGAIN + * will be returned. + */ + read_res = recv(fd, buf, sizeof(buf), MSG_DONTWAIT); + if (read_res != RCVLOWAT_BUF_SIZE) { + fprintf(stderr, "Unexpected recv result %zi\n", + read_res); + exit(EXIT_FAILURE); + } + + control_writeln("POLLDONE"); + + close(fd); +} + static struct test_case test_cases[] = { { .name = "SOCK_STREAM connection reset", @@ -646,6 +749,11 @@ static struct test_case test_cases[] = { .run_client = test_seqpacket_invalid_rec_buffer_client, .run_server = test_seqpacket_invalid_rec_buffer_server, }, + { + .name = "SOCK_STREAM poll() + SO_RCVLOWAT", + .run_client = test_stream_poll_rcvlowat_client, + .run_server = test_stream_poll_rcvlowat_server, + }, {}, }; |
