diff options
Diffstat (limited to 'Documentation/driver-api')
98 files changed, 6117 insertions, 1007 deletions
diff --git a/Documentation/driver-api/auxiliary_bus.rst b/Documentation/driver-api/auxiliary_bus.rst index cec84908fbc0..b236de773e1d 100644 --- a/Documentation/driver-api/auxiliary_bus.rst +++ b/Documentation/driver-api/auxiliary_bus.rst @@ -24,7 +24,6 @@ Auxiliary Device Creation .. kernel-doc:: drivers/base/auxiliary.c :identifiers: auxiliary_device_init __auxiliary_device_add - auxiliary_find_device Auxiliary Device Memory Model and Lifespan ------------------------------------------ diff --git a/Documentation/driver-api/basics.rst b/Documentation/driver-api/basics.rst index d78b7c328ff7..5e9f7aee71a7 100644 --- a/Documentation/driver-api/basics.rst +++ b/Documentation/driver-api/basics.rst @@ -108,6 +108,9 @@ Kernel objects manipulation .. kernel-doc:: lib/kobject.c :export: +.. kernel-doc:: lib/kobject_uevent.c + :export: + Kernel utility functions ------------------------ diff --git a/Documentation/driver-api/coco/index.rst b/Documentation/driver-api/coco/index.rst new file mode 100644 index 000000000000..af9f08ca0cfd --- /dev/null +++ b/Documentation/driver-api/coco/index.rst @@ -0,0 +1,12 @@ +.. SPDX-License-Identifier: GPL-2.0 + +====================== +Confidential Computing +====================== + +.. toctree:: + :maxdepth: 1 + + measurement-registers + +.. only:: subproject and html diff --git a/Documentation/driver-api/coco/measurement-registers.rst b/Documentation/driver-api/coco/measurement-registers.rst new file mode 100644 index 000000000000..962a44efa2c0 --- /dev/null +++ b/Documentation/driver-api/coco/measurement-registers.rst @@ -0,0 +1,12 @@ +.. SPDX-License-Identifier: GPL-2.0 +.. include:: <isonum.txt> + +===================== +Measurement Registers +===================== + +.. kernel-doc:: include/linux/tsm-mr.h + :internal: + +.. kernel-doc:: drivers/virt/coco/guest/tsm-mr.c + :export: diff --git a/Documentation/driver-api/crypto/iaa/iaa-crypto.rst b/Documentation/driver-api/crypto/iaa/iaa-crypto.rst index f4fba897d931..8e50b900d51c 100644 --- a/Documentation/driver-api/crypto/iaa/iaa-crypto.rst +++ b/Documentation/driver-api/crypto/iaa/iaa-crypto.rst @@ -272,7 +272,7 @@ The available attributes are: echo async_irq > /sys/bus/dsa/drivers/crypto/sync_mode Async mode without interrupts (caller must poll) can be enabled by - writing 'async' to it:: + writing 'async' to it (please see Caveat):: echo async > /sys/bus/dsa/drivers/crypto/sync_mode @@ -283,6 +283,13 @@ The available attributes are: The default mode is 'sync'. + Caveat: since the only mechanism that iaa_crypto currently implements + for async polling without interrupts is via the 'sync' mode as + described earlier, writing 'async' to + '/sys/bus/dsa/drivers/crypto/sync_mode' will internally enable the + 'sync' mode. This is to ensure correct iaa_crypto behavior until true + async polling without interrupts is enabled in iaa_crypto. + .. _iaa_default_config: IAA Default Configuration @@ -471,7 +478,6 @@ Use the following commands to enable zswap:: # echo deflate-iaa > /sys/module/zswap/parameters/compressor # echo zsmalloc > /sys/module/zswap/parameters/zpool # echo 1 > /sys/module/zswap/parameters/enabled - # echo 0 > /sys/module/zswap/parameters/same_filled_pages_enabled # echo 100 > /proc/sys/vm/swappiness # echo never > /sys/kernel/mm/transparent_hugepage/enabled # echo 1 > /proc/sys/vm/overcommit_memory @@ -621,7 +627,6 @@ the 'fixed' compression mode:: echo deflate-iaa > /sys/module/zswap/parameters/compressor echo zsmalloc > /sys/module/zswap/parameters/zpool echo 1 > /sys/module/zswap/parameters/enabled - echo 0 > /sys/module/zswap/parameters/same_filled_pages_enabled echo 100 > /proc/sys/vm/swappiness echo never > /sys/kernel/mm/transparent_hugepage/enabled diff --git a/Documentation/driver-api/cxl/allocation/dax.rst b/Documentation/driver-api/cxl/allocation/dax.rst new file mode 100644 index 000000000000..c6f7a5da832f --- /dev/null +++ b/Documentation/driver-api/cxl/allocation/dax.rst @@ -0,0 +1,60 @@ +.. SPDX-License-Identifier: GPL-2.0 + +=========== +DAX Devices +=========== +CXL capacity exposed as a DAX device can be accessed directly via mmap. +Users may wish to use this interface mechanism to write their own userland +CXL allocator, or to managed shared or persistent memory regions across multiple +hosts. + +If the capacity is shared across hosts or persistent, appropriate flushing +mechanisms must be employed unless the region supports Snoop Back-Invalidate. + +Note that mappings must be aligned (size and base) to the dax device's base +alignment, which is typically 2MB - but maybe be configured larger. + +:: + + #include <stdio.h> + #include <stdlib.h> + #include <stdint.h> + #include <sys/mman.h> + #include <fcntl.h> + #include <unistd.h> + + #define DEVICE_PATH "/dev/dax0.0" // Replace DAX device path + #define DEVICE_SIZE (4ULL * 1024 * 1024 * 1024) // 4GB + + int main() { + int fd; + void* mapped_addr; + + /* Open the DAX device */ + fd = open(DEVICE_PATH, O_RDWR); + if (fd < 0) { + perror("open"); + return -1; + } + + /* Map the device into memory */ + mapped_addr = mmap(NULL, DEVICE_SIZE, PROT_READ | PROT_WRITE, + MAP_SHARED, fd, 0); + if (mapped_addr == MAP_FAILED) { + perror("mmap"); + close(fd); + return -1; + } + + printf("Mapped address: %p\n", mapped_addr); + + /* You can now access the device through the mapped address */ + uint64_t* ptr = (uint64_t*)mapped_addr; + *ptr = 0x1234567890abcdef; // Write a value to the device + printf("Value at address %p: 0x%016llx\n", ptr, *ptr); + + /* Clean up */ + munmap(mapped_addr, DEVICE_SIZE); + close(fd); + return 0; + } diff --git a/Documentation/driver-api/cxl/allocation/hugepages.rst b/Documentation/driver-api/cxl/allocation/hugepages.rst new file mode 100644 index 000000000000..1023c6922829 --- /dev/null +++ b/Documentation/driver-api/cxl/allocation/hugepages.rst @@ -0,0 +1,32 @@ +.. SPDX-License-Identifier: GPL-2.0 + +========== +Huge Pages +========== + +Contiguous Memory Allocator +=========================== +CXL Memory onlined as SystemRAM during early boot is eligible for use by CMA, +as the NUMA node hosting that capacity will be `Online` at the time CMA +carves out contiguous capacity. + +CXL Memory deferred to the CXL Driver for configuration cannot have its +capacity allocated by CMA - as the NUMA node hosting the capacity is `Offline` +at :code:`__init` time - when CMA carves out contiguous capacity. + +HugeTLB +======= +Different huge page sizes allow different memory configurations. + +2MB Huge Pages +-------------- +All CXL capacity regardless of configuration time or memory zone is eligible +for use as 2MB huge pages. + +1GB Huge Pages +-------------- +CXL capacity onlined in :code:`ZONE_NORMAL` is eligible for 1GB Gigantic Page +allocation. + +CXL capacity onlined in :code:`ZONE_MOVABLE` is not eligible for 1GB Gigantic +Page allocation. diff --git a/Documentation/driver-api/cxl/allocation/page-allocator.rst b/Documentation/driver-api/cxl/allocation/page-allocator.rst new file mode 100644 index 000000000000..7b8fe1b8d5bb --- /dev/null +++ b/Documentation/driver-api/cxl/allocation/page-allocator.rst @@ -0,0 +1,85 @@ +.. SPDX-License-Identifier: GPL-2.0 + +================== +The Page Allocator +================== + +The kernel page allocator services all general page allocation requests, such +as :code:`kmalloc`. CXL configuration steps affect the behavior of the page +allocator based on the selected `Memory Zone` and `NUMA node` the capacity is +placed in. + +This section mostly focuses on how these configurations affect the page +allocator (as of Linux v6.15) rather than the overall page allocator behavior. + +NUMA nodes and mempolicy +======================== +Unless a task explicitly registers a mempolicy, the default memory policy +of the linux kernel is to allocate memory from the `local NUMA node` first, +and fall back to other nodes only if the local node is pressured. + +Generally, we expect to see local DRAM and CXL memory on separate NUMA nodes, +with the CXL memory being non-local. Technically, however, it is possible +for a compute node to have no local DRAM, and for CXL memory to be the +`local` capacity for that compute node. + + +Memory Zones +============ +CXL capacity may be onlined in :code:`ZONE_NORMAL` or :code:`ZONE_MOVABLE`. + +As of v6.15, the page allocator attempts to allocate from the highest +available and compatible ZONE for an allocation from the local node first. + +An example of a `zone incompatibility` is attempting to service an allocation +marked :code:`GFP_KERNEL` from :code:`ZONE_MOVABLE`. Kernel allocations are +typically not migratable, and as a result can only be serviced from +:code:`ZONE_NORMAL` or lower. + +To simplify this, the page allocator will prefer :code:`ZONE_MOVABLE` over +:code:`ZONE_NORMAL` by default, but if :code:`ZONE_MOVABLE` is depleted, it +will fallback to allocate from :code:`ZONE_NORMAL`. + + +Zone and Node Quirks +==================== +Let's consider a configuration where the local DRAM capacity is largely onlined +into :code:`ZONE_NORMAL`, with no :code:`ZONE_MOVABLE` capacity present. The +CXL capacity has the opposite configuration - all onlined in +:code:`ZONE_MOVABLE`. + +Under the default allocation policy, the page allocator will completely skip +:code:`ZONE_MOVABLE` as a valid allocation target. This is because, as of +Linux v6.15, the page allocator does (approximately) the following: :: + + for (each zone in local_node): + + for (each node in fallback_order): + + attempt_allocation(gfp_flags); + +Because the local node does not have :code:`ZONE_MOVABLE`, the CXL node is +functionally unreachable for direct allocation. As a result, the only way +for CXL capacity to be used is via `demotion` in the reclaim path. + +This configuration also means that if the DRAM ndoe has :code:`ZONE_MOVABLE` +capacity - when that capacity is depleted, the page allocator will actually +prefer CXL :code:`ZONE_MOVABLE` pages over DRAM :code:`ZONE_NORMAL` pages. + +We may wish to invert this priority in future Linux versions. + +If `demotion` and `swap` are disabled, Linux will begin to cause OOM crashes +when the DRAM nodes are depleted. See the reclaim section for more details. + + +CGroups and CPUSets +=================== +Finally, assuming CXL memory is reachable via the page allocation (i.e. onlined +in :code:`ZONE_NORMAL`), the :code:`cpusets.mems_allowed` may be used by +containers to limit the accessibility of certain NUMA nodes for tasks in that +container. Users may wish to utilize this in multi-tenant systems where some +tasks prefer not to use slower memory. + +In the reclaim section we'll discuss some limitations of this interface to +prevent demotions of shared data to CXL memory (if demotions are enabled). + diff --git a/Documentation/driver-api/cxl/allocation/reclaim.rst b/Documentation/driver-api/cxl/allocation/reclaim.rst new file mode 100644 index 000000000000..f40f1cae391a --- /dev/null +++ b/Documentation/driver-api/cxl/allocation/reclaim.rst @@ -0,0 +1,51 @@ +.. SPDX-License-Identifier: GPL-2.0 + +======= +Reclaim +======= +Another way CXL memory can be utilized *indirectly* is via the reclaim system +in :code:`mm/vmscan.c`. Reclaim is engaged when memory capacity on the system +becomes pressured based on global and cgroup-local `watermark` settings. + +In this section we won't discuss the `watermark` configurations, just how CXL +memory can be consumed by various pieces of reclaim system. + +Demotion +======== +By default, the reclaim system will prefer swap (or zswap) when reclaiming +memory. Enabling :code:`kernel/mm/numa/demotion_enabled` will cause vmscan +to opportunistically prefer distant NUMA nodes to swap or zswap, if capacity +is available. + +Demotion engages the :code:`mm/memory_tier.c` component to determine the +next demotion node. The next demotion node is based on the :code:`HMAT` +or :code:`CDAT` performance data. + +cpusets.mems_allowed quirk +-------------------------- +In Linux v6.15 and below, demotion does not respect :code:`cpusets.mems_allowed` +when migrating pages. As a result, if demotion is enabled, vmscan cannot +guarantee isolation of a container's memory from nodes not set in mems_allowed. + +In Linux v6.XX and up, demotion does attempt to respect +:code:`cpusets.mems_allowed`; however, certain classes of shared memory +originally instantiated by another cgroup (such as common libraries - e.g. +libc) may still be demoted. As a result, the mems_allowed interface still +cannot provide perfect isolation from the remote nodes. + +ZSwap and Node Preference +========================= +In Linux v6.15 and below, ZSwap allocates memory from the local node of the +processor for the new pages being compressed. Since pages being compressed +are typically cold, the result is a cold page becomes promoted - only to +be later demoted as it ages off the LRU. + +In Linux v6.XX, ZSwap tries to prefer the node of the page being compressed +as the allocation target for the compression page. This helps prevent +thrashing. + +Demotion with ZSwap +=================== +When enabling both Demotion and ZSwap, you create a situation where ZSwap +will prefer the slowest form of CXL memory by default until that tier of +memory is exhausted. diff --git a/Documentation/driver-api/cxl/devices/device-types.rst b/Documentation/driver-api/cxl/devices/device-types.rst new file mode 100644 index 000000000000..f5e4330c1cfe --- /dev/null +++ b/Documentation/driver-api/cxl/devices/device-types.rst @@ -0,0 +1,165 @@ +.. SPDX-License-Identifier: GPL-2.0 + +===================== +Devices and Protocols +===================== + +The type of CXL device (Memory, Accelerator, etc) dictates many configuration steps. This section +covers some basic background on device types and on-device resources used by the platform and OS +which impact configuration. + +Protocols +========= + +There are three core protocols to CXL. For the purpose of this documentation, +we will only discuss very high level definitions as the specific hardware +details are largely abstracted away from Linux. See the CXL specification +for more details. + +CXL.io +------ +The basic interaction protocol, similar to PCIe configuration mechanisms. +Typically used for initialization, configuration, and I/O access for anything +other than memory (CXL.mem) or cache (CXL.cache) operations. + +The Linux CXL driver exposes access to .io functionalty via the various sysfs +interfaces and /dev/cxl/ devices (which exposes direct access to device +mailboxes). + +CXL.cache +--------- +The mechanism by which a device may coherently access and cache host memory. + +Largely transparent to Linux once configured. + +CXL.mem +--------- +The mechanism by which the CPU may coherently access and cache device memory. + +Largely transparent to Linux once configured. + + +Device Types +============ + +Type-1 +------ + +A Type-1 CXL device: + +* Supports cxl.io and cxl.cache protocols +* Implements a fully coherent cache +* Allows Device-to-Host coherence and Host-to-Device snoops. +* Does NOT have host-managed device memory (HDM) + +Typical examples of type-1 devices is a Smart NIC - which may want to +directly operate on host-memory (DMA) to store incoming packets. These +devices largely rely on CPU-attached memory. + +Type-2 +------ + +A Type-2 CXL Device: + +* Supports cxl.io, cxl.cache, and cxl.mem protocols +* Optionally implements coherent cache and Host-Managed Device Memory +* Is typically an accelerator device w/ high bandwidth memory. + +The primary difference between a type-1 and type-2 device is the presence +of host-managed device memory, which allows the device to operate on a +local memory bank - while the CPU sill has coherent DMA to the same memory. + +The allows things like GPUs to expose their memory via DAX devices or file +descriptors, allows drivers and programs direct access to device memory +rather than use block-transfer semantics. + +Type-3 +------ + +A Type-3 CXL Device + +* Supports cxl.io and cxl.mem +* Implements Host-Managed Device Memory +* May provide either Volatile or Persistent memory capacity (or both). + +A basic example of a type-3 device is a simple memory expander, whose +local memory capacity is exposed to the CPU for access directly via +basic coherent DMA. + +Switch +------ + +A CXL switch is a device capacity of routing any CXL (and by extension, PCIe) +protocol between an upstream, downstream, or peer devices. Many devices, such +as Multi-Logical Devices, imply the presence of switching in some manner. + +Logical Devices and Heads +------------------------- + +A CXL device may present one or more "Logical Devices" to one or more hosts +(via physical "Heads"). + +A Single-Logical Device (SLD) is a device which presents a single device to +one or more heads. + +A Multi-Logical Device (MLD) is a device which may present multiple devices +to one or more devices. + +A Single-Headed Device exposes only a single physical connection. + +A Multi-Headed Device exposes multiple physical connections. + +MHSLD +~~~~~ +A Multi-Headed Single-Logical Device (MHSLD) exposes a single logical +device to multiple heads which may be connected to one or more discrete +hosts. An example of this would be a simple memory-pool which may be +statically configured (prior to boot) to expose portions of its memory +to Linux via :doc:`CEDT <../platform/acpi/cedt>`. + +MHMLD +~~~~~ +A Multi-Headed Multi-Logical Device (MHMLD) exposes multiple logical +devices to multiple heads which may be connected to one or more discrete +hosts. An example of this would be a Dynamic Capacity Device or which +may be configured at runtime to expose portions of its memory to Linux. + +Example Devices +=============== + +Memory Expander +--------------- +The simplest form of Type-3 device is a memory expander. A memory expander +exposes Host-Managed Device Memory (HDM) to Linux. This memory may be +Volatile or Non-Volatile (Persistent). + +Memory Expanders will typically be considered a form of Single-Headed, +Single-Logical Device - as its form factor will typically be an add-in-card +(AIC) or some other similar form-factor. + +The Linux CXL driver provides support for static or dynamic configuration of +basic memory expanders. The platform may program decoders prior to OS init +(e.g. auto-decoders), or the user may program the fabric if the platform +defers these operations to the OS. + +Multiple Memory Expanders may be added to an external chassis and exposed to +a host via a head attached to a CXL switch. This is a "memory pool", and +would be considered an MHSLD or MHMLD depending on the management capabilities +provided by the switch platform. + +As of v6.14, Linux does not provide a formalized interface to manage non-DCD +MHSLD or MHMLD devices. + +Dynamic Capacity Device (DCD) +----------------------------- + +A Dynamic Capacity Device is a Type-3 device which provides dynamic management +of memory capacity. The basic premise of a DCD to provide an allocator-like +interface for physical memory capacity to a "Fabric Manager" (an external, +privileged host with privileges to change configurations for other hosts). + +A DCD manages "Memory Extents", which may be volatile or persistent. Extents +may also be exclusive to a single host or shared across multiple hosts. + +As of v6.14, Linux does not provide a formalized interface to manage DCD +devices, however there is active work on LKML targeting future release. diff --git a/Documentation/driver-api/cxl/index.rst b/Documentation/driver-api/cxl/index.rst index 036e49553542..9e1414ad3357 100644 --- a/Documentation/driver-api/cxl/index.rst +++ b/Documentation/driver-api/cxl/index.rst @@ -4,9 +4,50 @@ Compute Express Link ==================== +CXL device configuration has a complex handoff between platform (Hardware, +BIOS, EFI), OS (early boot, core kernel, driver), and user policy decisions +that have impacts on each other. The docs here break up configurations steps. + +.. toctree:: + :maxdepth: 2 + :caption: Overview + + theory-of-operation + maturity-map + +.. toctree:: + :maxdepth: 2 + :caption: Device Reference + + devices/device-types + +.. toctree:: + :maxdepth: 2 + :caption: Platform Configuration + + platform/bios-and-efi + platform/acpi + platform/cdat + platform/example-configs + +.. toctree:: + :maxdepth: 2 + :caption: Linux Kernel Configuration + + linux/overview + linux/early-boot + linux/cxl-driver + linux/dax-driver + linux/memory-hotplug + linux/access-coordinates + .. toctree:: - :maxdepth: 1 + :maxdepth: 2 + :caption: Memory Allocation - memory-devices + allocation/dax + allocation/page-allocator + allocation/reclaim + allocation/hugepages.rst .. only:: subproject and html diff --git a/Documentation/driver-api/cxl/linux/access-coordinates.rst b/Documentation/driver-api/cxl/linux/access-coordinates.rst new file mode 100644 index 000000000000..341a7c682043 --- /dev/null +++ b/Documentation/driver-api/cxl/linux/access-coordinates.rst @@ -0,0 +1,178 @@ +.. SPDX-License-Identifier: GPL-2.0 +.. include:: <isonum.txt> + +================================== +CXL Access Coordinates Computation +================================== + +Latency and Bandwidth Calculation +================================= +A memory region performance coordinates (latency and bandwidth) are typically +provided via ACPI tables :doc:`SRAT <../platform/acpi/srat>` and +:doc:`HMAT <../platform/acpi/hmat>`. However, the platform firmware (BIOS) is +not able to annotate those for CXL devices that are hot-plugged since they do +not exist during platform firmware initialization. The CXL driver can compute +the performance coordinates by retrieving data from several components. + +The :doc:`SRAT <../platform/acpi/srat>` provides a Generic Port Affinity +subtable that ties a proximity domain to a device handle, which in this case +would be the CXL hostbridge. Using this association, the performance +coordinates for the Generic Port can be retrieved from the +:doc:`HMAT <../platform/acpi/hmat>` subtable. This piece represents the +performance coordinates between a CPU and a Generic Port (CXL hostbridge). + +The :doc:`CDAT <../platform/cdat>` provides the performance coordinates for +the CXL device itself. That is the bandwidth and latency to access that device's +memory region. The DSMAS subtable provides a DSMADHandle that is tied to a +Device Physical Address (DPA) range. The DSLBIS subtable provides the +performance coordinates that's tied to a DSMADhandle and this ties the two +table entries together to provide the performance coordinates for each DPA +region. For example, if a device exports a DRAM region and a PMEM region, +then there would be different performance characteristsics for each of those +regions. + +If there's a CXL switch in the topology, then the performance coordinates for the +switch is provided by SSLBIS subtable. This provides the bandwidth and latency +for traversing the switch between the switch upstream port and the switch +downstream port that points to the endpoint device. + +Simple topology example:: + + GP0/HB0/ACPI0016-0 + RP0 + | + | L0 + | + SW 0 / USP0 + SW 0 / DSP0 + | + | L1 + | + EP0 + +In this example, there is a CXL switch between an endpoint and a root port. +Latency in this example is calculated as such: +L(EP0) - Latency from EP0 CDAT DSMAS+DSLBIS +L(L1) - Link latency between EP0 and SW0DSP0 +L(SW0) - Latency for the switch from SW0 CDAT SSLBIS. +L(L0) - Link latency between SW0 and RP0 +L(RP0) - Latency from root port to CPU via SRAT and HMAT (Generic Port). +Total read and write latencies are the sum of all these parts. + +Bandwidth in this example is calculated as such: +B(EP0) - Bandwidth from EP0 CDAT DSMAS+DSLBIS +B(L1) - Link bandwidth between EP0 and SW0DSP0 +B(SW0) - Bandwidth for the switch from SW0 CDAT SSLBIS. +B(L0) - Link bandwidth between SW0 and RP0 +B(RP0) - Bandwidth from root port to CPU via SRAT and HMAT (Generic Port). +The total read and write bandwidth is the min() of all these parts. + +To calculate the link bandwidth: +LinkOperatingFrequency (GT/s) is the current negotiated link speed. +DataRatePerLink (MB/s) = LinkOperatingFrequency / 8 +Bandwidth (MB/s) = PCIeCurrentLinkWidth * DataRatePerLink +Where PCIeCurrentLinkWidth is the number of lanes in the link. + +To calculate the link latency: +LinkLatency (picoseconds) = FlitSize / LinkBandwidth (MB/s) + +See `CXL Memory Device SW Guide r1.0 <https://www.intel.com/content/www/us/en/content-details/643805/cxl-memory-device-software-guide.html>`_, +section 2.11.3 and 2.11.4 for details. + +In the end, the access coordinates for a constructed memory region is calculated from one +or more memory partitions from each of the CXL device(s). + +Shared Upstream Link Calculation +================================ +For certain CXL region construction with endpoints behind CXL switches (SW) or +Root Ports (RP), there is the possibility of the total bandwidth for all +the endpoints behind a switch being more than the switch upstream link. +A similar situation can occur within the host, upstream of the root ports. +The CXL driver performs an additional pass after all the targets have +arrived for a region in order to recalculate the bandwidths with possible +upstream link being a limiting factor in mind. + +The algorithm assumes the configuration is a symmetric topology as that +maximizes performance. When asymmetric topology is detected, the calculation +is aborted. An asymmetric topology is detected during topology walk where the +number of RPs detected as a grandparent is not equal to the number of devices +iterated in the same iteration loop. The assumption is made that subtle +asymmetry in properties does not happen and all paths to EPs are equal. + +There can be multiple switches under an RP. There can be multiple RPs under +a CXL Host Bridge (HB). There can be multiple HBs under a CXL Fixed Memory +Window Structure (CFMWS) in the :doc:`CEDT <../platform/acpi/cedt>`. + +An example hierarchy:: + + CFMWS 0 + | + _________|_________ + | | + ACPI0017-0 ACPI0017-1 + GP0/HB0/ACPI0016-0 GP1/HB1/ACPI0016-1 + | | | | + RP0 RP1 RP2 RP3 + | | | | + SW 0 SW 1 SW 2 SW 3 + | | | | | | | | + EP0 EP1 EP2 EP3 EP4 EP5 EP6 EP7 + +Computation for the example hierarchy: + +Min (GP0 to CPU BW, + Min(SW 0 Upstream Link to RP0 BW, + Min(SW0SSLBIS for SW0DSP0 (EP0), EP0 DSLBIS, EP0 Upstream Link) + + Min(SW0SSLBIS for SW0DSP1 (EP1), EP1 DSLBIS, EP1 Upstream link)) + + Min(SW 1 Upstream Link to RP1 BW, + Min(SW1SSLBIS for SW1DSP0 (EP2), EP2 DSLBIS, EP2 Upstream Link) + + Min(SW1SSLBIS for SW1DSP1 (EP3), EP3 DSLBIS, EP3 Upstream link))) + +Min (GP1 to CPU BW, + Min(SW 2 Upstream Link to RP2 BW, + Min(SW2SSLBIS for SW2DSP0 (EP4), EP4 DSLBIS, EP4 Upstream Link) + + Min(SW2SSLBIS for SW2DSP1 (EP5), EP5 DSLBIS, EP5 Upstream link)) + + Min(SW 3 Upstream Link to RP3 BW, + Min(SW3SSLBIS for SW3DSP0 (EP6), EP6 DSLBIS, EP6 Upstream Link) + + Min(SW3SSLBIS for SW3DSP1 (EP7), EP7 DSLBIS, EP7 Upstream link)))) + +The calculation starts at cxl_region_shared_upstream_perf_update(). A xarray +is created to collect all the endpoint bandwidths via the +cxl_endpoint_gather_bandwidth() function. The min() of bandwidth from the +endpoint CDAT and the upstream link bandwidth is calculated. If the endpoint +has a CXL switch as a parent, then min() of calculated bandwidth and the +bandwidth from the SSLBIS for the switch downstream port that is associated +with the endpoint is calculated. The final bandwidth is stored in a +'struct cxl_perf_ctx' in the xarray indexed by a device pointer. If the +endpoint is direct attached to a root port (RP), the device pointer would be an +RP device. If the endpoint is behind a switch, the device pointer would be the +upstream device of the parent switch. + +At the next stage, the code walks through one or more switches if they exist +in the topology. For endpoints directly attached to RPs, this step is skipped. +If there is another switch upstream, the code takes the min() of the current +gathered bandwidth and the upstream link bandwidth. If there's a switch +upstream, then the SSLBIS of the upstream switch. + +Once the topology walk reaches the RP, whether it's direct attached endpoints +or walking through the switch(es), cxl_rp_gather_bandwidth() is called. At +this point all the bandwidths are aggregated per each host bridge, which is +also the index for the resulting xarray. + +The next step is to take the min() of the per host bridge bandwidth and the +bandwidth from the Generic Port (GP). The bandwidths for the GP are retrieved +via ACPI tables (:doc:`SRAT <../platform/acpi/srat>` and +:doc:`HMAT <../platform/acpi/hmat>`). The minimum bandwidth are aggregated +under the same ACPI0017 device to form a new xarray. + +Finally, the cxl_region_update_bandwidth() is called and the aggregated +bandwidth from all the members of the last xarray is updated for the +access coordinates residing in the cxl region (cxlr) context. + +QTG ID +====== +Each :doc:`CEDT <../platform/acpi/cedt>` has a QTG ID field. This field provides +the ID that associates with a QoS Throttling Group (QTG) for the CFMWS window. +Once the access coordinates are calculated, an ACPI Device Specific Method can +be issued to the ACPI0016 device to retrieve the QTG ID depends on the access +coordinates provided. The QTG ID for the device can be used as guidance to match +to the CFMWS to setup the best Linux root decoder for the device performance. diff --git a/Documentation/driver-api/cxl/linux/cxl-driver.rst b/Documentation/driver-api/cxl/linux/cxl-driver.rst new file mode 100644 index 000000000000..9759e90c3cf1 --- /dev/null +++ b/Documentation/driver-api/cxl/linux/cxl-driver.rst @@ -0,0 +1,630 @@ +.. SPDX-License-Identifier: GPL-2.0 + +==================== +CXL Driver Operation +==================== + +The devices described in this section are present in :: + + /sys/bus/cxl/devices/ + /dev/cxl/ + +The :code:`cxl-cli` library, maintained as part of the NDTCL project, may +be used to script interactions with these devices. + +Drivers +======= +The CXL driver is split into a number of drivers. + +* cxl_core - fundamental init interface and core object creation +* cxl_port - initializes root and provides port enumeration interface. +* cxl_acpi - initializes root decoders and interacts with ACPI data. +* cxl_p/mem - initializes memory devices +* cxl_pci - uses cxl_port to enumates the actual fabric hierarchy. + +Driver Devices +============== +Here is an example from a single-socket system with 4 host bridges. Two host +bridges have a single memory device attached, and the devices are interleaved +into a single memory region. The memory region has been converted to dax. :: + + # ls /sys/bus/cxl/devices/ + dax_region0 decoder3.0 decoder6.0 mem0 port3 + decoder0.0 decoder4.0 decoder6.1 mem1 port4 + decoder1.0 decoder5.0 endpoint5 port1 region0 + decoder2.0 decoder5.1 endpoint6 port2 root0 + + +.. kernel-render:: DOT + :alt: Digraph of CXL fabric describing host-bridge interleaving + :caption: Diagraph of CXL fabric with a host-bridge interleave memory region + + digraph foo { + "root0" -> "port1"; + "root0" -> "port3"; + "root0" -> "decoder0.0"; + "port1" -> "endpoint5"; + "port3" -> "endpoint6"; + "port1" -> "decoder1.0"; + "port3" -> "decoder3.0"; + "endpoint5" -> "decoder5.0"; + "endpoint6" -> "decoder6.0"; + "decoder0.0" -> "region0"; + "decoder0.0" -> "decoder1.0"; + "decoder0.0" -> "decoder3.0"; + "decoder1.0" -> "decoder5.0"; + "decoder3.0" -> "decoder6.0"; + "decoder5.0" -> "region0"; + "decoder6.0" -> "region0"; + "region0" -> "dax_region0"; + "dax_region0" -> "dax0.0"; + } + +For this section we'll explore the devices present in this configuration, but +we'll explore more configurations in-depth in example configurations below. + +Base Devices +------------ +Most devices in a CXL fabric are a `port` of some kind (because each +device mostly routes request from one device to the next, rather than +provide a direct service). + +Root +~~~~ +The `CXL Root` is logical object created by the `cxl_acpi` driver during +:code:`cxl_acpi_probe` - if the :code:`ACPI0017` `Compute Express Link +Root Object` Device Class is found. + +The Root contains links to: + +* `Host Bridge Ports` defined by CHBS in the :doc:`CEDT<../platform/acpi/cedt>` + +* `Downstream Ports` typically connected to `Host Bridge Ports`. + +* `Root Decoders` defined by CFMWS the :doc:`CEDT<../platform/acpi/cedt>` + +:: + + # ls /sys/bus/cxl/devices/root0 + decoder0.0 dport0 dport5 port2 subsystem + decoders_committed dport1 modalias port3 uevent + devtype dport4 port1 port4 uport + + # cat /sys/bus/cxl/devices/root0/devtype + cxl_port + + # cat port1/devtype + cxl_port + + # cat decoder0.0/devtype + cxl_decoder_root + +The root is first `logical port` in the CXL fabric, as presented by the Linux +CXL driver. The `CXL root` is a special type of `switch port`, in that it +only has downstream port connections. + +Port +~~~~ +A `port` object is better described as a `switch port`. It may represent a +host bridge to the root or an actual switch port on a switch. A `switch port` +contains one or more decoders used to route memory requests downstream ports, +which may be connected to another `switch port` or an `endpoint port`. + +:: + + # ls /sys/bus/cxl/devices/port1 + decoder1.0 dport0 driver parent_dport uport + decoders_committed dport113 endpoint5 subsystem + devtype dport2 modalias uevent + + # cat devtype + cxl_port + + # cat decoder1.0/devtype + cxl_decoder_switch + + # cat endpoint5/devtype + cxl_port + +CXL `Host Bridges` in the fabric are probed during :code:`cxl_acpi_probe` at +the time the `CXL Root` is probed. The allows for the immediate logical +connection to between the root and host bridge. + +* The root has a downstream port connection to a host bridge + +* The host bridge has an upstream port connection to the root. + +* The host bridge has one or more downstream port connections to switch + or endpoint ports. + +A `Host Bridge` is a special type of CXL `switch port`. It is explicitly +defined in the ACPI specification via `ACPI0016` ID. `Host Bridge` ports +will be probed at `acpi_probe` time, while similar ports on an actual switch +will be probed later. Otherwise, switch and host bridge ports look very +similar - the both contain switch decoders which route accesses between +upstream and downstream ports. + +Endpoint +~~~~~~~~ +An `endpoint` is a terminal port in the fabric. This is a `logical device`, +and may be one of many `logical devices` presented by a memory device. It +is still considered a type of `port` in the fabric. + +An `endpoint` contains `endpoint decoders` and the device's Coherent Device +Attribute Table (which describes the device's capabilities). :: + + # ls /sys/bus/cxl/devices/endpoint5 + CDAT decoders_committed modalias uevent + decoder5.0 devtype parent_dport uport + decoder5.1 driver subsystem + + # cat /sys/bus/cxl/devices/endpoint5/devtype + cxl_port + + # cat /sys/bus/cxl/devices/endpoint5/decoder5.0/devtype + cxl_decoder_endpoint + + +Memory Device (memdev) +~~~~~~~~~~~~~~~~~~~~~~ +A `memdev` is probed and added by the `cxl_pci` driver in :code:`cxl_pci_probe` +and is managed by the `cxl_mem` driver. It primarily provides the `IOCTL` +interface to a memory device, via :code:`/dev/cxl/memN`, and exposes various +device configuration data. :: + + # ls /sys/bus/cxl/devices/mem0 + dev firmware_version payload_max security uevent + driver label_storage_size pmem serial + firmware numa_node ram subsystem + +A Memory Device is a discrete base object that is not a port. While the +physical device it belongs to may also host an `endpoint`, the relationship +between an `endpoint` and a `memdev` is not captured in sysfs. + +Port Relationships +~~~~~~~~~~~~~~~~~~ +In our example described above, there are four host bridges attached to the +root, and two of the host bridges have one endpoint attached. + +.. kernel-render:: DOT + :alt: Digraph of CXL fabric describing host-bridge interleaving + :caption: Diagraph of CXL fabric with a host-bridge interleave memory region + + digraph foo { + "root0" -> "port1"; + "root0" -> "port2"; + "root0" -> "port3"; + "root0" -> "port4"; + "port1" -> "endpoint5"; + "port3" -> "endpoint6"; + } + +Decoders +-------- +A `Decoder` is short for a CXL Host-Managed Device Memory (HDM) Decoder. It is +a device that routes accesses through the CXL fabric to an endpoint, and at +the endpoint translates a `Host Physical` to `Device Physical` Addressing. + +The CXL 3.1 specification heavily implies that only endpoint decoders should +engage in translation of `Host Physical Address` to `Device Physical Address`. +:: + + 8.2.4.20 CXL HDM Decoder Capability Structure + + IMPLEMENTATION NOTE + CXL Host Bridge and Upstream Switch Port Decode Flow + + IMPLEMENTATION NOTE + Device Decode Logic + +These notes imply that there are two logical groups of decoders. + +* Routing Decoder - a decoder which routes accesses but does not translate + addresses from HPA to DPA. + +* Translating Decoder - a decoder which translates accesses from HPA to DPA + for an endpoint to service. + +The CXL drivers distinguish 3 decoder types: root, switch, and endpoint. Only +endpoint decoders are Translating Decoders, all others are Routing Decoders. + +.. note:: PLATFORM VENDORS BE AWARE + + Linux makes a strong assumption that endpoint decoders are the only decoder + in the fabric that actively translates HPA to DPA. Linux assumes routing + decoders pass the HPA unchanged to the next decoder in the fabric. + + It is therefore assumed that any given decoder in the fabric will have an + address range that is a subset of its upstream port decoder. Any deviation + from this scheme undefined per the specification. Linux prioritizes + spec-defined / architectural behavior. + +Decoders may have one or more `Downstream Targets` if configured to interleave +memory accesses. This will be presented in sysfs via the :code:`target_list` +parameter. + +Root Decoder +~~~~~~~~~~~~ +A `Root Decoder` is logical construct of the physical address and interleave +configurations present in the CFMWS field of the :doc:`CEDT +<../platform/acpi/cedt>`. +Linux presents this information as a decoder present in the `CXL Root`. We +consider this a `Root Decoder`, though technically it exists on the boundary +of the CXL specification and platform-specific CXL root implementations. + +Linux considers these logical decoders a type of `Routing Decoder`, and is the +first decoder in the CXL fabric to receive a memory access from the platform's +memory controllers. + +`Root Decoders` are created during :code:`cxl_acpi_probe`. One root decoder +is created per CFMWS entry in the :doc:`CEDT <../platform/acpi/cedt>`. + +The :code:`target_list` parameter is filled by the CFMWS target fields. Targets +of a root decoder are `Host Bridges`, which means interleave done at the root +decoder level is an `Inter-Host-Bridge Interleave`. + +Only root decoders are capable of `Inter-Host-Bridge Interleave`. + +Such interleaves must be configured by the platform and described in the ACPI +CEDT CFMWS, as the target CXL host bridge UIDs in the CFMWS must match the CXL +host bridge UIDs in the CHBS field of the :doc:`CEDT +<../platform/acpi/cedt>` and the UID field of CXL Host Bridges defined in +the :doc:`DSDT <../platform/acpi/dsdt>`. + +Interleave settings in a root decoder describe how to interleave accesses among +the *immediate downstream targets*, not the entire interleave set. + +The memory range described in the root decoder is used to + +1) Create a memory region (:code:`region0` in this example), and + +2) Associate the region with an IO Memory Resource (:code:`kernel/resource.c`) + +:: + + # ls /sys/bus/cxl/devices/decoder0.0/ + cap_pmem devtype region0 + cap_ram interleave_granularity size + cap_type2 interleave_ways start + cap_type3 locked subsystem + create_ram_region modalias target_list + delete_region qos_class uevent + + # cat /sys/bus/cxl/devices/decoder0.0/region0/resource + 0xc050000000 + +The IO Memory Resource is created during early boot when the CFMWS region is +identified in the EFI Memory Map or E820 table (on x86). + +Root decoders are defined as a separate devtype, but are also a type +of `Switch Decoder` due to having downstream targets. :: + + # cat /sys/bus/cxl/devices/decoder0.0/devtype + cxl_decoder_root + +Switch Decoder +~~~~~~~~~~~~~~ +Any non-root, translating decoder is considered a `Switch Decoder`, and will +present with the type :code:`cxl_decoder_switch`. Both `Host Bridge` and `CXL +Switch` (device) decoders are of type :code:`cxl_decoder_switch`. :: + + # ls /sys/bus/cxl/devices/decoder1.0/ + devtype locked size target_list + interleave_granularity modalias start target_type + interleave_ways region subsystem uevent + + # cat /sys/bus/cxl/devices/decoder1.0/devtype + cxl_decoder_switch + + # cat /sys/bus/cxl/devices/decoder1.0/region + region0 + +A `Switch Decoder` has associations between a region defined by a root +decoder and downstream target ports. Interleaving done within a switch decoder +is a multi-downstream-port interleave (or `Intra-Host-Bridge Interleave` for +host bridges). + +Interleave settings in a switch decoder describe how to interleave accesses +among the *immediate downstream targets*, not the entire interleave set. + +Switch decoders are created during :code:`cxl_switch_port_probe` in the +:code:`cxl_port` driver, and is created based on a PCI device's DVSEC +registers. + +Switch decoder programming is validated during probe if the platform programs +them during boot (See `Auto Decoders` below), or on commit if programmed at +runtime (See `Runtime Programming` below). + + +Endpoint Decoder +~~~~~~~~~~~~~~~~ +Any decoder attached to a *terminal* point in the CXL fabric (`An Endpoint`) is +considered an `Endpoint Decoder`. Endpoint decoders are of type +:code:`cxl_decoder_endpoint`. :: + + # ls /sys/bus/cxl/devices/decoder5.0 + devtype locked start + dpa_resource modalias subsystem + dpa_size mode target_type + interleave_granularity region uevent + interleave_ways size + + # cat /sys/bus/cxl/devices/decoder5.0/devtype + cxl_decoder_endpoint + + # cat /sys/bus/cxl/devices/decoder5.0/region + region0 + +An `Endpoint Decoder` has an association with a region defined by a root +decoder and describes the device-local resource associated with this region. + +Unlike root and switch decoders, endpoint decoders translate `Host Physical` to +`Device Physical` address ranges. The interleave settings on an endpoint +therefore describe the entire *interleave set*. + +`Device Physical Address` regions must be committed in-order. For example, the +DPA region starting at 0x80000000 cannot be committed before the DPA region +starting at 0x0. + +As of Linux v6.15, Linux does not support *imbalanced* interleave setups, all +endpoints in an interleave set are expected to have the same interleave +settings (granularity and ways must be the same). + +Endpoint decoders are created during :code:`cxl_endpoint_port_probe` in the +:code:`cxl_port` driver, and is created based on a PCI device's DVSEC registers. + +Decoder Relationships +~~~~~~~~~~~~~~~~~~~~~ +In our example described above, there is one root decoder which routes memory +accesses over two host bridges. Each host bridge has a decoder which routes +access to their singular endpoint targets. Each endpoint has a decoder which +translates HPA to DPA and services the memory request. + +The driver validates relationships between ports by decoder programming, so +we can think of decoders being related in a similarly hierarchical fashion to +ports. + +.. kernel-render:: DOT + :alt: Digraph of hierarchical relationship between root, switch, and endpoint decoders. + :caption: Diagraph of CXL root, switch, and endpoint decoders. + + digraph foo { + "root0" -> "decoder0.0"; + "decoder0.0" -> "decoder1.0"; + "decoder0.0" -> "decoder3.0"; + "decoder1.0" -> "decoder5.0"; + "decoder3.0" -> "decoder6.0"; + } + +Regions +------- + +Memory Region +~~~~~~~~~~~~~ +A `Memory Region` is a logical construct that connects a set of CXL ports in +the fabric to an IO Memory Resource. It is ultimately used to expose the memory +on these devices to the DAX subsystem via a `DAX Region`. + +An example RAM region: :: + + # ls /sys/bus/cxl/devices/region0/ + access0 devtype modalias subsystem uuid + access1 driver mode target0 + commit interleave_granularity resource target1 + dax_region0 interleave_ways size uevent + +A memory region can be constructed during endpoint probe, if decoders were +programmed by BIOS/EFI (see `Auto Decoders`), or by creating a region manually +via a `Root Decoder`'s :code:`create_ram_region` or :code:`create_pmem_region` +interfaces. + +The interleave settings in a `Memory Region` describe the configuration of the +`Interleave Set` - and are what can be expected to be seen in the endpoint +interleave settings. + +.. kernel-render:: DOT + :alt: Digraph of CXL memory region relationships between root and endpoint decoders. + :caption: Regions are created based on root decoder configurations. Endpoint decoders + must be programmed with the same interleave settings as the region. + + digraph foo { + "root0" -> "decoder0.0"; + "decoder0.0" -> "region0"; + "region0" -> "decoder5.0"; + "region0" -> "decoder6.0"; + } + +DAX Region +~~~~~~~~~~ +A `DAX Region` is used to convert a CXL `Memory Region` to a DAX device. A +DAX device may then be accessed directly via a file descriptor interface, or +converted to System RAM via the DAX kmem driver. See the DAX driver section +for more details. :: + + # ls /sys/bus/cxl/devices/dax_region0/ + dax0.0 devtype modalias uevent + dax_region driver subsystem + +Mailbox Interfaces +------------------ +A mailbox command interface for each device is exposed in :: + + /dev/cxl/mem0 + /dev/cxl/mem1 + +These mailboxes may receive any specification-defined command. Raw commands +(custom commands) can only be sent to these interfaces if the build config +:code:`CXL_MEM_RAW_COMMANDS` is set. This is considered a debug and/or +development interface, not an officially supported mechanism for creation +of vendor-specific commands (see the `fwctl` subsystem for that). + +Decoder Programming +=================== + +Runtime Programming +------------------- +During probe, the only decoders *required* to be programmed are `Root Decoders`. +In reality, `Root Decoders` are a logical construct to describe the memory +region and interleave configuration at the host bridge level - as described +in the ACPI CEDT CFMWS. + +All other `Switch` and `Endpoint` decoders may be programmed by the user +at runtime - if the platform supports such configurations. + +This interaction is what creates a `Software Defined Memory` environment. + +See the :code:`cxl-cli` documentation for more information about how to +configure CXL decoders at runtime. + +Auto Decoders +------------- +Auto Decoders are decoders programmed by BIOS/EFI at boot time, and are +almost always locked (cannot be changed). This is done by a platform +which may have a static configuration - or certain quirks which may prevent +dynamic runtime changes to the decoders (such as requiring additional +controller programming within the CPU complex outside the scope of CXL). + +Auto Decoders are probed automatically as long as the devices and memory +regions they are associated with probe without issue. When probing Auto +Decoders, the driver's primary responsibility is to ensure the fabric is +sane - as-if validating runtime programmed regions and decoders. + +If Linux cannot validate auto-decoder configuration, the memory will not +be surfaced as a DAX device - and therefore not be exposed to the page +allocator - effectively stranding it. + +Interleave +---------- + +The Linux CXL driver supports `Cross-Link First` interleave. This dictates +how interleave is programmed at each decoder step, as the driver validates +the relationships between a decoder and it's parent. + +For example, in a `Cross-Link First` interleave setup with 16 endpoints +attached to 4 host bridges, linux expects the following ways/granularity +across the root, host bridge, and endpoints respectively. + +.. flat-table:: 4x4 cross-link first interleave settings + + * - decoder + - ways + - granularity + + * - root + - 4 + - 256 + + * - host bridge + - 4 + - 1024 + + * - endpoint + - 16 + - 256 + +At the root, every a given access will be routed to the +:code:`((HPA / 256) % 4)th` target host bridge. Within a host bridge, every +:code:`((HPA / 1024) % 4)th` target endpoint. Each endpoint translates based +on the entire 16 device interleave set. + +Unbalanced interleave sets are not supported - decoders at a similar point +in the hierarchy (e.g. all host bridge decoders) must have the same ways and +granularity configuration. + +At Root +~~~~~~~ +Root decoder interleave is defined by CFMWS field of the :doc:`CEDT +<../platform/acpi/cedt>`. The CEDT may actually define multiple CFMWS +configurations to describe the same physical capacity, with the intent to allow +users to decide at runtime whether to online memory as interleaved or +non-interleaved. :: + + Subtable Type : 01 [CXL Fixed Memory Window Structure] + Window base address : 0000000100000000 + Window size : 0000000100000000 + Interleave Members (2^n) : 00 + Interleave Arithmetic : 00 + First Target : 00000007 + + Subtable Type : 01 [CXL Fixed Memory Window Structure] + Window base address : 0000000200000000 + Window size : 0000000100000000 + Interleave Members (2^n) : 00 + Interleave Arithmetic : 00 + First Target : 00000006 + + Subtable Type : 01 [CXL Fixed Memory Window Structure] + Window base address : 0000000300000000 + Window size : 0000000200000000 + Interleave Members (2^n) : 01 + Interleave Arithmetic : 00 + First Target : 00000007 + Next Target : 00000006 + +In this example, the CFMWS defines two discrete non-interleaved 4GB regions +for each host bridge, and one interleaved 8GB region that targets both. This +would result in 3 root decoders presenting in the root. :: + + # ls /sys/bus/cxl/devices/root0/decoder* + decoder0.0 decoder0.1 decoder0.2 + + # cat /sys/bus/cxl/devices/decoder0.0/target_list start size + 7 + 0x100000000 + 0x100000000 + + # cat /sys/bus/cxl/devices/decoder0.1/target_list start size + 6 + 0x200000000 + 0x100000000 + + # cat /sys/bus/cxl/devices/decoder0.2/target_list start size + 7,6 + 0x300000000 + 0x200000000 + +These decoders are not runtime programmable. They are used to generate a +`Memory Region` to bring this memory online with runtime programmed settings +at the `Switch` and `Endpoint` decoders. + +At Host Bridge or Switch +~~~~~~~~~~~~~~~~~~~~~~~~ +`Host Bridge` and `Switch` decoders are programmable via the following fields: + +- :code:`start` - the HPA region associated with the memory region +- :code:`size` - the size of the region +- :code:`target_list` - the list of downstream ports +- :code:`interleave_ways` - the number downstream ports to interleave across +- :code:`interleave_granularity` - the granularity to interleave at. + +Linux expects the :code:`interleave_granularity` of switch decoders to be +derived from their upstream port connections. In `Cross-Link First` interleave +configurations, the :code:`interleave_granularity` of a decoder is equal to +:code:`parent_interleave_granularity * parent_interleave_ways`. + +At Endpoint +~~~~~~~~~~~ +`Endpoint Decoders` are programmed similar to Host Bridge and Switch decoders, +with the exception that the ways and granularity are defined by the interleave +set (e.g. the interleave settings defined by the associated `Memory Region`). + +- :code:`start` - the HPA region associated with the memory region +- :code:`size` - the size of the region +- :code:`interleave_ways` - the number endpoints in the interleave set +- :code:`interleave_granularity` - the granularity to interleave at. + +These settings are used by endpoint decoders to *Translate* memory requests +from HPA to DPA. This is why they must be aware of the entire interleave set. + +Linux does not support unbalanced interleave configurations. As a result, all +endpoints in an interleave set must have the same ways and granularity. + +Example Configurations +====================== +.. toctree:: + :maxdepth: 1 + + example-configurations/single-device.rst + example-configurations/hb-interleave.rst + example-configurations/intra-hb-interleave.rst + example-configurations/multi-interleave.rst diff --git a/Documentation/driver-api/cxl/linux/dax-driver.rst b/Documentation/driver-api/cxl/linux/dax-driver.rst new file mode 100644 index 000000000000..10d953a2167b --- /dev/null +++ b/Documentation/driver-api/cxl/linux/dax-driver.rst @@ -0,0 +1,43 @@ +.. SPDX-License-Identifier: GPL-2.0 + +==================== +DAX Driver Operation +==================== +The `Direct Access Device` driver was originally designed to provide a +memory-like access mechanism to memory-like block-devices. It was +extended to support CXL Memory Devices, which provide user-configured +memory devices. + +The CXL subsystem depends on the DAX subsystem to either: + +- Generate a file-like interface to userland via :code:`/dev/daxN.Y`, or +- Engage the memory-hotplug interface to add CXL memory to page allocator. + +The DAX subsystem exposes this ability through the `cxl_dax_region` driver. +A `dax_region` provides the translation between a CXL `memory_region` and +a `DAX Device`. + +DAX Device +========== +A `DAX Device` is a file-like interface exposed in :code:`/dev/daxN.Y`. A +memory region exposed via dax device can be accessed via userland software +via the :code:`mmap()` system-call. The result is direct mappings to the +CXL capacity in the task's page tables. + +Users wishing to manually handle allocation of CXL memory should use this +interface. + +kmem conversion +=============== +The :code:`dax_kmem` driver converts a `DAX Device` into a series of `hotplug +memory blocks` managed by :code:`kernel/memory-hotplug.c`. This capacity +will be exposed to the kernel page allocator in the user-selected memory +zone. + +The :code:`memmap_on_memory` setting (both global and DAX device local) +dictates where the kernell will allocate the :code:`struct folio` descriptors +for this memory will come from. If :code:`memmap_on_memory` is set, memory +hotplug will set aside a portion of the memory block capacity to allocate +folios. If unset, the memory is allocated via a normal :code:`GFP_KERNEL` +allocation - and as a result will most likely land on the local NUM node of the +CPU executing the hotplug operation. diff --git a/Documentation/driver-api/cxl/linux/early-boot.rst b/Documentation/driver-api/cxl/linux/early-boot.rst new file mode 100644 index 000000000000..a7fc6fc85fbe --- /dev/null +++ b/Documentation/driver-api/cxl/linux/early-boot.rst @@ -0,0 +1,137 @@ +.. SPDX-License-Identifier: GPL-2.0 + +======================= +Linux Init (Early Boot) +======================= + +Linux configuration is split into two major steps: Early-Boot and everything else. + +During early boot, Linux sets up immutable resources (such as numa nodes), while +later operations include things like driver probe and memory hotplug. Linux may +read EFI and ACPI information throughout this process to configure logical +representations of the devices. + +During Linux Early Boot stage (functions in the kernel that have the __init +decorator), the system takes the resources created by EFI/BIOS +(:doc:`ACPI tables <../platform/acpi>`) and turns them into resources that the +kernel can consume. + + +BIOS, Build and Boot Options +============================ + +There are 4 pre-boot options that need to be considered during kernel build +which dictate how memory will be managed by Linux during early boot. + +* EFI_MEMORY_SP + + * BIOS/EFI Option that dictates whether memory is SystemRAM or + Specific Purpose. Specific Purpose memory will be deferred to + drivers to manage - and not immediately exposed as system RAM. + +* CONFIG_EFI_SOFT_RESERVE + + * Linux Build config option that dictates whether the kernel supports + Specific Purpose memory. + +* CONFIG_MHP_DEFAULT_ONLINE_TYPE + + * Linux Build config that dictates whether and how Specific Purpose memory + converted to a dax device should be managed (left as DAX or onlined as + SystemRAM in ZONE_NORMAL or ZONE_MOVABLE). + +* nosoftreserve + + * Linux kernel boot option that dictates whether Soft Reserve should be + supported. Similar to CONFIG_EFI_SOFT_RESERVE. + +Memory Map Creation +=================== + +While the kernel parses the EFI memory map, if :code:`Specific Purpose` memory +is supported and detected, it will set this region aside as +:code:`SOFT_RESERVED`. + +If :code:`EFI_MEMORY_SP=0`, :code:`CONFIG_EFI_SOFT_RESERVE=n`, or +:code:`nosoftreserve=y` - Linux will default a CXL device memory region to +SystemRAM. This will expose the memory to the kernel page allocator in +:code:`ZONE_NORMAL`, making it available for use for most allocations (including +:code:`struct page` and page tables). + +If `Specific Purpose` is set and supported, :code:`CONFIG_MHP_DEFAULT_ONLINE_TYPE_*` +dictates whether the memory is onlined by default (:code:`_OFFLINE` or +:code:`_ONLINE_*`), and if online which zone to online this memory to by default +(:code:`_NORMAL` or :code:`_MOVABLE`). + +If placed in :code:`ZONE_MOVABLE`, the memory will not be available for most +kernel allocations (such as :code:`struct page` or page tables). This may +significant impact performance depending on the memory capacity of the system. + + +NUMA Node Reservation +===================== + +Linux refers to the proximity domains (:code:`PXM`) defined in the :doc:`SRAT +<../platform/acpi/srat>` to create NUMA nodes in :code:`acpi_numa_init`. +Typically, there is a 1:1 relation between :code:`PXM` and NUMA node IDs. + +The SRAT is the only ACPI defined way of defining Proximity Domains. Linux +chooses to, at most, map those 1:1 with NUMA nodes. +:doc:`CEDT <../platform/acpi/cedt>` adds a description of SPA ranges which +Linux may map to one or more NUMA nodes. + +If there are CXL ranges in the CFMWS but not in SRAT, then a fake :code:`PXM` +is created (as of v6.15). In the future, Linux may reject CFMWS not described +by SRAT due to the ambiguity of proximity domain association. + +It is important to note that NUMA node creation cannot be done at runtime. All +possible NUMA nodes are identified at :code:`__init` time, more specifically +during :code:`mm_init`. The CEDT and SRAT must contain sufficient :code:`PXM` +data for Linux to identify NUMA nodes their associated memory regions. + +The relevant code exists in: :code:`linux/drivers/acpi/numa/srat.c`. + +See :doc:`Example Platform Configurations <../platform/example-configs>` +for more info. + +Memory Tiers Creation +===================== +Memory tiers are a collection of NUMA nodes grouped by performance characteristics. +During :code:`__init`, Linux initializes the system with a default memory tier that +contains all nodes marked :code:`N_MEMORY`. + +:code:`memory_tier_init` is called at boot for all nodes with memory online by +default. :code:`memory_tier_late_init` is called during late-init for nodes setup +during driver configuration. + +Nodes are only marked :code:`N_MEMORY` if they have *online* memory. + +Tier membership can be inspected in :: + + /sys/devices/virtual/memory_tiering/memory_tierN/nodelist + 0-1 + +If nodes are grouped which have clear difference in performance, check the +:doc:`HMAT <../platform/acpi/hmat>` and CDAT information for the CXL nodes. All +nodes default to the DRAM tier, unless HMAT/CDAT information is reported to the +memory_tier component via `access_coordinates`. + +For more, see :doc:`CXL access coordinates documentation +<../linux/access-coordinates>`. + +Contiguous Memory Allocation +============================ +The contiguous memory allocator (CMA) enables reservation of contiguous memory +regions on NUMA nodes during early boot. However, CMA cannot reserve memory +on NUMA nodes that are not online during early boot. :: + + void __init hugetlb_cma_reserve(int order) { + if (!node_online(nid)) + /* do not allow reservations */ + } + +This means if users intend to defer management of CXL memory to the driver, CMA +cannot be used to guarantee huge page allocations. If enabling CXL memory as +SystemRAM in `ZONE_NORMAL` during early boot, CMA reservations per-node can be +made with the :code:`cma_pernuma` or :code:`numa_cma` kernel command line +parameters. diff --git a/Documentation/driver-api/cxl/linux/example-configurations/hb-interleave.rst b/Documentation/driver-api/cxl/linux/example-configurations/hb-interleave.rst new file mode 100644 index 000000000000..f071490763a2 --- /dev/null +++ b/Documentation/driver-api/cxl/linux/example-configurations/hb-interleave.rst @@ -0,0 +1,314 @@ +.. SPDX-License-Identifier: GPL-2.0 + +============================ +Inter-Host-Bridge Interleave +============================ +This cxl-cli configuration dump shows the following host configuration: + +* A single socket system with one CXL root +* CXL Root has Four (4) CXL Host Bridges +* Two CXL Host Bridges have a single CXL Memory Expander Attached +* The CXL root is configured to interleave across the two host bridges. + +This output is generated by :code:`cxl list -v` and describes the relationships +between objects exposed in :code:`/sys/bus/cxl/devices/`. + +:: + + [ + { + "bus":"root0", + "provider":"ACPI.CXL", + "nr_dports":4, + "dports":[ + { + "dport":"pci0000:00", + "alias":"ACPI0016:01", + "id":0 + }, + { + "dport":"pci0000:a8", + "alias":"ACPI0016:02", + "id":4 + }, + { + "dport":"pci0000:2a", + "alias":"ACPI0016:03", + "id":1 + }, + { + "dport":"pci0000:d2", + "alias":"ACPI0016:00", + "id":5 + } + ], + +This chunk shows the CXL "bus" (root0) has 4 downstream ports attached to CXL +Host Bridges. The `Root` can be considered the singular upstream port attached +to the platform's memory controller - which routes memory requests to it. + +The `ports:root0` section lays out how each of these downstream ports are +configured. If a port is not configured (id's 0 and 1), they are omitted. + +:: + + "ports:root0":[ + { + "port":"port1", + "host":"pci0000:d2", + "depth":1, + "nr_dports":3, + "dports":[ + { + "dport":"0000:d2:01.1", + "alias":"device:02", + "id":0 + }, + { + "dport":"0000:d2:01.3", + "alias":"device:05", + "id":2 + }, + { + "dport":"0000:d2:07.1", + "alias":"device:0d", + "id":113 + } + ], + +This chunk shows the available downstream ports associated with the CXL Host +Bridge :code:`port1`. In this case, :code:`port1` has 3 available downstream +ports: :code:`dport1`, :code:`dport2`, and :code:`dport113`.. + +:: + + "endpoints:port1":[ + { + "endpoint":"endpoint5", + "host":"mem0", + "parent_dport":"0000:d2:01.1", + "depth":2, + "memdev":{ + "memdev":"mem0", + "ram_size":137438953472, + "serial":0, + "numa_node":0, + "host":"0000:d3:00.0" + }, + "decoders:endpoint5":[ + { + "decoder":"decoder5.0", + "resource":825975898112, + "size":274877906944, + "interleave_ways":2, + "interleave_granularity":256, + "region":"region0", + "dpa_resource":0, + "dpa_size":137438953472, + "mode":"ram" + } + ] + } + ], + +This chunk shows the endpoints attached to the host bridge :code:`port1`. + +:code:`endpoint5` contains a single configured decoder :code:`decoder5.0` +which has the same interleave configuration as :code:`region0` (shown later). + +Next we have the decodesr belonging to the host bridge: + +:: + + "decoders:port1":[ + { + "decoder":"decoder1.0", + "resource":825975898112, + "size":274877906944, + "interleave_ways":1, + "region":"region0", + "nr_targets":1, + "targets":[ + { + "target":"0000:d2:01.1", + "alias":"device:02", + "position":0, + "id":0 + } + ] + } + ] + }, + +Host Bridge :code:`port1` has a single decoder (:code:`decoder1.0`), whose only +target is :code:`dport1` - which is attached to :code:`endpoint5`. + +The following chunk shows a similar configuration for Host Bridge :code:`port3`, +the second host bridge with a memory device attached. + +:: + + { + "port":"port3", + "host":"pci0000:a8", + "depth":1, + "nr_dports":1, + "dports":[ + { + "dport":"0000:a8:01.1", + "alias":"device:c3", + "id":0 + } + ], + "endpoints:port3":[ + { + "endpoint":"endpoint6", + "host":"mem1", + "parent_dport":"0000:a8:01.1", + "depth":2, + "memdev":{ + "memdev":"mem1", + "ram_size":137438953472, + "serial":0, + "numa_node":0, + "host":"0000:a9:00.0" + }, + "decoders:endpoint6":[ + { + "decoder":"decoder6.0", + "resource":825975898112, + "size":274877906944, + "interleave_ways":2, + "interleave_granularity":256, + "region":"region0", + "dpa_resource":0, + "dpa_size":137438953472, + "mode":"ram" + } + ] + } + ], + "decoders:port3":[ + { + "decoder":"decoder3.0", + "resource":825975898112, + "size":274877906944, + "interleave_ways":1, + "region":"region0", + "nr_targets":1, + "targets":[ + { + "target":"0000:a8:01.1", + "alias":"device:c3", + "position":0, + "id":0 + } + ] + } + ] + }, + + +The next chunk shows the two CXL host bridges without attached endpoints. + +:: + + { + "port":"port2", + "host":"pci0000:00", + "depth":1, + "nr_dports":2, + "dports":[ + { + "dport":"0000:00:01.3", + "alias":"device:55", + "id":2 + }, + { + "dport":"0000:00:07.1", + "alias":"device:5d", + "id":113 + } + ] + }, + { + "port":"port4", + "host":"pci0000:2a", + "depth":1, + "nr_dports":1, + "dports":[ + { + "dport":"0000:2a:01.1", + "alias":"device:d0", + "id":0 + } + ] + } + ], + +Next we have the `Root Decoders` belonging to :code:`root0`. This root decoder +applies the interleave across the downstream ports :code:`port1` and +:code:`port3` - with a granularity of 256 bytes. + +This information is generated by the CXL driver reading the ACPI CEDT CMFWS. + +:: + + "decoders:root0":[ + { + "decoder":"decoder0.0", + "resource":825975898112, + "size":274877906944, + "interleave_ways":2, + "interleave_granularity":256, + "max_available_extent":0, + "volatile_capable":true, + "nr_targets":2, + "targets":[ + { + "target":"pci0000:a8", + "alias":"ACPI0016:02", + "position":1, + "id":4 + }, + { + "target":"pci0000:d2", + "alias":"ACPI0016:00", + "position":0, + "id":5 + } + ], + +Finally we have the `Memory Region` associated with the `Root Decoder` +:code:`decoder0.0`. This region describes the overall interleave configuration +of the interleave set. + +:: + + "regions:decoder0.0":[ + { + "region":"region0", + "resource":825975898112, + "size":274877906944, + "type":"ram", + "interleave_ways":2, + "interleave_granularity":256, + "decode_state":"commit", + "mappings":[ + { + "position":1, + "memdev":"mem1", + "decoder":"decoder6.0" + }, + { + "position":0, + "memdev":"mem0", + "decoder":"decoder5.0" + } + ] + } + ] + } + ] + } + ] diff --git a/Documentation/driver-api/cxl/linux/example-configurations/intra-hb-interleave.rst b/Documentation/driver-api/cxl/linux/example-configurations/intra-hb-interleave.rst new file mode 100644 index 000000000000..077dfaf8458d --- /dev/null +++ b/Documentation/driver-api/cxl/linux/example-configurations/intra-hb-interleave.rst @@ -0,0 +1,291 @@ +.. SPDX-License-Identifier: GPL-2.0 + +============================ +Intra-Host-Bridge Interleave +============================ +This cxl-cli configuration dump shows the following host configuration: + +* A single socket system with one CXL root +* CXL Root has Four (4) CXL Host Bridges +* One (1) CXL Host Bridges has two CXL Memory Expanders Attached +* The Host bridge decoder is programmed to interleave across the expanders. + +This output is generated by :code:`cxl list -v` and describes the relationships +between objects exposed in :code:`/sys/bus/cxl/devices/`. + +:: + + [ + { + "bus":"root0", + "provider":"ACPI.CXL", + "nr_dports":4, + "dports":[ + { + "dport":"pci0000:00", + "alias":"ACPI0016:01", + "id":0 + }, + { + "dport":"pci0000:a8", + "alias":"ACPI0016:02", + "id":4 + }, + { + "dport":"pci0000:2a", + "alias":"ACPI0016:03", + "id":1 + }, + { + "dport":"pci0000:d2", + "alias":"ACPI0016:00", + "id":5 + } + ], + +This chunk shows the CXL "bus" (root0) has 4 downstream ports attached to CXL +Host Bridges. The `Root` can be considered the singular upstream port attached +to the platform's memory controller - which routes memory requests to it. + +The `ports:root0` section lays out how each of these downstream ports are +configured. If a port is not configured (id's 0 and 1), they are omitted. + +:: + + "ports:root0":[ + { + "port":"port1", + "host":"pci0000:d2", + "depth":1, + "nr_dports":3, + "dports":[ + { + "dport":"0000:d2:01.1", + "alias":"device:02", + "id":0 + }, + { + "dport":"0000:d2:01.3", + "alias":"device:05", + "id":2 + }, + { + "dport":"0000:d2:07.1", + "alias":"device:0d", + "id":113 + } + ], + +This chunk shows the available downstream ports associated with the CXL Host +Bridge :code:`port1`. In this case, :code:`port1` has 3 available downstream +ports: :code:`dport1`, :code:`dport2`, and :code:`dport113`.. + +:: + + "endpoints:port1":[ + { + "endpoint":"endpoint5", + "host":"mem0", + "parent_dport":"0000:d2:01.1", + "depth":2, + "memdev":{ + "memdev":"mem0", + "ram_size":137438953472, + "serial":0, + "numa_node":0, + "host":"0000:d3:00.0" + }, + "decoders:endpoint5":[ + { + "decoder":"decoder5.0", + "resource":825975898112, + "size":274877906944, + "interleave_ways":2, + "interleave_granularity":256, + "region":"region0", + "dpa_resource":0, + "dpa_size":137438953472, + "mode":"ram" + } + ] + }, + { + "endpoint":"endpoint6", + "host":"mem1", + "parent_dport":"0000:d2:01.3, + "depth":2, + "memdev":{ + "memdev":"mem1", + "ram_size":137438953472, + "serial":0, + "numa_node":0, + "host":"0000:a9:00.0" + }, + "decoders:endpoint6":[ + { + "decoder":"decoder6.0", + "resource":825975898112, + "size":274877906944, + "interleave_ways":2, + "interleave_granularity":256, + "region":"region0", + "dpa_resource":0, + "dpa_size":137438953472, + "mode":"ram" + } + ] + } + ], + +This chunk shows the endpoints attached to the host bridge :code:`port1`. + +:code:`endpoint5` contains a single configured decoder :code:`decoder5.0` +which has the same interleave configuration memory region they belong to +(show later). + +Next we have the decoders belonging to the host bridge: + +:: + + "decoders:port1":[ + { + "decoder":"decoder1.0", + "resource":825975898112, + "size":274877906944, + "interleave_ways":2, + "interleave_granularity":256, + "region":"region0", + "nr_targets":2, + "targets":[ + { + "target":"0000:d2:01.1", + "alias":"device:02", + "position":0, + "id":0 + }, + { + "target":"0000:d2:01.3", + "alias":"device:05", + "position":1, + "id":0 + } + ] + } + ] + }, + +Host Bridge :code:`port1` has a single decoder (:code:`decoder1.0`) with two +targets: :code:`dport1` and :code:`dport3` - which are attached to +:code:`endpoint5` and :code:`endpoint6` respectively. + +The host bridge decoder interleaves these devices at a 256 byte granularity. + +The next chunk shows the three CXL host bridges without attached endpoints. + +:: + + { + "port":"port2", + "host":"pci0000:00", + "depth":1, + "nr_dports":2, + "dports":[ + { + "dport":"0000:00:01.3", + "alias":"device:55", + "id":2 + }, + { + "dport":"0000:00:07.1", + "alias":"device:5d", + "id":113 + } + ] + }, + { + "port":"port3", + "host":"pci0000:a8", + "depth":1, + "nr_dports":1, + "dports":[ + { + "dport":"0000:a8:01.1", + "alias":"device:c3", + "id":0 + } + ], + }, + { + "port":"port4", + "host":"pci0000:2a", + "depth":1, + "nr_dports":1, + "dports":[ + { + "dport":"0000:2a:01.1", + "alias":"device:d0", + "id":0 + } + ] + } + ], + +Next we have the `Root Decoders` belonging to :code:`root0`. This root decoder +applies the interleave across the downstream ports :code:`port1` and +:code:`port3` - with a granularity of 256 bytes. + +This information is generated by the CXL driver reading the ACPI CEDT CMFWS. + +:: + + "decoders:root0":[ + { + "decoder":"decoder0.0", + "resource":825975898112, + "size":274877906944, + "interleave_ways":1, + "max_available_extent":0, + "volatile_capable":true, + "nr_targets":2, + "targets":[ + { + "target":"pci0000:a8", + "alias":"ACPI0016:02", + "position":1, + "id":4 + }, + ], + +Finally we have the `Memory Region` associated with the `Root Decoder` +:code:`decoder0.0`. This region describes the overall interleave configuration +of the interleave set. + +:: + + "regions:decoder0.0":[ + { + "region":"region0", + "resource":825975898112, + "size":274877906944, + "type":"ram", + "interleave_ways":2, + "interleave_granularity":256, + "decode_state":"commit", + "mappings":[ + { + "position":1, + "memdev":"mem1", + "decoder":"decoder6.0" + }, + { + "position":0, + "memdev":"mem0", + "decoder":"decoder5.0" + } + ] + } + ] + } + ] + } + ] diff --git a/Documentation/driver-api/cxl/linux/example-configurations/multi-interleave.rst b/Documentation/driver-api/cxl/linux/example-configurations/multi-interleave.rst new file mode 100644 index 000000000000..008f9053c630 --- /dev/null +++ b/Documentation/driver-api/cxl/linux/example-configurations/multi-interleave.rst @@ -0,0 +1,401 @@ +.. SPDX-License-Identifier: GPL-2.0 + +====================== +Multi-Level Interleave +====================== +This cxl-cli configuration dump shows the following host configuration: + +* A single socket system with one CXL root +* CXL Root has Four (4) CXL Host Bridges +* Two CXL Host Bridges have a two CXL Memory Expanders Attached each. +* The CXL root is configured to interleave across the two host bridges. +* Each host bridge with expanders interleaves across two endpoints. + +This output is generated by :code:`cxl list -v` and describes the relationships +between objects exposed in :code:`/sys/bus/cxl/devices/`. + +:: + + [ + { + "bus":"root0", + "provider":"ACPI.CXL", + "nr_dports":4, + "dports":[ + { + "dport":"pci0000:00", + "alias":"ACPI0016:01", + "id":0 + }, + { + "dport":"pci0000:a8", + "alias":"ACPI0016:02", + "id":4 + }, + { + "dport":"pci0000:2a", + "alias":"ACPI0016:03", + "id":1 + }, + { + "dport":"pci0000:d2", + "alias":"ACPI0016:00", + "id":5 + } + ], + +This chunk shows the CXL "bus" (root0) has 4 downstream ports attached to CXL +Host Bridges. The `Root` can be considered the singular upstream port attached +to the platform's memory controller - which routes memory requests to it. + +The `ports:root0` section lays out how each of these downstream ports are +configured. If a port is not configured (id's 0 and 1), they are omitted. + +:: + + "ports:root0":[ + { + "port":"port1", + "host":"pci0000:d2", + "depth":1, + "nr_dports":3, + "dports":[ + { + "dport":"0000:d2:01.1", + "alias":"device:02", + "id":0 + }, + { + "dport":"0000:d2:01.3", + "alias":"device:05", + "id":2 + }, + { + "dport":"0000:d2:07.1", + "alias":"device:0d", + "id":113 + } + ], + +This chunk shows the available downstream ports associated with the CXL Host +Bridge :code:`port1`. In this case, :code:`port1` has 3 available downstream +ports: :code:`dport0`, :code:`dport2`, and :code:`dport113`. + +:: + + "endpoints:port1":[ + { + "endpoint":"endpoint5", + "host":"mem0", + "parent_dport":"0000:d2:01.1", + "depth":2, + "memdev":{ + "memdev":"mem0", + "ram_size":137438953472, + "serial":0, + "numa_node":0, + "host":"0000:d3:00.0" + }, + "decoders:endpoint5":[ + { + "decoder":"decoder5.0", + "resource":825975898112, + "size":549755813888, + "interleave_ways":4, + "interleave_granularity":256, + "region":"region0", + "dpa_resource":0, + "dpa_size":137438953472, + "mode":"ram" + } + ] + }, + { + "endpoint":"endpoint6", + "host":"mem1", + "parent_dport":"0000:d2:01.3", + "depth":2, + "memdev":{ + "memdev":"mem1", + "ram_size":137438953472, + "serial":0, + "numa_node":0, + "host":"0000:d3:00.0" + }, + "decoders:endpoint6":[ + { + "decoder":"decoder6.0", + "resource":825975898112, + "size":549755813888, + "interleave_ways":4, + "interleave_granularity":256, + "region":"region0", + "dpa_resource":0, + "dpa_size":137438953472, + "mode":"ram" + } + ] + } + ], + +This chunk shows the endpoints attached to the host bridge :code:`port1`. + +:code:`endpoint5` contains a single configured decoder :code:`decoder5.0` +which has the same interleave configuration as :code:`region0` (shown later). + +:code:`endpoint6` contains a single configured decoder :code:`decoder5.0` +which has the same interleave configuration as :code:`region0` (shown later). + +Next we have the decoders belonging to the host bridge: + +:: + + "decoders:port1":[ + { + "decoder":"decoder1.0", + "resource":825975898112, + "size":549755813888, + "interleave_ways":2, + "interleave_granularity":512, + "region":"region0", + "nr_targets":2, + "targets":[ + { + "target":"0000:d2:01.1", + "alias":"device:02", + "position":0, + "id":0 + }, + { + "target":"0000:d2:01.3", + "alias":"device:05", + "position":2, + "id":0 + } + ] + } + ] + }, + +Host Bridge :code:`port1` has a single decoder (:code:`decoder1.0`), whose +targets are :code:`dport0` and :code:`dport2` - which are attached to +:code:`endpoint5` and :code:`endpoint6` respectively. + +The following chunk shows a similar configuration for Host Bridge :code:`port3`, +the second host bridge with a memory device attached. + +:: + + { + "port":"port3", + "host":"pci0000:a8", + "depth":1, + "nr_dports":1, + "dports":[ + { + "dport":"0000:a8:01.1", + "alias":"device:c3", + "id":0 + }, + { + "dport":"0000:a8:01.3", + "alias":"device:c5", + "id":0 + } + ], + "endpoints:port3":[ + { + "endpoint":"endpoint7", + "host":"mem2", + "parent_dport":"0000:a8:01.1", + "depth":2, + "memdev":{ + "memdev":"mem2", + "ram_size":137438953472, + "serial":0, + "numa_node":0, + "host":"0000:a9:00.0" + }, + "decoders:endpoint7":[ + { + "decoder":"decoder7.0", + "resource":825975898112, + "size":549755813888, + "interleave_ways":4, + "interleave_granularity":256, + "region":"region0", + "dpa_resource":0, + "dpa_size":137438953472, + "mode":"ram" + } + ] + }, + { + "endpoint":"endpoint8", + "host":"mem3", + "parent_dport":"0000:a8:01.3", + "depth":2, + "memdev":{ + "memdev":"mem3", + "ram_size":137438953472, + "serial":0, + "numa_node":0, + "host":"0000:a9:00.0" + }, + "decoders:endpoint8":[ + { + "decoder":"decoder8.0", + "resource":825975898112, + "size":549755813888, + "interleave_ways":4, + "interleave_granularity":256, + "region":"region0", + "dpa_resource":0, + "dpa_size":137438953472, + "mode":"ram" + } + ] + } + ], + "decoders:port3":[ + { + "decoder":"decoder3.0", + "resource":825975898112, + "size":549755813888, + "interleave_ways":2, + "interleave_granularity":512, + "region":"region0", + "nr_targets":1, + "targets":[ + { + "target":"0000:a8:01.1", + "alias":"device:c3", + "position":1, + "id":0 + }, + { + "target":"0000:a8:01.3", + "alias":"device:c5", + "position":3, + "id":0 + } + ] + } + ] + }, + + +The next chunk shows the two CXL host bridges without attached endpoints. + +:: + + { + "port":"port2", + "host":"pci0000:00", + "depth":1, + "nr_dports":2, + "dports":[ + { + "dport":"0000:00:01.3", + "alias":"device:55", + "id":2 + }, + { + "dport":"0000:00:07.1", + "alias":"device:5d", + "id":113 + } + ] + }, + { + "port":"port4", + "host":"pci0000:2a", + "depth":1, + "nr_dports":1, + "dports":[ + { + "dport":"0000:2a:01.1", + "alias":"device:d0", + "id":0 + } + ] + } + ], + +Next we have the `Root Decoders` belonging to :code:`root0`. This root decoder +applies the interleave across the downstream ports :code:`port1` and +:code:`port3` - with a granularity of 256 bytes. + +This information is generated by the CXL driver reading the ACPI CEDT CMFWS. + +:: + + "decoders:root0":[ + { + "decoder":"decoder0.0", + "resource":825975898112, + "size":549755813888, + "interleave_ways":2, + "interleave_granularity":256, + "max_available_extent":0, + "volatile_capable":true, + "nr_targets":2, + "targets":[ + { + "target":"pci0000:a8", + "alias":"ACPI0016:02", + "position":1, + "id":4 + }, + { + "target":"pci0000:d2", + "alias":"ACPI0016:00", + "position":0, + "id":5 + } + ], + +Finally we have the `Memory Region` associated with the `Root Decoder` +:code:`decoder0.0`. This region describes the overall interleave configuration +of the interleave set. So we see there are a total of :code:`4` interleave +targets across 4 endpoint decoders. + +:: + + "regions:decoder0.0":[ + { + "region":"region0", + "resource":825975898112, + "size":549755813888, + "type":"ram", + "interleave_ways":4, + "interleave_granularity":256, + "decode_state":"commit", + "mappings":[ + { + "position":3, + "memdev":"mem3", + "decoder":"decoder8.0" + }, + { + "position":2, + "memdev":"mem1", + "decoder":"decoder6.0" + } + { + "position":1, + "memdev":"mem2", + "decoder":"decoder7.0" + }, + { + "position":0, + "memdev":"mem0", + "decoder":"decoder5.0" + } + ] + } + ] + } + ] + } + ] diff --git a/Documentation/driver-api/cxl/linux/example-configurations/single-device.rst b/Documentation/driver-api/cxl/linux/example-configurations/single-device.rst new file mode 100644 index 000000000000..5fd38eb0aaf4 --- /dev/null +++ b/Documentation/driver-api/cxl/linux/example-configurations/single-device.rst @@ -0,0 +1,246 @@ +.. SPDX-License-Identifier: GPL-2.0 + +============= +Single Device +============= +This cxl-cli configuration dump shows the following host configuration: + +* A single socket system with one CXL root +* CXL Root has Four (4) CXL Host Bridges +* One CXL Host Bridges has a single CXL Memory Expander Attached +* No interleave is present. + +This output is generated by :code:`cxl list -v` and describes the relationships +between objects exposed in :code:`/sys/bus/cxl/devices/`. + +:: + + [ + { + "bus":"root0", + "provider":"ACPI.CXL", + "nr_dports":4, + "dports":[ + { + "dport":"pci0000:00", + "alias":"ACPI0016:01", + "id":0 + }, + { + "dport":"pci0000:a8", + "alias":"ACPI0016:02", + "id":4 + }, + { + "dport":"pci0000:2a", + "alias":"ACPI0016:03", + "id":1 + }, + { + "dport":"pci0000:d2", + "alias":"ACPI0016:00", + "id":5 + } + ], + +This chunk shows the CXL "bus" (root0) has 4 downstream ports attached to CXL +Host Bridges. The `Root` can be considered the singular upstream port attached +to the platform's memory controller - which routes memory requests to it. + +The `ports:root0` section lays out how each of these downstream ports are +configured. If a port is not configured (id's 0, 1, and 4), they are omitted. + +:: + + "ports:root0":[ + { + "port":"port1", + "host":"pci0000:d2", + "depth":1, + "nr_dports":3, + "dports":[ + { + "dport":"0000:d2:01.1", + "alias":"device:02", + "id":0 + }, + { + "dport":"0000:d2:01.3", + "alias":"device:05", + "id":2 + }, + { + "dport":"0000:d2:07.1", + "alias":"device:0d", + "id":113 + } + ], + +This chunk shows the available downstream ports associated with the CXL Host +Bridge :code:`port1`. In this case, :code:`port1` has 3 available downstream +ports: :code:`dport1`, :code:`dport2`, and :code:`dport113`.. + +:: + + "endpoints:port1":[ + { + "endpoint":"endpoint5", + "host":"mem0", + "parent_dport":"0000:d2:01.1", + "depth":2, + "memdev":{ + "memdev":"mem0", + "ram_size":137438953472, + "serial":0, + "numa_node":0, + "host":"0000:d3:00.0" + }, + "decoders:endpoint5":[ + { + "decoder":"decoder5.0", + "resource":825975898112, + "size":137438953472, + "interleave_ways":1, + "region":"region0", + "dpa_resource":0, + "dpa_size":137438953472, + "mode":"ram" + } + ] + } + ], + +This chunk shows the endpoints attached to the host bridge :code:`port1`. + +:code:`endpoint5` contains a single configured decoder :code:`decoder5.0` +which has the same interleave configuration as :code:`region0` (shown later). + +Next we have the decoders belonging to the host bridge: + +:: + + "decoders:port1":[ + { + "decoder":"decoder1.0", + "resource":825975898112, + "size":137438953472, + "interleave_ways":1, + "region":"region0", + "nr_targets":1, + "targets":[ + { + "target":"0000:d2:01.1", + "alias":"device:02", + "position":0, + "id":0 + } + ] + } + ] + }, + +Host Bridge :code:`port1` has a single decoder (:code:`decoder1.0`), whose only +target is :code:`dport1` - which is attached to :code:`endpoint5`. + +The next chunk shows the three CXL host bridges without attached endpoints. + +:: + + { + "port":"port2", + "host":"pci0000:00", + "depth":1, + "nr_dports":2, + "dports":[ + { + "dport":"0000:00:01.3", + "alias":"device:55", + "id":2 + }, + { + "dport":"0000:00:07.1", + "alias":"device:5d", + "id":113 + } + ] + }, + { + "port":"port3", + "host":"pci0000:a8", + "depth":1, + "nr_dports":1, + "dports":[ + { + "dport":"0000:a8:01.1", + "alias":"device:c3", + "id":0 + } + ] + }, + { + "port":"port4", + "host":"pci0000:2a", + "depth":1, + "nr_dports":1, + "dports":[ + { + "dport":"0000:2a:01.1", + "alias":"device:d0", + "id":0 + } + ] + } + ], + +Next we have the `Root Decoders` belonging to :code:`root0`. This root decoder +is a pass-through decoder because :code:`interleave_ways` is set to :code:`1`. + +This information is generated by the CXL driver reading the ACPI CEDT CMFWS. + +:: + + "decoders:root0":[ + { + "decoder":"decoder0.0", + "resource":825975898112, + "size":137438953472, + "interleave_ways":1, + "max_available_extent":0, + "volatile_capable":true, + "nr_targets":1, + "targets":[ + { + "target":"pci0000:d2", + "alias":"ACPI0016:00", + "position":0, + "id":5 + } + ], + +Finally we have the `Memory Region` associated with the `Root Decoder` +:code:`decoder0.0`. This region describes the discrete region associated +with the lone device. + +:: + + "regions:decoder0.0":[ + { + "region":"region0", + "resource":825975898112, + "size":137438953472, + "type":"ram", + "interleave_ways":1, + "decode_state":"commit", + "mappings":[ + { + "position":0, + "memdev":"mem0", + "decoder":"decoder5.0" + } + ] + } + ] + } + ] + } + ] diff --git a/Documentation/driver-api/cxl/linux/memory-hotplug.rst b/Documentation/driver-api/cxl/linux/memory-hotplug.rst new file mode 100644 index 000000000000..af368c2bc9cf --- /dev/null +++ b/Documentation/driver-api/cxl/linux/memory-hotplug.rst @@ -0,0 +1,78 @@ +.. SPDX-License-Identifier: GPL-2.0 + +============== +Memory Hotplug +============== +The final phase of surfacing CXL memory to the kernel page allocator is for +the `DAX` driver to surface a `Driver Managed` memory region via the +memory-hotplug component. + +There are four major configurations to consider: + +1) Default Online Behavior (on/off and zone) +2) Hotplug Memory Block size +3) Memory Map Resource location +4) Driver-Managed Memory Designation + +Default Online Behavior +======================= +The default-online behavior of hotplug memory is dictated by the following, +in order of precedence: + +- :code:`CONFIG_MHP_DEFAULT_ONLINE_TYPE` Build Configuration +- :code:`memhp_default_state` Boot parameter +- :code:`/sys/devices/system/memory/auto_online_blocks` value + +These dictate whether hotplugged memory blocks arrive in one of three states: + +1) Offline +2) Online in :code:`ZONE_NORMAL` +3) Online in :code:`ZONE_MOVABLE` + +:code:`ZONE_NORMAL` implies this capacity may be used for almost any allocation, +while :code:`ZONE_MOVABLE` implies this capacity should only be used for +migratable allocations. + +:code:`ZONE_MOVABLE` attempts to retain the hotplug-ability of a memory block +so that it the entire region may be hot-unplugged at a later time. Any capacity +onlined into :code:`ZONE_NORMAL` should be considered permanently attached to +the page allocator. + +Hotplug Memory Block Size +========================= +By default, on most architectures, the Hotplug Memory Block Size is either +128MB or 256MB. On x86, the block size increases up to 2GB as total memory +capacity exceeds 64GB. As of v6.15, Linux does not take into account the +size and alignment of the ACPI CEDT CFMWS regions (see Early Boot docs) when +deciding the Hotplug Memory Block Size. + +Memory Map +========== +The location of :code:`struct folio` allocations to represent the hotplugged +memory capacity are dictated by the following system settings: + +- :code:`/sys_module/memory_hotplug/parameters/memmap_on_memory` +- :code:`/sys/bus/dax/devices/daxN.Y/memmap_on_memory` + +If both of these parameters are set to true, :code:`struct folio` for this +capacity will be carved out of the memory block being onlined. This has +performance implications if the memory is particularly high-latency and +its :code:`struct folio` becomes hotly contended. + +If either parameter is set to false, :code:`struct folio` for this capacity +will be allocated from the local node of the processor running the hotplug +procedure. This capacity will be allocated from :code:`ZONE_NORMAL` on +that node, as it is a :code:`GFP_KERNEL` allocation. + +Systems with extremely large amounts of :code:`ZONE_MOVABLE` memory (e.g. +CXL memory pools) must ensure that there is sufficient local +:code:`ZONE_NORMAL` capacity to host the memory map for the hotplugged capacity. + +Driver Managed Memory +===================== +The DAX driver surfaces this memory to memory-hotplug as "Driver Managed". This +is not a configurable setting, but it's important to note that driver managed +memory is explicitly excluded from use during kexec. This is required to ensure +any reset or out-of-band operations that the CXL device may be subject to during +a functional system-reboot (such as a reset-on-probe) will not cause portions of +the kexec kernel to be overwritten. diff --git a/Documentation/driver-api/cxl/linux/overview.rst b/Documentation/driver-api/cxl/linux/overview.rst new file mode 100644 index 000000000000..648beb2c8c83 --- /dev/null +++ b/Documentation/driver-api/cxl/linux/overview.rst @@ -0,0 +1,103 @@ +.. SPDX-License-Identifier: GPL-2.0 + +======== +Overview +======== + +This section presents the configuration process of a CXL Type-3 memory device, +and how it is ultimately exposed to users as either a :code:`DAX` device or +normal memory pages via the kernel's page allocator. + +Portions marked with a bullet are points at which certain kernel objects +are generated. + +1) Early Boot + + a) BIOS, Build, and Boot Parameters + + i) EFI_MEMORY_SP + ii) CONFIG_EFI_SOFT_RESERVE + iii) CONFIG_MHP_DEFAULT_ONLINE_TYPE + iv) nosoftreserve + + b) Memory Map Creation + + i) EFI Memory Map / E820 Consulted for Soft-Reserved + + * CXL Memory is set aside to be handled by the CXL driver + + * Soft-Reserved IO Resource created for CFMWS entry + + c) NUMA Node Creation + + * Nodes created from ACPI CEDT CFMWS and SRAT Proximity domains (PXM) + + d) Memory Tier Creation + + * A default memory_tier is created with all nodes. + + e) Contiguous Memory Allocation + + * Any requested CMA is allocated from Online nodes + + f) Init Finishes, Drivers start probing + +2) ACPI and PCI Drivers + + a) Detects PCI device is CXL, marking it for probe by CXL driver + +3) CXL Driver Operation + + a) Base device creation + + * root, port, and memdev devices created + * CEDT CFMWS IO Resource creation + + b) Decoder creation + + * root, switch, and endpoint decoders created + + c) Logical device creation + + * memory_region and endpoint devices created + + d) Devices are associated with each other + + * If auto-decoder (BIOS-programmed decoders), driver validates + configurations, builds associations, and locks configs at probe time. + + * If user-configured, validation and associations are built at + decoder-commit time. + + e) Regions surfaced as DAX region + + * dax_region created + + * DAX device created via DAX driver + +4) DAX Driver Operation + + a) DAX driver surfaces DAX region as one of two dax device modes + + * kmem - dax device is converted to hotplug memory blocks + + * DAX kmem IO Resource creation + + * hmem - dax device is left as daxdev to be accessed as a file. + + * If hmem, journey ends here. + + b) DAX kmem surfaces memory region to Memory Hotplug to add to page + allocator as "driver managed memory" + +5) Memory Hotplug + + a) mhp component surfaces a dax device memory region as multiple memory + blocks to the page allocator + + * blocks appear in :code:`/sys/bus/memory/devices` and linked to a NUMA node + + b) blocks are onlined into the requested zone (NORMAL or MOVABLE) + + * Memory is marked "Driver Managed" to avoid kexec from using it as region + for kernel updates diff --git a/Documentation/driver-api/cxl/maturity-map.rst b/Documentation/driver-api/cxl/maturity-map.rst new file mode 100644 index 000000000000..1330f3f52129 --- /dev/null +++ b/Documentation/driver-api/cxl/maturity-map.rst @@ -0,0 +1,202 @@ +.. SPDX-License-Identifier: GPL-2.0 +.. include:: <isonum.txt> + +=========================================== +Compute Express Link Subsystem Maturity Map +=========================================== + +The Linux CXL subsystem tracks the dynamic `CXL specification +<https://computeexpresslink.org/cxl-specification-landing-page>`_ that +continues to respond to new use cases with new features, capability +updates and fixes. At any given point some aspects of the subsystem are +more mature than others. While the periodic pull requests summarize the +`work being incorporated each merge window +<https://lore.kernel.org/linux-cxl/?q=s%3APULL+s%3ACXL+tc%3Atorvalds+NOT+s%3ARe>`_, +those do not always convey progress relative to a starting point and a +future end goal. + +What follows is a coarse breakdown of the subsystem's major +responsibilities along with a maturity score. The expectation is that +the change-history of this document provides an overview summary of the +subsystem maturation over time. + +The maturity scores are: + +- [3] Mature: Work in this area is complete and no changes on the horizon. + Note that this score can regress from one kernel release to the next + based on new test results or end user reports. + +- [2] Stabilizing: Major functionality operational, common cases are + mature, but known corner cases are still a work in progress. + +- [1] Initial: Capability that has exited the Proof of Concept phase, but + may still have significant gaps to close and fixes to apply as real + world testing occurs. + +- [0] Known gap: Feature is on a medium to long term horizon to + implement. If the specification has a feature that does not even have + a '0' score in this document, there is a good chance that no one in + the linux-cxl@vger.kernel.org community has started to look at it. + +- X: Out of scope for kernel enabling, or kernel enabling not required + +Feature and Capabilities +======================== + +Enumeration / Provisioning +-------------------------- +All of the fundamental enumeration an object model of the subsystem is +in place, but there are several corner cases that are pending closure. + + +* [2] CXL Window Enumeration + + * [2] :ref:`Extended-linear memory-side cache <extended-linear>` + * [0] Low Memory-hole + * [X] Hetero-interleave + +* [2] Switch Enumeration + + * [0] CXL register enumeration link-up dependency + +* [2] HDM Decoder Configuration + + * [0] Decoder target and granularity constraints + +* [2] Performance enumeration + + * [3] Endpoint CDAT + * [3] Switch CDAT + * [1] CDAT to Core-mm integration + + * [1] x86 + * [0] Arm64 + * [0] All other arch. + + * [0] Shared link + +* [2] Hotplug + (see CXL Window Enumeration) + + * [0] Handle Soft Reserved conflicts + +* [0] :ref:`RCH link status <rch-link-status>` +* [0] Fabrics / G-FAM (chapter 7) +* [0] Global Access Endpoint + + +RAS +--- +In many ways CXL can be seen as a standardization of what would normally +be handled by custom EDAC drivers. The open development here is +mainly caused by the enumeration corner cases above. + +* [3] Component events (OS) +* [2] Component events (FFM) +* [1] Endpoint protocol errors (OS) +* [1] Endpoint protocol errors (FFM) +* [0] Switch protocol errors (OS) +* [1] Switch protocol errors (FFM) +* [2] DPA->HPA Address translation + + * [1] XOR Interleave translation + (see CXL Window Enumeration) + +* [1] Memory Failure coordination +* [0] Scrub control +* [2] ACPI error injection EINJ + + * [0] EINJ v2 + * [X] Compliance DOE + +* [2] Native error injection +* [3] RCH error handling +* [1] VH error handling +* [0] PPR +* [0] Sparing +* [0] Device built in test + + +Mailbox commands +---------------- + +* [3] Firmware update +* [3] Health / Alerts +* [1] :ref:`Background commands <background-commands>` +* [3] Sanitization +* [3] Security commands +* [3] RAW Command Debug Passthrough +* [0] CEL-only-validation Passthrough +* [0] Switch CCI +* [3] Timestamp +* [1] PMEM labels +* [3] PMEM GPF / Dirty Shutdown +* [0] Scan Media + +PMU +--- +* [1] Type 3 PMU +* [0] Switch USP/ DSP, Root Port + +Security +-------- + +* [X] CXL Trusted Execution Environment Security Protocol (TSP) +* [X] CXL IDE (subsumed by TSP) + +Memory-pooling +-------------- + +* [1] Hotplug of LDs (via PCI hotplug) +* [0] Dynamic Capacity Device (DCD) Support + +Multi-host sharing +------------------ + +* [0] Hardware coherent shared memory +* [0] Software managed coherency shared memory + +Multi-host memory +----------------- + +* [0] Dynamic Capacity Device Support +* [0] Sharing + +Accelerator +----------- + +* [0] Accelerator memory enumeration HDM-D (CXL 1.1/2.0 Type-2) +* [0] Accelerator memory enumeration HDM-DB (CXL 3.0 Type-2) +* [0] CXL.cache 68b (CXL 2.0) +* [0] CXL.cache 256b Cache IDs (CXL 3.0) + +User Flow Support +----------------- + +* [0] Inject & clear poison by HPA + +Details +======= + +.. _extended-linear: + +* **Extended-linear memory-side cache**: An HMAT proposal to enumerate the presence of a + memory-side cache where the cache capacity extends the SRAT address + range capacity. `See the ECN + <https://lore.kernel.org/linux-cxl/6650e4f835a0e_195e294a8@dwillia2-mobl3.amr.corp.intel.com.notmuch/>`_ + for more details: + +.. _rch-link-status: + +* **RCH Link Status**: RCH (Restricted CXL Host) topologies, end up + hiding some standard registers like PCIe Link Status / Capabilities in + the CXL RCRB (Root Complex Register Block). + +.. _background-commands: + +* **Background commands**: The CXL background command mechanism is + awkward as the single slot is monopolized potentially indefinitely by + various commands. A `cancel on conflict + <http://lore.kernel.org/r/66035c2e8ba17_770232948b@dwillia2-xfh.jf.intel.com.notmuch>`_ + facility is needed to make sure the kernel can ensure forward progress + of priority commands. diff --git a/Documentation/driver-api/cxl/platform/acpi.rst b/Documentation/driver-api/cxl/platform/acpi.rst new file mode 100644 index 000000000000..ee7e6bd4c43d --- /dev/null +++ b/Documentation/driver-api/cxl/platform/acpi.rst @@ -0,0 +1,76 @@ +.. SPDX-License-Identifier: GPL-2.0 + +=========== +ACPI Tables +=========== + +ACPI is the "Advanced Configuration and Power Interface", which is a standard +that defines how platforms and OS manage power and configure computer hardware. +For the purpose of this theory of operation, when referring to "ACPI" we will +usually refer to "ACPI Tables" - which are the way a platform (BIOS/EFI) +communicates static configuration information to the operation system. + +The Following ACPI tables contain *static* configuration and performance data +about CXL devices. + +.. toctree:: + :maxdepth: 1 + + acpi/cedt.rst + acpi/srat.rst + acpi/hmat.rst + acpi/slit.rst + acpi/dsdt.rst + +The SRAT table may also contain generic port/initiator content that is intended +to describe the generic port, but not information about the rest of the path to +the endpoint. + +Linux uses these tables to configure kernel resources for statically configured +(by BIOS/EFI) CXL devices, such as: + +- NUMA nodes +- Memory Tiers +- NUMA Abstract Distances +- SystemRAM Memory Regions +- Weighted Interleave Node Weights + +ACPI Debugging +============== + +The :code:`acpidump -b` command dumps the ACPI tables into binary format. + +The :code:`iasl -d` command disassembles the files into human readable format. + +Example :code:`acpidump -b && iasl -d cedt.dat` :: + + [000h 0000 4] Signature : "CEDT" [CXL Early Discovery Table] + +Common Issues +------------- +Most failures described here result in a failure of the driver to surface +memory as a DAX device and/or kmem. + +* CEDT CFMWS targets list UIDs do not match CEDT CHBS UIDs. +* CEDT CFMWS targets list UIDs do not match DSDT CXL Host Bridge UIDs. +* CEDT CFMWS Restriction Bits are not correct. +* CEDT CFMWS Memory regions are poorly aligned. +* CEDT CFMWS Memory regions spans a platform memory hole. +* CEDT CHBS UIDs do not match DSDT CXL Host Bridge UIDs. +* CEDT CHBS Specification version is incorrect. +* SRAT is missing regions described in CEDT CFMWS. + + * Result: failure to create a NUMA node for the region, or + region is placed in wrong node. + +* HMAT is missing data for regions described in CEDT CFMWS. + + * Result: NUMA node being placed in the wrong memory tier. + +* SLIT has bad data. + + * Result: Lots of performance mechanisms in the kernel will be very unhappy. + +All of these issues will appear to users as if the driver is failing to +support CXL - when in reality they are all the failure of a platform to +configure the ACPI tables correctly. diff --git a/Documentation/driver-api/cxl/platform/acpi/cedt.rst b/Documentation/driver-api/cxl/platform/acpi/cedt.rst new file mode 100644 index 000000000000..1d9c9d3592dc --- /dev/null +++ b/Documentation/driver-api/cxl/platform/acpi/cedt.rst @@ -0,0 +1,62 @@ +.. SPDX-License-Identifier: GPL-2.0 + +================================ +CEDT - CXL Early Discovery Table +================================ + +The CXL Early Discovery Table is generated by BIOS to describe the CXL memory +regions configured at boot by the BIOS. + +CHBS +==== +The CXL Host Bridge Structure describes CXL host bridges. Other than describing +device register information, it reports the specific host bridge UID for this +host bridge. These host bridge ID's will be referenced in other tables. + +Example :: + + Subtable Type : 00 [CXL Host Bridge Structure] + Reserved : 00 + Length : 0020 + Associated host bridge : 00000007 <- Host bridge _UID + Specification version : 00000001 + Reserved : 00000000 + Register base : 0000010370400000 + Register length : 0000000000010000 + +CFMWS +===== +The CXL Fixed Memory Window structure describes a memory region associated +with one or more CXL host bridges (as described by the CHBS). It additionally +describes any inter-host-bridge interleave configuration that may have been +programmed by BIOS. + +Example :: + + Subtable Type : 01 [CXL Fixed Memory Window Structure] + Reserved : 00 + Length : 002C + Reserved : 00000000 + Window base address : 000000C050000000 <- Memory Region + Window size : 0000003CA0000000 + Interleave Members (2^n) : 01 <- Interleave configuration + Interleave Arithmetic : 00 + Reserved : 0000 + Granularity : 00000000 + Restrictions : 0006 + QtgId : 0001 + First Target : 00000007 <- Host Bridge _UID + Next Target : 00000006 <- Host Bridge _UID + +The restriction field dictates what this SPA range may be used for (memory type, +voltile vs persistent, etc). One or more bits may be set. :: + + Bit[0]: CXL Type 2 Memory + Bit[1]: CXL Type 3 Memory + Bit[2]: Volatile Memory + Bit[3]: Persistent Memory + Bit[4]: Fixed Config (HPA cannot be re-used) + +INTRA-host-bridge interleave (multiple devices on one host bridge) is NOT +reported in this structure, and is solely defined via CXL device decoder +programming (host bridge and endpoint decoders). diff --git a/Documentation/driver-api/cxl/platform/acpi/dsdt.rst b/Documentation/driver-api/cxl/platform/acpi/dsdt.rst new file mode 100644 index 000000000000..b4583b01d67d --- /dev/null +++ b/Documentation/driver-api/cxl/platform/acpi/dsdt.rst @@ -0,0 +1,28 @@ +.. SPDX-License-Identifier: GPL-2.0 + +============================================== +DSDT - Differentiated system Description Table +============================================== + +This table describes what peripherals a machine has. + +This table's UIDs for CXL devices - specifically host bridges, must be +consistent with the contents of the CEDT, otherwise the CXL driver will +fail to probe correctly. + +Example Compute Express Link Host Bridge :: + + Scope (_SB) + { + Device (S0D0) + { + Name (_HID, "ACPI0016" /* Compute Express Link Host Bridge */) // _HID: Hardware ID + Name (_CID, Package (0x02) // _CID: Compatible ID + { + EisaId ("PNP0A08") /* PCI Express Bus */, + EisaId ("PNP0A03") /* PCI Bus */ + }) + ... + Name (_UID, 0x05) // _UID: Unique ID + ... + } diff --git a/Documentation/driver-api/cxl/platform/acpi/hmat.rst b/Documentation/driver-api/cxl/platform/acpi/hmat.rst new file mode 100644 index 000000000000..095a26f02a37 --- /dev/null +++ b/Documentation/driver-api/cxl/platform/acpi/hmat.rst @@ -0,0 +1,32 @@ +.. SPDX-License-Identifier: GPL-2.0 + +=========================================== +HMAT - Heterogeneous Memory Attribute Table +=========================================== + +The Heterogeneous Memory Attributes Table contains information such as cache +attributes and bandwidth and latency details for memory proximity domains. +For the purpose of this document, we will only discuss the SSLIB entry. + +SLLBI +===== +The System Locality Latency and Bandwidth Information records latency and +bandwidth information for proximity domains. + +This table is used by Linux to configure interleave weights and memory tiers. + +Example (Heavily truncated for brevity) :: + + Structure Type : 0001 [SLLBI] + Data Type : 00 <- Latency + Target Proximity Domain List : 00000000 + Target Proximity Domain List : 00000001 + Entry : 0080 <- DRAM LTC + Entry : 0100 <- CXL LTC + + Structure Type : 0001 [SLLBI] + Data Type : 03 <- Bandwidth + Target Proximity Domain List : 00000000 + Target Proximity Domain List : 00000001 + Entry : 1200 <- DRAM BW + Entry : 0200 <- CXL BW diff --git a/Documentation/driver-api/cxl/platform/acpi/slit.rst b/Documentation/driver-api/cxl/platform/acpi/slit.rst new file mode 100644 index 000000000000..a56768e8fe41 --- /dev/null +++ b/Documentation/driver-api/cxl/platform/acpi/slit.rst @@ -0,0 +1,21 @@ +.. SPDX-License-Identifier: GPL-2.0 + +======================================== +SLIT - System Locality Information Table +======================================== + +The system locality information table provides "abstract distances" between +accessor and memory nodes. Node without initiators (cpus) are infinitely (FF) +distance away from all other nodes. + +The abstract distance described in this table does not describe any real +latency of bandwidth information. + +Example :: + + Signature : "SLIT" [System Locality Information Table] + Localities : 0000000000000004 + Locality 0 : 10 20 20 30 + Locality 1 : 20 10 30 20 + Locality 2 : FF FF 0A FF + Locality 3 : FF FF FF 0A diff --git a/Documentation/driver-api/cxl/platform/acpi/srat.rst b/Documentation/driver-api/cxl/platform/acpi/srat.rst new file mode 100644 index 000000000000..cc98ca0e508e --- /dev/null +++ b/Documentation/driver-api/cxl/platform/acpi/srat.rst @@ -0,0 +1,71 @@ +.. SPDX-License-Identifier: GPL-2.0 + +===================================== +SRAT - Static Resource Affinity Table +===================================== + +The System/Static Resource Affinity Table describes resource (CPU, Memory) +affinity to "Proximity Domains". This table is technically optional, but for +performance information (see "HMAT") to be enumerated by linux it must be +present. + +There is a careful dance between the CEDT and SRAT tables and how NUMA nodes are +created. If things don't look quite the way you expect - check the SRAT Memory +Affinity entries and CEDT CFMWS to determine what your platform actually +supports in terms of flexible topologies. + +The SRAT may statically assign portions of a CFMWS SPA range to a specific +proximity domains. See linux numa creation for more information about how +this presents in the NUMA topology. + +Proximity Domain +================ +A proximity domain is ROUGHLY equivalent to "NUMA Node" - though a 1-to-1 +mapping is not guaranteed. There are scenarios where "Proximity Domain 4" may +map to "NUMA Node 3", for example. (See "NUMA Node Creation") + +Memory Affinity +=============== +Generally speaking, if a host does any amount of CXL fabric (decoder) +programming in BIOS - an SRAT entry for that memory needs to be present. + +Example :: + + Subtable Type : 01 [Memory Affinity] + Length : 28 + Proximity Domain : 00000001 <- NUMA Node 1 + Reserved1 : 0000 + Base Address : 000000C050000000 <- Physical Memory Region + Address Length : 0000003CA0000000 + Reserved2 : 00000000 + Flags (decoded below) : 0000000B + Enabled : 1 + Hot Pluggable : 1 + Non-Volatile : 0 + + +Generic Port Affinity +===================== +The Generic Port Affinity subtable provides an association between a proximity +domain and a device handle representing a Generic Port such as a CXL host +bridge. With the association, latency and bandwidth numbers can be retrieved +from the SRAT for the path between CPU(s) (initiator) and the Generic Port. +This is used to construct performance coordinates for hotplugged CXL DEVICES, +which cannot be enumerated at boot by platform firmware. + +Example :: + + Subtable Type : 06 [Generic Port Affinity] + Length : 20 <- 32d, length of table + Reserved : 00 + Device Handle Type : 00 <- 0 - ACPI, 1 - PCI + Proximity Domain : 00000001 + Device Handle : ACPI0016:01 + Flags : 00000001 <- Bit 0 (Enabled) + Reserved : 00000000 + +The Proximity Domain is matched up to the :doc:`HMAT <hmat>` SSLBI Target +Proximity Domain List for the related latency or bandwidth numbers. Those +performance numbers are tied to a CXL host bridge via the Device Handle. +The driver uses the association to retrieve the Generic Port performance +numbers for the whole CXL path access coordinates calculation. diff --git a/Documentation/driver-api/cxl/platform/bios-and-efi.rst b/Documentation/driver-api/cxl/platform/bios-and-efi.rst new file mode 100644 index 000000000000..645322632cc9 --- /dev/null +++ b/Documentation/driver-api/cxl/platform/bios-and-efi.rst @@ -0,0 +1,262 @@ +.. SPDX-License-Identifier: GPL-2.0 + +====================== +BIOS/EFI Configuration +====================== + +BIOS and EFI are largely responsible for configuring static information about +devices (or potential future devices) such that Linux can build the appropriate +logical representations of these devices. + +At a high level, this is what occurs during this phase of configuration. + +* The bootloader starts the BIOS/EFI. + +* BIOS/EFI do early device probe to determine static configuration + +* BIOS/EFI creates ACPI Tables that describe static config for the OS + +* BIOS/EFI create the system memory map (EFI Memory Map, E820, etc) + +* BIOS/EFI calls :code:`start_kernel` and begins the Linux Early Boot process. + +Much of what this section is concerned with is ACPI Table production and +static memory map configuration. More detail on these tables can be found +at :doc:`ACPI Tables <acpi>`. + +.. note:: + Platform Vendors should read carefully, as this sections has recommendations + on physical memory region size and alignment, memory holes, HDM interleave, + and what linux expects of HDM decoders trying to work with these features. + +UEFI Settings +============= +If your platform supports it, the :code:`uefisettings` command can be used to +read/write EFI settings. Changes will be reflected on the next reboot. Kexec +is not a sufficient reboot. + +One notable configuration here is the EFI_MEMORY_SP (Specific Purpose) bit. +When this is enabled, this bit tells linux to defer management of a memory +region to a driver (in this case, the CXL driver). Otherwise, the memory is +treated as "normal memory", and is exposed to the page allocator during +:code:`__init`. + +uefisettings examples +--------------------- + +:code:`uefisettings identify` :: + + uefisettings identify + + bios_vendor: xxx + bios_version: xxx + bios_release: xxx + bios_date: xxx + product_name: xxx + product_family: xxx + product_version: xxx + +On some AMD platforms, the :code:`EFI_MEMORY_SP` bit is set via the :code:`CXL +Memory Attribute` field. This may be called something else on your platform. + +:code:`uefisettings get "CXL Memory Attribute"` :: + + selector: xxx + ... + question: Question { + name: "CXL Memory Attribute", + answer: "Enabled", + ... + } + +Physical Memory Map +=================== + +Physical Address Region Alignment +--------------------------------- + +As of Linux v6.14, the hotplug memory system requires memory regions to be +uniform in size and alignment. While the CXL specification allows for memory +regions as small as 256MB, the supported memory block size and alignment for +hotplugged memory is architecture-defined. + +A Linux memory blocks may be as small as 128MB and increase in powers of two. + +* On ARM, the default block size and alignment is either 128MB or 256MB. + +* On x86, the default block size is 256MB, and increases to 2GB as the + capacity of the system increases up to 64GB. + +For best support across versions, platform vendors should place CXL memory at +a 2GB aligned base address, and regions should be 2GB aligned. This also helps +prevent the creating thousands of memory devices (one per block). + +Memory Holes +------------ + +Holes in the memory map are tricky. Consider a 4GB device located at base +address 0x100000000, but with the following memory map :: + + --------------------- + | 0x100000000 | + | CXL | + | 0x1BFFFFFFF | + --------------------- + | 0x1C0000000 | + | MEMORY HOLE | + | 0x1FFFFFFFF | + --------------------- + | 0x200000000 | + | CXL CONT. | + | 0x23FFFFFFF | + --------------------- + +There are two issues to consider: + +* decoder programming, and +* memory block alignment. + +If your architecture requires 2GB uniform size and aligned memory blocks, the +only capacity Linux is capable of mapping (as of v6.14) would be the capacity +from `0x100000000-0x180000000`. The remaining capacity will be stranded, as +they are not of 2GB aligned length. + +Assuming your architecture and memory configuration allows 1GB memory blocks, +this memory map is supported and this should be presented as multiple CFMWS +in the CEDT that describe each side of the memory hole separately - along with +matching decoders. + +Multiple decoders can (and should) be used to manage such a memory hole (see +below), but each chunk of a memory hole should be aligned to a reasonable block +size (larger alignment is always better). If you intend to have memory holes +in the memory map, expect to use one decoder per contiguous chunk of host +physical memory. + +As of v6.14, Linux does provide support for memory hotplug of multiple +physical memory regions separated by a memory hole described by a single +HDM decoder. + + +Decoder Programming +=================== +If BIOS/EFI intends to program the decoders to be statically configured, +there are a few things to consider to avoid major pitfalls that will +prevent Linux compatibility. Some of these recommendations are not +required "per the specification", but Linux makes no guarantees of support +otherwise. + + +Translation Point +----------------- +Per the specification, the only decoders which **TRANSLATE** Host Physical +Address (HPA) to Device Physical Address (DPA) are the **Endpoint Decoders**. +All other decoders in the fabric are intended to route accesses without +translating the addresses. + +This is heavily implied by the specification, see: :: + + CXL Specification 3.1 + 8.2.4.20: CXL HDM Decoder Capability Structure + - Implementation Note: CXL Host Bridge and Upstream Switch Port Decoder Flow + - Implementation Note: Device Decoder Logic + +Given this, Linux makes a strong assumption that decoders between CPU and +endpoint will all be programmed with addresses ranges that are subsets of +their parent decoder. + +Due to some ambiguity in how Architecture, ACPI, PCI, and CXL specifications +"hand off" responsibility between domains, some early adopting platforms +attempted to do translation at the originating memory controller or host +bridge. This configuration requires a platform specific extension to the +driver and is not officially endorsed - despite being supported. + +It is *highly recommended* **NOT** to do this; otherwise, you are on your own +to implement driver support for your platform. + +Interleave and Configuration Flexibility +---------------------------------------- +If providing cross-host-bridge interleave, a CFMWS entry in the :doc:`CEDT +<acpi/cedt>` must be presented with target host-bridges for the interleaved +device sets (there may be multiple behind each host bridge). + +If providing intra-host-bridge interleaving, only 1 CFMWS entry in the CEDT is +required for that host bridge - if it covers the entire capacity of the devices +behind the host bridge. + +If intending to provide users flexibility in programming decoders beyond the +root, you may want to provide multiple CFMWS entries in the CEDT intended for +different purposes. For example, you may want to consider adding: + +1) A CFMWS entry to cover all interleavable host bridges. +2) A CFMWS entry to cover all devices on a single host bridge. +3) A CFMWS entry to cover each device. + +A platform may choose to add all of these, or change the mode based on a BIOS +setting. For each CFMWS entry, Linux expects descriptions of the described +memory regions in the :doc:`SRAT <acpi/srat>` to determine the number of +NUMA nodes it should reserve during early boot / init. + +As of v6.14, Linux will create a NUMA node for each CEDT CFMWS entry, even if +a matching SRAT entry does not exist; however, this is not guaranteed in the +future and such a configuration should be avoided. + +Memory Holes +------------ +If your platform includes memory holes intersparsed between your CXL memory, it +is recommended to utilize multiple decoders to cover these regions of memory, +rather than try to program the decoders to accept the entire range and expect +Linux to manage the overlap. + +For example, consider the Memory Hole described above :: + + --------------------- + | 0x100000000 | + | CXL | + | 0x1BFFFFFFF | + --------------------- + | 0x1C0000000 | + | MEMORY HOLE | + | 0x1FFFFFFFF | + --------------------- + | 0x200000000 | + | CXL CONT. | + | 0x23FFFFFFF | + --------------------- + +Assuming this is provided by a single device attached directly to a host bridge, +Linux would expect the following decoder programming :: + + ----------------------- ----------------------- + | root-decoder-0 | | root-decoder-1 | + | base: 0x100000000 | | base: 0x200000000 | + | size: 0xC0000000 | | size: 0x40000000 | + ----------------------- ----------------------- + | | + ----------------------- ----------------------- + | HB-decoder-0 | | HB-decoder-1 | + | base: 0x100000000 | | base: 0x200000000 | + | size: 0xC0000000 | | size: 0x40000000 | + ----------------------- ----------------------- + | | + ----------------------- ----------------------- + | ep-decoder-0 | | ep-decoder-1 | + | base: 0x100000000 | | base: 0x200000000 | + | size: 0xC0000000 | | size: 0x40000000 | + ----------------------- ----------------------- + +With a CEDT configuration with two CFMWS describing the above root decoders. + +Linux makes no guarantee of support for strange memory hole situations. + +Multi-Media Devices +------------------- +The CFMWS field of the CEDT has special restriction bits which describe whether +the described memory region allows volatile or persistent memory (or both). If +the platform intends to support either: + +1) A device with multiple medias, or +2) Using a persistent memory device as normal memory + +A platform may wish to create multiple CEDT CFMWS entries to describe the same +memory, with the intent of allowing the end user flexibility in how that memory +is configured. Linux does not presently have strong requirements in this area. diff --git a/Documentation/driver-api/cxl/platform/cdat.rst b/Documentation/driver-api/cxl/platform/cdat.rst new file mode 100644 index 000000000000..34bbe7264d71 --- /dev/null +++ b/Documentation/driver-api/cxl/platform/cdat.rst @@ -0,0 +1,118 @@ +.. SPDX-License-Identifier: GPL-2.0 + +====================================== +Coherent Device Attribute Table (CDAT) +====================================== + +The CDAT provides functional and performance attributes of devices such +as CXL accelerators, switches, or endpoints. The table formatting is +similar to ACPI tables. CDAT data may be parsed by BIOS at boot or may +be enumerated at runtime (after device hotplug, for example). + +Terminology: +DPA - Device Physical Address, used by the CXL device to denote the address +it supports for that device. + +DSMADHandle - A device unique handle that is associated with a DPA range +defined by the DSMAS table. + + +=============================================== +Device Scoped Memory Affinity Structure (DSMAS) +=============================================== + +The DSMAS contains information such as DSMADHandle, the DPA Base, and DPA +Length. + +This table is used by Linux in conjunction with the Device Scoped Latency and +Bandwidth Information Structure (DSLBIS) to determine the performance +attributes of the CXL device itself. + +Example :: + + Structure Type : 00 [DSMAS] + Reserved : 00 + Length : 0018 <- 24d, size of structure + DSMADHandle : 01 + Flags : 00 + Reserved : 0000 + DPA Base : 0000000040000000 <- 1GiB base + DPA Length : 0000000080000000 <- 2GiB size + + +================================================================== +Device Scoped Latency and Bandwidth Information Structure (DSLBIS) +================================================================== + +This table is used by Linux in conjunction with DSMAS to determine the +performance attributes of a CXL device. The DSLBIS contains latency +and bandwidth information based on DSMADHandle matching. + +Example :: + + Structure Type : 01 [DSLBIS] + Reserved : 00 + Length : 18 <- 24d, size of structure + Handle : 0001 <- DSMAS handle + Flags : 00 <- Matches flag field for HMAT SLLBIS + Data Type : 00 <- Latency + Entry Basee Unit : 0000000000001000 <- Entry Base Unit field in HMAT SSLBIS + Entry : 010000000000 <- First byte used here, CXL LTC + Reserved : 0000 + + Structure Type : 01 [DSLBIS] + Reserved : 00 + Length : 18 <- 24d, size of structure + Handle : 0001 <- DSMAS handle + Flags : 00 <- Matches flag field for HMAT SLLBIS + Data Type : 03 <- Bandwidth + Entry Basee Unit : 0000000000001000 <- Entry Base Unit field in HMAT SSLBIS + Entry : 020000000000 <- First byte used here, CXL BW + Reserved : 0000 + + +================================================================== +Switch Scoped Latency and Bandwidth Information Structure (SSLBIS) +================================================================== + +The SSLBIS contains information about the latency and bandwidth of a switch. + +The table is used by Linux to compute the performance coordinates of a CXL path +from the device to the root port where a switch is part of the path. + +Example :: + + Structure Type : 05 [SSLBIS] + Reserved : 00 + Length : 20 <- 32d, length of record, including SSLB entries + Data Type : 00 <- Latency + Reserved : 000000 + Entry Base Unit : 00000000000000001000 <- Matches Entry Base Unit in HMAT SSLBIS + + <- SSLB Entry 0 + Port X ID : 0100 <- First port, 0100h represents an upstream port + Port Y ID : 0000 <- Second port, downstream port 0 + Latency : 0100 <- Port latency + Reserved : 0000 + <- SSLB Entry 1 + Port X ID : 0100 + Port Y ID : 0001 + Latency : 0100 + Reserved : 0000 + + + Structure Type : 05 [SSLBIS] + Reserved : 00 + Length : 18 <- 24d, length of record, including SSLB entry + Data Type : 03 <- Bandwidth + Reserved : 000000 + Entry Base Unit : 00000000000000001000 <- Matches Entry Base Unit in HMAT SSLBIS + + <- SSLB Entry 0 + Port X ID : 0100 <- First port, 0100h represents an upstream port + Port Y ID : FFFF <- Second port, FFFFh indicates any port + Bandwidth : 1200 <- Port bandwidth + Reserved : 0000 + +The CXL driver uses a combination of CDAT, HMAT, SRAT, and other data to +generate "whole path performance" data for a CXL device. diff --git a/Documentation/driver-api/cxl/platform/example-configs.rst b/Documentation/driver-api/cxl/platform/example-configs.rst new file mode 100644 index 000000000000..90a10d7473c6 --- /dev/null +++ b/Documentation/driver-api/cxl/platform/example-configs.rst @@ -0,0 +1,13 @@ +.. SPDX-License-Identifier: GPL-2.0 + +Example Platform Configurations +############################### + +.. toctree:: + :maxdepth: 1 + :caption: Contents + + example-configurations/one-dev-per-hb.rst + example-configurations/multi-dev-per-hb.rst + example-configurations/hb-interleave.rst + example-configurations/flexible.rst diff --git a/Documentation/driver-api/cxl/platform/example-configurations/flexible.rst b/Documentation/driver-api/cxl/platform/example-configurations/flexible.rst new file mode 100644 index 000000000000..dab704b6fcc2 --- /dev/null +++ b/Documentation/driver-api/cxl/platform/example-configurations/flexible.rst @@ -0,0 +1,296 @@ +.. SPDX-License-Identifier: GPL-2.0 + +===================== +Flexible Presentation +===================== +This system has a single socket with two CXL host bridges. Each host bridge +has two CXL memory expanders with a 4GB of memory (32GB total). + +On this system, the platform designer wanted to provide the user flexibility +to configure the memory devices in various interleave or NUMA node +configurations. So they provided every combination. + +Things to note: + +* Cross-Bridge interleave is described in one CFMWS that covers all capacity. +* One CFMWS is also described per-host bridge. +* One CFMWS is also described per-device. +* This SRAT describes one node for each of the above CFMWS. +* The HMAT describes performance for each node in the SRAT. + +:doc:`CEDT <../acpi/cedt>`:: + + Subtable Type : 00 [CXL Host Bridge Structure] + Reserved : 00 + Length : 0020 + Associated host bridge : 00000007 + Specification version : 00000001 + Reserved : 00000000 + Register base : 0000010370400000 + Register length : 0000000000010000 + + Subtable Type : 00 [CXL Host Bridge Structure] + Reserved : 00 + Length : 0020 + Associated host bridge : 00000006 + Specification version : 00000001 + Reserved : 00000000 + Register base : 0000010380800000 + Register length : 0000000000010000 + + Subtable Type : 01 [CXL Fixed Memory Window Structure] + Reserved : 00 + Length : 002C + Reserved : 00000000 + Window base address : 0000001000000000 + Window size : 0000000400000000 + Interleave Members (2^n) : 01 + Interleave Arithmetic : 00 + Reserved : 0000 + Granularity : 00000000 + Restrictions : 0006 + QtgId : 0001 + First Target : 00000007 + Second Target : 00000006 + + Subtable Type : 01 [CXL Fixed Memory Window Structure] + Reserved : 00 + Length : 002C + Reserved : 00000000 + Window base address : 0000002000000000 + Window size : 0000000200000000 + Interleave Members (2^n) : 00 + Interleave Arithmetic : 00 + Reserved : 0000 + Granularity : 00000000 + Restrictions : 0006 + QtgId : 0001 + First Target : 00000007 + + Subtable Type : 01 [CXL Fixed Memory Window Structure] + Reserved : 00 + Length : 002C + Reserved : 00000000 + Window base address : 0000002200000000 + Window size : 0000000200000000 + Interleave Members (2^n) : 00 + Interleave Arithmetic : 00 + Reserved : 0000 + Granularity : 00000000 + Restrictions : 0006 + QtgId : 0001 + First Target : 00000006 + + Subtable Type : 01 [CXL Fixed Memory Window Structure] + Reserved : 00 + Length : 002C + Reserved : 00000000 + Window base address : 0000003000000000 + Window size : 0000000100000000 + Interleave Members (2^n) : 00 + Interleave Arithmetic : 00 + Reserved : 0000 + Granularity : 00000000 + Restrictions : 0006 + QtgId : 0001 + First Target : 00000007 + + Subtable Type : 01 [CXL Fixed Memory Window Structure] + Reserved : 00 + Length : 002C + Reserved : 00000000 + Window base address : 0000003100000000 + Window size : 0000000100000000 + Interleave Members (2^n) : 00 + Interleave Arithmetic : 00 + Reserved : 0000 + Granularity : 00000000 + Restrictions : 0006 + QtgId : 0001 + First Target : 00000007 + + Subtable Type : 01 [CXL Fixed Memory Window Structure] + Reserved : 00 + Length : 002C + Reserved : 00000000 + Window base address : 0000003200000000 + Window size : 0000000100000000 + Interleave Members (2^n) : 00 + Interleave Arithmetic : 00 + Reserved : 0000 + Granularity : 00000000 + Restrictions : 0006 + QtgId : 0001 + First Target : 00000006 + + Subtable Type : 01 [CXL Fixed Memory Window Structure] + Reserved : 00 + Length : 002C + Reserved : 00000000 + Window base address : 0000003300000000 + Window size : 0000000100000000 + Interleave Members (2^n) : 00 + Interleave Arithmetic : 00 + Reserved : 0000 + Granularity : 00000000 + Restrictions : 0006 + QtgId : 0001 + First Target : 00000006 + +:doc:`SRAT <../acpi/srat>`:: + + Subtable Type : 01 [Memory Affinity] + Length : 28 + Proximity Domain : 00000001 + Reserved1 : 0000 + Base Address : 0000001000000000 + Address Length : 0000000400000000 + Reserved2 : 00000000 + Flags (decoded below) : 0000000B + Enabled : 1 + Hot Pluggable : 1 + Non-Volatile : 0 + + Subtable Type : 01 [Memory Affinity] + Length : 28 + Proximity Domain : 00000002 + Reserved1 : 0000 + Base Address : 0000002000000000 + Address Length : 0000000200000000 + Reserved2 : 00000000 + Flags (decoded below) : 0000000B + Enabled : 1 + Hot Pluggable : 1 + Non-Volatile : 0 + + Subtable Type : 01 [Memory Affinity] + Length : 28 + Proximity Domain : 00000003 + Reserved1 : 0000 + Base Address : 0000002200000000 + Address Length : 0000000200000000 + Reserved2 : 00000000 + Flags (decoded below) : 0000000B + Enabled : 1 + Hot Pluggable : 1 + Non-Volatile : 0 + + Subtable Type : 01 [Memory Affinity] + Length : 28 + Proximity Domain : 00000004 + Reserved1 : 0000 + Base Address : 0000003000000000 + Address Length : 0000000100000000 + Reserved2 : 00000000 + Flags (decoded below) : 0000000B + Enabled : 1 + Hot Pluggable : 1 + Non-Volatile : 0 + + Subtable Type : 01 [Memory Affinity] + Length : 28 + Proximity Domain : 00000005 + Reserved1 : 0000 + Base Address : 0000003100000000 + Address Length : 0000000100000000 + Reserved2 : 00000000 + Flags (decoded below) : 0000000B + Enabled : 1 + Hot Pluggable : 1 + Non-Volatile : 0 + + Subtable Type : 01 [Memory Affinity] + Length : 28 + Proximity Domain : 00000006 + Reserved1 : 0000 + Base Address : 0000003200000000 + Address Length : 0000000100000000 + Reserved2 : 00000000 + Flags (decoded below) : 0000000B + Enabled : 1 + Hot Pluggable : 1 + Non-Volatile : 0 + + Subtable Type : 01 [Memory Affinity] + Length : 28 + Proximity Domain : 00000007 + Reserved1 : 0000 + Base Address : 0000003300000000 + Address Length : 0000000100000000 + Reserved2 : 00000000 + Flags (decoded below) : 0000000B + Enabled : 1 + Hot Pluggable : 1 + Non-Volatile : 0 + +:doc:`HMAT <../acpi/hmat>`:: + + Structure Type : 0001 [SLLBI] + Data Type : 00 [Latency] + Target Proximity Domain List : 00000000 + Target Proximity Domain List : 00000001 + Target Proximity Domain List : 00000002 + Target Proximity Domain List : 00000003 + Target Proximity Domain List : 00000004 + Target Proximity Domain List : 00000005 + Target Proximity Domain List : 00000006 + Target Proximity Domain List : 00000007 + Entry : 0080 + Entry : 0100 + Entry : 0100 + Entry : 0100 + Entry : 0100 + Entry : 0100 + Entry : 0100 + Entry : 0100 + + Structure Type : 0001 [SLLBI] + Data Type : 03 [Bandwidth] + Target Proximity Domain List : 00000000 + Target Proximity Domain List : 00000001 + Target Proximity Domain List : 00000002 + Target Proximity Domain List : 00000003 + Target Proximity Domain List : 00000004 + Target Proximity Domain List : 00000005 + Target Proximity Domain List : 00000006 + Target Proximity Domain List : 00000007 + Entry : 1200 + Entry : 0400 + Entry : 0200 + Entry : 0200 + Entry : 0100 + Entry : 0100 + Entry : 0100 + Entry : 0100 + +:doc:`SLIT <../acpi/slit>`:: + + Signature : "SLIT" [System Locality Information Table] + Localities : 0000000000000003 + Locality 0 : 10 20 20 20 20 20 20 20 + Locality 1 : FF 0A FF FF FF FF FF FF + Locality 2 : FF FF 0A FF FF FF FF FF + Locality 3 : FF FF FF 0A FF FF FF FF + Locality 4 : FF FF FF FF 0A FF FF FF + Locality 5 : FF FF FF FF FF 0A FF FF + Locality 6 : FF FF FF FF FF FF 0A FF + Locality 7 : FF FF FF FF FF FF FF 0A + +:doc:`DSDT <../acpi/dsdt>`:: + + Scope (_SB) + { + Device (S0D0) + { + Name (_HID, "ACPI0016" /* Compute Express Link Host Bridge */) // _HID: Hardware ID + ... + Name (_UID, 0x07) // _UID: Unique ID + } + ... + Device (S0D5) + { + Name (_HID, "ACPI0016" /* Compute Express Link Host Bridge */) // _HID: Hardware ID + ... + Name (_UID, 0x06) // _UID: Unique ID + } + } diff --git a/Documentation/driver-api/cxl/platform/example-configurations/hb-interleave.rst b/Documentation/driver-api/cxl/platform/example-configurations/hb-interleave.rst new file mode 100644 index 000000000000..c474dcf09fb0 --- /dev/null +++ b/Documentation/driver-api/cxl/platform/example-configurations/hb-interleave.rst @@ -0,0 +1,107 @@ +.. SPDX-License-Identifier: GPL-2.0 + +============================ +Cross-Host-Bridge Interleave +============================ +This system has a single socket with two CXL host bridges. Each host bridge +has a single CXL memory expander with a 4GB of memory. + +Things to note: + +* Cross-Bridge interleave is described. +* The expanders are described by a single CFMWS. +* This SRAT describes one node for both host bridges. +* The HMAT describes a single node's performance. + +:doc:`CEDT <../acpi/cedt>`:: + + Subtable Type : 00 [CXL Host Bridge Structure] + Reserved : 00 + Length : 0020 + Associated host bridge : 00000007 + Specification version : 00000001 + Reserved : 00000000 + Register base : 0000010370400000 + Register length : 0000000000010000 + + Subtable Type : 00 [CXL Host Bridge Structure] + Reserved : 00 + Length : 0020 + Associated host bridge : 00000006 + Specification version : 00000001 + Reserved : 00000000 + Register base : 0000010380800000 + Register length : 0000000000010000 + + Subtable Type : 01 [CXL Fixed Memory Window Structure] + Reserved : 00 + Length : 002C + Reserved : 00000000 + Window base address : 0000001000000000 + Window size : 0000000200000000 + Interleave Members (2^n) : 01 + Interleave Arithmetic : 00 + Reserved : 0000 + Granularity : 00000000 + Restrictions : 0006 + QtgId : 0001 + First Target : 00000007 + Second Target : 00000006 + +:doc:`SRAT <../acpi/srat>`:: + + Subtable Type : 01 [Memory Affinity] + Length : 28 + Proximity Domain : 00000001 + Reserved1 : 0000 + Base Address : 0000001000000000 + Address Length : 0000000200000000 + Reserved2 : 00000000 + Flags (decoded below) : 0000000B + Enabled : 1 + Hot Pluggable : 1 + Non-Volatile : 0 + +:doc:`HMAT <../acpi/hmat>`:: + + Structure Type : 0001 [SLLBI] + Data Type : 00 [Latency] + Target Proximity Domain List : 00000000 + Target Proximity Domain List : 00000001 + Target Proximity Domain List : 00000002 + Entry : 0080 + Entry : 0100 + + Structure Type : 0001 [SLLBI] + Data Type : 03 [Bandwidth] + Target Proximity Domain List : 00000000 + Target Proximity Domain List : 00000001 + Target Proximity Domain List : 00000002 + Entry : 1200 + Entry : 0400 + +:doc:`SLIT <../acpi/slit>`:: + + Signature : "SLIT" [System Locality Information Table] + Localities : 0000000000000003 + Locality 0 : 10 20 + Locality 1 : FF 0A + +:doc:`DSDT <../acpi/dsdt>`:: + + Scope (_SB) + { + Device (S0D0) + { + Name (_HID, "ACPI0016" /* Compute Express Link Host Bridge */) // _HID: Hardware ID + ... + Name (_UID, 0x07) // _UID: Unique ID + } + ... + Device (S0D5) + { + Name (_HID, "ACPI0016" /* Compute Express Link Host Bridge */) // _HID: Hardware ID + ... + Name (_UID, 0x06) // _UID: Unique ID + } + } diff --git a/Documentation/driver-api/cxl/platform/example-configurations/multi-dev-per-hb.rst b/Documentation/driver-api/cxl/platform/example-configurations/multi-dev-per-hb.rst new file mode 100644 index 000000000000..a7854a79dbbd --- /dev/null +++ b/Documentation/driver-api/cxl/platform/example-configurations/multi-dev-per-hb.rst @@ -0,0 +1,90 @@ +.. SPDX-License-Identifier: GPL-2.0 + +================================ +Multiple Devices per Host Bridge +================================ + +In this example system we will have a single socket and one CXL host bridge. +There are two CXL memory expanders with 4GB attached to the host bridge. + +Things to note: + +* Intra-Bridge interleave is not described here. +* The expanders are described by a single CEDT/CFMWS. +* This CEDT/SRAT describes one node for both devices. +* There is only one proximity domain the HMAT for both devices. + +:doc:`CEDT <../acpi/cedt>`:: + + Subtable Type : 00 [CXL Host Bridge Structure] + Reserved : 00 + Length : 0020 + Associated host bridge : 00000007 + Specification version : 00000001 + Reserved : 00000000 + Register base : 0000010370400000 + Register length : 0000000000010000 + + Subtable Type : 01 [CXL Fixed Memory Window Structure] + Reserved : 00 + Length : 002C + Reserved : 00000000 + Window base address : 0000001000000000 + Window size : 0000000200000000 + Interleave Members (2^n) : 00 + Interleave Arithmetic : 00 + Reserved : 0000 + Granularity : 00000000 + Restrictions : 0006 + QtgId : 0001 + First Target : 00000007 + +:doc:`SRAT <../acpi/srat>`:: + + Subtable Type : 01 [Memory Affinity] + Length : 28 + Proximity Domain : 00000001 + Reserved1 : 0000 + Base Address : 0000001000000000 + Address Length : 0000000200000000 + Reserved2 : 00000000 + Flags (decoded below) : 0000000B + Enabled : 1 + Hot Pluggable : 1 + Non-Volatile : 0 + +:doc:`HMAT <../acpi/hmat>`:: + + Structure Type : 0001 [SLLBI] + Data Type : 00 [Latency] + Target Proximity Domain List : 00000000 + Target Proximity Domain List : 00000001 + Entry : 0080 + Entry : 0100 + + Structure Type : 0001 [SLLBI] + Data Type : 03 [Bandwidth] + Target Proximity Domain List : 00000000 + Target Proximity Domain List : 00000001 + Entry : 1200 + Entry : 0200 + +:doc:`SLIT <../acpi/slit>`:: + + Signature : "SLIT" [System Locality Information Table] + Localities : 0000000000000003 + Locality 0 : 10 20 + Locality 1 : FF 0A + +:doc:`DSDT <../acpi/dsdt>`:: + + Scope (_SB) + { + Device (S0D0) + { + Name (_HID, "ACPI0016" /* Compute Express Link Host Bridge */) // _HID: Hardware ID + ... + Name (_UID, 0x07) // _UID: Unique ID + } + ... + } diff --git a/Documentation/driver-api/cxl/platform/example-configurations/one-dev-per-hb.rst b/Documentation/driver-api/cxl/platform/example-configurations/one-dev-per-hb.rst new file mode 100644 index 000000000000..aebda0eb3e17 --- /dev/null +++ b/Documentation/driver-api/cxl/platform/example-configurations/one-dev-per-hb.rst @@ -0,0 +1,136 @@ +.. SPDX-License-Identifier: GPL-2.0 + +========================== +One Device per Host Bridge +========================== + +This system has a single socket with two CXL host bridges. Each host bridge +has a single CXL memory expander with a 4GB of memory. + +Things to note: + +* Cross-Bridge interleave is not being used. +* The expanders are in two separate but adjascent memory regions. +* This CEDT/SRAT describes one node per device +* The expanders have the same performance and will be in the same memory tier. + +:doc:`CEDT <../acpi/cedt>`:: + + Subtable Type : 00 [CXL Host Bridge Structure] + Reserved : 00 + Length : 0020 + Associated host bridge : 00000007 + Specification version : 00000001 + Reserved : 00000000 + Register base : 0000010370400000 + Register length : 0000000000010000 + + Subtable Type : 00 [CXL Host Bridge Structure] + Reserved : 00 + Length : 0020 + Associated host bridge : 00000006 + Specification version : 00000001 + Reserved : 00000000 + Register base : 0000010380800000 + Register length : 0000000000010000 + + Subtable Type : 01 [CXL Fixed Memory Window Structure] + Reserved : 00 + Length : 002C + Reserved : 00000000 + Window base address : 0000001000000000 + Window size : 0000000100000000 + Interleave Members (2^n) : 00 + Interleave Arithmetic : 00 + Reserved : 0000 + Granularity : 00000000 + Restrictions : 0006 + QtgId : 0001 + First Target : 00000007 + + Subtable Type : 01 [CXL Fixed Memory Window Structure] + Reserved : 00 + Length : 002C + Reserved : 00000000 + Window base address : 0000001100000000 + Window size : 0000000100000000 + Interleave Members (2^n) : 00 + Interleave Arithmetic : 00 + Reserved : 0000 + Granularity : 00000000 + Restrictions : 0006 + QtgId : 0001 + First Target : 00000006 + +:doc:`SRAT <../acpi/srat>`:: + + Subtable Type : 01 [Memory Affinity] + Length : 28 + Proximity Domain : 00000001 + Reserved1 : 0000 + Base Address : 0000001000000000 + Address Length : 0000000100000000 + Reserved2 : 00000000 + Flags (decoded below) : 0000000B + Enabled : 1 + Hot Pluggable : 1 + Non-Volatile : 0 + + Subtable Type : 01 [Memory Affinity] + Length : 28 + Proximity Domain : 00000002 + Reserved1 : 0000 + Base Address : 0000001100000000 + Address Length : 0000000100000000 + Reserved2 : 00000000 + Flags (decoded below) : 0000000B + Enabled : 1 + Hot Pluggable : 1 + Non-Volatile : 0 + +:doc:`HMAT <../acpi/hmat>`:: + + Structure Type : 0001 [SLLBI] + Data Type : 00 [Latency] + Target Proximity Domain List : 00000000 + Target Proximity Domain List : 00000001 + Target Proximity Domain List : 00000002 + Entry : 0080 + Entry : 0100 + Entry : 0100 + + Structure Type : 0001 [SLLBI] + Data Type : 03 [Bandwidth] + Target Proximity Domain List : 00000000 + Target Proximity Domain List : 00000001 + Target Proximity Domain List : 00000002 + Entry : 1200 + Entry : 0200 + Entry : 0200 + +:doc:`SLIT <../acpi/slit>`:: + + Signature : "SLIT" [System Locality Information Table] + Localities : 0000000000000003 + Locality 0 : 10 20 20 + Locality 1 : FF 0A FF + Locality 2 : FF FF 0A + +:doc:`DSDT <../acpi/dsdt>`:: + + Scope (_SB) + { + Device (S0D0) + { + Name (_HID, "ACPI0016" /* Compute Express Link Host Bridge */) // _HID: Hardware ID + ... + Name (_UID, 0x07) // _UID: Unique ID + } + ... + Device (S0D5) + { + Name (_HID, "ACPI0016" /* Compute Express Link Host Bridge */) // _HID: Hardware ID + ... + Name (_UID, 0x06) // _UID: Unique ID + } + } diff --git a/Documentation/driver-api/cxl/memory-devices.rst b/Documentation/driver-api/cxl/theory-of-operation.rst index 5149ecdc53c7..40793dad3630 100644 --- a/Documentation/driver-api/cxl/memory-devices.rst +++ b/Documentation/driver-api/cxl/theory-of-operation.rst @@ -1,9 +1,9 @@ .. SPDX-License-Identifier: GPL-2.0 .. include:: <isonum.txt> -=================================== -Compute Express Link Memory Devices -=================================== +=============================================== +Compute Express Link Driver Theory of Operation +=============================================== A Compute Express Link Memory Device is a CXL component that implements the CXL.mem protocol. It contains some amount of volatile memory, persistent memory, @@ -14,8 +14,8 @@ that optionally define a device's contribution to an interleaved address range across multiple devices underneath a host-bridge or interleaved across host-bridges. -CXL Bus: Theory of Operation -============================ +The CXL Bus +=========== Similar to how a RAID driver takes disk objects and assembles them into a new logical device, the CXL subsystem is tasked to take PCIe and ACPI objects and assemble them into a CXL.mem decode topology. The need for runtime configuration @@ -328,6 +328,12 @@ CXL Memory Device .. kernel-doc:: drivers/cxl/mem.c :doc: cxl mem +.. kernel-doc:: drivers/cxl/cxlmem.h + :internal: + +.. kernel-doc:: drivers/cxl/core/memdev.c + :identifiers: + CXL Port -------- .. kernel-doc:: drivers/cxl/port.c @@ -341,6 +347,18 @@ CXL Core .. kernel-doc:: drivers/cxl/cxl.h :internal: +.. kernel-doc:: drivers/cxl/acpi.c + :identifiers: add_cxl_resources + +.. kernel-doc:: drivers/cxl/core/hdm.c + :doc: cxl core hdm + +.. kernel-doc:: drivers/cxl/core/hdm.c + :identifiers: + +.. kernel-doc:: drivers/cxl/core/cdat.c + :identifiers: + .. kernel-doc:: drivers/cxl/core/port.c :doc: cxl core @@ -356,12 +374,26 @@ CXL Core .. kernel-doc:: drivers/cxl/core/pmem.c :doc: cxl pmem +.. kernel-doc:: drivers/cxl/core/pmem.c + :identifiers: + .. kernel-doc:: drivers/cxl/core/regs.c :doc: cxl registers +.. kernel-doc:: drivers/cxl/core/regs.c + :identifiers: + .. kernel-doc:: drivers/cxl/core/mbox.c :doc: cxl mbox +.. kernel-doc:: drivers/cxl/core/mbox.c + :identifiers: + +.. kernel-doc:: drivers/cxl/core/features.c + :doc: cxl features + +See :c:func:`devm_cxl_setup_features` for API details. + CXL Regions ----------- .. kernel-doc:: drivers/cxl/core/region.c diff --git a/Documentation/driver-api/dmaengine/client.rst b/Documentation/driver-api/dmaengine/client.rst index ecf139f73da4..d491e385d61a 100644 --- a/Documentation/driver-api/dmaengine/client.rst +++ b/Documentation/driver-api/dmaengine/client.rst @@ -80,6 +80,10 @@ The details of these operations are: - slave_sg: DMA a list of scatter gather buffers from/to a peripheral + - peripheral_dma_vec: DMA an array of scatter gather buffers from/to a + peripheral. Similar to slave_sg, but uses an array of dma_vec + structures instead of a scatterlist. + - dma_cyclic: Perform a cyclic DMA operation from/to a peripheral till the operation is explicitly stopped. @@ -102,6 +106,11 @@ The details of these operations are: unsigned int sg_len, enum dma_data_direction direction, unsigned long flags); + struct dma_async_tx_descriptor *dmaengine_prep_peripheral_dma_vec( + struct dma_chan *chan, const struct dma_vec *vecs, + size_t nents, enum dma_data_direction direction, + unsigned long flags); + struct dma_async_tx_descriptor *dmaengine_prep_dma_cyclic( struct dma_chan *chan, dma_addr_t buf_addr, size_t buf_len, size_t period_len, enum dma_data_direction direction); diff --git a/Documentation/driver-api/dmaengine/provider.rst b/Documentation/driver-api/dmaengine/provider.rst index ceac2a300e32..1594598b3317 100644 --- a/Documentation/driver-api/dmaengine/provider.rst +++ b/Documentation/driver-api/dmaengine/provider.rst @@ -172,8 +172,8 @@ Currently, the types available are: - It's usually used for copying pixel data between host memory and memory-mapped GPU device memory, such as found on modern PCI video graphics cards. The most immediate example is the OpenGL API function - ``glReadPielx()``, which might require a verbatim copy of a huge framebuffer - from local device memory onto host memory. + ``glReadPixels()``, which might require a verbatim copy of a huge + framebuffer from local device memory onto host memory. - DMA_XOR @@ -217,10 +217,12 @@ Currently, the types available are: - DMA_ASYNC_TX - - Must not be set by the device, and will be set by the framework - if needed + - The device supports asynchronous memory-to-memory operations, + including memcpy, memset, xor, pq, xor_val, and pq_val. - - TODO: What is it about? + - This capability is automatically set by the DMA engine + framework and must not be configured manually by device + drivers. - DMA_SLAVE @@ -433,6 +435,12 @@ supported. - residue: Provides the residue bytes of the transfer for those that support residue. +- ``device_prep_peripheral_dma_vec`` + + - Similar to ``device_prep_slave_sg``, but it takes a pointer to a + array of ``dma_vec`` structures, which (in the long run) will replace + scatterlists. + - ``device_issue_pending`` - Takes the first transaction descriptor in the pending queue, @@ -544,6 +552,10 @@ dma_cookie_t - Not really relevant any more since the introduction of ``virt-dma`` that abstracts it away. +dma_vec + +- A small structure that contains a DMA address and length. + DMA_CTRL_ACK - If clear, the descriptor cannot be reused by provider until the diff --git a/Documentation/driver-api/dpll.rst b/Documentation/driver-api/dpll.rst index ea8d16600e16..e6855cd37e85 100644 --- a/Documentation/driver-api/dpll.rst +++ b/Documentation/driver-api/dpll.rst @@ -214,6 +214,27 @@ offset values are fractional with 3-digit decimal places and shell be divided with ``DPLL_PIN_PHASE_OFFSET_DIVIDER`` to get integer part and modulo divided to get fractional part. +Embedded SYNC +============= + +Device may provide ability to use Embedded SYNC feature. It allows +to embed additional SYNC signal into the base frequency of a pin - a one +special pulse of base frequency signal every time SYNC signal pulse +happens. The user can configure the frequency of Embedded SYNC. +The Embedded SYNC capability is always related to a given base frequency +and HW capabilities. The user is provided a range of Embedded SYNC +frequencies supported, depending on current base frequency configured for +the pin. + + ========================================= ================================= + ``DPLL_A_PIN_ESYNC_FREQUENCY`` current Embedded SYNC frequency + ``DPLL_A_PIN_ESYNC_FREQUENCY_SUPPORTED`` nest available Embedded SYNC + frequency ranges + ``DPLL_A_PIN_FREQUENCY_MIN`` attr minimum value of frequency + ``DPLL_A_PIN_FREQUENCY_MAX`` attr maximum value of frequency + ``DPLL_A_PIN_ESYNC_PULSE`` pulse type of Embedded SYNC + ========================================= ================================= + Configuration commands group ============================ diff --git a/Documentation/driver-api/driver-model/devres.rst b/Documentation/driver-api/driver-model/devres.rst index 18caebad7376..3d56f94ac2ee 100644 --- a/Documentation/driver-api/driver-model/devres.rst +++ b/Documentation/driver-api/driver-model/devres.rst @@ -391,13 +391,11 @@ PCI devm_pci_remap_cfgspace() : ioremap PCI configuration space devm_pci_remap_cfg_resource() : ioremap PCI configuration space resource - pcim_enable_device() : after success, all PCI ops become managed + pcim_enable_device() : after success, the PCI device gets disabled automatically on driver detach pcim_iomap() : do iomap() on a single BAR pcim_iomap_regions() : do request_region() and iomap() on multiple BARs - pcim_iomap_regions_request_all() : do request_region() on all and iomap() on multiple BARs pcim_iomap_table() : array of mapped addresses indexed by BAR pcim_iounmap() : do iounmap() on a single BAR - pcim_iounmap_regions() : do iounmap() and release_region() on multiple BARs pcim_pin_device() : keep PCI device enabled after release pcim_set_mwi() : enable Memory-Write-Invalidate PCI transaction @@ -405,7 +403,6 @@ PHY devm_usb_get_phy() devm_usb_get_phy_by_node() devm_usb_get_phy_by_phandle() - devm_usb_put_phy() PINCTRL devm_pinctrl_get() @@ -459,12 +456,14 @@ SERDEV SLAVE DMA ENGINE devm_acpi_dma_controller_register() - devm_acpi_dma_controller_free() SPI - devm_spi_alloc_master() - devm_spi_alloc_slave() + devm_spi_alloc_host() + devm_spi_alloc_target() + devm_spi_optimize_message() devm_spi_register_controller() + devm_spi_register_host() + devm_spi_register_target() WATCHDOG devm_watchdog_register_device() diff --git a/Documentation/driver-api/driver-model/platform.rst b/Documentation/driver-api/driver-model/platform.rst index 1fe5c6c6199c..7beb8a9648c5 100644 --- a/Documentation/driver-api/driver-model/platform.rst +++ b/Documentation/driver-api/driver-model/platform.rst @@ -41,13 +41,14 @@ and shutdown notifications using the standard conventions:: struct platform_driver { int (*probe)(struct platform_device *); - int (*remove)(struct platform_device *); + void (*remove)(struct platform_device *); void (*shutdown)(struct platform_device *); int (*suspend)(struct platform_device *, pm_message_t state); - int (*suspend_late)(struct platform_device *, pm_message_t state); - int (*resume_early)(struct platform_device *); int (*resume)(struct platform_device *); struct device_driver driver; + const struct platform_device_id *id_table; + bool prevent_deferred_probe; + bool driver_managed_dma; }; Note that probe() should in general verify that the specified device hardware diff --git a/Documentation/driver-api/early-userspace/buffer-format.rst b/Documentation/driver-api/early-userspace/buffer-format.rst index 7f74e301fdf3..726bfa2fe70d 100644 --- a/Documentation/driver-api/early-userspace/buffer-format.rst +++ b/Documentation/driver-api/early-userspace/buffer-format.rst @@ -4,20 +4,18 @@ initramfs buffer format Al Viro, H. Peter Anvin -Last revision: 2002-01-13 - -Starting with kernel 2.5.x, the old "initial ramdisk" protocol is -getting {replaced/complemented} with the new "initial ramfs" -(initramfs) protocol. The initramfs contents is passed using the same -memory buffer protocol used by the initrd protocol, but the contents +With kernel 2.5.x, the old "initial ramdisk" protocol was complemented +with an "initial ramfs" protocol. The initramfs content is passed +using the same memory buffer protocol used by initrd, but the content is different. The initramfs buffer contains an archive which is -expanded into a ramfs filesystem; this document details the format of -the initramfs buffer format. +expanded into a ramfs filesystem; this document details the initramfs +buffer format. The initramfs buffer format is based around the "newc" or "crc" CPIO formats, and can be created with the cpio(1) utility. The cpio -archive can be compressed using gzip(1). One valid version of an -initramfs buffer is thus a single .cpio.gz file. +archive can be compressed using gzip(1), or any other algorithm provided +via CONFIG_DECOMPRESS_*. One valid version of an initramfs buffer is +thus a single .cpio.gz file. The full format of the initramfs buffer is defined by the following grammar, where:: @@ -25,12 +23,20 @@ grammar, where:: * is used to indicate "0 or more occurrences of" (|) indicates alternatives + indicates concatenation - GZIP() indicates the gzip(1) of the operand + GZIP() indicates gzip compression of the operand + BZIP2() indicates bzip2 compression of the operand + LZMA() indicates lzma compression of the operand + XZ() indicates xz compression of the operand + LZO() indicates lzo compression of the operand + LZ4() indicates lz4 compression of the operand + ZSTD() indicates zstd compression of the operand ALGN(n) means padding with null bytes to an n-byte boundary - initramfs := ("\0" | cpio_archive | cpio_gzip_archive)* + initramfs := ("\0" | cpio_archive | cpio_compressed_archive)* - cpio_gzip_archive := GZIP(cpio_archive) + cpio_compressed_archive := (GZIP(cpio_archive) | BZIP2(cpio_archive) + | LZMA(cpio_archive) | XZ(cpio_archive) | LZO(cpio_archive) + | LZ4(cpio_archive) | ZSTD(cpio_archive)) cpio_archive := cpio_file* + (<nothing> | cpio_trailer) @@ -75,6 +81,8 @@ c_chksum 8 bytes Checksum of data field if c_magic is 070702; The c_mode field matches the contents of st_mode returned by stat(2) on Linux, and encodes the file type and file permissions. +c_mtime is ignored unless CONFIG_INITRAMFS_PRESERVE_MTIME=y is set. + The c_filesize should be zero for any file which is not a regular file or symlink. diff --git a/Documentation/driver-api/extcon.rst b/Documentation/driver-api/extcon.rst new file mode 100644 index 000000000000..d3217b9cdcd5 --- /dev/null +++ b/Documentation/driver-api/extcon.rst @@ -0,0 +1,255 @@ +======================= +Extcon Device Subsystem +======================= + +Overview +======== + +The Extcon (External Connector) subsystem provides a unified framework for +managing external connectors in Linux systems. It allows drivers to report +the state of external connectors and provides a standardized interface for +userspace to query and monitor these states. + +Extcon is particularly useful in modern devices with multiple connectivity +options, such as smartphones, tablets, and laptops. It helps manage various +types of connectors, including: + +1. USB connectors (e.g., USB-C, micro-USB) +2. Charging ports (e.g., fast charging, wireless charging) +3. Audio jacks (e.g., 3.5mm headphone jack) +4. Video outputs (e.g., HDMI, DisplayPort) +5. Docking stations + +Real-world examples: + +1. Smartphone USB-C port: + A single USB-C port on a smartphone can serve multiple functions. Extcon + can manage the different states of this port, such as: + - USB data connection + - Charging (various types like fast charging, USB Power Delivery) + - Audio output (USB-C headphones) + - Video output (USB-C to HDMI adapter) + +2. Laptop docking station: + When a laptop is connected to a docking station, multiple connections are + made simultaneously. Extcon can handle the state changes for: + - Power delivery + - External displays + - USB hub connections + - Ethernet connectivity + +3. Wireless charging pad: + Extcon can manage the state of a wireless charging connection, allowing + the system to respond appropriately when a device is placed on or removed + from the charging pad. + +4. Smart TV HDMI ports: + In a smart TV, Extcon can manage multiple HDMI ports, detecting when + devices are connected or disconnected, and potentially identifying the + type of device (e.g., gaming console, set-top box, Blu-ray player). + +The Extcon framework simplifies the development of drivers for these complex +scenarios by providing a standardized way to report and query connector +states, handle mutually exclusive connections, and manage connector +properties. This allows for more robust and flexible handling of external +connections in modern devices. + +Key Components +============== + +extcon_dev +---------- + +The core structure representing an Extcon device:: + + struct extcon_dev { + const char *name; + const unsigned int *supported_cable; + const u32 *mutually_exclusive; + + /* Internal data */ + struct device dev; + unsigned int id; + struct raw_notifier_head nh_all; + struct raw_notifier_head *nh; + struct list_head entry; + int max_supported; + spinlock_t lock; + u32 state; + + /* Sysfs related */ + struct device_type extcon_dev_type; + struct extcon_cable *cables; + struct attribute_group attr_g_muex; + struct attribute **attrs_muex; + struct device_attribute *d_attrs_muex; + }; + +Key fields: + +- ``name``: Name of the Extcon device +- ``supported_cable``: Array of supported cable types +- ``mutually_exclusive``: Array defining mutually exclusive cable types + This field is crucial for enforcing hardware constraints. It's an array of + 32-bit unsigned integers, where each element represents a set of mutually + exclusive cable types. The array should be terminated with a 0. + + For example: + + :: + + static const u32 mutually_exclusive[] = { + BIT(0) | BIT(1), /* Cable 0 and 1 are mutually exclusive */ + BIT(2) | BIT(3) | BIT(4), /* Cables 2, 3, and 4 are mutually exclusive */ + 0 /* Terminator */ + }; + + In this example, cables 0 and 1 cannot be connected simultaneously, and + cables 2, 3, and 4 are also mutually exclusive. This is useful for + scenarios like a single port that can either be USB or HDMI, but not both + at the same time. + + The Extcon core uses this information to prevent invalid combinations of + cable states, ensuring that the reported states are always consistent + with the hardware capabilities. + +- ``state``: Current state of the device (bitmap of connected cables) + + +extcon_cable +------------ + +Represents an individual cable managed by an Extcon device:: + + struct extcon_cable { + struct extcon_dev *edev; + int cable_index; + struct attribute_group attr_g; + struct device_attribute attr_name; + struct device_attribute attr_state; + struct attribute *attrs[3]; + union extcon_property_value usb_propval[EXTCON_PROP_USB_CNT]; + union extcon_property_value chg_propval[EXTCON_PROP_CHG_CNT]; + union extcon_property_value jack_propval[EXTCON_PROP_JACK_CNT]; + union extcon_property_value disp_propval[EXTCON_PROP_DISP_CNT]; + DECLARE_BITMAP(usb_bits, EXTCON_PROP_USB_CNT); + DECLARE_BITMAP(chg_bits, EXTCON_PROP_CHG_CNT); + DECLARE_BITMAP(jack_bits, EXTCON_PROP_JACK_CNT); + DECLARE_BITMAP(disp_bits, EXTCON_PROP_DISP_CNT); + }; + +Core Functions +============== + +.. kernel-doc:: drivers/extcon/extcon.c + :identifiers: extcon_get_state + +.. kernel-doc:: drivers/extcon/extcon.c + :identifiers: extcon_set_state + +.. kernel-doc:: drivers/extcon/extcon.c + :identifiers: extcon_set_state_sync + +.. kernel-doc:: drivers/extcon/extcon.c + :identifiers: extcon_get_property + + +Sysfs Interface +=============== + +Extcon devices expose the following sysfs attributes: + +- ``name``: Name of the Extcon device +- ``state``: Current state of all supported cables +- ``cable.N/name``: Name of the Nth supported cable +- ``cable.N/state``: State of the Nth supported cable + +Usage Example +------------- + +.. code-block:: c + + #include <linux/module.h> + #include <linux/platform_device.h> + #include <linux/extcon.h> + + struct my_extcon_data { + struct extcon_dev *edev; + struct device *dev; + }; + + static const unsigned int my_extcon_cable[] = { + EXTCON_USB, + EXTCON_USB_HOST, + EXTCON_NONE, + }; + + static int my_extcon_probe(struct platform_device *pdev) + { + struct my_extcon_data *data; + int ret; + + data = devm_kzalloc(&pdev->dev, sizeof(*data), GFP_KERNEL); + if (!data) + return -ENOMEM; + + data->dev = &pdev->dev; + + /* Initialize extcon device */ + data->edev = devm_extcon_dev_allocate(data->dev, my_extcon_cable); + if (IS_ERR(data->edev)) { + dev_err(data->dev, "Failed to allocate extcon device\n"); + return PTR_ERR(data->edev); + } + + /* Register extcon device */ + ret = devm_extcon_dev_register(data->dev, data->edev); + if (ret < 0) { + dev_err(data->dev, "Failed to register extcon device\n"); + return ret; + } + + platform_set_drvdata(pdev, data); + + /* Example: Set initial state */ + extcon_set_state_sync(data->edev, EXTCON_USB, true); + + dev_info(data->dev, "My extcon driver probed successfully\n"); + return 0; + } + + static int my_extcon_remove(struct platform_device *pdev) + { + struct my_extcon_data *data = platform_get_drvdata(pdev); + + /* Example: Clear state before removal */ + extcon_set_state_sync(data->edev, EXTCON_USB, false); + + dev_info(data->dev, "My extcon driver removed\n"); + return 0; + } + + static const struct of_device_id my_extcon_of_match[] = { + { .compatible = "my,extcon-device", }, + { }, + }; + MODULE_DEVICE_TABLE(of, my_extcon_of_match); + + static struct platform_driver my_extcon_driver = { + .driver = { + .name = "my-extcon-driver", + .of_match_table = my_extcon_of_match, + }, + .probe = my_extcon_probe, + .remove = my_extcon_remove, + }; + + module_platform_driver(my_extcon_driver); + +This example demonstrates: +--------------------------- + +- Defining supported cable types (USB and USB Host in this case). +- Allocating and registering an extcon device. +- Setting an initial state for a cable (USB connected in this example). +- Clearing the state when the driver is removed. diff --git a/Documentation/driver-api/firewire.rst b/Documentation/driver-api/firewire.rst index d3cfa73cbb2b..28a32410f7d2 100644 --- a/Documentation/driver-api/firewire.rst +++ b/Documentation/driver-api/firewire.rst @@ -43,6 +43,8 @@ Firewire core transaction interfaces Firewire Isochronous I/O interfaces =================================== +.. kernel-doc:: include/linux/firewire.h + :functions: fw_iso_context_schedule_flush_completions .. kernel-doc:: drivers/firewire/core-iso.c :export: diff --git a/Documentation/driver-api/firmware/firmware-usage-guidelines.rst b/Documentation/driver-api/firmware/firmware-usage-guidelines.rst index fdcfce42c6d2..336e912281c3 100644 --- a/Documentation/driver-api/firmware/firmware-usage-guidelines.rst +++ b/Documentation/driver-api/firmware/firmware-usage-guidelines.rst @@ -42,3 +42,8 @@ then of course these rules will not apply strictly.) deprecating old major versions, then this should only be done as a last option, and be stated clearly in all communications. +* Firmware files that affect the User API (UAPI) shall not introduce + changes that break existing userspace programs. Updates to such firmware + must ensure backward compatibility with existing userspace applications. + This includes maintaining consistent interfaces and behaviors that + userspace programs rely on. diff --git a/Documentation/driver-api/fpga/fpga-bridge.rst b/Documentation/driver-api/fpga/fpga-bridge.rst index 604208534095..833f68fb0700 100644 --- a/Documentation/driver-api/fpga/fpga-bridge.rst +++ b/Documentation/driver-api/fpga/fpga-bridge.rst @@ -6,9 +6,12 @@ API to implement a new FPGA bridge * struct fpga_bridge - The FPGA Bridge structure * struct fpga_bridge_ops - Low level Bridge driver ops -* fpga_bridge_register() - Create and register a bridge +* __fpga_bridge_register() - Create and register a bridge * fpga_bridge_unregister() - Unregister a bridge +The helper macro ``fpga_bridge_register()`` automatically sets +the module that registers the FPGA bridge as the owner. + .. kernel-doc:: include/linux/fpga/fpga-bridge.h :functions: fpga_bridge @@ -16,7 +19,7 @@ API to implement a new FPGA bridge :functions: fpga_bridge_ops .. kernel-doc:: drivers/fpga/fpga-bridge.c - :functions: fpga_bridge_register + :functions: __fpga_bridge_register .. kernel-doc:: drivers/fpga/fpga-bridge.c :functions: fpga_bridge_unregister diff --git a/Documentation/driver-api/fpga/fpga-mgr.rst b/Documentation/driver-api/fpga/fpga-mgr.rst index 49c0a9512653..8d2b79f696c1 100644 --- a/Documentation/driver-api/fpga/fpga-mgr.rst +++ b/Documentation/driver-api/fpga/fpga-mgr.rst @@ -24,7 +24,8 @@ How to support a new FPGA device -------------------------------- To add another FPGA manager, write a driver that implements a set of ops. The -probe function calls fpga_mgr_register() or fpga_mgr_register_full(), such as:: +probe function calls ``fpga_mgr_register()`` or ``fpga_mgr_register_full()``, +such as:: static const struct fpga_manager_ops socfpga_fpga_ops = { .write_init = socfpga_fpga_ops_configure_init, @@ -69,10 +70,11 @@ probe function calls fpga_mgr_register() or fpga_mgr_register_full(), such as:: } Alternatively, the probe function could call one of the resource managed -register functions, devm_fpga_mgr_register() or devm_fpga_mgr_register_full(). -When these functions are used, the parameter syntax is the same, but the call -to fpga_mgr_unregister() should be removed. In the above example, the -socfpga_fpga_remove() function would not be required. +register functions, ``devm_fpga_mgr_register()`` or +``devm_fpga_mgr_register_full()``. When these functions are used, the +parameter syntax is the same, but the call to ``fpga_mgr_unregister()`` should be +removed. In the above example, the ``socfpga_fpga_remove()`` function would not be +required. The ops will implement whatever device specific register writes are needed to do the programming sequence for this particular FPGA. These ops return 0 for @@ -125,15 +127,19 @@ API for implementing a new FPGA Manager driver * struct fpga_manager - the FPGA manager struct * struct fpga_manager_ops - Low level FPGA manager driver ops * struct fpga_manager_info - Parameter structure for fpga_mgr_register_full() -* fpga_mgr_register_full() - Create and register an FPGA manager using the +* __fpga_mgr_register_full() - Create and register an FPGA manager using the fpga_mgr_info structure to provide the full flexibility of options -* fpga_mgr_register() - Create and register an FPGA manager using standard +* __fpga_mgr_register() - Create and register an FPGA manager using standard arguments -* devm_fpga_mgr_register_full() - Resource managed version of - fpga_mgr_register_full() -* devm_fpga_mgr_register() - Resource managed version of fpga_mgr_register() +* __devm_fpga_mgr_register_full() - Resource managed version of + __fpga_mgr_register_full() +* __devm_fpga_mgr_register() - Resource managed version of __fpga_mgr_register() * fpga_mgr_unregister() - Unregister an FPGA manager +Helper macros ``fpga_mgr_register_full()``, ``fpga_mgr_register()``, +``devm_fpga_mgr_register_full()``, and ``devm_fpga_mgr_register()`` are available +to ease the registration. + .. kernel-doc:: include/linux/fpga/fpga-mgr.h :functions: fpga_mgr_states @@ -147,16 +153,16 @@ API for implementing a new FPGA Manager driver :functions: fpga_manager_info .. kernel-doc:: drivers/fpga/fpga-mgr.c - :functions: fpga_mgr_register_full + :functions: __fpga_mgr_register_full .. kernel-doc:: drivers/fpga/fpga-mgr.c - :functions: fpga_mgr_register + :functions: __fpga_mgr_register .. kernel-doc:: drivers/fpga/fpga-mgr.c - :functions: devm_fpga_mgr_register_full + :functions: __devm_fpga_mgr_register_full .. kernel-doc:: drivers/fpga/fpga-mgr.c - :functions: devm_fpga_mgr_register + :functions: __devm_fpga_mgr_register .. kernel-doc:: drivers/fpga/fpga-mgr.c :functions: fpga_mgr_unregister diff --git a/Documentation/driver-api/fpga/fpga-region.rst b/Documentation/driver-api/fpga/fpga-region.rst index dc55d60a0b4a..2d03b5fb7657 100644 --- a/Documentation/driver-api/fpga/fpga-region.rst +++ b/Documentation/driver-api/fpga/fpga-region.rst @@ -46,13 +46,16 @@ API to add a new FPGA region ---------------------------- * struct fpga_region - The FPGA region struct -* struct fpga_region_info - Parameter structure for fpga_region_register_full() -* fpga_region_register_full() - Create and register an FPGA region using the +* struct fpga_region_info - Parameter structure for __fpga_region_register_full() +* __fpga_region_register_full() - Create and register an FPGA region using the fpga_region_info structure to provide the full flexibility of options -* fpga_region_register() - Create and register an FPGA region using standard +* __fpga_region_register() - Create and register an FPGA region using standard arguments * fpga_region_unregister() - Unregister an FPGA region +Helper macros ``fpga_region_register()`` and ``fpga_region_register_full()`` +automatically set the module that registers the FPGA region as the owner. + The FPGA region's probe function will need to get a reference to the FPGA Manager it will be using to do the programming. This usually would happen during the region's probe function. @@ -82,10 +85,10 @@ following APIs to handle building or tearing down that list. :functions: fpga_region_info .. kernel-doc:: drivers/fpga/fpga-region.c - :functions: fpga_region_register_full + :functions: __fpga_region_register_full .. kernel-doc:: drivers/fpga/fpga-region.c - :functions: fpga_region_register + :functions: __fpga_region_register .. kernel-doc:: drivers/fpga/fpga-region.c :functions: fpga_region_unregister diff --git a/Documentation/driver-api/generic-counter.rst b/Documentation/driver-api/generic-counter.rst index 71ccc30e586b..e826f16ea43d 100644 --- a/Documentation/driver-api/generic-counter.rst +++ b/Documentation/driver-api/generic-counter.rst @@ -467,7 +467,7 @@ Counter sysfs Translates counter data to the standard Counter sysfs interface format and vice versa. -Please refer to the ``Documentation/ABI/testing/sysfs-bus-counter`` file +Please refer to the Documentation/ABI/testing/sysfs-bus-counter file for a detailed breakdown of the available Generic Counter interface sysfs attributes. @@ -483,7 +483,7 @@ Sysfs Interface Several sysfs attributes are generated by the Generic Counter interface, and reside under the ``/sys/bus/counter/devices/counterX`` directory, where ``X`` is to the respective counter device id. Please see -``Documentation/ABI/testing/sysfs-bus-counter`` for detailed information +Documentation/ABI/testing/sysfs-bus-counter for detailed information on each Generic Counter interface sysfs attribute. Through these sysfs attributes, programs and scripts may interact with diff --git a/Documentation/driver-api/gpio/board.rst b/Documentation/driver-api/gpio/board.rst index b33aa04f213f..4fd1cbd8296e 100644 --- a/Documentation/driver-api/gpio/board.rst +++ b/Documentation/driver-api/gpio/board.rst @@ -4,12 +4,6 @@ GPIO Mappings This document explains how GPIOs can be assigned to given devices and functions. -Note that it only applies to the new descriptor-based interface. For a -description of the deprecated integer-based GPIO interface please refer to -legacy.rst (actually, there is no real mapping possible with the old -interface; you just fetch an integer from somewhere and request the -corresponding GPIO). - All platforms can enable the GPIO library, but if the platform strictly requires GPIO functionality to be present, it needs to select GPIOLIB from its Kconfig. Then, how GPIOs are mapped depends on what the platform uses to diff --git a/Documentation/driver-api/gpio/consumer.rst b/Documentation/driver-api/gpio/consumer.rst index ab56ab0dd7a6..bb3366047fad 100644 --- a/Documentation/driver-api/gpio/consumer.rst +++ b/Documentation/driver-api/gpio/consumer.rst @@ -2,9 +2,7 @@ GPIO Descriptor Consumer Interface ================================== -This document describes the consumer interface of the GPIO framework. Note that -it describes the new descriptor-based interface. For a description of the -deprecated integer-based GPIO interface please refer to legacy.rst. +This document describes the consumer interface of the GPIO framework. Guidelines for GPIOs consumers diff --git a/Documentation/driver-api/gpio/driver.rst b/Documentation/driver-api/gpio/driver.rst index e541bd2e898b..ae433261e11a 100644 --- a/Documentation/driver-api/gpio/driver.rst +++ b/Documentation/driver-api/gpio/driver.rst @@ -69,9 +69,8 @@ driver code: The code implementing a gpio_chip should support multiple instances of the controller, preferably using the driver model. That code will configure each -gpio_chip and issue gpiochip_add(), gpiochip_add_data(), or -devm_gpiochip_add_data(). Removing a GPIO controller should be rare; use -gpiochip_remove() when it is unavoidable. +gpio_chip and issue gpiochip_add_data() or devm_gpiochip_add_data(). Removing +a GPIO controller should be rare; use gpiochip_remove() when it is unavoidable. Often a gpio_chip is part of an instance-specific structure with states not exposed by the GPIO interfaces, such as addressing, power management, and more. diff --git a/Documentation/driver-api/gpio/drivers-on-gpio.rst b/Documentation/driver-api/gpio/drivers-on-gpio.rst index af632d764ac6..95572d2a94ce 100644 --- a/Documentation/driver-api/gpio/drivers-on-gpio.rst +++ b/Documentation/driver-api/gpio/drivers-on-gpio.rst @@ -27,7 +27,12 @@ hardware descriptions such as device tree or ACPI: to the lines for a more permanent solution of this type. - gpio-beeper: drivers/input/misc/gpio-beeper.c is used to provide a beep from - an external speaker connected to a GPIO line. + an external speaker connected to a GPIO line. (If the beep is controlled by + off/on, for an actual PWM waveform, see pwm-gpio below.) + +- pwm-gpio: drivers/pwm/pwm-gpio.c is used to toggle a GPIO with a high + resolution timer producing a PWM waveform on the GPIO line, as well as + Linux high resolution timers can do. - extcon-gpio: drivers/extcon/extcon-gpio.c is used when you need to read an external connector status, such as a headset line for an audio driver or an diff --git a/Documentation/driver-api/gpio/index.rst b/Documentation/driver-api/gpio/index.rst index 1d48fe248f05..43f6a3afe10b 100644 --- a/Documentation/driver-api/gpio/index.rst +++ b/Documentation/driver-api/gpio/index.rst @@ -13,7 +13,6 @@ Contents: consumer board drivers-on-gpio - legacy bt8xxgpio Core @@ -28,7 +27,7 @@ Core ACPI support ============ -.. kernel-doc:: drivers/gpio/gpiolib-acpi.c +.. kernel-doc:: drivers/gpio/gpiolib-acpi-core.c :export: Device tree support diff --git a/Documentation/driver-api/gpio/intro.rst b/Documentation/driver-api/gpio/intro.rst index c9c19243b97f..5936a9c57df3 100644 --- a/Documentation/driver-api/gpio/intro.rst +++ b/Documentation/driver-api/gpio/intro.rst @@ -10,18 +10,6 @@ The documents in this directory give detailed instructions on how to access GPIOs in drivers, and how to write a driver for a device that provides GPIOs itself. -Due to the history of GPIO interfaces in the kernel, there are two different -ways to obtain and use GPIOs: - - - The descriptor-based interface is the preferred way to manipulate GPIOs, - and is described by all the files in this directory excepted legacy.rst. - - The legacy integer-based interface which is considered deprecated (but still - usable for compatibility reasons) is documented in legacy.rst. - -The remainder of this document applies to the new descriptor-based interface. -legacy.rst contains the same information applied to the legacy -integer-based interface. - What is a GPIO? =============== diff --git a/Documentation/driver-api/gpio/legacy.rst b/Documentation/driver-api/gpio/legacy.rst deleted file mode 100644 index 534dfe95d128..000000000000 --- a/Documentation/driver-api/gpio/legacy.rst +++ /dev/null @@ -1,679 +0,0 @@ -====================== -Legacy GPIO Interfaces -====================== - -This provides an overview of GPIO access conventions on Linux. - -These calls use the gpio_* naming prefix. No other calls should use that -prefix, or the related __gpio_* prefix. - - -What is a GPIO? -=============== -A "General Purpose Input/Output" (GPIO) is a flexible software-controlled -digital signal. They are provided from many kinds of chip, and are familiar -to Linux developers working with embedded and custom hardware. Each GPIO -represents a bit connected to a particular pin, or "ball" on Ball Grid Array -(BGA) packages. Board schematics show which external hardware connects to -which GPIOs. Drivers can be written generically, so that board setup code -passes such pin configuration data to drivers. - -System-on-Chip (SOC) processors heavily rely on GPIOs. In some cases, every -non-dedicated pin can be configured as a GPIO; and most chips have at least -several dozen of them. Programmable logic devices (like FPGAs) can easily -provide GPIOs; multifunction chips like power managers, and audio codecs -often have a few such pins to help with pin scarcity on SOCs; and there are -also "GPIO Expander" chips that connect using the I2C or SPI serial busses. -Most PC southbridges have a few dozen GPIO-capable pins (with only the BIOS -firmware knowing how they're used). - -The exact capabilities of GPIOs vary between systems. Common options: - - - Output values are writable (high=1, low=0). Some chips also have - options about how that value is driven, so that for example only one - value might be driven ... supporting "wire-OR" and similar schemes - for the other value (notably, "open drain" signaling). - - - Input values are likewise readable (1, 0). Some chips support readback - of pins configured as "output", which is very useful in such "wire-OR" - cases (to support bidirectional signaling). GPIO controllers may have - input de-glitch/debounce logic, sometimes with software controls. - - - Inputs can often be used as IRQ signals, often edge triggered but - sometimes level triggered. Such IRQs may be configurable as system - wakeup events, to wake the system from a low power state. - - - Usually a GPIO will be configurable as either input or output, as needed - by different product boards; single direction ones exist too. - - - Most GPIOs can be accessed while holding spinlocks, but those accessed - through a serial bus normally can't. Some systems support both types. - -On a given board each GPIO is used for one specific purpose like monitoring -MMC/SD card insertion/removal, detecting card writeprotect status, driving -a LED, configuring a transceiver, bitbanging a serial bus, poking a hardware -watchdog, sensing a switch, and so on. - - -GPIO conventions -================ -Note that this is called a "convention" because you don't need to do it this -way, and it's no crime if you don't. There **are** cases where portability -is not the main issue; GPIOs are often used for the kind of board-specific -glue logic that may even change between board revisions, and can't ever be -used on a board that's wired differently. Only least-common-denominator -functionality can be very portable. Other features are platform-specific, -and that can be critical for glue logic. - -Plus, this doesn't require any implementation framework, just an interface. -One platform might implement it as simple inline functions accessing chip -registers; another might implement it by delegating through abstractions -used for several very different kinds of GPIO controller. (There is some -optional code supporting such an implementation strategy, described later -in this document, but drivers acting as clients to the GPIO interface must -not care how it's implemented.) - -That said, if the convention is supported on their platform, drivers should -use it when possible. Platforms must select GPIOLIB if GPIO functionality -is strictly required. Drivers that can't work without -standard GPIO calls should have Kconfig entries which depend on GPIOLIB. The -GPIO calls are available, either as "real code" or as optimized-away stubs, -when drivers use the include file: - - #include <linux/gpio.h> - -If you stick to this convention then it'll be easier for other developers to -see what your code is doing, and help maintain it. - -Note that these operations include I/O barriers on platforms which need to -use them; drivers don't need to add them explicitly. - - -Identifying GPIOs ------------------ -GPIOs are identified by unsigned integers in the range 0..MAX_INT. That -reserves "negative" numbers for other purposes like marking signals as -"not available on this board", or indicating faults. Code that doesn't -touch the underlying hardware treats these integers as opaque cookies. - -Platforms define how they use those integers, and usually #define symbols -for the GPIO lines so that board-specific setup code directly corresponds -to the relevant schematics. In contrast, drivers should only use GPIO -numbers passed to them from that setup code, using platform_data to hold -board-specific pin configuration data (along with other board specific -data they need). That avoids portability problems. - -So for example one platform uses numbers 32-159 for GPIOs; while another -uses numbers 0..63 with one set of GPIO controllers, 64-79 with another -type of GPIO controller, and on one particular board 80-95 with an FPGA. -The numbers need not be contiguous; either of those platforms could also -use numbers 2000-2063 to identify GPIOs in a bank of I2C GPIO expanders. - -If you want to initialize a structure with an invalid GPIO number, use -some negative number (perhaps "-EINVAL"); that will never be valid. To -test if such number from such a structure could reference a GPIO, you -may use this predicate: - - int gpio_is_valid(int number); - -A number that's not valid will be rejected by calls which may request -or free GPIOs (see below). Other numbers may also be rejected; for -example, a number might be valid but temporarily unused on a given board. - -Whether a platform supports multiple GPIO controllers is a platform-specific -implementation issue, as are whether that support can leave "holes" in the space -of GPIO numbers, and whether new controllers can be added at runtime. Such issues -can affect things including whether adjacent GPIO numbers are both valid. - -Using GPIOs ------------ -The first thing a system should do with a GPIO is allocate it, using -the gpio_request() call; see later. - -One of the next things to do with a GPIO, often in board setup code when -setting up a platform_device using the GPIO, is mark its direction:: - - /* set as input or output, returning 0 or negative errno */ - int gpio_direction_input(unsigned gpio); - int gpio_direction_output(unsigned gpio, int value); - -The return value is zero for success, else a negative errno. It should -be checked, since the get/set calls don't have error returns and since -misconfiguration is possible. You should normally issue these calls from -a task context. However, for spinlock-safe GPIOs it's OK to use them -before tasking is enabled, as part of early board setup. - -For output GPIOs, the value provided becomes the initial output value. -This helps avoid signal glitching during system startup. - -For compatibility with legacy interfaces to GPIOs, setting the direction -of a GPIO implicitly requests that GPIO (see below) if it has not been -requested already. That compatibility is being removed from the optional -gpiolib framework. - -Setting the direction can fail if the GPIO number is invalid, or when -that particular GPIO can't be used in that mode. It's generally a bad -idea to rely on boot firmware to have set the direction correctly, since -it probably wasn't validated to do more than boot Linux. (Similarly, -that board setup code probably needs to multiplex that pin as a GPIO, -and configure pullups/pulldowns appropriately.) - - -Spinlock-Safe GPIO access -------------------------- -Most GPIO controllers can be accessed with memory read/write instructions. -Those don't need to sleep, and can safely be done from inside hard -(nonthreaded) IRQ handlers and similar contexts. - -Use the following calls to access such GPIOs:: - - /* GPIO INPUT: return zero or nonzero */ - int gpio_get_value(unsigned gpio); - - /* GPIO OUTPUT */ - void gpio_set_value(unsigned gpio, int value); - -The values are boolean, zero for low, nonzero for high. When reading the -value of an output pin, the value returned should be what's seen on the -pin ... that won't always match the specified output value, because of -issues including open-drain signaling and output latencies. - -The get/set calls have no error returns because "invalid GPIO" should have -been reported earlier from gpio_direction_*(). However, note that not all -platforms can read the value of output pins; those that can't should always -return zero. Also, using these calls for GPIOs that can't safely be accessed -without sleeping (see below) is an error. - -Platform-specific implementations are encouraged to optimize the two -calls to access the GPIO value in cases where the GPIO number (and for -output, value) are constant. It's normal for them to need only a couple -of instructions in such cases (reading or writing a hardware register), -and not to need spinlocks. Such optimized calls can make bitbanging -applications a lot more efficient (in both space and time) than spending -dozens of instructions on subroutine calls. - - -GPIO access that may sleep --------------------------- -Some GPIO controllers must be accessed using message based busses like I2C -or SPI. Commands to read or write those GPIO values require waiting to -get to the head of a queue to transmit a command and get its response. -This requires sleeping, which can't be done from inside IRQ handlers. -To access such GPIOs, a different set of accessors is defined:: - - /* GPIO INPUT: return zero or nonzero, might sleep */ - int gpio_get_value_cansleep(unsigned gpio); - - /* GPIO OUTPUT, might sleep */ - void gpio_set_value_cansleep(unsigned gpio, int value); - -Accessing such GPIOs requires a context which may sleep, for example -a threaded IRQ handler, and those accessors must be used instead of -spinlock-safe accessors without the cansleep() name suffix. - -Other than the fact that these accessors might sleep, and will work -on GPIOs that can't be accessed from hardIRQ handlers, these calls act -the same as the spinlock-safe calls. - -**IN ADDITION** calls to setup and configure such GPIOs must be made -from contexts which may sleep, since they may need to access the GPIO -controller chip too (These setup calls are usually made from board -setup or driver probe/teardown code, so this is an easy constraint.):: - - gpio_direction_input() - gpio_direction_output() - gpio_request() - - ## gpio_request_one() - - gpio_free() - - -Claiming and Releasing GPIOs ----------------------------- -To help catch system configuration errors, two calls are defined:: - - /* request GPIO, returning 0 or negative errno. - * non-null labels may be useful for diagnostics. - */ - int gpio_request(unsigned gpio, const char *label); - - /* release previously-claimed GPIO */ - void gpio_free(unsigned gpio); - -Passing invalid GPIO numbers to gpio_request() will fail, as will requesting -GPIOs that have already been claimed with that call. The return value of -gpio_request() must be checked. You should normally issue these calls from -a task context. However, for spinlock-safe GPIOs it's OK to request GPIOs -before tasking is enabled, as part of early board setup. - -These calls serve two basic purposes. One is marking the signals which -are actually in use as GPIOs, for better diagnostics; systems may have -several hundred potential GPIOs, but often only a dozen are used on any -given board. Another is to catch conflicts, identifying errors when -(a) two or more drivers wrongly think they have exclusive use of that -signal, or (b) something wrongly believes it's safe to remove drivers -needed to manage a signal that's in active use. That is, requesting a -GPIO can serve as a kind of lock. - -Some platforms may also use knowledge about what GPIOs are active for -power management, such as by powering down unused chip sectors and, more -easily, gating off unused clocks. - -For GPIOs that use pins known to the pinctrl subsystem, that subsystem should -be informed of their use; a gpiolib driver's .request() operation may call -pinctrl_gpio_request(), and a gpiolib driver's .free() operation may call -pinctrl_gpio_free(). The pinctrl subsystem allows a pinctrl_gpio_request() -to succeed concurrently with a pin or pingroup being "owned" by a device for -pin multiplexing. - -Any programming of pin multiplexing hardware that is needed to route the -GPIO signal to the appropriate pin should occur within a GPIO driver's -.direction_input() or .direction_output() operations, and occur after any -setup of an output GPIO's value. This allows a glitch-free migration from a -pin's special function to GPIO. This is sometimes required when using a GPIO -to implement a workaround on signals typically driven by a non-GPIO HW block. - -Some platforms allow some or all GPIO signals to be routed to different pins. -Similarly, other aspects of the GPIO or pin may need to be configured, such as -pullup/pulldown. Platform software should arrange that any such details are -configured prior to gpio_request() being called for those GPIOs, e.g. using -the pinctrl subsystem's mapping table, so that GPIO users need not be aware -of these details. - -Also note that it's your responsibility to have stopped using a GPIO -before you free it. - -Considering in most cases GPIOs are actually configured right after they -are claimed, three additional calls are defined:: - - /* request a single GPIO, with initial configuration specified by - * 'flags', identical to gpio_request() wrt other arguments and - * return value - */ - int gpio_request_one(unsigned gpio, unsigned long flags, const char *label); - -where 'flags' is currently defined to specify the following properties: - - * GPIOF_DIR_IN - to configure direction as input - * GPIOF_DIR_OUT - to configure direction as output - - * GPIOF_INIT_LOW - as output, set initial level to LOW - * GPIOF_INIT_HIGH - as output, set initial level to HIGH - -since GPIOF_INIT_* are only valid when configured as output, so group valid -combinations as: - - * GPIOF_IN - configure as input - * GPIOF_OUT_INIT_LOW - configured as output, initial level LOW - * GPIOF_OUT_INIT_HIGH - configured as output, initial level HIGH - -Further more, to ease the claim/release of multiple GPIOs, 'struct gpio' is -introduced to encapsulate all three fields as:: - - struct gpio { - unsigned gpio; - unsigned long flags; - const char *label; - }; - -A typical example of usage:: - - static struct gpio leds_gpios[] = { - { 32, GPIOF_OUT_INIT_HIGH, "Power LED" }, /* default to ON */ - { 33, GPIOF_OUT_INIT_LOW, "Green LED" }, /* default to OFF */ - { 34, GPIOF_OUT_INIT_LOW, "Red LED" }, /* default to OFF */ - { 35, GPIOF_OUT_INIT_LOW, "Blue LED" }, /* default to OFF */ - { ... }, - }; - - err = gpio_request_one(31, GPIOF_IN, "Reset Button"); - if (err) - ... - - -GPIOs mapped to IRQs --------------------- -GPIO numbers are unsigned integers; so are IRQ numbers. These make up -two logically distinct namespaces (GPIO 0 need not use IRQ 0). You can -map between them using calls like:: - - /* map GPIO numbers to IRQ numbers */ - int gpio_to_irq(unsigned gpio); - -Those return either the corresponding number in the other namespace, or -else a negative errno code if the mapping can't be done. (For example, -some GPIOs can't be used as IRQs.) It is an unchecked error to use a GPIO -number that wasn't set up as an input using gpio_direction_input(), or -to use an IRQ number that didn't originally come from gpio_to_irq(). - -These two mapping calls are expected to cost on the order of a single -addition or subtraction. They're not allowed to sleep. - -Non-error values returned from gpio_to_irq() can be passed to request_irq() -or free_irq(). They will often be stored into IRQ resources for platform -devices, by the board-specific initialization code. Note that IRQ trigger -options are part of the IRQ interface, e.g. IRQF_TRIGGER_FALLING, as are -system wakeup capabilities. - - -Emulating Open Drain Signals ----------------------------- -Sometimes shared signals need to use "open drain" signaling, where only the -low signal level is actually driven. (That term applies to CMOS transistors; -"open collector" is used for TTL.) A pullup resistor causes the high signal -level. This is sometimes called a "wire-AND"; or more practically, from the -negative logic (low=true) perspective this is a "wire-OR". - -One common example of an open drain signal is a shared active-low IRQ line. -Also, bidirectional data bus signals sometimes use open drain signals. - -Some GPIO controllers directly support open drain outputs; many don't. When -you need open drain signaling but your hardware doesn't directly support it, -there's a common idiom you can use to emulate it with any GPIO pin that can -be used as either an input or an output: - - LOW: gpio_direction_output(gpio, 0) ... this drives the signal - and overrides the pullup. - - HIGH: gpio_direction_input(gpio) ... this turns off the output, - so the pullup (or some other device) controls the signal. - -If you are "driving" the signal high but gpio_get_value(gpio) reports a low -value (after the appropriate rise time passes), you know some other component -is driving the shared signal low. That's not necessarily an error. As one -common example, that's how I2C clocks are stretched: a slave that needs a -slower clock delays the rising edge of SCK, and the I2C master adjusts its -signaling rate accordingly. - - -GPIO controllers and the pinctrl subsystem ------------------------------------------- - -A GPIO controller on a SOC might be tightly coupled with the pinctrl -subsystem, in the sense that the pins can be used by other functions -together with an optional gpio feature. We have already covered the -case where e.g. a GPIO controller need to reserve a pin or set the -direction of a pin by calling any of:: - - pinctrl_gpio_request() - pinctrl_gpio_free() - pinctrl_gpio_direction_input() - pinctrl_gpio_direction_output() - -But how does the pin control subsystem cross-correlate the GPIO -numbers (which are a global business) to a certain pin on a certain -pin controller? - -This is done by registering "ranges" of pins, which are essentially -cross-reference tables. These are described in -Documentation/driver-api/pin-control.rst - -While the pin allocation is totally managed by the pinctrl subsystem, -gpio (under gpiolib) is still maintained by gpio drivers. It may happen -that different pin ranges in a SoC is managed by different gpio drivers. - -This makes it logical to let gpio drivers announce their pin ranges to -the pin ctrl subsystem before it will call 'pinctrl_gpio_request' in order -to request the corresponding pin to be prepared by the pinctrl subsystem -before any gpio usage. - -For this, the gpio controller can register its pin range with pinctrl -subsystem. There are two ways of doing it currently: with or without DT. - -For with DT support refer to Documentation/devicetree/bindings/gpio/gpio.txt. - -For non-DT support, user can call gpiochip_add_pin_range() with appropriate -parameters to register a range of gpio pins with a pinctrl driver. For this -exact name string of pinctrl device has to be passed as one of the -argument to this routine. - - -What do these conventions omit? -=============================== -One of the biggest things these conventions omit is pin multiplexing, since -this is highly chip-specific and nonportable. One platform might not need -explicit multiplexing; another might have just two options for use of any -given pin; another might have eight options per pin; another might be able -to route a given GPIO to any one of several pins. (Yes, those examples all -come from systems that run Linux today.) - -Related to multiplexing is configuration and enabling of the pullups or -pulldowns integrated on some platforms. Not all platforms support them, -or support them in the same way; and any given board might use external -pullups (or pulldowns) so that the on-chip ones should not be used. -(When a circuit needs 5 kOhm, on-chip 100 kOhm resistors won't do.) -Likewise drive strength (2 mA vs 20 mA) and voltage (1.8V vs 3.3V) is a -platform-specific issue, as are models like (not) having a one-to-one -correspondence between configurable pins and GPIOs. - -There are other system-specific mechanisms that are not specified here, -like the aforementioned options for input de-glitching and wire-OR output. -Hardware may support reading or writing GPIOs in gangs, but that's usually -configuration dependent: for GPIOs sharing the same bank. (GPIOs are -commonly grouped in banks of 16 or 32, with a given SOC having several such -banks.) Some systems can trigger IRQs from output GPIOs, or read values -from pins not managed as GPIOs. Code relying on such mechanisms will -necessarily be nonportable. - -Dynamic definition of GPIOs is not currently standard; for example, as -a side effect of configuring an add-on board with some GPIO expanders. - - -GPIO implementor's framework (OPTIONAL) -======================================= -As noted earlier, there is an optional implementation framework making it -easier for platforms to support different kinds of GPIO controller using -the same programming interface. This framework is called "gpiolib". - -As a debugging aid, if debugfs is available a /sys/kernel/debug/gpio file -will be found there. That will list all the controllers registered through -this framework, and the state of the GPIOs currently in use. - - -Controller Drivers: gpio_chip ------------------------------ -In this framework each GPIO controller is packaged as a "struct gpio_chip" -with information common to each controller of that type: - - - methods to establish GPIO direction - - methods used to access GPIO values - - flag saying whether calls to its methods may sleep - - optional debugfs dump method (showing extra state like pullup config) - - label for diagnostics - -There is also per-instance data, which may come from device.platform_data: -the number of its first GPIO, and how many GPIOs it exposes. - -The code implementing a gpio_chip should support multiple instances of the -controller, possibly using the driver model. That code will configure each -gpio_chip and issue gpiochip_add(). Removing a GPIO controller should be -rare; use gpiochip_remove() when it is unavoidable. - -Most often a gpio_chip is part of an instance-specific structure with state -not exposed by the GPIO interfaces, such as addressing, power management, -and more. Chips such as codecs will have complex non-GPIO state. - -Any debugfs dump method should normally ignore signals which haven't been -requested as GPIOs. They can use gpiochip_is_requested(), which returns -either NULL or the label associated with that GPIO when it was requested. - - -Platform Support ----------------- -To force-enable this framework, a platform's Kconfig will "select" GPIOLIB, -else it is up to the user to configure support for GPIO. - -If neither of these options are selected, the platform does not support -GPIOs through GPIO-lib and the code cannot be enabled by the user. - -Trivial implementations of those functions can directly use framework -code, which always dispatches through the gpio_chip:: - - #define gpio_get_value __gpio_get_value - #define gpio_set_value __gpio_set_value - -Fancier implementations could instead define those as inline functions with -logic optimizing access to specific SOC-based GPIOs. For example, if the -referenced GPIO is the constant "12", getting or setting its value could -cost as little as two or three instructions, never sleeping. When such an -optimization is not possible those calls must delegate to the framework -code, costing at least a few dozen instructions. For bitbanged I/O, such -instruction savings can be significant. - -For SOCs, platform-specific code defines and registers gpio_chip instances -for each bank of on-chip GPIOs. Those GPIOs should be numbered/labeled to -match chip vendor documentation, and directly match board schematics. They -may well start at zero and go up to a platform-specific limit. Such GPIOs -are normally integrated into platform initialization to make them always be -available, from arch_initcall() or earlier; they can often serve as IRQs. - - -Board Support -------------- -For external GPIO controllers -- such as I2C or SPI expanders, ASICs, multi -function devices, FPGAs or CPLDs -- most often board-specific code handles -registering controller devices and ensures that their drivers know what GPIO -numbers to use with gpiochip_add(). Their numbers often start right after -platform-specific GPIOs. - -For example, board setup code could create structures identifying the range -of GPIOs that chip will expose, and passes them to each GPIO expander chip -using platform_data. Then the chip driver's probe() routine could pass that -data to gpiochip_add(). - -Initialization order can be important. For example, when a device relies on -an I2C-based GPIO, its probe() routine should only be called after that GPIO -becomes available. That may mean the device should not be registered until -calls for that GPIO can work. One way to address such dependencies is for -such gpio_chip controllers to provide setup() and teardown() callbacks to -board specific code; those board specific callbacks would register devices -once all the necessary resources are available, and remove them later when -the GPIO controller device becomes unavailable. - - -Sysfs Interface for Userspace (OPTIONAL) -======================================== -Platforms which use the "gpiolib" implementors framework may choose to -configure a sysfs user interface to GPIOs. This is different from the -debugfs interface, since it provides control over GPIO direction and -value instead of just showing a gpio state summary. Plus, it could be -present on production systems without debugging support. - -Given appropriate hardware documentation for the system, userspace could -know for example that GPIO #23 controls the write protect line used to -protect boot loader segments in flash memory. System upgrade procedures -may need to temporarily remove that protection, first importing a GPIO, -then changing its output state, then updating the code before re-enabling -the write protection. In normal use, GPIO #23 would never be touched, -and the kernel would have no need to know about it. - -Again depending on appropriate hardware documentation, on some systems -userspace GPIO can be used to determine system configuration data that -standard kernels won't know about. And for some tasks, simple userspace -GPIO drivers could be all that the system really needs. - -Note that standard kernel drivers exist for common "LEDs and Buttons" -GPIO tasks: "leds-gpio" and "gpio_keys", respectively. Use those -instead of talking directly to the GPIOs; they integrate with kernel -frameworks better than your userspace code could. - - -Paths in Sysfs --------------- -There are three kinds of entry in /sys/class/gpio: - - - Control interfaces used to get userspace control over GPIOs; - - - GPIOs themselves; and - - - GPIO controllers ("gpio_chip" instances). - -That's in addition to standard files including the "device" symlink. - -The control interfaces are write-only: - - /sys/class/gpio/ - - "export" ... Userspace may ask the kernel to export control of - a GPIO to userspace by writing its number to this file. - - Example: "echo 19 > export" will create a "gpio19" node - for GPIO #19, if that's not requested by kernel code. - - "unexport" ... Reverses the effect of exporting to userspace. - - Example: "echo 19 > unexport" will remove a "gpio19" - node exported using the "export" file. - -GPIO signals have paths like /sys/class/gpio/gpio42/ (for GPIO #42) -and have the following read/write attributes: - - /sys/class/gpio/gpioN/ - - "direction" ... reads as either "in" or "out". This value may - normally be written. Writing as "out" defaults to - initializing the value as low. To ensure glitch free - operation, values "low" and "high" may be written to - configure the GPIO as an output with that initial value. - - Note that this attribute *will not exist* if the kernel - doesn't support changing the direction of a GPIO, or - it was exported by kernel code that didn't explicitly - allow userspace to reconfigure this GPIO's direction. - - "value" ... reads as either 0 (low) or 1 (high). If the GPIO - is configured as an output, this value may be written; - any nonzero value is treated as high. - - If the pin can be configured as interrupt-generating interrupt - and if it has been configured to generate interrupts (see the - description of "edge"), you can poll(2) on that file and - poll(2) will return whenever the interrupt was triggered. If - you use poll(2), set the events POLLPRI. If you use select(2), - set the file descriptor in exceptfds. After poll(2) returns, - either lseek(2) to the beginning of the sysfs file and read the - new value or close the file and re-open it to read the value. - - "edge" ... reads as either "none", "rising", "falling", or - "both". Write these strings to select the signal edge(s) - that will make poll(2) on the "value" file return. - - This file exists only if the pin can be configured as an - interrupt generating input pin. - - "active_low" ... reads as either 0 (false) or 1 (true). Write - any nonzero value to invert the value attribute both - for reading and writing. Existing and subsequent - poll(2) support configuration via the edge attribute - for "rising" and "falling" edges will follow this - setting. - -GPIO controllers have paths like /sys/class/gpio/gpiochip42/ (for the -controller implementing GPIOs starting at #42) and have the following -read-only attributes: - - /sys/class/gpio/gpiochipN/ - - "base" ... same as N, the first GPIO managed by this chip - - "label" ... provided for diagnostics (not always unique) - - "ngpio" ... how many GPIOs this manges (N to N + ngpio - 1) - -Board documentation should in most cases cover what GPIOs are used for -what purposes. However, those numbers are not always stable; GPIOs on -a daughtercard might be different depending on the base board being used, -or other cards in the stack. In such cases, you may need to use the -gpiochip nodes (possibly in conjunction with schematics) to determine -the correct GPIO number to use for a given signal. - - -API Reference -============= - -The functions listed in this section are deprecated. The GPIO descriptor based -API should be used in new code. - -.. kernel-doc:: drivers/gpio/gpiolib-legacy.c - :export: diff --git a/Documentation/driver-api/iio/buffers.rst b/Documentation/driver-api/iio/buffers.rst index e83026aebe97..63f364e862d1 100644 --- a/Documentation/driver-api/iio/buffers.rst +++ b/Documentation/driver-api/iio/buffers.rst @@ -15,8 +15,8 @@ trigger source. Multiple data channels can be read at once from IIO buffer sysfs interface ========================== An IIO buffer has an associated attributes directory under -:file:`/sys/bus/iio/iio:device{X}/buffer/*`. Here are some of the existing -attributes: +:file:`/sys/bus/iio/devices/iio:device{X}/buffer/*`. Here are some of the +existing attributes: * :file:`length`, the total number of data samples (capacity) that can be stored by the buffer. @@ -28,8 +28,8 @@ IIO buffer setup The meta information associated with a channel reading placed in a buffer is called a scan element. The important bits configuring scan elements are exposed to userspace applications via the -:file:`/sys/bus/iio/iio:device{X}/scan_elements/` directory. This directory contains -attributes of the following form: +:file:`/sys/bus/iio/devices/iio:device{X}/scan_elements/` directory. This +directory contains attributes of the following form: * :file:`enable`, used for enabling a channel. If and only if its attribute is non *zero*, then a triggered capture will contain data samples for this diff --git a/Documentation/driver-api/iio/core.rst b/Documentation/driver-api/iio/core.rst index 715cf29482a1..42b580fb2989 100644 --- a/Documentation/driver-api/iio/core.rst +++ b/Documentation/driver-api/iio/core.rst @@ -24,7 +24,7 @@ then we will show how a device driver makes use of an IIO device. There are two ways for a user space application to interact with an IIO driver. -1. :file:`/sys/bus/iio/iio:device{X}/`, this represents a hardware sensor +1. :file:`/sys/bus/iio/devices/iio:device{X}/`, this represents a hardware sensor and groups together the data channels of the same chip. 2. :file:`/dev/iio:device{X}`, character device node interface used for buffered data transfer and for events information retrieval. @@ -51,8 +51,8 @@ IIO device sysfs interface Attributes are sysfs files used to expose chip info and also allowing applications to set various configuration parameters. For device with -index X, attributes can be found under /sys/bus/iio/iio:deviceX/ directory. -Common attributes are: +index X, attributes can be found under /sys/bus/iio/devices/iio:deviceX/ +directory. Common attributes are: * :file:`name`, description of the physical chip. * :file:`dev`, shows the major:minor pair associated with @@ -60,7 +60,7 @@ Common attributes are: * :file:`sampling_frequency_available`, available discrete set of sampling frequency values for device. * Available standard attributes for IIO devices are described in the - :file:`Documentation/ABI/testing/sysfs-bus-iio` file in the Linux kernel + :file:Documentation/ABI/testing/sysfs-bus-iio file in the Linux kernel sources. IIO device channels @@ -140,16 +140,16 @@ Here is how we can make use of the channel's modifiers:: This channel's definition will generate two separate sysfs files for raw data retrieval: -* :file:`/sys/bus/iio/iio:device{X}/in_intensity_ir_raw` -* :file:`/sys/bus/iio/iio:device{X}/in_intensity_both_raw` +* :file:`/sys/bus/iio/devices/iio:device{X}/in_intensity_ir_raw` +* :file:`/sys/bus/iio/devices/iio:device{X}/in_intensity_both_raw` one file for processed data: -* :file:`/sys/bus/iio/iio:device{X}/in_illuminance_input` +* :file:`/sys/bus/iio/devices/iio:device{X}/in_illuminance_input` and one shared sysfs file for sampling frequency: -* :file:`/sys/bus/iio/iio:device{X}/sampling_frequency`. +* :file:`/sys/bus/iio/devices/iio:device{X}/sampling_frequency`. Here is how we can make use of the channel's indexing:: diff --git a/Documentation/driver-api/index.rst b/Documentation/driver-api/index.rst index f10decc2c14b..3e2a270bd828 100644 --- a/Documentation/driver-api/index.rst +++ b/Documentation/driver-api/index.rst @@ -81,11 +81,13 @@ Subsystem-specific APIs acpi/index backlight/lp855x-driver.rst clk + coco/index console crypto/index dmaengine/index dpll edac + extcon firmware/index fpga/index frame-buffer @@ -124,6 +126,7 @@ Subsystem-specific APIs pps ptp pwm + pwrseq regulator reset rfkill diff --git a/Documentation/driver-api/infiniband.rst b/Documentation/driver-api/infiniband.rst index 30e142ccbee9..10d8be9e74fe 100644 --- a/Documentation/driver-api/infiniband.rst +++ b/Documentation/driver-api/infiniband.rst @@ -77,14 +77,14 @@ iSCSI Extensions for RDMA (iSER) :internal: .. kernel-doc:: drivers/infiniband/ulp/iser/iscsi_iser.c - :functions: iscsi_iser_pdu_alloc iser_initialize_task_headers \ - iscsi_iser_task_init iscsi_iser_mtask_xmit iscsi_iser_task_xmit \ - iscsi_iser_cleanup_task iscsi_iser_check_protection \ - iscsi_iser_conn_create iscsi_iser_conn_bind \ - iscsi_iser_conn_start iscsi_iser_conn_stop \ - iscsi_iser_session_destroy iscsi_iser_session_create \ - iscsi_iser_set_param iscsi_iser_ep_connect iscsi_iser_ep_poll \ - iscsi_iser_ep_disconnect + :functions: iscsi_iser_pdu_alloc iser_initialize_task_headers + iscsi_iser_task_init iscsi_iser_mtask_xmit iscsi_iser_task_xmit + iscsi_iser_cleanup_task iscsi_iser_check_protection + iscsi_iser_conn_create iscsi_iser_conn_bind + iscsi_iser_conn_start iscsi_iser_conn_stop + iscsi_iser_session_destroy iscsi_iser_session_create + iscsi_iser_set_param iscsi_iser_ep_connect iscsi_iser_ep_poll + iscsi_iser_ep_disconnect .. kernel-doc:: drivers/infiniband/ulp/iser/iser_initiator.c :internal: diff --git a/Documentation/driver-api/infrastructure.rst b/Documentation/driver-api/infrastructure.rst index 3d52dfdfa9fd..35e36fee4238 100644 --- a/Documentation/driver-api/infrastructure.rst +++ b/Documentation/driver-api/infrastructure.rst @@ -41,6 +41,12 @@ Device Drivers Base .. kernel-doc:: drivers/base/class.c :export: +.. kernel-doc:: include/linux/device/faux.h + :internal: + +.. kernel-doc:: drivers/base/faux.c + :export: + .. kernel-doc:: drivers/base/node.c :internal: diff --git a/Documentation/driver-api/input.rst b/Documentation/driver-api/input.rst index 4bbb26ae2a89..bd7a3578ade7 100644 --- a/Documentation/driver-api/input.rst +++ b/Documentation/driver-api/input.rst @@ -40,3 +40,10 @@ Sparse keymap support .. kernel-doc:: drivers/input/sparse-keymap.c :export: +PS/2 protocol support +--------------------- +.. kernel-doc:: include/linux/libps2.h + :internal: + +.. kernel-doc:: drivers/input/serio/libps2.c + :export: diff --git a/Documentation/driver-api/ipmi.rst b/Documentation/driver-api/ipmi.rst index e224e47b6b09..2cc6c898ab90 100644 --- a/Documentation/driver-api/ipmi.rst +++ b/Documentation/driver-api/ipmi.rst @@ -45,7 +45,7 @@ manual), choose the 'IPMI SI handler' option. A driver also exists for direct I2C access to the IPMI management controller. Some boards support this, but it is unknown if it will work on every board. For this, choose 'IPMI SMBus handler', but be ready to try to do some -figuring to see if it will work on your system if the SMBIOS/APCI +figuring to see if it will work on your system if the SMBIOS/ACPI information is wrong or not present. It is fairly safe to have both these enabled and let the drivers auto-detect what is present. @@ -63,7 +63,7 @@ situation, you need to read the section below named 'The SI Driver' or IPMI defines a standard watchdog timer. You can enable this with the 'IPMI Watchdog Timer' config option. If you compile the driver into the kernel, then via a kernel command-line option you can have the -watchdog timer start as soon as it initializes. It also have a lot +watchdog timer start as soon as it initializes. It also has a lot of other options, see the 'Watchdog' section below for more details. Note that you can also have the watchdog continue to run if it is closed (by default it is disabled on close). Go into the 'Watchdog @@ -280,10 +280,8 @@ Creating the User To use the message handler, you must first create a user using ipmi_create_user. The interface number specifies which SMI you want to connect to, and you must supply callback functions to be called -when data comes in. The callback function can run at interrupt level, -so be careful using the callbacks. This also allows to you pass in a -piece of data, the handler_data, that will be passed back to you on -all calls. +when data comes in. This also allows to you pass in a piece of data, +the handler_data, that will be passed back to you on all calls. Once you are done, call ipmi_destroy_user() to get rid of the user. @@ -303,8 +301,7 @@ use it for anything you like. Responses come back in the function pointed to by the ipmi_recv_hndl field of the "handler" that you passed in to ipmi_create_user(). -Remember again, these may be running at interrupt level. Remember to -look at the receive type, too. +Remember to look at the receive type, too. From userland, you fill out an ipmi_req_t structure and use the IPMICTL_SEND_COMMAND ioctl. For incoming stuff, you can use select() @@ -317,13 +314,13 @@ This gives the receiver a place to actually put the message. If the message cannot fit into the data you provide, you will get an EMSGSIZE error and the driver will leave the data in the receive -queue. If you want to get it and have it truncate the message, us +queue. If you want to get it and have it truncate the message, use the IPMICTL_RECEIVE_MSG_TRUNC ioctl. When you send a command (which is defined by the lowest-order bit of the netfn per the IPMI spec) on the IPMB bus, the driver will automatically assign the sequence number to the command and save the -command. If the response is not receive in the IPMI-specified 5 +command. If the response is not received in the IPMI-specified 5 seconds, it will generate a response automatically saying the command timed out. If an unsolicited response comes in (if it was after 5 seconds, for instance), that response will be ignored. @@ -367,7 +364,7 @@ channel bitmasks do not overlap. To respond to a received command, set the response bit in the returned netfn, use the address from the received message, and use the same -msgid that you got in the receive message. +msgid that you got in the received message. From userland, equivalent IOCTLs are provided to do these functions. @@ -440,7 +437,7 @@ register would be 0xca6. This defaults to 1. The regsizes parameter gives the size of a register, in bytes. The data used by IPMI is 8-bits wide, but it may be inside a larger -register. This parameter allows the read and write type to specified. +register. This parameter allows the read and write type to be specified. It may be 1, 2, 4, or 8. The default is 1. Since the register size may be larger than 32 bits, the IPMI data may not @@ -481,8 +478,8 @@ If your IPMI interface does not support interrupts and is a KCS or SMIC interface, the IPMI driver will start a kernel thread for the interface to help speed things up. This is a low-priority kernel thread that constantly polls the IPMI driver while an IPMI operation -is in progress. The force_kipmid module parameter will all the user to -force this thread on or off. If you force it off and don't have +is in progress. The force_kipmid module parameter will allow the user +to force this thread on or off. If you force it off and don't have interrupts, the driver will run VERY slowly. Don't blame me, these interfaces suck. @@ -540,7 +537,7 @@ at module load time (for a module) with:: alerts_broken The addresses are normal I2C addresses. The adapter is the string -name of the adapter, as shown in /sys/class/i2c-adapter/i2c-<n>/name. +name of the adapter, as shown in /sys/bus/i2c/devices/i2c-<n>/name. It is *NOT* i2c-<n> itself. Also, the comparison is done ignoring spaces, so if the name is "This is an I2C chip" you can say adapter_name=ThisisanI2cchip. This is because it's hard to pass in @@ -583,7 +580,7 @@ kernel command line as:: These are the same options as on the module command line. The I2C driver does not support non-blocking access or polling, so -this driver cannod to IPMI panic events, extend the watchdog at panic +this driver cannot do IPMI panic events, extend the watchdog at panic time, or other panic-related IPMI functions without special kernel patches and driver modifications. You can get those at the openipmi web page. @@ -610,7 +607,7 @@ Parameters are:: ipmi_ipmb.retry_time_ms=<Time between retries on IPMB> ipmi_ipmb.max_retries=<Number of times to retry a message> -Loading the module will not result in the driver automatcially +Loading the module will not result in the driver automatically starting unless there is device tree information setting it up. If you want to instantiate one of these by hand, do:: diff --git a/Documentation/driver-api/media/camera-sensor.rst b/Documentation/driver-api/media/camera-sensor.rst index b4920b34cebc..c290833165e6 100644 --- a/Documentation/driver-api/media/camera-sensor.rst +++ b/Documentation/driver-api/media/camera-sensor.rst @@ -81,10 +81,10 @@ restart when the system is resumed. This requires coordination between the camera sensor and the rest of the camera pipeline. Bridge drivers are responsible for this coordination, and instruct camera sensors to stop and restart streaming by calling the appropriate subdev operations -(``.s_stream()``, ``.enable_streams()`` or ``.disable_streams()``). Camera -sensor drivers shall therefore **not** keep track of the streaming state to -stop streaming in the PM suspend handler and restart it in the resume handler. -Drivers should in general not implement the system PM handlers. +(``.enable_streams()`` or ``.disable_streams()``). Camera sensor drivers shall +therefore **not** keep track of the streaming state to stop streaming in the PM +suspend handler and restart it in the resume handler. Drivers should in general +not implement the system PM handlers. Camera sensor drivers shall **not** implement the subdev ``.s_power()`` operation, as it is deprecated. While this operation is implemented in some diff --git a/Documentation/driver-api/media/drivers/index.rst b/Documentation/driver-api/media/drivers/index.rst index c4123a16b5f9..7f6f3dcd5c90 100644 --- a/Documentation/driver-api/media/drivers/index.rst +++ b/Documentation/driver-api/media/drivers/index.rst @@ -26,6 +26,7 @@ Video4Linux (V4L) drivers vimc-devel zoran ccs/ccs + ipu6 Digital TV drivers diff --git a/Documentation/driver-api/media/drivers/ipu6.rst b/Documentation/driver-api/media/drivers/ipu6.rst new file mode 100644 index 000000000000..88f6498e74db --- /dev/null +++ b/Documentation/driver-api/media/drivers/ipu6.rst @@ -0,0 +1,190 @@ +.. SPDX-License-Identifier: GPL-2.0 + +================== +Intel IPU6 Driver +================== + +Author: Bingbu Cao <bingbu.cao@intel.com> + +Overview +========= + +Intel IPU6 is the sixth generation of Intel Image Processing Unit used in some +Intel Chipsets such as Tiger Lake, Jasper Lake, Alder Lake, Raptor Lake and +Meteor Lake. IPU6 consists of two major systems: Input System (ISYS) and +Processing System (PSYS). IPU6 are visible on the PCI bus as a single device, it +can be found by ``lspci``: + +``0000:00:05.0 Multimedia controller: Intel Corporation Device xxxx (rev xx)`` + +IPU6 has a 16 MB BAR in PCI configuration Space for MMIO registers which is +visible for driver. + +Buttress +========= + +The IPU6 is connecting to the system fabric with Buttress which is enabling host +driver to control the IPU6, it also allows IPU6 access the system memory to +store and load frame pixel streams and any other metadata. + +Buttress mainly manages several system functionalities: power management, +interrupt handling, firmware authentication and global timer sync. + +ISYS and PSYS Power flow +------------------------ + +IPU6 driver initialize the ISYS and PSYS power up or down request by setting the +Buttress frequency control register for ISYS and PSYS +(``IPU6_BUTTRESS_REG_IS_FREQ_CTL`` and ``IPU6_BUTTRESS_REG_PS_FREQ_CTL``) in +function: + +.. c:function:: int ipu6_buttress_power(...) + +Buttress forwards the request to Punit, after Punit execute the power up flow, +Buttress indicates driver that ISYS or PSYS is powered up by updating the power +status registers. + +.. Note:: ISYS power up needs take place prior to PSYS power up, ISYS power down + needs take place after PSYS power down due to hardware limitation. + +Interrupt +--------- + +IPU6 interrupt can be generated as MSI or INTA, interrupt will be triggered when +ISYS, PSYS, Buttress event or error happen, driver can get the interrupt cause +by reading the interrupt status register ``BUTTRESS_REG_ISR_STATUS``, driver +clears the irq status and then calls specific ISYS or PSYS irq handler. + +.. c:function:: irqreturn_t ipu6_buttress_isr(int irq, ...) + +Security and firmware authentication +------------------------------------- + +To address the IPU6 firmware security concerns, the IPU6 firmware needs to +undergo an authentication process before it is allowed to executed on the IPU6 +internal processors. The IPU6 driver will work with Converged Security Engine +(CSE) to complete authentication process. The CSE is responsible of +authenticating the IPU6 firmware. The authenticated firmware binary is copied +into an isolated memory region. Firmware authentication process is implemented +by CSE following an IPC handshake with the IPU6 driver. There are some Buttress +registers used by the CSE and the IPU6 driver to communicate with each other via +IPC. + +.. c:function:: int ipu6_buttress_authenticate(...) + +Global timer sync +----------------- + +The IPU6 driver initiates a Hammock Harbor synchronization flow each time it +starts camera operation. The IPU6 will synchronizes an internal counter in the +Buttress with a copy of the SoC time, this counter maintains the up-to-date time +until camera operation is stopped. The IPU6 driver can use this time counter to +calibrate the timestamp based on the timestamp in response event from firmware. + +.. c:function:: int ipu6_buttress_start_tsc_sync(...) + +DMA and MMU +============ + +The IPU6 has its own scalar processor where the firmware run at and an internal +32-bit virtual address space. The IPU6 has MMU address translation hardware to +allow that scalar processors to access the internal memory and external system +memory through IPU6 virtual address. The address translation is based on two +levels of page lookup tables stored in system memory which are maintained by the +IPU6 driver. The IPU6 driver sets the level-1 page table base address to MMU +register and allows MMU to perform page table lookups. + +The IPU6 driver exports its own DMA operations. The IPU6 driver will update the +page table entries for each DMA operation and invalidate the MMU TLB after each +unmap and free. + +Firmware file format +==================== + +The IPU6 firmware is in Code Partition Directory (CPD) file format. The CPD +firmware contains a CPD header, several CPD entries and components. The CPD +component includes 3 entries - manifest, metadata and module data. Manifest and +metadata are defined by CSE and used by CSE for authentication. Module data is +specific to IPU6 which holds the binary data of firmware called package +directory. The IPU6 driver (``ipu6-cpd.c`` in particular) parses and validates +the CPD firmware file and gets the package directory binary data of the IPU6 +firmware, copies it to specific DMA buffer and sets its base address to Buttress +``FW_SOURCE_BASE`` register. Finally the CSE will do authentication for this +firmware binary. + + +Syscom interface +================ + +The IPU6 driver communicates with firmware via the Syscom ABI. Syscom is an +inter-processor communication mechanism between the IPU scalar processors and +the CPU. There are a number of resources shared between firmware and software. +A system memory region where the message queues reside, firmware can access the +memory region via the IPU MMU. The Syscom queues are FIFO fixed depth queues +with a configurable number of tokens (messages). There are also common IPU6 MMIO +registers where the queue read and write indices reside. Software and firmware +function as producer and consumer of tokens in the queues and update the write +and read indices separately when sending or receiving each message. + +The IPU6 driver must prepare and configure the number of input and output +queues, configure the count of tokens per queue and the size of per token before +initiating and starting the communication with firmware. Firmware and software +must use same configurations. The IPU6 Buttress has a number of firmware boot +parameter registers which can be used to store the address of configuration and +initialise the Syscom state, then driver can request firmware to start and run via +setting the scalar processor control status register. + +Input System +============ + +IPU6 input system consists of MIPI D-PHY and several CSI-2 receivers. It can +capture image pixel data from camera sensors or other MIPI CSI-2 output devices. + +D-PHYs and CSI-2 ports lane mapping +----------------------------------- + +The IPU6 integrates different D-PHY IPs on different SoCs, on Tiger Lake and +Alder Lake, IPU6 integrates MCD10 D-PHY, IPU6SE on Jasper Lake integrates JSL +D-PHY and IPU6EP on Meteor Lake integrates a Synopsys DWC D-PHY. There is an +adaptional layer between D-PHY and CSI-2 receiver controller which includes port +configuration, PHY wrapper or private test interfaces for D-PHY. There are 3 +D-PHY drivers ``ipu6-isys-mcd-phy.c``, ``ipu6-isys-jsl-phy.c`` and +``ipu6-isys-dwc-phy.c`` program the above 3 D-PHYs in IPU6. + +Different IPU6 versions have different D-PHY lanes mappings, On Tiger Lake, +there are 12 data lanes and 8 clock lanes, IPU6 support maximum 8 CSI-2 ports, +see the PPI mmapping in ``ipu6-isys-mcd-phy.c`` for more information. On Jasper +Lake and Alder Lake, D-PHY has 8 data lanes and 4 clock lanes, the IPU6 supports +maximum 4 CSI-2 ports. For Meteor Lake, D-PHY has 12 data lanes and 6 clock +lanes so IPU6 support maximum 6 CSI-2 ports. + +.. Note:: Each pair of CSI-2 two ports is a single unit that can share the data + lanes. For example, for CSI-2 port 0 and 1, CSI-2 port 0 support + maximum 4 data lanes, CSI-2 port 1 support maximum 2 data lanes, CSI-2 + port 0 with 2 data lanes can work together with CSI-2 port 1 with 2 + data lanes. If trying to use CSI-2 port 0 with 4 lanes, CSI-2 port 1 + will not be available as the 4 data lanes are shared by CSI-2 port 0 + and 1. The same applies to CSI ports 2/3, 4/5 and 7/8. + +ISYS firmware ABIs +------------------ + +The IPU6 firmware implements a series of ABIs for software access. In general, +software firstly prepares the stream configuration ``struct +ipu6_fw_isys_stream_cfg_data_abi`` and sends the configuration to firmware via +sending ``STREAM_OPEN`` command. Stream configuration includes input pins and +output pins, input pin ``struct ipu6_fw_isys_input_pin_info_abi`` defines the +resolution and data type of input source, output pin ``struct +ipu6_fw_isys_output_pin_info_abi`` defines the output resolution, stride and +frame format, etc. + +Once the driver gets the interrupt from firmware that indicates stream open +successfully, the driver will send the ``STREAM_START`` and ``STREAM_CAPTURE`` +command to request firmware to start capturing image frames. ``STREAM_CAPTURE`` +command queues the buffers to firmware with ``struct +ipu6_fw_isys_frame_buff_set``, software then waits for the interrupt and +response from firmware, ``PIN_DATA_READY`` means a buffer is ready on a specific +output pin and then software can return the buffer to user. + +.. Note:: See :ref:`Examples<ipu6_isys_capture_examples>` about how to do + capture by IPU6 ISYS driver. diff --git a/Documentation/driver-api/media/drivers/zoran.rst b/Documentation/driver-api/media/drivers/zoran.rst index b205e10c3154..3e05b7f0442a 100644 --- a/Documentation/driver-api/media/drivers/zoran.rst +++ b/Documentation/driver-api/media/drivers/zoran.rst @@ -222,7 +222,7 @@ The CCIR - I uses the PAL colorsystem, and is used in Great Britain, Hong Kong, Ireland, Nigeria, South Africa. The CCIR - N uses the PAL colorsystem and PAL frame size but the NTSC framerate, -and is used in Argentinia, Uruguay, an a few others +and is used in Argentina, Uruguay, an a few others We do not talk about how the audio is broadcast ! diff --git a/Documentation/driver-api/media/maintainer-entry-profile.rst b/Documentation/driver-api/media/maintainer-entry-profile.rst index ffc712a5f632..ad96a89ee916 100644 --- a/Documentation/driver-api/media/maintainer-entry-profile.rst +++ b/Documentation/driver-api/media/maintainer-entry-profile.rst @@ -116,7 +116,7 @@ CEC drivers ``cec-compliance`` .. [3] The ``v4l2-compliance`` also covers the media controller usage inside V4L2 drivers. -Other compilance tools are under development to check other parts of the +Other compliance tools are under development to check other parts of the subsystem. Those tests need to pass before the patches go upstream. diff --git a/Documentation/driver-api/media/mc-core.rst b/Documentation/driver-api/media/mc-core.rst index 2456950ce8ff..1d010bd7ec49 100644 --- a/Documentation/driver-api/media/mc-core.rst +++ b/Documentation/driver-api/media/mc-core.rst @@ -144,7 +144,8 @@ valid values are described at :c:func:`media_create_pad_link()` and Graph traversal ^^^^^^^^^^^^^^^ -The media framework provides APIs to iterate over entities in a graph. +The media framework provides APIs to traverse media graphs, locating connected +entities and links. To iterate over all entities belonging to a media device, drivers can use the media_device_for_each_entity macro, defined in @@ -159,31 +160,6 @@ the media_device_for_each_entity macro, defined in ... } -Drivers might also need to iterate over all entities in a graph that can be -reached only through enabled links starting at a given entity. The media -framework provides a depth-first graph traversal API for that purpose. - -.. note:: - - Graphs with cycles (whether directed or undirected) are **NOT** - supported by the graph traversal API. To prevent infinite loops, the graph - traversal code limits the maximum depth to ``MEDIA_ENTITY_ENUM_MAX_DEPTH``, - currently defined as 16. - -Drivers initiate a graph traversal by calling -:c:func:`media_graph_walk_start()` - -The graph structure, provided by the caller, is initialized to start graph -traversal at the given entity. - -Drivers can then retrieve the next entity by calling -:c:func:`media_graph_walk_next()` - -When the graph traversal is complete the function will return ``NULL``. - -Graph traversal can be interrupted at any moment. No cleanup function call -is required and the graph structure can be freed normally. - Helper functions can be used to find a link between two given pads, or a pad connected to another pad through an enabled link (:c:func:`media_entity_find_link()`, :c:func:`media_pad_remote_pad_first()`, @@ -276,6 +252,45 @@ Subsystems should facilitate link validation by providing subsystem specific helper functions to provide easy access for commonly needed information, and in the end provide a way to use driver-specific callbacks. +Pipeline traversal +^^^^^^^^^^^^^^^^^^ + +Once a pipeline has been constructed with :c:func:`media_pipeline_start()`, +drivers can iterate over entities or pads in the pipeline with the +:c:macro:´media_pipeline_for_each_entity` and +:c:macro:´media_pipeline_for_each_pad` macros. Iterating over pads is +straightforward: + +.. code-block:: c + + media_pipeline_pad_iter iter; + struct media_pad *pad; + + media_pipeline_for_each_pad(pipe, &iter, pad) { + /* 'pad' will point to each pad in turn */ + ... + } + +To iterate over entities, the iterator needs to be initialized and cleaned up +as an additional steps: + +.. code-block:: c + + media_pipeline_entity_iter iter; + struct media_entity *entity; + int ret; + + ret = media_pipeline_entity_iter_init(pipe, &iter); + if (ret) + ...; + + media_pipeline_for_each_entity(pipe, &iter, entity) { + /* 'entity' will point to each entity in turn */ + ... + } + + media_pipeline_entity_iter_cleanup(&iter); + Media Controller Device Allocator API ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/Documentation/driver-api/media/tx-rx.rst b/Documentation/driver-api/media/tx-rx.rst index 29d66a47b56e..0b8c9cde8ee4 100644 --- a/Documentation/driver-api/media/tx-rx.rst +++ b/Documentation/driver-api/media/tx-rx.rst @@ -49,12 +49,31 @@ Link frequency The :ref:`V4L2_CID_LINK_FREQ <v4l2-cid-link-freq>` control is used to tell the receiver the frequency of the bus (i.e. it is not the same as the symbol rate). -``.s_stream()`` callback -^^^^^^^^^^^^^^^^^^^^^^^^ +Drivers that do not have user-configurable link frequency should report it +through the ``.get_mbus_config()`` subdev pad operation, in the ``link_freq`` +field of struct v4l2_mbus_config, instead of through controls. + +Receiver drivers should use :c:func:`v4l2_get_link_freq` helper to obtain the +link frequency from the transmitter sub-device. + +``.enable_streams()`` and ``.disable_streams()`` callbacks +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -The struct struct v4l2_subdev_video_ops->s_stream() callback is used by the -receiver driver to control the transmitter driver's streaming state. +The struct v4l2_subdev_pad_ops->enable_streams() and struct +v4l2_subdev_pad_ops->disable_streams() callbacks are used by the receiver driver +to control the transmitter driver's streaming state. These callbacks may not be +called directly, but by using ``v4l2_subdev_enable_streams()`` and +``v4l2_subdev_disable_streams()``. + +Stopping the transmitter +^^^^^^^^^^^^^^^^^^^^^^^^ +A transmitter stops sending the stream of images as a result of +calling the ``.disable_streams()`` callback. Some transmitters may stop the +stream at a frame boundary whereas others stop immediately, +effectively leaving the current frame unfinished. The receiver driver +should not make assumptions either way, but function properly in both +cases. CSI-2 transmitter drivers ------------------------- @@ -76,14 +95,15 @@ where * - link_freq - The value of the ``V4L2_CID_LINK_FREQ`` integer64 menu item. * - nr_of_lanes - - Number of data lanes used on the CSI-2 link. This can - be obtained from the OF endpoint configuration. + - Number of data lanes used on the CSI-2 link. * - 2 - Data is transferred on both rising and falling edge of the signal. * - bits_per_sample - Number of bits per sample. * - k - - 16 for D-PHY and 7 for C-PHY + - 16 for D-PHY and 7 for C-PHY. + +Information on whether D-PHY or C-PHY is used, and the value of ``nr_of_lanes``, can be obtained from the OF endpoint configuration. .. note:: @@ -122,13 +142,3 @@ device, so this should be only done when it is needed. Receiver drivers that do not need explicit LP-11 or LP-111 state setup are waived from calling the two callbacks. - -Stopping the transmitter -^^^^^^^^^^^^^^^^^^^^^^^^ - -A transmitter stops sending the stream of images as a result of -calling the ``.s_stream()`` callback. Some transmitters may stop the -stream at a frame boundary whereas others stop immediately, -effectively leaving the current frame unfinished. The receiver driver -should not make assumptions either way, but function properly in both -cases. diff --git a/Documentation/driver-api/media/v4l2-core.rst b/Documentation/driver-api/media/v4l2-core.rst index 58cba831ade5..ad987c34ad2a 100644 --- a/Documentation/driver-api/media/v4l2-core.rst +++ b/Documentation/driver-api/media/v4l2-core.rst @@ -26,3 +26,4 @@ Video4Linux devices v4l2-tuner v4l2-common v4l2-tveeprom + v4l2-jpeg diff --git a/Documentation/driver-api/media/v4l2-jpeg.rst b/Documentation/driver-api/media/v4l2-jpeg.rst new file mode 100644 index 000000000000..af3bc52f865b --- /dev/null +++ b/Documentation/driver-api/media/v4l2-jpeg.rst @@ -0,0 +1,10 @@ +.. SPDX-License-Identifier: GPL-2.0 + +V4L2 JPEG header related functions and data structures +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. kernel-doc:: include/media/v4l2-jpeg.h + :internal: + +.. kernel-doc:: drivers/media/v4l2-core/v4l2-jpeg.c + :export: diff --git a/Documentation/driver-api/mmc/index.rst b/Documentation/driver-api/mmc/index.rst index 7339736ac774..8188863e5959 100644 --- a/Documentation/driver-api/mmc/index.rst +++ b/Documentation/driver-api/mmc/index.rst @@ -10,4 +10,5 @@ MMC/SD/SDIO card support mmc-dev-attrs mmc-dev-parts mmc-async-req + mmc-test mmc-tools diff --git a/Documentation/driver-api/mmc/mmc-test.rst b/Documentation/driver-api/mmc/mmc-test.rst new file mode 100644 index 000000000000..1fe33eb43742 --- /dev/null +++ b/Documentation/driver-api/mmc/mmc-test.rst @@ -0,0 +1,299 @@ +.. SPDX-License-Identifier: GPL-2.0 + +======================== +MMC Test Framework +======================== + +Overview +======== + +The `mmc_test` framework is designed to test the performance and reliability of host controller drivers and all devices handled by the MMC subsystem. This includes not only MMC devices but also SD cards and other devices supported by the subsystem. + +The framework provides a variety of tests to evaluate different aspects of the host controller and device interactions, such as read and write performance, data integrity, and error handling. These tests help ensure that the host controller drivers and devices operate correctly under various conditions. + +The `mmc_test` framework is particularly useful for: + +- Verifying the functionality and performance of MMC and SD host controller drivers. +- Ensuring compatibility and reliability of MMC and SD devices. +- Identifying and diagnosing issues in the MMC subsystem. + +The results of the tests are logged in the kernel log, providing detailed information about the test outcomes and any encountered issues. + +Note: whatever is on your card will be overwritten by these tests. + +Initialization +============== + +To use the ``mmc_test`` framework, follow these steps: + +1. **Enable the MMC Test Framework**: + + Ensure that the ``CONFIG_MMC_TEST`` kernel configuration option is enabled. This can be done by configuring the kernel: + + .. code-block:: none + + make menuconfig + + Navigate to: + + Device Drivers ---> + <*> MMC/SD/SDIO card support ---> + [*] MMC host test driver + + Alternatively, you can enable it directly in the kernel configuration file: + + .. code-block:: none + + echo "CONFIG_MMC_TEST=y" >> .config + + Rebuild and install the kernel if necessary. + +2. **Load the MMC Test Module**: + + If the ``mmc_test`` framework is built as a module, you need to load it using ``modprobe``: + + .. code-block:: none + + modprobe mmc_test + +Binding the MMC Card for Testing +================================ + +To enable MMC testing, you need to unbind the MMC card from the ``mmcblk`` driver and bind it to the ``mmc_test`` driver. This allows the ``mmc_test`` framework to take control of the MMC card for testing purposes. + +1. Identify the MMC card: + + .. code-block:: sh + + ls /sys/bus/mmc/devices/ + + This will list the MMC devices, such as ``mmc0:0001``. + +2. Unbind the MMC card from the ``mmcblk`` driver: + + .. code-block:: sh + + echo 'mmc0:0001' > /sys/bus/mmc/drivers/mmcblk/unbind + +3. Bind the MMC card to the ``mmc_test`` driver: + + .. code-block:: sh + + echo 'mmc0:0001' > /sys/bus/mmc/drivers/mmc_test/bind + +After binding, you should see a line in the kernel log indicating that the card has been claimed for testing: + +.. code-block:: none + + mmc_test mmc0:0001: Card claimed for testing. + + +Usage - Debugfs Entries +======================= + +Once the ``mmc_test`` framework is enabled, you can interact with the following debugfs entries located in ``/sys/kernel/debug/mmc0/mmc0:0001``: + +1. **test**: + + This file is used to run specific tests. Write the test number to this file to execute a test. + + .. code-block:: sh + + echo <test_number> > /sys/kernel/debug/mmc0/mmc0:0001/test + + The test result is indicated in the kernel log info. You can view the kernel log using the `dmesg` command or by checking the log file in `/var/log/`. + + .. code-block:: sh + + dmesg | grep mmc0 + + Example: + + To run test number 4 (Basic read with data verification): + + .. code-block:: sh + + echo 4 > /sys/kernel/debug/mmc0/mmc0:0001/test + + Check the kernel log for the result: + + .. code-block:: sh + + dmesg | grep mmc0 + +2. **testlist**: + + This file lists all available tests. You can read this file to see the list of tests and their corresponding numbers. + + .. code-block:: sh + + cat /sys/kernel/debug/mmc0/mmc0:0001/testlist + + The available tests are listed in the table below: + ++------+--------------------------------+---------------------------------------------+ +| Test | Test Name | Test Description | ++======+================================+=============================================+ +| 0 | Run all tests | Runs all available tests | ++------+--------------------------------+---------------------------------------------+ +| 1 | Basic write | Performs a basic write operation of a | +| | | single 512-Byte block to the MMC card | +| | | without data verification. | ++------+--------------------------------+---------------------------------------------+ +| 2 | Basic read | Same for read | ++------+--------------------------------+---------------------------------------------+ +| 3 | Basic write | Performs a basic write operation of a | +| | (with data verification) | single 512-Byte block to the MMC card | +| | | with data verification by reading back | +| | | the written data and comparing it. | ++------+--------------------------------+---------------------------------------------+ +| 4 | Basic read | Same for read | +| | (with data verification) | | ++------+--------------------------------+---------------------------------------------+ +| 5 | Multi-block write | Performs a multi-block write operation of | +| | | 8 blocks (each 512 bytes) to the MMC card. | ++------+--------------------------------+---------------------------------------------+ +| 6 | Multi-block read | Same for read | ++------+--------------------------------+---------------------------------------------+ +| 7 | Power of two block writes | Performs write operations with block sizes | +| | | that are powers of two, starting from 1 | +| | | byte up to 256 bytes, to the MMC card. | ++------+--------------------------------+---------------------------------------------+ +| 8 | Power of two block reads | Same for read | ++------+--------------------------------+---------------------------------------------+ +| 9 | Weird sized block writes | Performs write operations with varying | +| | | block sizes starting from 3 bytes and | +| | | increasing by 7 bytes each iteration, up | +| | | to 511 bytes, to the MMC card. | ++------+--------------------------------+---------------------------------------------+ +| 10 | Weird sized block reads | same for read | ++------+--------------------------------+---------------------------------------------+ +| 11 | Badly aligned write | Performs write operations with buffers | +| | | starting at different alignments (0 to 7 | +| | | bytes offset) to test how the MMC card | +| | | handles unaligned data transfers. | ++------+--------------------------------+---------------------------------------------+ +| 12 | Badly aligned read | same for read | ++------+--------------------------------+---------------------------------------------+ +| 13 | Badly aligned multi-block write| same for multi-write | ++------+--------------------------------+---------------------------------------------+ +| 14 | Badly aligned multi-block read | same for multi-read | ++------+--------------------------------+---------------------------------------------+ +| 15 | Proper xfer_size at write | intentionally create a broken transfer by | +| | (Start failure) | modifying the MMC request in a way that it | +| | | will not perform as expected, e.g. use | +| | | MMC_WRITE_BLOCK for a multi-block transfer | ++------+--------------------------------+---------------------------------------------+ +| 16 | Proper xfer_size at read | same for read | +| | (Start failure) | | ++------+--------------------------------+---------------------------------------------+ +| 17 | Proper xfer_size at write | same for 2 blocks | +| | (Midway failure) | | ++------+--------------------------------+---------------------------------------------+ +| 18 | Proper xfer_size at read | same for read | +| | (Midway failure) | | ++------+--------------------------------+---------------------------------------------+ +| 19 | Highmem write | use a high memory page | ++------+--------------------------------+---------------------------------------------+ +| 20 | Highmem read | same for read | ++------+--------------------------------+---------------------------------------------+ +| 21 | Multi-block highmem write | same for multi-write | ++------+--------------------------------+---------------------------------------------+ +| 22 | Multi-block highmem read | same for mult-read | ++------+--------------------------------+---------------------------------------------+ +| 23 | Best-case read performance | Performs 512K sequential read (non sg) | ++------+--------------------------------+---------------------------------------------+ +| 24 | Best-case write performance | same for write | ++------+--------------------------------+---------------------------------------------+ +| 25 | Best-case read performance | Same using sg | +| | (Into scattered pages) | | ++------+--------------------------------+---------------------------------------------+ +| 26 | Best-case write performance | same for write | +| | (From scattered pages) | | ++------+--------------------------------+---------------------------------------------+ +| 27 | Single read performance | By transfer size | ++------+--------------------------------+---------------------------------------------+ +| 28 | Single write performance | By transfer size | ++------+--------------------------------+---------------------------------------------+ +| 29 | Single trim performance | By transfer size | ++------+--------------------------------+---------------------------------------------+ +| 30 | Consecutive read performance | By transfer size | ++------+--------------------------------+---------------------------------------------+ +| 31 | Consecutive write performance | By transfer size | ++------+--------------------------------+---------------------------------------------+ +| 32 | Consecutive trim performance | By transfer size | ++------+--------------------------------+---------------------------------------------+ +| 33 | Random read performance | By transfer size | ++------+--------------------------------+---------------------------------------------+ +| 34 | Random write performance | By transfer size | ++------+--------------------------------+---------------------------------------------+ +| 35 | Large sequential read | Into scattered pages | ++------+--------------------------------+---------------------------------------------+ +| 36 | Large sequential write | From scattered pages | ++------+--------------------------------+---------------------------------------------+ +| 37 | Write performance | With blocking req 4k to 4MB | ++------+--------------------------------+---------------------------------------------+ +| 38 | Write performance | With non-blocking req 4k to 4MB | ++------+--------------------------------+---------------------------------------------+ +| 39 | Read performance | With blocking req 4k to 4MB | ++------+--------------------------------+---------------------------------------------+ +| 40 | Read performance | With non-blocking req 4k to 4MB | ++------+--------------------------------+---------------------------------------------+ +| 41 | Write performance | Blocking req 1 to 512 sg elems | ++------+--------------------------------+---------------------------------------------+ +| 42 | Write performance | Non-blocking req 1 to 512 sg elems | ++------+--------------------------------+---------------------------------------------+ +| 43 | Read performance | Blocking req 1 to 512 sg elems | ++------+--------------------------------+---------------------------------------------+ +| 44 | Read performance | Non-blocking req 1 to 512 sg elems | ++------+--------------------------------+---------------------------------------------+ +| 45 | Reset test | | ++------+--------------------------------+---------------------------------------------+ +| 46 | Commands during read | No Set Block Count (CMD23) | ++------+--------------------------------+---------------------------------------------+ +| 47 | Commands during write | No Set Block Count (CMD23) | ++------+--------------------------------+---------------------------------------------+ +| 48 | Commands during read | Use Set Block Count (CMD23) | ++------+--------------------------------+---------------------------------------------+ +| 49 | Commands during write | Use Set Block Count (CMD23) | ++------+--------------------------------+---------------------------------------------+ +| 50 | Commands during non-blocking | Read - use Set Block Count (CMD23) | ++------+--------------------------------+---------------------------------------------+ +| 51 | Commands during non-blocking | Write - use Set Block Count (CMD23) | ++------+--------------------------------+---------------------------------------------+ + +Test Results +============ + +The results of the tests are logged in the kernel log. Each test logs the start, end, and result of the test. The possible results are: + +- **OK**: The test completed successfully. +- **FAILED**: The test failed. +- **UNSUPPORTED (by host)**: The test is unsupported by the host. +- **UNSUPPORTED (by card)**: The test is unsupported by the card. +- **ERROR**: An error occurred during the test. + +Example Kernel Log Output +========================= + +When running a test, you will see log entries similar to the following in the kernel log: + +.. code-block:: none + + [ 1234.567890] mmc0: Starting tests of card mmc0:0001... + [ 1234.567891] mmc0: Test case 4. Basic read (with data verification)... + [ 1234.567892] mmc0: Result: OK + [ 1234.567893] mmc0: Tests completed. + +In this example, test case 4 (Basic read with data verification) was executed, and the result was OK. + + +Contributing +============ + +Contributions to the `mmc_test` framework are welcome. Please follow the standard Linux kernel contribution guidelines and submit patches to the appropriate maintainers. + +Contact +======= + +For more information or to report issues, please contact the MMC subsystem maintainers. diff --git a/Documentation/driver-api/ntb.rst b/Documentation/driver-api/ntb.rst index e991d92b8b1d..a49c41383779 100644 --- a/Documentation/driver-api/ntb.rst +++ b/Documentation/driver-api/ntb.rst @@ -35,7 +35,7 @@ anyone who has written a pci driver. NTB Typical client driver implementation ---------------------------------------- -Primary purpose of NTB is to share some peace of memory between at least two +Primary purpose of NTB is to share some piece of memory between at least two systems. So the NTB device features like Scratchpad/Message registers are mainly used to perform the proper memory window initialization. Typically there are two types of memory window interfaces supported by the NTB API: diff --git a/Documentation/driver-api/nvdimm/nvdimm.rst b/Documentation/driver-api/nvdimm/nvdimm.rst index ca16b5acbf30..c205efa4d45b 100644 --- a/Documentation/driver-api/nvdimm/nvdimm.rst +++ b/Documentation/driver-api/nvdimm/nvdimm.rst @@ -535,12 +535,12 @@ internally with a static identifier:: char devname[50]; snprintf(devname, sizeof(devname), "namespace%d.%d", - ndctl_region_get_id(region), paramaters->id); + ndctl_region_get_id(region), parameters->id); ndctl_namespace_set_alt_name(ndns, devname); /* 'uuid' must be set prior to setting size! */ - ndctl_namespace_set_uuid(ndns, paramaters->uuid); - ndctl_namespace_set_size(ndns, paramaters->size); + ndctl_namespace_set_uuid(ndns, parameters->uuid); + ndctl_namespace_set_size(ndns, parameters->size); /* unlike pmem namespaces, blk namespaces have a sector size */ if (parameters->lbasize) ndctl_namespace_set_sector_size(ndns, parameters->lbasize); diff --git a/Documentation/driver-api/nvmem.rst b/Documentation/driver-api/nvmem.rst index 5d9500d21ecc..790e2dc652c0 100644 --- a/Documentation/driver-api/nvmem.rst +++ b/Documentation/driver-api/nvmem.rst @@ -59,10 +59,10 @@ For example, a simple nvram case:: devm_nvmem_register(&config); } -Users of board files can define and register nvmem cells using the -nvmem_cell_table struct:: +Device drivers can define and register an nvmem cell using the nvmem_cell_info +struct:: - static struct nvmem_cell_info foo_nvmem_cells[] = { + static const struct nvmem_cell_info foo_nvmem_cell = { { .name = "macaddr", .offset = 0x7f00, @@ -70,13 +70,7 @@ nvmem_cell_table struct:: } }; - static struct nvmem_cell_table foo_nvmem_cell_table = { - .nvmem_name = "i2c-eeprom", - .cells = foo_nvmem_cells, - .ncells = ARRAY_SIZE(foo_nvmem_cells), - }; - - nvmem_add_cell_table(&foo_nvmem_cell_table); + int nvmem_add_one_cell(nvmem, &foo_nvmem_cell); Additionally it is possible to create nvmem cell lookup entries and register them with the nvmem framework from machine code as shown in the example below:: diff --git a/Documentation/driver-api/pci/pci.rst b/Documentation/driver-api/pci/pci.rst index aa40b1cc243b..59d86e827198 100644 --- a/Documentation/driver-api/pci/pci.rst +++ b/Documentation/driver-api/pci/pci.rst @@ -46,6 +46,9 @@ PCI Support Library .. kernel-doc:: drivers/pci/pci-sysfs.c :internal: +.. kernel-doc:: drivers/pci/tph.c + :export: + PCI Hotplug Support Library --------------------------- diff --git a/Documentation/driver-api/phy/phy.rst b/Documentation/driver-api/phy/phy.rst index 81785c084f3e..719a2b3fd2ab 100644 --- a/Documentation/driver-api/phy/phy.rst +++ b/Documentation/driver-api/phy/phy.rst @@ -198,8 +198,7 @@ pm_runtime_get_sync of PHY provider device because of parent-child relationship. It should also be noted that phy_power_on and phy_power_off performs phy_pm_runtime_get_sync and phy_pm_runtime_put respectively. There are exported APIs like phy_pm_runtime_get, phy_pm_runtime_get_sync, -phy_pm_runtime_put, phy_pm_runtime_put_sync, phy_pm_runtime_allow and -phy_pm_runtime_forbid for performing PM operations. +phy_pm_runtime_put and phy_pm_runtime_put_sync for performing PM operations. PHY Mappings ============ diff --git a/Documentation/driver-api/pin-control.rst b/Documentation/driver-api/pin-control.rst index 4639912dc9cc..27ea1236307e 100644 --- a/Documentation/driver-api/pin-control.rst +++ b/Documentation/driver-api/pin-control.rst @@ -1002,7 +1002,7 @@ it even more compact which assumes you want to use pinctrl-foo and position .. code-block:: c static struct pinctrl_map mapping[] __initdata = { - PIN_MAP_MUX_GROUP("foo-i2c.o", PINCTRL_STATE_DEFAULT, + PIN_MAP_MUX_GROUP("foo-i2c.0", PINCTRL_STATE_DEFAULT, "pinctrl-foo", NULL, "i2c0"), }; diff --git a/Documentation/driver-api/pm/devices.rst b/Documentation/driver-api/pm/devices.rst index d448cb57df86..8d86d5da4023 100644 --- a/Documentation/driver-api/pm/devices.rst +++ b/Documentation/driver-api/pm/devices.rst @@ -358,7 +358,7 @@ the phases are: ``prepare``, ``suspend``, ``suspend_late``, ``suspend_noirq``. is probed against the device in question by passing them to the :c:func:`dev_pm_set_driver_flags` helper function.] If the first of these flags is set, the PM core will not apply the direct-complete - procedure described above to the given device and, consequenty, to any + procedure described above to the given device and, consequently, to any of its ancestors. The second flag, when set, informs the middle layer code (bus types, device types, PM domains, classes) that it should take the return value of the ``->prepare`` callback provided by the driver diff --git a/Documentation/driver-api/pps.rst b/Documentation/driver-api/pps.rst index 78dded03e5d8..598729f9cd27 100644 --- a/Documentation/driver-api/pps.rst +++ b/Documentation/driver-api/pps.rst @@ -202,6 +202,45 @@ Sometimes one needs to be able not only to catch PPS signals but to produce them also. For example, running a distributed simulation, which requires computers' clock to be synchronized very tightly. +To do so the class pps-gen has been added. PPS generators can be +registered in the kernel by defining a struct pps_gen_source_info as +follows:: + + static const struct pps_gen_source_info pps_gen_dummy_info = { + .use_system_clock = true, + .get_time = pps_gen_dummy_get_time, + .enable = pps_gen_dummy_enable, + }; + +Where the use_system_clock states if the generator uses the system +clock to generate its pulses, or they are from a peripheral device +clock. Method get_time() is used to query the time stored into the +generator clock, while the method enable() is used to enable or +disable the PPS pulse generation. + +Then calling the function pps_gen_register_source() in your +initialization routine as follows creates a new generator in the +system:: + + pps_gen = pps_gen_register_source(&pps_gen_dummy_info); + +Generators SYSFS support +------------------------ + +If the SYSFS filesystem is enabled in the kernel it provides a new class:: + + $ ls /sys/class/pps-gen/ + pps-gen0/ pps-gen1/ pps-gen2/ + +Every directory is the ID of a PPS generator defined in the system and +inside of it you find several files:: + + $ ls -F /sys/class/pps-gen/pps-gen0/ + dev enable name power/ subsystem@ system time uevent + +To enable the PPS signal generation you can use the command below:: + + $ echo 1 > /sys/class/pps-gen/pps-gen0/enable Parallel port generator ------------------------ @@ -246,3 +285,27 @@ delay between assert and clear edge as small as possible to reduce system latencies. But if it is too small slave won't be able to capture clear edge transition. The default of 30us should be good enough in most situations. The delay can be selected using 'delay' pps_gen_parport module parameter. + + +Intel Timed I/O PPS signal generator +------------------------------------ + +Intel Timed I/O is a high precision device, present on 2019 and newer Intel +CPUs, that can generate PPS signals. + +Timed I/O and system time are both driven by same hardware clock. The signal +is generated with a precision of ~20 nanoseconds. The generated PPS signal +is used to synchronize an external device with system clock. For example, +it can be used to share your clock with a device that receives PPS signal, +generated by Timed I/O device. There are dedicated Timed I/O pins to deliver +the PPS signal to an external device. + +Usage of Intel Timed I/O as PPS generator: + +Start generating PPS signal:: + + $echo 1 > /sys/class/pps-gen/pps-genx/enable + +Stop generating PPS signal:: + + $echo 0 > /sys/class/pps-gen/pps-genx/enable diff --git a/Documentation/driver-api/pwrseq.rst b/Documentation/driver-api/pwrseq.rst new file mode 100644 index 000000000000..ad18b2326b68 --- /dev/null +++ b/Documentation/driver-api/pwrseq.rst @@ -0,0 +1,95 @@ +.. SPDX-License-Identifier: GPL-2.0-only +.. Copyright 2024 Linaro Ltd. + +==================== +Power Sequencing API +==================== + +:Author: Bartosz Golaszewski + +Introduction +============ + +This framework is designed to abstract complex power-up sequences that are +shared between multiple logical devices in the Linux kernel. + +The intention is to allow consumers to obtain a power sequencing handle +exposed by the power sequence provider and delegate the actual requesting and +control of the underlying resources as well as to allow the provider to +mitigate any potential conflicts between multiple users behind the scenes. + +Glossary +-------- + +The power sequencing API uses a number of terms specific to the subsystem: + +Unit + + A unit is a discrete chunk of a power sequence. For instance one unit may + enable a set of regulators, another may enable a specific GPIO. Units can + define dependencies in the form of other units that must be enabled before + it itself can be. + +Target + + A target is a set of units (composed of the "final" unit and its + dependencies) that a consumer selects by its name when requesting a handle + to the power sequencer. Via the dependency system, multiple targets may + share the same parts of a power sequence but ignore parts that are + irrelevant. + +Descriptor + + A handle passed by the pwrseq core to every consumer that serves as the + entry point to the provider layer. It ensures coherence between different + users and keeps reference counting consistent. + +Consumer interface +================== + +The consumer API is aimed to be as simple as possible. The driver interested in +getting a descriptor from the power sequencer should call pwrseq_get() and +specify the name of the target it wants to reach in the sequence after calling +pwrseq_power_up(). The descriptor can be released by calling pwrseq_put() and +the consumer can request the powering down of its target with +pwrseq_power_off(). Note that there is no guarantee that pwrseq_power_off() +will have any effect as there may be multiple users of the underlying resources +who may keep them active. + +Provider interface +================== + +The provider API is admittedly not nearly as straightforward as the one for +consumers but it makes up for it in flexibility. + +Each provider can logically split the power-up sequence into discrete chunks +(units) and define their dependencies. They can then expose named targets that +consumers may use as the final point in the sequence that they wish to reach. + +To that end the providers fill out a set of configuration structures and +register with the pwrseq subsystem by calling pwrseq_device_register(). + +Dynamic consumer matching +------------------------- + +The main difference between pwrseq and other Linux kernel providers is the +mechanism for dynamic matching of consumers and providers. Every power sequence +provider driver must implement the `match()` callback and pass it to the pwrseq +core when registering with the subsystems. + +When a client requests a sequencer handle, the core will call this callback for +every registered provider and let it flexibly figure out whether the proposed +client device is indeed its consumer. For example: if the provider binds to the +device-tree node representing a power management unit of a chipset and the +consumer driver controls one of its modules, the provider driver may parse the +relevant regulator supply properties in device tree and see if they lead from +the PMU to the consumer. + +API reference +============= + +.. kernel-doc:: include/linux/pwrseq/provider.h + :internal: + +.. kernel-doc:: drivers/power/sequencing/core.c + :export: diff --git a/Documentation/driver-api/scsi.rst b/Documentation/driver-api/scsi.rst index 273281474c09..bf2be96cc2d6 100644 --- a/Documentation/driver-api/scsi.rst +++ b/Documentation/driver-api/scsi.rst @@ -126,7 +126,7 @@ Manage scsi_dev_info_list, which tracks blacklisted and whitelisted devices. .. kernel-doc:: drivers/scsi/scsi_devinfo.c - :internal: + :export: drivers/scsi/scsi_ioctl.c ~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -162,7 +162,6 @@ statistics and to pass information directly to the lowlevel driver. I.E. plumbing to manage /proc/scsi/\* .. kernel-doc:: drivers/scsi/scsi_proc.c - :internal: drivers/scsi/scsi_netlink.c ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -193,7 +192,7 @@ else, sequentially scan LUNs up until some maximum is reached, or a LUN is seen that cannot have a device attached to it. .. kernel-doc:: drivers/scsi/scsi_scan.c - :internal: + :export: drivers/scsi/scsi_sysctl.c ~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/Documentation/driver-api/serial/driver.rst b/Documentation/driver-api/serial/driver.rst index 84b43061c11b..fa1ebfcd4472 100644 --- a/Documentation/driver-api/serial/driver.rst +++ b/Documentation/driver-api/serial/driver.rst @@ -101,6 +101,6 @@ Modem control lines via GPIO Some helpers are provided in order to set/get modem control lines via GPIO. .. kernel-doc:: drivers/tty/serial/serial_mctrl_gpio.c - :identifiers: mctrl_gpio_init mctrl_gpio_free mctrl_gpio_to_gpiod + :identifiers: mctrl_gpio_init mctrl_gpio_to_gpiod mctrl_gpio_set mctrl_gpio_get mctrl_gpio_enable_ms - mctrl_gpio_disable_ms + mctrl_gpio_disable_ms_sync mctrl_gpio_disable_ms_no_sync diff --git a/Documentation/driver-api/soundwire/bra.rst b/Documentation/driver-api/soundwire/bra.rst new file mode 100644 index 000000000000..8500253fa3e8 --- /dev/null +++ b/Documentation/driver-api/soundwire/bra.rst @@ -0,0 +1,336 @@ +========================== +Bulk Register Access (BRA) +========================== + +Conventions +----------- + +Capitalized words used in this documentation are intentional and refer +to concepts of the SoundWire 1.x specification. + +Introduction +------------ + +The SoundWire 1.x specification provides a mechanism to speed-up +command/control transfers by reclaiming parts of the audio +bandwidth. The Bulk Register Access (BRA) protocol is a standard +solution based on the Bulk Payload Transport (BPT) definitions. + +The regular control channel uses Column 0 and can only send/retrieve +one byte per frame with write/read commands. With a typical 48kHz +frame rate, only 48kB/s can be transferred. + +The optional Bulk Register Access capability can transmit up to 12 +Mbits/s and reduce transfer times by several orders of magnitude, but +has multiple design constraints: + + (1) Each frame can only support a read or a write transfer, with a + 10-byte overhead per frame (header and footer response). + + (2) The read/writes SHALL be from/to contiguous register addresses + in the same frame. A fragmented register space decreases the + efficiency of the protocol by requiring multiple BRA transfers + scheduled in different frames. + + (3) The targeted Peripheral device SHALL support the optional Data + Port 0, and likewise the Manager SHALL expose audio-like Ports + to insert BRA packets in the audio payload using the concepts of + Sample Interval, HSTART, HSTOP, etc. + + (4) The BRA transport efficiency depends on the available + bandwidth. If there are no on-going audio transfers, the entire + frame minus Column 0 can be reclaimed for BRA. The frame shape + also impacts efficiency: since Column0 cannot be used for + BTP/BRA, the frame should rely on a large number of columns and + minimize the number of rows. The bus clock should be as high as + possible. + + (5) The number of bits transferred per frame SHALL be a multiple of + 8 bits. Padding bits SHALL be inserted if necessary at the end + of the data. + + (6) The regular read/write commands can be issued in parallel with + BRA transfers. This is convenient to e.g. deal with alerts, jack + detection or change the volume during firmware download, but + accessing the same address with two independent protocols has to + be avoided to avoid undefined behavior. + + (7) Some implementations may not be capable of handling the + bandwidth of the BRA protocol, e.g. in the case of a slow I2C + bus behind the SoundWire IP. In this case, the transfers may + need to be spaced in time or flow-controlled. + + (8) Each BRA packet SHALL be marked as 'Active' when valid data is + to be transmitted. This allows for software to allocate a BRA + stream but not transmit/discard data while processing the + results or preparing the next batch of data, or allowing the + peripheral to deal with the previous transfer. In addition BRA + transfer can be started early on without data being ready. + + (9) Up to 470 bytes may be transmitted per frame. + + (10) The address is represented with 32 bits and does not rely on + the paging registers used for the regular command/control + protocol in Column 0. + + +Error checking +-------------- + +Firmware download is one of the key usages of the Bulk Register Access +protocol. To make sure the binary data integrity is not compromised by +transmission or programming errors, each BRA packet provides: + + (1) A CRC on the 7-byte header. This CRC helps the Peripheral Device + check if it is addressed and set the start address and number of + bytes. The Peripheral Device provides a response in Byte 7. + + (2) A CRC on the data block (header excluded). This CRC is + transmitted as the last-but-one byte in the packet, prior to the + footer response. + +The header response can be one of: + (a) Ack + (b) Nak + (c) Not Ready + +The footer response can be one of: + (1) Ack + (2) Nak (CRC failure) + (3) Good (operation completed) + (4) Bad (operation failed) + +Example frame +------------- + +The example below is not to scale and makes simplifying assumptions +for clarity. The different chunks in the BRA packets are not required +to start on a new SoundWire Row, and the scale of data may vary. + + :: + + +---+--------------------------------------------+ + + | | + + | BRA HEADER | + + | | + + +--------------------------------------------+ + + C | HEADER CRC | + + O +--------------------------------------------+ + + M | HEADER RESPONSE | + + M +--------------------------------------------+ + + A | | + + N | | + + D | DATA | + + | | + + | | + + | | + + +--------------------------------------------+ + + | DATA CRC | + + +--------------------------------------------+ + + | FOOTER RESPONSE | + +---+--------------------------------------------+ + + +Assuming the frame uses N columns, the configuration shown above can +be programmed by setting the DP0 registers as: + + - HSTART = 1 + - HSTOP = N - 1 + - Sampling Interval = N + - WordLength = N - 1 + +Addressing restrictions +----------------------- + +The Device Number specified in the Header follows the SoundWire +definitions, and broadcast and group addressing are permitted. For now +the Linux implementation only allows for a single BPT transfer to a +single device at a time. This might be revisited at a later point as +an optimization to send the same firmware to multiple devices, but +this would only be beneficial for single-link solutions. + +In the case of multiple Peripheral devices attached to different +Managers, the broadcast and group addressing is not supported by the +SoundWire specification. Each device must be handled with separate BRA +streams, possibly in parallel - the links are really independent. + +Unsupported features +-------------------- + +The Bulk Register Access specification provides a number of +capabilities that are not supported in known implementations, such as: + + (1) Transfers initiated by a Peripheral Device. The BRA Initiator is + always the Manager Device. + + (2) Flow-control capabilities and retransmission based on the + 'NotReady' header response require extra buffering in the + SoundWire IP and are not implemented. + +Bi-directional handling +----------------------- + +The BRA protocol can handle writes as well as reads, and in each +packet the header and footer response are provided by the Peripheral +Target device. On the Peripheral device, the BRA protocol is handled +by a single DP0 data port, and at the low-level the bus ownership can +will change for header/footer response as well as the data transmitted +during a read. + +On the host side, most implementations rely on a Port-like concept, +with two FIFOs consuming/generating data transfers in parallel +(Host->Peripheral and Peripheral->Host). The amount of data +consumed/produced by these FIFOs is not symmetrical, as a result +hardware typically inserts markers to help software and hardware +interpret raw data + +Each packet will typically have: + + (1) a 'Start of Packet' indicator. + + (2) an 'End of Packet' indicator. + + (3) a packet identifier to correlate the data requested and + transmitted, and the error status for each frame + +Hardware implementations can check errors at the frame level, and +retry a transfer in case of errors. However, as for the flow-control +case, this requires extra buffering and intelligence in the +hardware. The Linux support assumes that the entire transfer is +cancelled if a single error is detected in one of the responses. + +Abstraction required +~~~~~~~~~~~~~~~~~~~~ + +There are no standard registers or mandatory implementation at the +Manager level, so the low-level BPT/BRA details must be hidden in +Manager-specific code. For example the Cadence IP format above is not +known to the codec drivers. + +Likewise, codec drivers should not have to know the frame size. The +computation of CRC and handling of responses is handled in helpers and +Manager-specific code. + +The host BRA driver may also have restrictions on pages allocated for +DMA, or other host-DSP communication protocols. The codec driver +should not be aware of any of these restrictions, since it might be +reused in combination with different implementations of Manager IPs. + +Concurrency between BRA and regular read/write +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The existing 'nread/nwrite' API already relies on a notion of start +address and number of bytes, so it would be possible to extend this +API with a 'hint' requesting BPT/BRA be used. + +However BRA transfers could be quite long, and the use of a single +mutex for regular read/write and BRA is a show-stopper. Independent +operation of the control/command and BRA transfers is a fundamental +requirement, e.g. to change the volume level with the existing regmap +interface while downloading firmware. The integration must however +ensure that there are no concurrent access to the same address with +the command/control protocol and the BRA protocol. + +In addition, the 'sdw_msg' structure hard-codes support for 16-bit +addresses and paging registers which are irrelevant for BPT/BRA +support based on native 32-bit addresses. A separate API with +'sdw_bpt_msg' makes more sense. + +One possible strategy to speed-up all initialization tasks would be to +start a BRA transfer for firmware download, then deal with all the +"regular" read/writes in parallel with the command channel, and last +to wait for the BRA transfers to complete. This would allow for a +degree of overlap instead of a purely sequential solution. As such, +the BRA API must support async transfers and expose a separate wait +function. + + +Peripheral/bus interface +------------------------ + +The bus interface for BPT/BRA is made of two functions: + + - sdw_bpt_send_async(bpt_message) + + This function sends the data using the Manager + implementation-defined capabilities (typically DMA or IPC + protocol). + + Queueing is currently not supported, the caller + needs to wait for completion of the requested transfer. + + - sdw_bpt_wait() + + This function waits for the entire message provided by the + codec driver in the 'send_async' stage. Intermediate status for + smaller chunks will not be provided back to the codec driver, + only a return code will be provided. + +Regmap use +~~~~~~~~~~ + +Existing codec drivers rely on regmap to download firmware to +Peripherals. regmap exposes an async interface similar to the +send/wait API suggested above, so at a high-level it would seem +natural to combine BRA and regmap. The regmap layer could check if BRA +is available or not, and use a regular read-write command channel in +the latter case. + +The regmap integration will be handled in a second step. + +BRA stream model +---------------- + +For regular audio transfers, the machine driver exposes a dailink +connecting CPU DAI(s) and Codec DAI(s). + +This model is not required BRA support: + + (1) The SoundWire DAIs are mainly wrappers for SoundWire Data + Ports, with possibly some analog or audio conversion + capabilities bolted behind the Data Port. In the context of + BRA, the DP0 is the destination. DP0 registers are standard and + can be programmed blindly without knowing what Peripheral is + connected to each link. In addition, if there are multiple + Peripherals on a link and some of them do not support DP0, the + write commands to program DP0 registers will generate harmless + COMMAND_IGNORED responses that will be wired-ORed with + responses from Peripherals which support DP0. In other words, + the DP0 programming can be done with broadcast commands, and + the information on the Target device can be added only in the + BRA Header. + + (2) At the CPU level, the DAI concept is not useful for BRA; the + machine driver will not create a dailink relying on DP0. The + only concept that is needed is the notion of port. + + (3) The stream concept relies on a set of master_rt and slave_rt + concepts. All of these entities represent ports and not DAIs. + + (4) With the assumption that a single BRA stream is used per link, + that stream can connect master ports as well as all peripheral + DP0 ports. + + (5) BRA transfers only make sense in the context of one + Manager/Link, so the BRA stream handling does not rely on the + concept of multi-link aggregation allowed by regular DAI links. + +Audio DMA support +----------------- + +Some DMAs, such as HDaudio, require an audio format field to be +set. This format is in turn used to define acceptable bursts. BPT/BRA +support is not fully compatible with these definitions in that the +format and bandwidth may vary between read and write commands. + +In addition, on Intel HDaudio Intel platforms the DMAs need to be +programmed with a PCM format matching the bandwidth of the BPT/BRA +transfer. The format is based on 192kHz 32-bit samples, and the number +of channels varies to adjust the bandwidth. The notion of channel is +completely notional since the data is not typical audio +PCM. Programming such channels helps reserve enough bandwidth and adjust +FIFO sizes to avoid xruns. + +Alignment requirements are currently not enforced at the core level +but at the platform-level, e.g. for Intel the data sizes must be +multiples of 32 bytes. diff --git a/Documentation/driver-api/soundwire/bra_cadence.rst b/Documentation/driver-api/soundwire/bra_cadence.rst new file mode 100644 index 000000000000..4560752e8f47 --- /dev/null +++ b/Documentation/driver-api/soundwire/bra_cadence.rst @@ -0,0 +1,66 @@ +Cadence IP BRA support +---------------------- + +Format requirements +~~~~~~~~~~~~~~~~~~~ + +The Cadence IP relies on PDI0 for TX and PDI1 for RX. The data needs +to be formatted with the following conventions: + + (1) all Data is stored in bits 15..0 of the 32-bit PDI FIFOs. + + (2) the start of packet is BIT(31). + + (3) the end of packet is BIT(30). + + (4) A packet ID is stored in bits 19..16. This packet ID is + determined by software and is typically a rolling counter. + + (5) Padding shall be inserted as needed so that the Header CRC, + Header response, Footer CRC, Footer response are always in + Byte0. Padding is inserted by software for writes, and on reads + software shall discard the padding added by the hardware. + +Example format +~~~~~~~~~~~~~~ + +The following table represents the sequence provided to PDI0 for a +write command followed by a read command. + +:: + + +---+---+--------+---------------+---------------+ + + 1 | 0 | ID = 0 | WR HDR[1] | WR HDR[0] | + + | | | WR HDR[3] | WR HDR[2] | + + | | | WR HDR[5] | WR HDR[4] | + + | | | pad | WR HDR CRC | + + | | | WR Data[1] | WR Data[0] | + + | | | WR Data[3] | WR Data[2] | + + | | | WR Data[n-2] | WR Data[n-3] | + + | | | pad | WR Data[n-1] | + + 0 | 1 | | pad | WR Data CRC | + +---+---+--------+---------------+---------------+ + + 1 | 0 | ID = 1 | RD HDR[1] | RD HDR[0] | + + | | | RD HDR[3] | RD HDR[2] | + + | | | RD HDR[5] | RD HDR[4] | + + 0 | 1 | | pad | RD HDR CRC | + +---+---+--------+---------------+---------------+ + + +The table below represents the data received on PDI1 for the same +write command followed by a read command. + +:: + + +---+---+--------+---------------+---------------+ + + 1 | 0 | ID = 0 | pad | WR Hdr Rsp | + + 0 | 1 | | pad | WR Ftr Rsp | + +---+---+--------+---------------+---------------+ + + 1 | 0 | ID = 0 | pad | Rd Hdr Rsp | + + | | | RD Data[1] | RD Data[0] | + + | | | RD Data[3] | RD Data[2] | + + | | | RD HDR[n-2] | RD Data[n-3] | + + | | | pad | RD Data[n-1] | + + | | | pad | RD Data CRC | + + 0 | 1 | | pad | RD Ftr Rsp | + +---+---+--------+---------------+---------------+ diff --git a/Documentation/driver-api/soundwire/index.rst b/Documentation/driver-api/soundwire/index.rst index 234911a0db99..ef8d90dfbdde 100644 --- a/Documentation/driver-api/soundwire/index.rst +++ b/Documentation/driver-api/soundwire/index.rst @@ -9,6 +9,8 @@ SoundWire Documentation stream error_handling locking + bra + bra_cadence .. only:: subproject and html diff --git a/Documentation/driver-api/soundwire/stream.rst b/Documentation/driver-api/soundwire/stream.rst index 2a794484f62c..d66201299633 100644 --- a/Documentation/driver-api/soundwire/stream.rst +++ b/Documentation/driver-api/soundwire/stream.rst @@ -291,7 +291,7 @@ per stream. From ASoC DPCM framework, this stream state maybe linked to .. code-block:: c - int sdw_alloc_stream(char * stream_name); + int sdw_alloc_stream(char * stream_name, enum sdw_stream_type type); The SoundWire core provides a sdw_startup_stream() helper function, typically called during a dailink .startup() callback, which performs diff --git a/Documentation/driver-api/soundwire/summary.rst b/Documentation/driver-api/soundwire/summary.rst index 01dcb954f6d7..df78053743b5 100644 --- a/Documentation/driver-api/soundwire/summary.rst +++ b/Documentation/driver-api/soundwire/summary.rst @@ -184,14 +184,6 @@ function that provides capabilities information. Bus needs to know a set of Slave capabilities to program Slave registers and to control the Bus reconfigurations. -Future enhancements to be done -============================== - - (1) Bulk Register Access (BRA) transfers. - - - (2) Multiple data lane support. - Links ===== diff --git a/Documentation/driver-api/thermal/intel_dptf.rst b/Documentation/driver-api/thermal/intel_dptf.rst index 8fb8c5b2d685..ec5769accae0 100644 --- a/Documentation/driver-api/thermal/intel_dptf.rst +++ b/Documentation/driver-api/thermal/intel_dptf.rst @@ -191,6 +191,27 @@ ABI. User space can specify any one of the available workload type using this interface. +:file:`/sys/bus/pci/devices/0000\:00\:04.0/ptc_0_control` +:file:`/sys/bus/pci/devices/0000\:00\:04.0/ptc_1_control` +:file:`/sys/bus/pci/devices/0000\:00\:04.0/ptc_2_control` + +All these controls needs admin privilege to update. + +``enable`` (RW) + 1 for enable, 0 for disable. Shows the current enable status of + platform temperature control feature. User space can enable/disable + hardware controls. + +``temperature_target`` (RW) + Update a new temperature target in milli degree celsius for hardware to + use for the temperature control. + +Given that this is platform temperature control, it is expected that a +single user-level manager owns and manages the controls. If multiple +user-level software applications attempt to write different targets, it +can lead to unexpected behavior. + + DPTF Processor thermal RFIM interface -------------------------------------------- diff --git a/Documentation/driver-api/thermal/sysfs-api.rst b/Documentation/driver-api/thermal/sysfs-api.rst index 6c1175c6afba..f73de211bdce 100644 --- a/Documentation/driver-api/thermal/sysfs-api.rst +++ b/Documentation/driver-api/thermal/sysfs-api.rst @@ -4,8 +4,6 @@ Generic Thermal Sysfs driver How To Written by Sujith Thomas <sujith.thomas@intel.com>, Zhang Rui <rui.zhang@intel.com> -Updated: 2 January 2008 - Copyright (c) 2008 Intel Corporation @@ -38,61 +36,57 @@ temperature) and throttle appropriate devices. :: - struct thermal_zone_device - *thermal_zone_device_register(char *type, - int trips, int mask, void *devdata, - struct thermal_zone_device_ops *ops, - const struct thermal_zone_params *tzp, - int passive_delay, int polling_delay)) + struct thermal_zone_device * + thermal_zone_device_register_with_trips(const char *type, + const struct thermal_trip *trips, + int num_trips, void *devdata, + const struct thermal_zone_device_ops *ops, + const struct thermal_zone_params *tzp, + unsigned int passive_delay, + unsigned int polling_delay) - This interface function adds a new thermal zone device (sensor) to + This interface function adds a new thermal zone device (sensor) to the /sys/class/thermal folder as `thermal_zone[0-*]`. It tries to bind all the - thermal cooling devices registered at the same time. + thermal cooling devices registered to it at the same time. type: the thermal zone type. trips: - the total number of trip points this thermal zone supports. - mask: - Bit string: If 'n'th bit is set, then trip point 'n' is writable. + the table of trip points for this thermal zone. devdata: device private data ops: thermal zone device call-backs. - .bind: - bind the thermal zone device with a thermal cooling device. - .unbind: - unbind the thermal zone device with a thermal cooling device. + .should_bind: + check whether or not a given cooling device should be bound to + a given trip point in this thermal zone. .get_temp: get the current temperature of the thermal zone. .set_trips: - set the trip points window. Whenever the current temperature - is updated, the trip points immediately below and above the - current temperature are found. - .get_mode: - get the current mode (enabled/disabled) of the thermal zone. - - - "enabled" means the kernel thermal management is - enabled. - - "disabled" will prevent kernel thermal driver action - upon trip points so that user applications can take - charge of thermal management. - .set_mode: - set the mode (enabled/disabled) of the thermal zone. - .get_trip_type: - get the type of certain trip point. - .get_trip_temp: - get the temperature above which the certain trip point - will be fired. + set the trip points window. Whenever the current temperature + is updated, the trip points immediately below and above the + current temperature are found. + .change_mode: + change the mode (enabled/disabled) of the thermal zone. + .set_trip_temp: + set the temperature of a given trip point. + .get_crit_temp: + get the critical temperature for this thermal zone. .set_emul_temp: - set the emulation temperature which helps in debugging - different threshold temperature points. + set the emulation temperature which helps in debugging + different threshold temperature points. + .get_trend: + get the trend of most recent zone temperature changes. + .hot: + hot trip point crossing handler. + .critical: + critical trip point crossing handler. tzp: thermal zone platform parameters. passive_delay: - number of milliseconds to wait between polls when - performing passive cooling. + number of milliseconds to wait between polls when performing passive + cooling. polling_delay: number of milliseconds to wait between polls when checking whether trip points have been crossed (0 for interrupt driven systems). @@ -251,56 +245,6 @@ temperature) and throttle appropriate devices. It deletes the corresponding entry from /sys/class/thermal folder and unbinds itself from all the thermal zone devices using it. -1.3 interface for binding a thermal zone device with a thermal cooling device ------------------------------------------------------------------------------ - - :: - - int thermal_zone_bind_cooling_device(struct thermal_zone_device *tz, - int trip, struct thermal_cooling_device *cdev, - unsigned long upper, unsigned long lower, unsigned int weight); - - This interface function binds a thermal cooling device to a particular trip - point of a thermal zone device. - - This function is usually called in the thermal zone device .bind callback. - - tz: - the thermal zone device - cdev: - thermal cooling device - trip: - indicates which trip point in this thermal zone the cooling device - is associated with. - upper: - the Maximum cooling state for this trip point. - THERMAL_NO_LIMIT means no upper limit, - and the cooling device can be in max_state. - lower: - the Minimum cooling state can be used for this trip point. - THERMAL_NO_LIMIT means no lower limit, - and the cooling device can be in cooling state 0. - weight: - the influence of this cooling device in this thermal - zone. See 1.4.1 below for more information. - - :: - - int thermal_zone_unbind_cooling_device(struct thermal_zone_device *tz, - int trip, struct thermal_cooling_device *cdev); - - This interface function unbinds a thermal cooling device from a particular - trip point of a thermal zone device. This function is usually called in - the thermal zone device .unbind callback. - - tz: - the thermal zone device - cdev: - thermal cooling device - trip: - indicates which trip point in this thermal zone the cooling device - is associated with. - 1.4 Thermal Zone Parameters --------------------------- @@ -371,8 +315,6 @@ Thermal cooling device sys I/F, created once it's registered:: Then next two dynamic attributes are created/removed in pairs. They represent the relationship between a thermal zone and its associated cooling device. -They are created/removed for each successful execution of -thermal_zone_bind_cooling_device/thermal_zone_unbind_cooling_device. :: @@ -464,32 +406,28 @@ are supposed to implement the callback. If they don't, the thermal framework calculated the trend by comparing the previous and the current temperature values. -4.2. get_thermal_instance -------------------------- - -This function returns the thermal_instance corresponding to a given -{thermal_zone, cooling_device, trip_point} combination. Returns NULL -if such an instance does not exist. - -4.3. thermal_cdev_update +4.2. thermal_cdev_update ------------------------ This function serves as an arbitrator to set the state of a cooling device. It sets the cooling device to the deepest cooling state if possible. -5. thermal_emergency_poweroff -============================= +5. Critical Events +================== + +On an event of critical trip temperature crossing, the thermal framework +will trigger a hardware protection power-off (shutdown) or reboot, +depending on configuration. -On an event of critical trip temperature crossing the thermal framework -shuts down the system by calling hw_protection_shutdown(). The -hw_protection_shutdown() first attempts to perform an orderly shutdown -but accepts a delay after which it proceeds doing a forced power-off -or as last resort an emergency_restart. +At first, the kernel will attempt an orderly power-off or reboot, but +accepts a delay after which it proceeds to do a forced power-off or +reboot, respectively. If this fails, ``emergency_restart()`` is invoked +as last resort. The delay should be carefully profiled so as to give adequate time for -orderly poweroff. +orderly power-off or reboot. -If the delay is set to 0 emergency poweroff will not be supported. So a -carefully profiled non-zero positive value is a must for emergency -poweroff to be triggered. +If the delay is set to 0, the emergency action will not be supported. So a +carefully profiled non-zero positive value is a must for the emergency +action to be triggered. diff --git a/Documentation/driver-api/tty/tty_driver.rst b/Documentation/driver-api/tty/tty_driver.rst index cc529f863406..7138222a70f2 100644 --- a/Documentation/driver-api/tty/tty_driver.rst +++ b/Documentation/driver-api/tty/tty_driver.rst @@ -25,6 +25,8 @@ freed. For reference, both allocation and deallocation functions are explained here in detail: +.. kernel-doc:: include/linux/tty_driver.h + :identifiers: tty_alloc_driver .. kernel-doc:: drivers/tty/tty_io.c :identifiers: __tty_alloc_driver tty_driver_kref_put @@ -35,7 +37,7 @@ Here comes the documentation of flags accepted by tty_alloc_driver() (or __tty_alloc_driver()): .. kernel-doc:: include/linux/tty_driver.h - :doc: TTY Driver Flags + :identifiers: tty_driver_flag ---- diff --git a/Documentation/driver-api/tty/tty_struct.rst b/Documentation/driver-api/tty/tty_struct.rst index c72f5a4293b2..29caf1c1ca5f 100644 --- a/Documentation/driver-api/tty/tty_struct.rst +++ b/Documentation/driver-api/tty/tty_struct.rst @@ -72,7 +72,7 @@ TTY Struct Flags ================ .. kernel-doc:: include/linux/tty.h - :doc: TTY Struct Flags + :identifiers: tty_struct_flags TTY Struct Reference ==================== diff --git a/Documentation/driver-api/usb/usb.rst b/Documentation/driver-api/usb/usb.rst index 89f9c37bb979..976fb4221062 100644 --- a/Documentation/driver-api/usb/usb.rst +++ b/Documentation/driver-api/usb/usb.rst @@ -161,6 +161,7 @@ rely on 64bit DMA to eliminate another kind of bounce buffer. .. kernel-doc:: drivers/usb/core/urb.c :export: +.. c:namespace:: usb_core .. kernel-doc:: drivers/usb/core/message.c :export: diff --git a/Documentation/driver-api/usb/writing_musb_glue_layer.rst b/Documentation/driver-api/usb/writing_musb_glue_layer.rst index 10416cc11cd5..0bb96ecdf527 100644 --- a/Documentation/driver-api/usb/writing_musb_glue_layer.rst +++ b/Documentation/driver-api/usb/writing_musb_glue_layer.rst @@ -613,7 +613,7 @@ endpoints configuration from the hardware, so we use line 12 instruction to bypass reading the configuration from silicon, and rely on a hard-coded table that describes the endpoints configuration instead:: - static struct musb_fifo_cfg jz4740_musb_fifo_cfg[] = { + static const struct musb_fifo_cfg jz4740_musb_fifo_cfg[] = { { .hw_ep_num = 1, .style = FIFO_TX, .maxpacket = 512, }, { .hw_ep_num = 1, .style = FIFO_RX, .maxpacket = 512, }, { .hw_ep_num = 2, .style = FIFO_TX, .maxpacket = 64, }, @@ -717,4 +717,4 @@ https://www.maximintegrated.com/app-notes/index.mvp/id/1822 :ref:`Writing USB Device Drivers <writing-usb-driver>` Texas Instruments USB Configuration Wiki Page: -http://processors.wiki.ti.com/index.php/Usbgeneralpage +https://web.archive.org/web/20201215135015/http://processors.wiki.ti.com/index.php/Usbgeneralpage diff --git a/Documentation/driver-api/vfio.rst b/Documentation/driver-api/vfio.rst index 633d11c7fa71..2a21a42c9386 100644 --- a/Documentation/driver-api/vfio.rst +++ b/Documentation/driver-api/vfio.rst @@ -364,7 +364,7 @@ IOMMUFD IOAS/HWPT to enable userspace DMA:: MAP_PRIVATE | MAP_ANONYMOUS, 0, 0); map.iova = 0; /* 1MB starting at 0x0 from device view */ map.length = 1024 * 1024; - map.ioas_id = alloc_data.out_ioas_id;; + map.ioas_id = alloc_data.out_ioas_id; ioctl(iommufd, IOMMU_IOAS_MAP, &map); diff --git a/Documentation/driver-api/wmi.rst b/Documentation/driver-api/wmi.rst index 6ca58c8249e5..4e8dbdb1fc67 100644 --- a/Documentation/driver-api/wmi.rst +++ b/Documentation/driver-api/wmi.rst @@ -7,12 +7,11 @@ WMI Driver API The WMI driver core supports a more modern bus-based interface for interacting with WMI devices, and an older GUID-based interface. The latter interface is considered to be deprecated, so new WMI drivers should generally avoid it since -it has some issues with multiple WMI devices and events sharing the same GUIDs -and/or notification IDs. The modern bus-based interface instead maps each -WMI device to a :c:type:`struct wmi_device <wmi_device>`, so it supports -WMI devices sharing GUIDs and/or notification IDs. Drivers can then register -a :c:type:`struct wmi_driver <wmi_driver>`, which will be bound to compatible -WMI devices by the driver core. +it has some issues with multiple WMI devices sharing the same GUID. +The modern bus-based interface instead maps each WMI device to a +:c:type:`struct wmi_device <wmi_device>`, so it supports WMI devices sharing the +same GUID. Drivers can then register a :c:type:`struct wmi_driver <wmi_driver>` +which will be bound to compatible WMI devices by the driver core. .. kernel-doc:: include/linux/wmi.h :internal: |